aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--share/man/man4/netmap.41131
-rw-r--r--sys/dev/e1000/if_em.c2
-rw-r--r--sys/dev/e1000/if_igb.c4
-rw-r--r--sys/dev/e1000/if_lem.c2
-rw-r--r--sys/dev/ixgbe/ixgbe.c2
-rw-r--r--sys/dev/netmap/if_em_netmap.h52
-rw-r--r--sys/dev/netmap/if_igb_netmap.h52
-rw-r--r--sys/dev/netmap/if_lem_netmap.h57
-rw-r--r--sys/dev/netmap/if_re_netmap.h84
-rw-r--r--sys/dev/netmap/ixgbe_netmap.h74
-rw-r--r--sys/dev/netmap/netmap.c835
-rw-r--r--sys/dev/netmap/netmap_freebsd.c26
-rw-r--r--sys/dev/netmap/netmap_generic.c1008
-rw-r--r--sys/dev/netmap/netmap_kern.h490
-rw-r--r--sys/dev/netmap/netmap_mbq.c15
-rw-r--r--sys/dev/netmap/netmap_mbq.h2
-rw-r--r--sys/dev/netmap/netmap_mem2.c20
-rw-r--r--sys/dev/netmap/netmap_mem2.h2
-rw-r--r--sys/dev/netmap/netmap_vale.c437
-rw-r--r--sys/net/netmap.h381
-rw-r--r--sys/net/netmap_user.h189
-rw-r--r--tools/tools/netmap/bridge.c36
-rw-r--r--tools/tools/netmap/nm_util.c91
-rw-r--r--tools/tools/netmap/nm_util.h8
-rw-r--r--tools/tools/netmap/pcap.c18
-rw-r--r--tools/tools/netmap/pkt-gen.c297
-rw-r--r--tools/tools/netmap/vale-ctl.c16
27 files changed, 3118 insertions, 2213 deletions
diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4
index 7975572b1e8a..523d8ddb8e5b 100644
--- a/share/man/man4/netmap.4
+++ b/share/man/man4/netmap.4
@@ -1,4 +1,4 @@
-.\" Copyright (c) 2011-2013 Matteo Landi, Luigi Rizzo, Universita` di Pisa
+.\" Copyright (c) 2011-2014 Matteo Landi, Luigi Rizzo, Universita` di Pisa
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
@@ -27,434 +27,546 @@
.\"
.\" $FreeBSD$
.\"
-.Dd October 18, 2013
+.Dd January 4, 2014
.Dt NETMAP 4
.Os
.Sh NAME
.Nm netmap
.Nd a framework for fast packet I/O
+.br
+.Nm VALE
+.Nd a fast VirtuAl Local Ethernet using the netmap API
.Sh SYNOPSIS
.Cd device netmap
.Sh DESCRIPTION
.Nm
is a framework for extremely fast and efficient packet I/O
-(reaching 14.88 Mpps with a single core at less than 1 GHz)
for both userspace and kernel clients.
-Userspace clients can use the netmap API
-to send and receive raw packets through physical interfaces
-or ports of the
-.Xr VALE 4
-switch.
+It runs on FreeBSD and Linux,
+and includes
+.Nm VALE ,
+a very fast and modular in-kernel software switch/dataplane.
.Pp
+.Nm
+and
.Nm VALE
-is a very fast (reaching 20 Mpps per port)
-and modular software switch,
-implemented within the kernel, which can interconnect
-virtual ports, physical devices, and the native host stack.
+are one order of magnitude faster than sockets, bpf or
+native switches based on
+.Xr tun/tap 4 ,
+reaching 14.88 Mpps with much less than one core on a 10 Gbit NIC,
+and 20 Mpps per core for VALE ports.
.Pp
+Userspace clients can dynamically switch NICs into
.Nm
-uses a memory mapped region to share packet buffers,
-descriptors and queues with the kernel.
-Simple
-.Pa ioctl()s
-are used to bind interfaces/ports to file descriptors and
-implement non-blocking I/O, whereas blocking I/O uses
-.Pa select()/poll() .
-.Nm
-can exploit the parallelism in multiqueue devices and
-multicore systems.
+mode and send and receive raw packets through
+memory mapped buffers.
+A selectable file descriptor supports
+synchronization and blocking I/O.
+.Pp
+Similarly,
+.Nm VALE
+can dynamically create switch instances and ports,
+providing high speed packet I/O between processes,
+virtual machines, NICs and the host stack.
.Pp
-For the best performance,
+For best performance,
.Nm
requires explicit support in device drivers;
-a generic emulation layer is available to implement the
+however, the
.Nm
-API on top of unmodified device drivers,
+API can be emulated on top of unmodified device drivers,
at the price of reduced performance
-(but still better than what can be achieved with
-sockets or BPF/pcap).
+(but still better than sockets or BPF/pcap).
.Pp
-For a list of devices with native
+In the rest of this (long) manual page we document
+various aspects of the
.Nm
-support, see the end of this manual page.
-.Sh OPERATION - THE NETMAP API
+and
+.Nm VALE
+architecture, features and usage.
+.Pp
+.Sh ARCHITECTURE
.Nm
-clients must first
-.Pa open("/dev/netmap") ,
-and then issue an
-.Pa ioctl(fd, NIOCREGIF, (struct nmreq *)arg)
-to bind the file descriptor to a specific interface or port.
+supports raw packet I/O through a
+.Em port ,
+which can be connected to a physical interface
+.Em ( NIC ) ,
+to the host stack,
+or to a
+.Nm VALE
+switch).
+Ports use preallocated circular queues of buffers
+.Em ( rings )
+residing in an mmapped region.
+There is one ring for each transmit/receive queue of a
+NIC or virtual port.
+An additional ring pair connects to the host stack.
+.Pp
+After binding a file descriptor to a port, a
.Nm
-has multiple modes of operation controlled by the
-content of the
-.Pa struct nmreq
-passed to the
-.Pa ioctl() .
-In particular, the
-.Em nr_name
-field specifies whether the client operates on a physical network
-interface or on a port of a
+client can send or receive packets in batches through
+the rings, and possibly implement zero-copy forwarding
+between ports.
+.Pp
+All NICs operating in
+.Nm
+mode use the same memory region,
+accessible to all processes who own
+.Nm /dev/netmap
+file descriptors bound to NICs.
.Nm VALE
-switch, as indicated below. Additional fields in the
-.Pa struct nmreq
-control the details of operation.
+ports instead use separate memory regions.
+.Pp
+.Sh ENTERING AND EXITING NETMAP MODE
+Ports and rings are created and controlled through a file descriptor,
+created by opening a special device
+.Dl fd = open("/dev/netmap");
+and then bound to a specific port with an
+.Dl ioctl(fd, NIOCREGIF, (struct nmreq *)arg);
+.Pp
+.Nm
+has multiple modes of operation controlled by the
+.Vt struct nmreq
+argument.
+.Va arg.nr_name
+specifies the port name, as follows:
.Bl -tag -width XXXX
-.It Dv Interface name (e.g. 'em0', 'eth1', ... )
-The data path of the interface is disconnected from the host stack.
-Depending on additional arguments,
-the file descriptor is bound to the NIC (one or all queues),
-or to the host stack.
+.It Dv OS network interface name (e.g. 'em0', 'eth1', ... )
+the data path of the NIC is disconnected from the host stack,
+and the file descriptor is bound to the NIC (one or all queues),
+or to the host stack;
.It Dv valeXXX:YYY (arbitrary XXX and YYY)
-The file descriptor is bound to port YYY of a VALE switch called XXX,
-where XXX and YYY are arbitrary alphanumeric strings.
+the file descriptor is bound to port YYY of a VALE switch called XXX,
+both dynamically created if necessary.
The string cannot exceed IFNAMSIZ characters, and YYY cannot
-matching the name of any existing interface.
-.Pp
-The switch and the port are created if not existing.
-.It Dv valeXXX:ifname (ifname is an existing interface)
-Flags in the argument control whether the physical interface
-(and optionally the corrisponding host stack endpoint)
-are connected or disconnected from the VALE switch named XXX.
-.Pp
-In this case the
-.Pa ioctl()
-is used only for configuring the VALE switch, typically through the
-.Nm vale-ctl
-command.
-The file descriptor cannot be used for I/O, and should be
-.Pa close()d
-after issuing the
-.Pa ioctl().
+be the name of any existing OS network interface.
.El
.Pp
-The binding can be removed (and the interface returns to
-regular operation, or the virtual port destroyed) with a
-.Pa close()
-on the file descriptor.
+On return,
+.Va arg
+indicates the size of the shared memory region,
+and the number, size and location of all the
+.Nm
+data structures, which can be accessed by mmapping the memory
+.Dl char *mem = mmap(0, arg.nr_memsize, fd);
.Pp
-The processes owning the file descriptor can then
-.Pa mmap()
-the memory region that contains pre-allocated
-buffers, descriptors and queues, and use them to
-read/write raw packets.
Non blocking I/O is done with special
-.Pa ioctl()'s ,
-whereas the file descriptor can be passed to
-.Pa select()/poll()
-to be notified about incoming packet or available transmit buffers.
-.Ss DATA STRUCTURES
-The data structures in the mmapped memory are described below
-(see
-.Xr sys/net/netmap.h
-for reference).
-All physical devices operating in
+.Xr ioctl 2
+.Xr select 2
+and
+.Xr poll 2
+on the file descriptor permit blocking I/O.
+.Xr epoll 2
+and
+.Xr kqueue 2
+are not supported on
.Nm
-mode use the same memory region,
-shared by the kernel and all processes who own
-.Pa /dev/netmap
-descriptors bound to those devices
-(NOTE: visibility may be restricted in future implementations).
-Virtual ports instead use separate memory regions,
-shared only with the kernel.
-.Pp
-All references between the shared data structure
-are relative (offsets or indexes). Some macros help converting
-them into actual pointers.
+file descriptors.
+.Pp
+While a NIC is in
+.Nm
+mode, the OS will still believe the interface is up and running.
+OS-generated packets for that NIC end up into a
+.Nm
+ring, and another ring is used to send packets into the OS network stack.
+A
+.Xr close 2
+on the file descriptor removes the binding,
+and returns the NIC to normal mode (reconnecting the data path
+to the host stack), or destroys the virtual port.
+.Pp
+.Sh DATA STRUCTURES
+The data structures in the mmapped memory region are detailed in
+.Xr sys/net/netmap.h ,
+which is the ultimate reference for the
+.Nm
+API. The main structures and fields are indicated below:
.Bl -tag -width XXX
.It Dv struct netmap_if (one per interface)
-indicates the number of rings supported by an interface, their
-sizes, and the offsets of the
-.Pa netmap_rings
-associated to the interface.
-.Pp
-.Pa struct netmap_if
-is at offset
-.Pa nr_offset
-in the shared memory region is indicated by the
-field in the structure returned by the
-.Pa NIOCREGIF
-(see below).
.Bd -literal
struct netmap_if {
- char ni_name[IFNAMSIZ]; /* name of the interface. */
- const u_int ni_version; /* API version */
- const u_int ni_rx_rings; /* number of rx ring pairs */
- const u_int ni_tx_rings; /* if 0, same as ni_rx_rings */
- const ssize_t ring_ofs[]; /* offset of tx and rx rings */
+ ...
+ const uint32_t ni_flags; /* properties */
+ ...
+ const uint32_t ni_tx_rings; /* NIC tx rings */
+ const uint32_t ni_rx_rings; /* NIC rx rings */
+ const uint32_t ni_extra_tx_rings; /* extra tx rings */
+ const uint32_t ni_extra_rx_rings; /* extra rx rings */
+ ...
};
.Ed
+.Pp
+Indicates the number of available rings
+.Pa ( struct netmap_rings )
+and their position in the mmapped region.
+The number of tx and rx rings
+.Pa ( ni_tx_rings , ni_rx_rings )
+normally depends on the hardware.
+NICs also have an extra tx/rx ring pair connected to the host stack.
+.Em NIOCREGIF
+can request additional tx/rx rings,
+to be used between multiple processes/threads
+accessing the same
+.Nm
+port.
.It Dv struct netmap_ring (one per ring)
-Contains the positions in the transmit and receive rings to
-synchronize the kernel and the application,
-and an array of
-.Pa slots
-describing the buffers.
-'reserved' is used in receive rings to tell the kernel the
-number of slots after 'cur' that are still in usr
-indicates how many slots starting from 'cur'
-the
-.Pp
-Each physical interface has one
-.Pa netmap_ring
-for each hardware transmit and receive ring,
-plus one extra transmit and one receive structure
-that connect to the host stack.
.Bd -literal
struct netmap_ring {
- const ssize_t buf_ofs; /* see details */
- const uint32_t num_slots; /* number of slots in the ring */
- uint32_t avail; /* number of usable slots */
- uint32_t cur; /* 'current' read/write index */
- uint32_t reserved; /* not refilled before current */
-
- const uint16_t nr_buf_size;
- uint16_t flags;
-#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
-#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
-#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */
- struct timeval ts;
- struct netmap_slot slot[0]; /* array of slots */
+ ...
+ const uint32_t num_slots; /* slots in each ring */
+ const uint32_t nr_buf_size; /* size of each buffer */
+ ...
+ uint32_t head; /* (u) first buf owned by user */
+ uint32_t cur; /* (u) wakeup position */
+ const uint32_t tail; /* (k) first buf owned by kernel */
+ ...
+ uint32_t flags;
+ struct timeval ts; /* (k) time of last rxsync() */
+ ...
+ struct netmap_slot slot[0]; /* array of slots */
}
.Ed
.Pp
-In transmit rings, after a system call 'cur' indicates
-the first slot that can be used for transmissions,
-and 'avail' reports how many of them are available.
-Before the next netmap-related system call on the file
-descriptor, the application should fill buffers and
-slots with data, and update 'cur' and 'avail'
-accordingly, as shown in the figure below:
+Implements transmit and receive rings, with read/write
+pointers, metadata and and an array of
+.Pa slots
+describing the buffers.
+.Pp
+.It Dv struct netmap_slot (one per buffer)
.Bd -literal
-
- cur
- |----- avail ---| (after syscall)
- v
- TX [*****aaaaaaaaaaaaaaaaa**]
- TX [*****TTTTTaaaaaaaaaaaa**]
- ^
- |-- avail --| (before syscall)
- cur
+struct netmap_slot {
+ uint32_t buf_idx; /* buffer index */
+ uint16_t len; /* packet length */
+ uint16_t flags; /* buf changed, etc. */
+ uint64_t ptr; /* address for indirect buffers */
+};
.Ed
-In receive rings, after a system call 'cur' indicates
-the first slot that contains a valid packet,
-and 'avail' reports how many of them are available.
-Before the next netmap-related system call on the file
-descriptor, the application can process buffers and
-release them to the kernel updating
-'cur' and 'avail' accordingly, as shown in the figure below.
-Receive rings have an additional field called 'reserved'
-to indicate how many buffers before 'cur' are still
-under processing and cannot be released.
+.Pp
+Describes a packet buffer, which normally is identified by
+an index and resides in the mmapped region.
+.It Dv packet buffers
+Fixed size (normally 2 KB) packet buffers allocated by the kernel.
+.El
+.Pp
+The offset of the
+.Pa struct netmap_if
+in the mmapped region is indicated by the
+.Pa nr_offset
+field in the structure returned by
+.Pa NIOCREGIF .
+From there, all other objects are reachable through
+relative references (offsets or indexes).
+Macros and functions in <net/netmap_user.h>
+help converting them into actual pointers:
+.Pp
+.Dl struct netmap_if *nifp = NETMAP_IF(mem, arg.nr_offset);
+.Dl struct netmap_ring *txr = NETMAP_TXRING(nifp, ring_index);
+.Dl struct netmap_ring *rxr = NETMAP_RXRING(nifp, ring_index);
+.Pp
+.Dl char *buf = NETMAP_BUF(ring, buffer_index);
+.Sh RINGS, BUFFERS AND DATA I/O
+.Va Rings
+are circular queues of packets with three indexes/pointers
+.Va ( head , cur , tail ) ;
+one slot is always kept empty.
+The ring size
+.Va ( num_slots )
+should not be assumed to be a power of two.
+.br
+(NOTE: older versions of netmap used head/count format to indicate
+the content of a ring).
+.Pp
+.Va head
+is the first slot available to userspace;
+.br
+.Va cur
+is the wakeup point:
+select/poll will unblock when
+.Va tail
+passes
+.Va cur ;
+.br
+.Va tail
+is the first slot reserved to the kernel.
+.Pp
+Slot indexes MUST only move forward;
+for convenience, the function
+.Dl nm_ring_next(ring, index)
+returns the next index modulo the ring size.
+.Pp
+.Va head
+and
+.Va cur
+are only modified by the user program;
+.Va tail
+is only modified by the kernel.
+The kernel only reads/writes the
+.Vt struct netmap_ring
+slots and buffers
+during the execution of a netmap-related system call.
+The only exception are slots (and buffers) in the range
+.Va tail\ . . . head-1 ,
+that are explicitly assigned to the kernel.
+.Pp
+.Ss TRANSMIT RINGS
+On transmit rings, after a
+.Nm
+system call, slots in the range
+.Va head\ . . . tail-1
+are available for transmission.
+User code should fill the slots sequentially
+and advance
+.Va head
+and
+.Va cur
+past slots ready to transmit.
+.Va cur
+may be moved further ahead if the user code needs
+more slots before further transmissions (see
+.Sx SCATTER GATHER I/O ) .
+.Pp
+At the next NIOCTXSYNC/select()/poll(),
+slots up to
+.Va head-1
+are pushed to the port, and
+.Va tail
+may advance if further slots have become available.
+Below is an example of the evolution of a TX ring:
+.Pp
.Bd -literal
- cur
- |-res-|-- avail --| (after syscall)
- v
- RX [**rrrrrrRRRRRRRRRRRR******]
- RX [**...........rrrrRRR******]
- |res|--|<avail (before syscall)
- ^
- cur
+ after the syscall, slots between cur and tail are (a)vailable
+ head=cur tail
+ | |
+ v v
+ TX [.....aaaaaaaaaaa.............]
+ user creates new packets to (T)ransmit
+ head=cur tail
+ | |
+ v v
+ TX [.....TTTTTaaaaaa.............]
+
+ NIOCTXSYNC/poll()/select() sends packets and reports new slots
+ head=cur tail
+ | |
+ v v
+ TX [..........aaaaaaaaaaa........]
.Ed
-.It Dv struct netmap_slot (one per packet)
-contains the metadata for a packet:
+.Pp
+select() and poll() wlll block if there is no space in the ring, i.e.
+.Dl ring->cur == ring->tail
+and return when new slots have become available.
+.Pp
+High speed applications may want to amortize the cost of system calls
+by preparing as many packets as possible before issuing them.
+.Pp
+A transmit ring with pending transmissions has
+.Dl ring->head != ring->tail + 1 (modulo the ring size).
+The function
+.Va int nm_tx_pending(ring)
+implements this test.
+.Pp
+.Ss RECEIVE RINGS
+On receive rings, after a
+.Nm
+system call, the slots in the range
+.Va head\& . . . tail-1
+contain received packets.
+User code should process them and advance
+.Va head
+and
+.Va cur
+past slots it wants to return to the kernel.
+.Va cur
+may be moved further ahead if the user code wants to
+wait for more packets
+without returning all the previous slots to the kernel.
+.Pp
+At the next NIOCRXSYNC/select()/poll(),
+slots up to
+.Va head-1
+are returned to the kernel for further receives, and
+.Va tail
+may advance to report new incoming packets.
+.br
+Below is an example of the evolution of an RX ring:
.Bd -literal
-struct netmap_slot {
- uint32_t buf_idx; /* buffer index */
- uint16_t len; /* packet length */
- uint16_t flags; /* buf changed, etc. */
-#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */
-#define NS_REPORT 0x0002 /* tell hw to report results
- * e.g. by generating an interrupt
- */
-#define NS_FORWARD 0x0004 /* pass packet to the other endpoint
- * (host stack or device)
- */
-#define NS_NO_LEARN 0x0008
-#define NS_INDIRECT 0x0010
-#define NS_MOREFRAG 0x0020
-#define NS_PORT_SHIFT 8
-#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
-#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff)
- uint64_t ptr; /* buffer address (indirect buffers) */
-};
+ after the syscall, there are some (h)eld and some (R)eceived slots
+ head cur tail
+ | | |
+ v v v
+ RX [..hhhhhhRRRRRRRR..........]
+
+ user advances head and cur, releasing some slots and holding others
+ head cur tail
+ | | |
+ v v v
+ RX [..*****hhhRRRRRR...........]
+
+ NICRXSYNC/poll()/select() recovers slots and reports new packets
+ head cur tail
+ | | |
+ v v v
+ RX [.......hhhRRRRRRRRRRRR....]
.Ed
-The flags control how the the buffer associated to the slot
-should be managed.
-.It Dv packet buffers
-are normally fixed size (2 Kbyte) buffers allocated by the kernel
-that contain packet data. Buffers addresses are computed through
-macros.
-.El
-.Bl -tag -width XXX
-Some macros support the access to objects in the shared memory
-region. In particular,
-.It NETMAP_TXRING(nifp, i)
-.It NETMAP_RXRING(nifp, i)
-return the address of the i-th transmit and receive ring,
-respectively, whereas
-.It NETMAP_BUF(ring, buf_idx)
-returns the address of the buffer with index buf_idx
-(which can be part of any ring for the given interface).
-.El
.Pp
-Normally, buffers are associated to slots when interfaces are bound,
-and one packet is fully contained in a single buffer.
-Clients can however modify the mapping using the
-following flags:
-.Ss FLAGS
+.Sh SLOTS AND PACKET BUFFERS
+Normally, packets should be stored in the netmap-allocated buffers
+assigned to slots when ports are bound to a file descriptor.
+One packet is fully contained in a single buffer.
+.Pp
+The following flags affect slot and buffer processing:
.Bl -tag -width XXX
.It NS_BUF_CHANGED
-indicates that the buf_idx in the slot has changed.
-This can be useful if the client wants to implement
-some form of zero-copy forwarding (e.g. by passing buffers
-from an input interface to an output interface), or
-needs to process packets out of order.
+it MUST be used when the buf_idx in the slot is changed.
+This can be used to implement
+zero-copy forwarding, see
+.Sx ZERO-COPY FORWARDING .
.Pp
-The flag MUST be used whenever the buffer index is changed.
.It NS_REPORT
-indicates that we want to be woken up when this buffer
-has been transmitted. This reduces performance but insures
-a prompt notification when a buffer has been sent.
+reports when this buffer has been transmitted.
Normally,
.Nm
notifies transmit completions in batches, hence signals
-can be delayed indefinitely. However, we need such notifications
-before closing a descriptor.
+can be delayed indefinitely. This flag helps detecting
+when packets have been send and a file descriptor can be closed.
.It NS_FORWARD
-When the device is open in 'transparent' mode,
-the client can mark slots in receive rings with this flag.
-For all marked slots, marked packets are forwarded to
-the other endpoint at the next system call, thus restoring
-(in a selective way) the connection between the NIC and the
-host stack.
+When a ring is in 'transparent' mode (see
+.Sx TRANSPARENT MODE ) ,
+packets marked with this flags are forwarded to the other endpoint
+at the next system call, thus restoring (in a selective way)
+the connection between a NIC and the host stack.
.It NS_NO_LEARN
tells the forwarding code that the SRC MAC address for this
-packet should not be used in the learning bridge
+packet must not be used in the learning bridge code.
.It NS_INDIRECT
-indicates that the packet's payload is not in the netmap
-supplied buffer, but in a user-supplied buffer whose
-user virtual address is in the 'ptr' field of the slot.
+indicates that the packet's payload is in a user-supplied buffer,
+whose user virtual address is in the 'ptr' field of the slot.
The size can reach 65535 bytes.
-.Em This is only supported on the transmit ring of virtual ports
+.br
+This is only supported on the transmit ring of
+.Nm VALE
+ports, and it helps reducing data copies in the interconnection
+of virtual machines.
.It NS_MOREFRAG
indicates that the packet continues with subsequent buffers;
the last buffer in a packet must have the flag clear.
+.El
+.Sh SCATTER GATHER I/O
+Packets can span multiple slots if the
+.Va NS_MOREFRAG
+flag is set in all but the last slot.
The maximum length of a chain is 64 buffers.
-.Em This is only supported on virtual ports
-.It NS_RFRAGS(slot)
-on receive rings, returns the number of remaining buffers
-in a packet, including this one.
-Slots with a value greater than 1 also have NS_MOREFRAG set.
-The length refers to the individual buffer, there is no
-field for the total length.
+This is normally used with
+.Nm VALE
+ports when connecting virtual machines, as they generate large
+TSO segments that are not split unless they reach a physical device.
.Pp
-On transmit rings, if NS_DST is set, it is passed to the lookup
-function, which can use it e.g. as the index of the destination
-port instead of doing an address lookup.
-.El
+NOTE: The length field always refers to the individual
+fragment; there is no place with the total length of a packet.
+.Pp
+On receive rings the macro
+.Va NS_RFRAGS(slot)
+indicates the remaining number of slots for this packet,
+including the current one.
+Slots with a value greater than 1 also have NS_MOREFRAG set.
.Sh IOCTLS
.Nm
-supports some ioctl() to synchronize the state of the rings
-between the kernel and the user processes, plus some
-to query and configure the interface.
-The former do not require any argument, whereas the latter
-use a
-.Pa struct nmreq
-defined as follows:
+uses two ioctls (NIOCTXSYNC, NIOCRXSYNC)
+for non-blocking I/O. They take no argument.
+Two more ioctls (NIOCGINFO, NIOCREGIF) are used
+to query and configure ports, with the following argument:
.Bd -literal
struct nmreq {
- char nr_name[IFNAMSIZ];
- uint32_t nr_version; /* API version */
-#define NETMAP_API 4 /* current version */
- uint32_t nr_offset; /* nifp offset in the shared region */
- uint32_t nr_memsize; /* size of the shared region */
- uint32_t nr_tx_slots; /* slots in tx rings */
- uint32_t nr_rx_slots; /* slots in rx rings */
- uint16_t nr_tx_rings; /* number of tx rings */
- uint16_t nr_rx_rings; /* number of tx rings */
- uint16_t nr_ringid; /* ring(s) we care about */
-#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */
-#define NETMAP_SW_RING 0x2000 /* we process the sw ring */
-#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */
-#define NETMAP_RING_MASK 0xfff /* the actual ring number */
- uint16_t nr_cmd;
-#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
-#define NETMAP_BDG_DETACH 2 /* detach the NIC */
-#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */
-#define NETMAP_BDG_LIST 4 /* get bridge's info */
- uint16_t nr_arg1;
- uint16_t nr_arg2;
- uint32_t spare2[3];
+ char nr_name[IFNAMSIZ]; /* (i) port name */
+ uint32_t nr_version; /* (i) API version */
+ uint32_t nr_offset; /* (o) nifp offset in mmap region */
+ uint32_t nr_memsize; /* (o) size of the mmap region */
+ uint32_t nr_tx_slots; /* (o) slots in tx rings */
+ uint32_t nr_rx_slots; /* (o) slots in rx rings */
+ uint16_t nr_tx_rings; /* (o) number of tx rings */
+ uint16_t nr_rx_rings; /* (o) number of tx rings */
+ uint16_t nr_ringid; /* (i) ring(s) we care about */
+ uint16_t nr_cmd; /* (i) special command */
+ uint16_t nr_arg1; /* (i) extra arguments */
+ uint16_t nr_arg2; /* (i) extra arguments */
+ ...
};
-
.Ed
-A device descriptor obtained through
+.Pp
+A file descriptor obtained through
.Pa /dev/netmap
-also supports the ioctl supported by network devices.
+also supports the ioctl supported by network devices, see
+.Xr netintro 4 .
.Pp
-The netmap-specific
-.Xr ioctl 2
-command codes below are defined in
-.In net/netmap.h
-and are:
.Bl -tag -width XXXX
.It Dv NIOCGINFO
-returns EINVAL if the named device does not support netmap.
+returns EINVAL if the named port does not support netmap.
Otherwise, it returns 0 and (advisory) information
-about the interface.
+about the port.
Note that all the information below can change before the
interface is actually put in netmap mode.
.Pp
-.Pa nr_memsize
-indicates the size of the netmap
-memory region. Physical devices all share the same memory region,
-whereas VALE ports may have independent regions for each port.
-These sizes can be set through system-wise sysctl variables.
-.Pa nr_tx_slots, nr_rx_slots
+.Bl -tag -width XX
+.It Pa nr_memsize
+indicates the size of the
+.Nm
+memory region. NICs in
+.Nm
+mode all share the same memory region,
+whereas
+.Nm VALE
+ports have independent regions for each port.
+.It Pa nr_tx_slots , nr_rx_slots
indicate the size of transmit and receive rings.
-.Pa nr_tx_rings, nr_rx_rings
+.It Pa nr_tx_rings , nr_rx_rings
indicate the number of transmit
and receive rings.
Both ring number and sizes may be configured at runtime
using interface-specific functions (e.g.
-.Pa sysctl
-or
-.Pa ethtool .
+.Xr ethtool
+).
+.El
.It Dv NIOCREGIF
-puts the interface named in nr_name into netmap mode, disconnecting
-it from the host stack, and/or defines which rings are controlled
-through this file descriptor.
+binds the port named in
+.Va nr_name
+to the file descriptor. For a physical device this also switches it into
+.Nm
+mode, disconnecting
+it from the host stack.
+Multiple file descriptors can be bound to the same port,
+with proper synchronization left to the user.
+.Pp
On return, it gives the same info as NIOCGINFO, and nr_ringid
indicates the identity of the rings controlled through the file
descriptor.
.Pp
-Possible values for nr_ringid are
+.Va nr_ringid
+selects which rings are controlled through this file descriptor.
+Possible values are:
.Bl -tag -width XXXXX
.It 0
-default, all hardware rings
+(default) all hardware rings
.It NETMAP_SW_RING
-the ``host rings'' connecting to the host stack
-.It NETMAP_HW_RING + i
-the i-th hardware ring
+the ``host rings'', connecting to the host stack.
+.It NETMAP_HW_RING | i
+the i-th hardware ring .
.El
+.Pp
By default, a
-.Nm poll
+.Xr poll 2
or
-.Nm select
+.Xr select 2
call pushes out any pending packets on the transmit ring, even if
no write events are specified.
The feature can be disabled by or-ing
-.Nm NETMAP_NO_TX_SYNC
-to nr_ringid.
-But normally you should keep this feature unless you are using
-separate file descriptors for the send and receive rings, because
-otherwise packets are pushed out only if NETMAP_TXSYNC is called,
-or the send queue is full.
-.Pp
-.Pa NIOCREGIF
-can be used multiple times to change the association of a
-file descriptor to a ring pair, always within the same device.
+.Va NETMAP_NO_TX_SYNC
+to the value written to
+.Va nr_ringid.
+When this feature is used,
+packets are transmitted only on
+.Va ioctl(NIOCTXSYNC)
+or select()/poll() are called with a write event (POLLOUT/wfdset) or a full ring.
.Pp
When registering a virtual interface that is dynamically created to a
.Xr vale 4
@@ -467,6 +579,164 @@ number of slots available for transmission.
tells the hardware of consumed packets, and asks for newly available
packets.
.El
+.Sh SELECT AND POLL
+.Xr select 2
+and
+.Xr poll 2
+on a
+.Nm
+file descriptor process rings as indicated in
+.Sx TRANSMIT RINGS
+and
+.Sx RECEIVE RINGS
+when write (POLLOUT) and read (POLLIN) events are requested.
+.Pp
+Both block if no slots are available in the ring (
+.Va ring->cur == ring->tail )
+.Pp
+Packets in transmit rings are normally pushed out even without
+requesting write events. Passing the NETMAP_NO_TX_SYNC flag to
+.Em NIOCREGIF
+disables this feature.
+.Sh LIBRARIES
+The
+.Nm
+API is supposed to be used directly, both because of its simplicity and
+for efficient integration with applications.
+.Pp
+For conveniency, the
+.Va <net/netmap_user.h>
+header provides a few macros and functions to ease creating
+a file descriptor and doing I/O with a
+.Nm
+port. These are loosely modeled after the
+.Xr pcap 3
+API, to ease porting of libpcap-based applications to
+.Nm .
+To use these extra functions, programs should
+.Dl #define NETMAP_WITH_LIBS
+before
+.Dl #include <net/netmap_user.h>
+.Pp
+The following functions are available:
+.Bl -tag -width XXXXX
+.It Va struct nm_desc_t * nm_open(const char *ifname, const char *ring_name, int flags, int ring_flags)
+similar to
+.Xr pcap_open ,
+binds a file descriptor to a port.
+.Bl -tag -width XX
+.It Va ifname
+is a port name, in the form "netmap:XXX" for a NIC and "valeXXX:YYY" for a
+.Nm VALE
+port.
+.It Va flags
+can be set to
+.Va NETMAP_SW_RING
+to bind to the host ring pair,
+or to NETMAP_HW_RING to bind to a specific ring.
+.Va ring_name
+with NETMAP_HW_RING,
+is interpreted as a string or an integer indicating the ring to use.
+.It Va ring_flags
+is copied directly into the ring flags, to specify additional parameters
+such as NR_TIMESTAMP or NR_FORWARD.
+.El
+.It Va int nm_close(struct nm_desc_t *d)
+closes the file descriptor, unmaps memory, frees resources.
+.It Va int nm_inject(struct nm_desc_t *d, const void *buf, size_t size)
+similar to pcap_inject(), pushes a packet to a ring, returns the size
+of the packet is successful, or 0 on error;
+.It Va int nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
+similar to pcap_dispatch(), applies a callback to incoming packets
+.It Va u_char * nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr)
+similar to pcap_next(), fetches the next packet
+.Pp
+.El
+.Sh SUPPORTED DEVICES
+.Nm
+natively supports the following devices:
+.Pp
+On FreeBSD:
+.Xr em 4 ,
+.Xr igb 4 ,
+.Xr ixgbe 4 ,
+.Xr lem 4 ,
+.Xr re 4 .
+.Pp
+On Linux
+.Xr e1000 4 ,
+.Xr e1000e 4 ,
+.Xr igb 4 ,
+.Xr ixgbe 4 ,
+.Xr mlx4 4 ,
+.Xr forcedeth 4 ,
+.Xr r8169 4 .
+.Pp
+NICs without native support can still be used in
+.Nm
+mode through emulation. Performance is inferior to native netmap
+mode but still significantly higher than sockets, and approaching
+that of in-kernel solutions such as Linux's
+.Xr pktgen .
+.Pp
+Emulation is also available for devices with native netmap support,
+which can be used for testing or performance comparison.
+The sysctl variable
+.Va dev.netmap.admode
+globally controls how netmap mode is implemented.
+.Sh SYSCTL VARIABLES AND MODULE PARAMETERS
+Some aspect of the operation of
+.Nm
+are controlled through sysctl variables on FreeBSD
+.Em ( dev.netmap.* )
+and module parameters on Linux
+.Em ( /sys/module/netmap_lin/parameters/* ) :
+.Pp
+.Bl -tag -width indent
+.It Va dev.netmap.admode: 0
+Controls the use of native or emulated adapter mode.
+0 uses the best available option, 1 forces native and
+fails if not available, 2 forces emulated hence never fails.
+.It Va dev.netmap.generic_ringsize: 1024
+Ring size used for emulated netmap mode
+.It Va dev.netmap.generic_mit: 100000
+Controls interrupt moderation for emulated mode
+.It Va dev.netmap.mmap_unreg: 0
+.It Va dev.netmap.fwd: 0
+Forces NS_FORWARD mode
+.It Va dev.netmap.flags: 0
+.It Va dev.netmap.txsync_retry: 2
+.It Va dev.netmap.no_pendintr: 1
+Forces recovery of transmit buffers on system calls
+.It Va dev.netmap.mitigate: 1
+Propagates interrupt mitigation to user processes
+.It Va dev.netmap.no_timestamp: 0
+Disables the update of the timestamp in the netmap ring
+.It Va dev.netmap.verbose: 0
+Verbose kernel messages
+.It Va dev.netmap.buf_num: 163840
+.It Va dev.netmap.buf_size: 2048
+.It Va dev.netmap.ring_num: 200
+.It Va dev.netmap.ring_size: 36864
+.It Va dev.netmap.if_num: 100
+.It Va dev.netmap.if_size: 1024
+Sizes and number of objects (netmap_if, netmap_ring, buffers)
+for the global memory region. The only parameter worth modifying is
+.Va dev.netmap.buf_num
+as it impacts the total amount of memory used by netmap.
+.It Va dev.netmap.buf_curr_num: 0
+.It Va dev.netmap.buf_curr_size: 0
+.It Va dev.netmap.ring_curr_num: 0
+.It Va dev.netmap.ring_curr_size: 0
+.It Va dev.netmap.if_curr_num: 0
+.It Va dev.netmap.if_curr_size: 0
+Actual values in use.
+.It Va dev.netmap.bridge_batch: 1024
+Batch size used when moving packets across a
+.Nm VALE
+switch. Values above 64 generally guarantee good
+performance.
+.El
.Sh SYSTEM CALLS
.Nm
uses
@@ -476,6 +746,9 @@ and
to wake up processes when significant events occur, and
.Xr mmap 2
to map memory.
+.Xr ioctl 2
+is used to configure ports and
+.Nm VALE switches .
.Pp
Applications may need to create threads and bind them to
specific cores to improve performance, using standard
@@ -484,47 +757,176 @@ OS primitives, see
In particular,
.Xr pthread_setaffinity_np 3
may be of use.
+.Sh CAVEATS
+No matter how fast the CPU and OS are,
+achieving line rate on 10G and faster interfaces
+requires hardware with sufficient performance.
+Several NICs are unable to sustain line rate with
+small packet sizes. Insufficient PCIe or memory bandwidth
+can also cause reduced performance.
+.Pp
+Another frequent reason for low performance is the use
+of flow control on the link: a slow receiver can limit
+the transmit speed.
+Be sure to disable flow control when running high
+speed experiments.
+.Pp
+.Ss SPECIAL NIC FEATURES
+.Nm
+is orthogonal to some NIC features such as
+multiqueue, schedulers, packet filters.
+.Pp
+Multiple transmit and receive rings are supported natively
+and can be configured with ordinary OS tools,
+such as
+.Xr ethtool
+or
+device-specific sysctl variables.
+The same goes for Receive Packet Steering (RPS)
+and filtering of incoming traffic.
+.Pp
+.Nm
+.Em does not use
+features such as
+.Em checksum offloading , TCP segmentation offloading ,
+.Em encryption , VLAN encapsulation/decapsulation ,
+etc. .
+When using netmap to exchange packets with the host stack,
+make sure to disable these features.
.Sh EXAMPLES
+.Ss TEST PROGRAMS
+.Nm
+comes with a few programs that can be used for testing or
+simple applications.
+See the
+.Va examples/
+directory in
+.Nm
+distributions, or
+.Va tools/tools/netmap/
+directory in FreeBSD distributions.
+.Pp
+.Xr pkt-gen
+is a general purpose traffic source/sink.
+.Pp
+As an example
+.Dl pkt-gen -i ix0 -f tx -l 60
+can generate an infinite stream of minimum size packets, and
+.Dl pkt-gen -i ix0 -f rx
+is a traffic sink.
+Both print traffic statistics, to help monitor
+how the system performs.
+.Pp
+.Xr pkt-gen
+has many options can be uses to set packet sizes, addresses,
+rates, and use multiple send/receive threads and cores.
+.Pp
+.Xr bridge
+is another test program which interconnects two
+.Nm
+ports. It can be used for transparent forwarding between
+interfaces, as in
+.Dl bridge -i ix0 -i ix1
+or even connect the NIC to the host stack using netmap
+.Dl bridge -i ix0 -i ix0
+.Ss USING THE NATIVE API
The following code implements a traffic generator
.Pp
.Bd -literal -compact
-#include <net/netmap.h>
#include <net/netmap_user.h>
-struct netmap_if *nifp;
-struct netmap_ring *ring;
-struct nmreq nmr;
+...
+void sender(void)
+{
+ struct netmap_if *nifp;
+ struct netmap_ring *ring;
+ struct nmreq nmr;
+ struct pollfd fds;
-fd = open("/dev/netmap", O_RDWR);
-bzero(&nmr, sizeof(nmr));
-strcpy(nmr.nr_name, "ix0");
-nmr.nm_version = NETMAP_API;
-ioctl(fd, NIOCREGIF, &nmr);
-p = mmap(0, nmr.nr_memsize, fd);
-nifp = NETMAP_IF(p, nmr.nr_offset);
-ring = NETMAP_TXRING(nifp, 0);
-fds.fd = fd;
-fds.events = POLLOUT;
-for (;;) {
- poll(list, 1, -1);
- for ( ; ring->avail > 0 ; ring->avail--) {
- i = ring->cur;
- buf = NETMAP_BUF(ring, ring->slot[i].buf_index);
- ... prepare packet in buf ...
- ring->slot[i].len = ... packet length ...
- ring->cur = NETMAP_RING_NEXT(ring, i);
+ fd = open("/dev/netmap", O_RDWR);
+ bzero(&nmr, sizeof(nmr));
+ strcpy(nmr.nr_name, "ix0");
+ nmr.nm_version = NETMAP_API;
+ ioctl(fd, NIOCREGIF, &nmr);
+ p = mmap(0, nmr.nr_memsize, fd);
+ nifp = NETMAP_IF(p, nmr.nr_offset);
+ ring = NETMAP_TXRING(nifp, 0);
+ fds.fd = fd;
+ fds.events = POLLOUT;
+ for (;;) {
+ poll(&fds, 1, -1);
+ while (!nm_ring_empty(ring)) {
+ i = ring->cur;
+ buf = NETMAP_BUF(ring, ring->slot[i].buf_index);
+ ... prepare packet in buf ...
+ ring->slot[i].len = ... packet length ...
+ ring->head = ring->cur = nm_ring_next(ring, i);
+ }
}
}
.Ed
-.Sh SUPPORTED INTERFACES
+.Ss HELPER FUNCTIONS
+A simple receiver can be implemented using the helper functions
+.Bd -literal -compact
+#define NETMAP_WITH_LIBS
+#include <net/netmap_user.h>
+...
+void receiver(void)
+{
+ struct nm_desc_t *d;
+ struct pollfd fds;
+ u_char *buf;
+ struct nm_hdr_t h;
+ ...
+ d = nm_open("netmap:ix0", NULL, 0, 0);
+ fds.fd = NETMAP_FD(d);
+ fds.events = POLLIN;
+ for (;;) {
+ poll(&fds, 1, -1);
+ while ( (buf = nm_nextpkt(d, &h)) )
+ consume_pkt(buf, h->len);
+ }
+ nm_close(d);
+}
+.Ed
+.Ss ZERO-COPY FORWARDING
+Since physical interfaces share the same memory region,
+it is possible to do packet forwarding between ports
+swapping buffers. The buffer from the transmit ring is used
+to replenish the receive ring:
+.Bd -literal -compact
+ uint32_t tmp;
+ struct netmap_slot *src, *dst;
+ ...
+ src = &src_ring->slot[rxr->cur];
+ dst = &dst_ring->slot[txr->cur];
+ tmp = dst->buf_idx;
+ dst->buf_idx = src->buf_idx;
+ dst->len = src->len;
+ dst->flags = NS_BUF_CHANGED;
+ src->buf_idx = tmp;
+ src->flags = NS_BUF_CHANGED;
+ rxr->head = rxr->cur = nm_ring_next(rxr, rxr->cur);
+ txr->head = txr->cur = nm_ring_next(txr, txr->cur);
+ ...
+.Ed
+.Ss ACCESSING THE HOST STACK
+.Ss VALE SWITCH
+A simple way to test the performance of a
+.Nm VALE
+switch is to attach a sender and a receiver to it,
+e.g. running the following in two different terminals:
+.Dl pkt-gen -i vale1:a -f rx # receiver
+.Dl pkt-gen -i vale1:b -f tx # sender
+.Pp
+The following command attaches an interface and the host stack
+to a switch:
+.Dl vale-ctl -h vale2:em0
+Other
.Nm
-supports the following interfaces:
-.Xr em 4 ,
-.Xr igb 4 ,
-.Xr ixgbe 4 ,
-.Xr lem 4 ,
-.Xr re 4
+clients attached to the same switch can now communicate
+with the network card or the host.
+.Pp
.Sh SEE ALSO
-.Xr vale 4
.Pp
http://info.iet.unipi.it/~luigi/netmap/
.Pp
@@ -551,3 +953,20 @@ and
.Nm VALE
have been funded by the European Commission within FP7 Projects
CHANGE (257422) and OPENLAB (287581).
+.Pp
+.Ss SPECIAL MODES
+When the device name has the form
+.Dl valeXXX:ifname (ifname is an existing interface)
+the physical interface
+(and optionally the corrisponding host stack endpoint)
+are connected or disconnected from the
+.Nm VALE
+switch named XXX.
+In this case the
+.Pa ioctl()
+is only used only for configuration, typically through the
+.Xr vale-ctl
+command.
+The file descriptor cannot be used for I/O, and should be
+closed after issuing the
+.Pa ioctl() .
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index 580407a529fd..428612a4a695 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -4352,7 +4352,7 @@ em_initialize_receive_unit(struct adapter *adapter)
* preserve the rx buffers passed to userspace.
*/
if (ifp->if_capenable & IFCAP_NETMAP)
- rdt -= NA(adapter->ifp)->rx_rings[i].nr_hwavail;
+ rdt -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[i]);
#endif /* DEV_NETMAP */
E1000_WRITE_REG(hw, E1000_RDT(i), rdt);
}
diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c
index 57e4f893ab35..2134e29625cc 100644
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@@ -4630,13 +4630,13 @@ igb_initialize_receive_units(struct adapter *adapter)
* an init() while a netmap client is active must
* preserve the rx buffers passed to userspace.
* In this driver it means we adjust RDT to
- * somthing different from next_to_refresh
+ * something different from next_to_refresh
* (which is not used in netmap mode).
*/
if (ifp->if_capenable & IFCAP_NETMAP) {
struct netmap_adapter *na = NA(adapter->ifp);
struct netmap_kring *kring = &na->rx_rings[i];
- int t = rxr->next_to_refresh - kring->nr_hwavail;
+ int t = rxr->next_to_refresh - nm_kr_rxspace(kring);
if (t >= adapter->num_rx_desc)
t -= adapter->num_rx_desc;
diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c
index a3da50c176ed..8014a0f9fde7 100644
--- a/sys/dev/e1000/if_lem.c
+++ b/sys/dev/e1000/if_lem.c
@@ -3367,7 +3367,7 @@ lem_initialize_receive_unit(struct adapter *adapter)
#ifdef DEV_NETMAP
/* preserve buffers already made available to clients */
if (ifp->if_capenable & IFCAP_NETMAP)
- rctl -= NA(adapter->ifp)->rx_rings[0].nr_hwavail;
+ rctl -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[0]);
#endif /* DEV_NETMAP */
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), rctl);
diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c
index 740f7709e5b2..6dfec02cc8d9 100644
--- a/sys/dev/ixgbe/ixgbe.c
+++ b/sys/dev/ixgbe/ixgbe.c
@@ -1245,7 +1245,7 @@ ixgbe_init_locked(struct adapter *adapter)
if (ifp->if_capenable & IFCAP_NETMAP) {
struct netmap_adapter *na = NA(adapter->ifp);
struct netmap_kring *kring = &na->rx_rings[i];
- int t = na->num_rx_desc - 1 - kring->nr_hwavail;
+ int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t);
} else
diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h
index dbbee4222407..17b4c4fd2e14 100644
--- a/sys/dev/netmap/if_em_netmap.h
+++ b/sys/dev/netmap/if_em_netmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -120,9 +120,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, new_slots;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_txsync_prologue(kring, &new_slots);
+ u_int const head = kring->rhead;
/* generate an interrupt approximately every half ring */
u_int report_frequency = kring->nkr_num_slots >> 1;
@@ -130,9 +130,6 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct adapter *adapter = ifp->if_softc;
struct tx_ring *txr = &adapter->tx_rings[ring_nr];
- if (cur > lim) /* error checking in nm_txsync_prologue() */
- return netmap_ring_reinit(kring);
-
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -141,9 +138,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) { /* we have new packets to send */
+ if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@@ -175,9 +172,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwcur = cur; /* the saved ring->cur */
- /* decrease avail by # of packets sent minus previous ones */
- kring->nr_hwavail -= new_slots;
+ kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
@@ -190,26 +185,20 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
- if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
- int delta;
-
+ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
}
- delta = nic_i - txr->next_to_clean;
- if (delta) {
- /* some completed, increment hwavail. */
- if (delta < 0)
- delta += kring->nkr_num_slots;
+ if (nic_i != txr->next_to_clean) {
txr->next_to_clean = nic_i;
- kring->nr_hwavail += delta;
+ kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
}
- nm_txsync_finalize(kring, cur);
+ nm_txsync_finalize(kring);
return 0;
}
@@ -226,16 +215,16 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, resvd;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
+ u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
- if (cur > lim)
+ if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@@ -251,7 +240,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i = rxr->next_to_check;
nm_i = netmap_idx_n2k(kring, nic_i);
- for (n = 0; ; n++) {
+ for (n = 0; ; n++) { // XXX no need to count
struct e1000_rx_desc *curr = &rxr->rx_base[nic_i];
uint32_t staterr = le32toh(curr->status);
@@ -268,7 +257,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
if (n) { /* update the state variables */
rxr->next_to_check = nic_i;
- kring->nr_hwavail += n;
+ kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
@@ -277,9 +266,9 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) {
+ if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@@ -302,8 +291,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwavail -= n;
- kring->nr_hwcur = cur;
+ kring->nr_hwcur = head;
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
@@ -311,12 +299,12 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
- nic_i = (nic_i == 0) ? lim : nic_i - 1;
+ nic_i = nm_prev(nic_i, lim);
E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i);
}
/* tell userspace that there might be new packets */
- ring->avail = kring->nr_hwavail - resvd;
+ nm_rxsync_finalize(kring);
return 0;
diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h
index b91d0baba06f..e1929f0918e2 100644
--- a/sys/dev/netmap/if_igb_netmap.h
+++ b/sys/dev/netmap/if_igb_netmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -88,9 +88,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, new_slots;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_txsync_prologue(kring, &new_slots);
+ u_int const head = kring->rhead;
/* generate an interrupt approximately every half ring */
u_int report_frequency = kring->nkr_num_slots >> 1;
@@ -101,9 +101,6 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
u32 olinfo_status =
(adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
- if (cur > lim) /* error checking in nm_txsync_prologue() */
- return netmap_ring_reinit(kring);
-
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -112,9 +109,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) { /* we have new packets to send */
+ if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@@ -155,9 +152,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwcur = cur; /* the saved ring->cur */
- /* decrease avail by # of packets sent minus previous ones */
- kring->nr_hwavail -= new_slots;
+ kring->nr_hwcur = head;
/* Set the watchdog XXX ? */
txr->queue_status = IGB_QUEUE_WORKING;
@@ -174,26 +169,18 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
- if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
- int delta;
-
+ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
}
- delta = nic_i - txr->next_to_clean;
- if (delta) {
- /* some completed, increment hwavail. */
- if (delta < 0)
- delta += kring->nkr_num_slots;
- txr->next_to_clean = nic_i;
- kring->nr_hwavail += delta;
- }
+ txr->next_to_clean = nic_i;
+ kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
- nm_txsync_finalize(kring, cur);
+ nm_txsync_finalize(kring);
return 0;
}
@@ -210,16 +197,16 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, resvd;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
+ u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
- if (cur > lim)
+ if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@@ -250,7 +237,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
if (n) { /* update the state variables */
rxr->next_to_check = nic_i;
- kring->nr_hwavail += n;
+ kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
@@ -259,9 +246,9 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) {
+ if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@@ -284,8 +271,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwavail -= n;
- kring->nr_hwcur = cur;
+ kring->nr_hwcur = head;
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
@@ -293,12 +279,12 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
- nic_i = (nic_i == 0) ? lim : nic_i - 1;
+ nic_i = nm_prev(nic_i, lim);
E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i);
}
/* tell userspace that there might be new packets */
- ring->avail = kring->nr_hwavail - resvd;
+ nm_rxsync_finalize(kring);
return 0;
diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h
index 8ad3b7a2a352..4fce5c988d09 100644
--- a/sys/dev/netmap/if_lem_netmap.h
+++ b/sys/dev/netmap/if_lem_netmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -91,18 +91,14 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, new_slots;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_txsync_prologue(kring, &new_slots);
+ u_int const head = kring->rhead;
/* generate an interrupt approximately every half ring */
u_int report_frequency = kring->nkr_num_slots >> 1;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- if (cur > lim) /* error checking in nm_txsync_prologue() */
- return netmap_ring_reinit(kring);
-
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -111,9 +107,9 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) { /* we have new packets to send */
+ if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@@ -145,9 +141,7 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwcur = cur; /* the saved ring->cur */
- /* decrease avail by # of packets sent minus previous ones */
- kring->nr_hwavail -= new_slots;
+ kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
@@ -160,26 +154,19 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
- if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
- int delta;
-
+ if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
+ kring->last_reclaim = ticks;
/* record completed transmissions using TDH */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
}
- delta = nic_i - adapter->next_tx_to_clean;
- if (delta) {
- /* some completed, increment hwavail. */
- if (delta < 0)
- delta += kring->nkr_num_slots;
- adapter->next_tx_to_clean = nic_i;
- kring->nr_hwavail += delta;
- }
+ adapter->next_tx_to_clean = nic_i;
+ kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
- nm_txsync_finalize(kring, cur);
+ nm_txsync_finalize(kring);
return 0;
}
@@ -196,15 +183,15 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, resvd;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
+ u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- if (cur > lim)
+ if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@@ -241,9 +228,14 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i = nm_next(nic_i, lim);
}
if (n) { /* update the state variables */
+ ND("%d new packets at nic %d nm %d tail %d",
+ n,
+ adapter->next_rx_desc_to_check,
+ netmap_idx_n2k(kring, adapter->next_rx_desc_to_check),
+ kring->nr_hwtail);
adapter->next_rx_desc_to_check = nic_i;
// ifp->if_ipackets += n;
- kring->nr_hwavail += n;
+ kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
@@ -252,9 +244,9 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) {
+ if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@@ -277,20 +269,19 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwavail -= n;
- kring->nr_hwcur = cur;
+ kring->nr_hwcur = head;
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
/*
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
- nic_i = (nic_i == 0) ? lim : nic_i - 1;
+ nic_i = nm_prev(nic_i, lim);
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i);
}
/* tell userspace that there might be new packets */
- ring->avail = kring->nr_hwavail - resvd;
+ nm_rxsync_finalize(kring);
return 0;
diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h
index 2c7ba060cffd..10abe4f49f83 100644
--- a/sys/dev/netmap/if_re_netmap.h
+++ b/sys/dev/netmap/if_re_netmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -72,17 +72,14 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, new_slots;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_txsync_prologue(kring, &new_slots);
+ u_int const head = kring->rhead;
/* device-specific */
struct rl_softc *sc = ifp->if_softc;
struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc;
- if (cur > lim) /* error checking in nm_txsync_prologue() */
- return netmap_ring_reinit(kring);
-
bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag,
sc->rl_ldata.rl_tx_list_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ?
@@ -91,11 +88,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* First part: process new packets to send.
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) { /* we have new packets to send */
+ if (nm_i != head) { /* we have new packets to send */
nic_i = sc->rl_ldata.rl_tx_prodidx;
// XXX or netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@@ -132,9 +129,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i = nm_next(nic_i, lim);
}
sc->rl_ldata.rl_tx_prodidx = nic_i;
- /* decrease avail by # of packets sent minus previous ones */
- kring->nr_hwcur = cur; /* the saved ring->cur */
- kring->nr_hwavail -= new_slots;
+ kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag,
@@ -148,7 +143,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/*
* Second part: reclaim buffers for completed transmissions.
*/
- if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) {
+ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
nic_i = sc->rl_ldata.rl_tx_considx;
for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx;
n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) {
@@ -160,11 +155,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
if (n > 0) {
sc->rl_ldata.rl_tx_considx = nic_i;
sc->rl_ldata.rl_tx_free += n;
- kring->nr_hwavail += n;
+ kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
}
- nm_txsync_finalize(kring, cur);
+ nm_txsync_finalize(kring);
return 0;
}
@@ -181,16 +176,16 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, resvd;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
+ u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct rl_softc *sc = ifp->if_softc;
struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc;
- if (cur > lim)
+ if (head > lim)
return netmap_ring_reinit(kring);
bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag,
@@ -202,16 +197,17 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*
* This device uses all the buffers in the ring, so we need
* another termination condition in addition to RL_RDESC_STAT_OWN
- * cleared (all buffers could have it cleared. The easiest one
- * is to limit the amount of data reported up to 'lim'
+ * cleared (all buffers could have it cleared). The easiest one
+ * is to stop right before nm_hwcur.
*/
if (netmap_no_pendintr || force_update) {
uint16_t slot_flags = kring->nkr_slot_flags;
+ uint32_t stop_i = nm_prev(kring->nr_hwcur, lim);
nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */
nm_i = netmap_idx_n2k(kring, nic_i);
- for (n = kring->nr_hwavail; n < lim ; n++) {
+ while (nm_i != stop_i) {
struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i];
uint32_t rxstat = le32toh(cur_rx->rl_cmdstat);
uint32_t total_len;
@@ -226,14 +222,12 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* sync was in re_newbuf() */
bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD);
+ // sc->rl_ifp->if_ipackets++;
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- if (n != kring->nr_hwavail) {
- sc->rl_ldata.rl_rx_prodidx = nic_i;
- sc->rl_ifp->if_ipackets += n - kring->nr_hwavail;
- kring->nr_hwavail = n;
- }
+ sc->rl_ldata.rl_rx_prodidx = nic_i;
+ kring->nr_hwtail = nm_i;
kring->nr_kflags &= ~NKR_PENDINTR;
}
@@ -241,9 +235,9 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Second part: skip past packets that userspace has released.
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) {
+ if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@@ -272,8 +266,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwavail -= n;
- kring->nr_hwcur = cur;
+ kring->nr_hwcur = head;
bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag,
sc->rl_ldata.rl_rx_list_map,
@@ -281,7 +274,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
/* tell userspace that there might be new packets */
- ring->avail = kring->nr_hwavail - resvd;
+ nm_rxsync_finalize(kring);
return 0;
@@ -336,36 +329,35 @@ re_netmap_rx_init(struct rl_softc *sc)
struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0);
struct rl_desc *desc = sc->rl_ldata.rl_rx_list;
uint32_t cmdstat;
- int i, n, max_avail;
+ uint32_t nic_i, max_avail;
+ uint32_t const n = sc->rl_ldata.rl_rx_desc_cnt;
if (!slot)
return;
- n = sc->rl_ldata.rl_rx_desc_cnt;
/*
- * Userspace owned hwavail packets before the reset,
- * so the NIC that last hwavail descriptors of the ring
- * are still owned by the driver (and keep one empty).
+ * Do not release the slots owned by userspace,
+ * and also keep one empty.
*/
- max_avail = n - 1 - na->rx_rings[0].nr_hwavail;
- for (i = 0; i < n; i++) {
+ max_avail = n - 1 - nm_kr_rxspace(&na->rx_rings[0]);
+ for (nic_i = 0; nic_i < n; nic_i++) {
void *addr;
uint64_t paddr;
- int l = netmap_idx_n2k(&na->rx_rings[0], i);
+ uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i);
- addr = PNMB(slot + l, &paddr);
+ addr = PNMB(slot + nm_i, &paddr);
netmap_reload_map(sc->rl_ldata.rl_rx_mtag,
- sc->rl_ldata.rl_rx_desc[i].rx_dmamap, addr);
+ sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr);
bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
- sc->rl_ldata.rl_rx_desc[i].rx_dmamap, BUS_DMASYNC_PREREAD);
- desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
- desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
+ sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD);
+ desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
+ desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
cmdstat = NETMAP_BUF_SIZE;
- if (i == n - 1) /* mark the end of ring */
+ if (nic_i == n - 1) /* mark the end of ring */
cmdstat |= RL_RDESC_CMD_EOR;
- if (i < max_avail)
+ if (nic_i < max_avail)
cmdstat |= RL_RDESC_CMD_OWN;
- desc[i].rl_cmdstat = htole32(cmdstat);
+ desc[nic_i].rl_cmdstat = htole32(cmdstat);
}
}
diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h
index 4dea6639d325..a617cc4c2429 100644
--- a/sys/dev/netmap/ixgbe_netmap.h
+++ b/sys/dev/netmap/ixgbe_netmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -141,14 +141,13 @@ ixgbe_netmap_reg(struct netmap_adapter *na, int onoff)
/*
* Reconcile kernel and user view of the transmit ring.
*
- * Userspace wants to send packets up to the one before ring->cur,
+ * All information is in the kring.
+ * Userspace wants to send packets up to the one before kring->rhead,
* kernel knows kring->nr_hwcur is the first unsent packet.
*
* Here we push packets out (as many as possible), and possibly
* reclaim buffers from previously completed transmission.
*
- * ring->avail is not used on input, but it is updated on return.
- *
* The caller (netmap) guarantees that there is only one instance
* running at any time. Any interference with other driver
* methods should be handled by the individual drivers.
@@ -161,9 +160,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, new_slots;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_txsync_prologue(kring, &new_slots);
+ u_int const head = kring->rhead;
/*
* interrupts on every tx packet are expensive so request
* them every half ring, or where NS_REPORT is set
@@ -175,9 +174,6 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct tx_ring *txr = &adapter->tx_rings[ring_nr];
int reclaim_tx;
- if (cur > lim) /* error checking in nm_txsync_prologue() */
- return netmap_ring_reinit(kring);
-
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -199,7 +195,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
/*
- * If we have packets to send (kring->nr_hwcur != ring->cur)
+ * If we have packets to send (kring->nr_hwcur != kring->rhead)
* iterate over the netmap ring, fetch length and update
* the corresponding slot in the NIC ring. Some drivers also
* need to update the buffer's physical address in the NIC slot
@@ -217,13 +213,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) { /* we have new packets to send */
+ if (nm_i != head) { /* we have new packets to send */
nic_i = netmap_idx_k2n(kring, nm_i);
__builtin_prefetch(&ring->slot[nm_i]);
__builtin_prefetch(&txr->tx_buffers[nic_i]);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
@@ -262,9 +258,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwcur = cur; /* the saved ring->cur */
- /* decrease avail by # of packets sent minus previous ones */
- kring->nr_hwavail -= new_slots;
+ kring->nr_hwcur = head;
/* synchronize the NIC ring */
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
@@ -281,7 +275,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
if (flags & NAF_FORCE_RECLAIM) {
reclaim_tx = 1; /* forced reclaim */
- } else if (kring->nr_hwavail > 0) {
+ } else if (!nm_kr_txempty(kring)) {
reclaim_tx = 0; /* have buffers, no reclaim */
} else {
/*
@@ -321,21 +315,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i -= kring->nkr_num_slots;
}
if (nic_i != txr->next_to_clean) {
- n = (nic_i + lim + 1) - txr->next_to_clean;
- if (n > lim)
- n -= lim + 1;
/* some tx completed, increment avail */
txr->next_to_clean = nic_i;
- kring->nr_hwavail += n;
- if (kring->nr_hwavail > lim) {
- RD(5, "bad hwavail %d",
- kring->nr_hwavail);
- return netmap_ring_reinit(kring);
- }
+ kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
}
}
- nm_txsync_finalize(kring, cur);
+ nm_txsync_finalize(kring);
return 0;
}
@@ -347,14 +333,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* The caller guarantees a single invocations, but races against
* the rest of the driver should be handled here.
*
- * When called, userspace has released buffers up to
- * ring->cur - ring->reserved (last one excluded).
- *
- * The last interrupt reported kring->nr_hwavail slots available
- * after kring->nr_hwcur.
- * We must subtract the newly consumed slots (cur - nr_hwcur)
- * from nr_hwavail, make the descriptors available for the next reads,
- * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail.
+ * On call, kring->rhead is the first packet that userspace wants
+ * to keep, and kring->rcur is the wakeup point.
+ * The kernel has previously reported packets up to kring->rtail.
*
* If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
* of whether or not we received an interrupt.
@@ -367,16 +348,16 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
- u_int n, resvd;
+ u_int n;
u_int const lim = kring->nkr_num_slots - 1;
- u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */
+ u_int const head = nm_rxsync_prologue(kring);
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
/* device-specific */
struct adapter *adapter = ifp->if_softc;
struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
- if (cur > lim)
+ if (head > lim)
return netmap_ring_reinit(kring);
/* XXX check sync modes */
@@ -391,8 +372,8 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* and they may differ in case if_init() has been called while
* in netmap mode. For the receive ring we have
*
- * nm_i = (kring->nr_hwcur + kring->nr_hwavail) % ring_size
* nic_i = rxr->next_to_check;
+ * nm_i = kring->nr_hwtail (previous)
* and
* nm_i == (nic_i + kring->nkr_hwofs) % ring_size
*
@@ -402,7 +383,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
int crclen = ix_crcstrip ? 0 : 4;
uint16_t slot_flags = kring->nkr_slot_flags;
- nic_i = rxr->next_to_check;
+ nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail)
nm_i = netmap_idx_n2k(kring, nic_i);
for (n = 0; ; n++) {
@@ -425,23 +406,23 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
ix_rx_miss_bufs += n;
}
rxr->next_to_check = nic_i;
- kring->nr_hwavail += n;
+ kring->nr_hwtail = nm_i;
}
kring->nr_kflags &= ~NKR_PENDINTR;
}
/*
* Second part: skip past packets that userspace has released.
- * (kring->nr_hwcur to ring->cur - ring->reserved excluded),
+ * (kring->nr_hwcur to kring->rhead excluded),
* and make the buffers available for reception.
* As usual nm_i is the index in the netmap ring,
* nic_i is the index in the NIC ring, and
* nm_i == (nic_i + kring->nkr_hwofs) % ring_size
*/
nm_i = kring->nr_hwcur;
- if (nm_i != cur) {
+ if (nm_i != head) {
nic_i = netmap_idx_k2n(kring, nm_i);
- for (n = 0; nm_i != cur; n++) {
+ for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(slot, &paddr);
@@ -464,8 +445,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
- kring->nr_hwavail -= n;
- kring->nr_hwcur = cur;
+ kring->nr_hwcur = head;
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
@@ -473,12 +453,12 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* IMPORTANT: we must leave one free slot in the ring,
* so move nic_i back by one unit
*/
- nic_i = (nic_i == 0) ? lim : nic_i - 1;
+ nic_i = nm_prev(nic_i, lim);
IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i);
}
/* tell userspace that there might be new packets */
- ring->avail = kring->nr_hwavail - resvd;
+ nm_rxsync_finalize(kring);
return 0;
diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index 478d9374937f..358d4693dcb3 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -151,7 +151,6 @@ ports attached to the switch)
#include <machine/bus.h> /* bus_dmamap_* */
#include <sys/endian.h>
#include <sys/refcount.h>
-#include <sys/jail.h>
/* reduce conditional code */
@@ -226,9 +225,6 @@ enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
NETMAP_ADMODE_NATIVE, /* either native or none */
NETMAP_ADMODE_GENERIC, /* force generic */
NETMAP_ADMODE_LAST };
-#define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */
-#define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */
-#define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */
static int netmap_admode = NETMAP_ADMODE_BEST;
int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */
@@ -252,6 +248,10 @@ nm_kr_get(struct netmap_kring *kr)
}
+/*
+ * mark the ring as stopped, and run through the locks
+ * to make sure other users get to see it.
+ */
void
netmap_disable_ring(struct netmap_kring *kr)
{
@@ -380,7 +380,6 @@ nm_dump_buf(char *p, int len, int lim, char *dst)
}
-
/*
* Fetch configuration from the device, to cope with dynamic
* reconfigurations after loading the module.
@@ -432,6 +431,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail
u_int i, len, ndesc;
struct netmap_kring *kring;
+ // XXX additional space for extra rings ?
len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
@@ -441,19 +441,23 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail
}
na->rx_rings = na->tx_rings + ntx;
+ /*
+ * All fields in krings are 0 except the one initialized below.
+ * but better be explicit on important kring fields.
+ */
ndesc = na->num_tx_desc;
for (i = 0; i < ntx; i++) { /* Transmit rings */
kring = &na->tx_rings[i];
bzero(kring, sizeof(*kring));
kring->na = na;
+ kring->ring_id = i;
kring->nkr_num_slots = ndesc;
/*
- * IMPORTANT:
- * Always keep one slot empty, so we can detect new
- * transmissions comparing cur and nr_hwcur (they are
- * the same only if there are no new transmissions).
+ * IMPORTANT: Always keep one slot empty.
*/
- kring->nr_hwavail = ndesc - 1;
+ kring->rhead = kring->rcur = kring->nr_hwcur = 0;
+ kring->rtail = kring->nr_hwtail = ndesc - 1;
+ snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
init_waitqueue_head(&kring->si);
}
@@ -463,7 +467,11 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail
kring = &na->rx_rings[i];
bzero(kring, sizeof(*kring));
kring->na = na;
+ kring->ring_id = i;
kring->nkr_num_slots = ndesc;
+ kring->rhead = kring->rcur = kring->nr_hwcur = 0;
+ kring->rtail = kring->nr_hwtail = 0;
+ snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
init_waitqueue_head(&kring->si);
}
@@ -473,10 +481,10 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail
na->tailroom = na->rx_rings + nrx;
return 0;
-
}
+/* XXX check boundaries */
void
netmap_krings_delete(struct netmap_adapter *na)
{
@@ -493,6 +501,23 @@ netmap_krings_delete(struct netmap_adapter *na)
}
+/*
+ * Destructor for NIC ports. They also have an mbuf queue
+ * on the rings connected to the host so we need to purge
+ * them first.
+ */
+static void
+netmap_hw_krings_delete(struct netmap_adapter *na)
+{
+ struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
+
+ ND("destroy sw mbq with len %d", mbq_len(q));
+ mbq_purge(q);
+ mbq_safe_destroy(q);
+ netmap_krings_delete(na);
+}
+
+
static struct netmap_if*
netmap_if_new(const char *ifname, struct netmap_adapter *na)
{
@@ -721,6 +746,7 @@ netmap_dtor(void *data)
/*
* pass a chain of buffers to the host stack as coming from 'dst'
+ * We do not need to lock because the queue is private.
*/
static void
netmap_send_up(struct ifnet *dst, struct mbq *q)
@@ -739,39 +765,30 @@ netmap_send_up(struct ifnet *dst, struct mbq *q)
/*
* put a copy of the buffers marked NS_FORWARD into an mbuf chain.
- * Run from hwcur to cur - reserved
+ * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
+ * and pass them up. Drop remaining packets in the unlikely event
+ * of an mbuf shortage.
*/
static void
netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
{
- /* Take packets from hwcur to cur-reserved and pass them up.
- * In case of no buffers we give up. At the end of the loop,
- * the queue is drained in all cases.
- * XXX handle reserved
- */
- u_int lim = kring->nkr_num_slots - 1;
- struct mbuf *m;
- u_int k = kring->ring->cur, n = kring->ring->reserved;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = kring->ring->head;
+ u_int n;
struct netmap_adapter *na = kring->na;
- /* compute the final position, ring->cur - ring->reserved */
- if (n > 0) {
- if (k < n)
- k += kring->nkr_num_slots;
- k += n;
- }
- for (n = kring->nr_hwcur; n != k;) {
+ for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
+ struct mbuf *m;
struct netmap_slot *slot = &kring->ring->slot[n];
- n = nm_next(n, lim);
if ((slot->flags & NS_FORWARD) == 0 && !force)
continue;
if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
- D("bad pkt at %d len %d", n, slot->len);
+ RD(5, "bad pkt at %d len %d", n, slot->len);
continue;
}
slot->flags &= ~NS_FORWARD; // XXX needed ?
- /* XXX adapt to the case of a multisegment packet */
+ /* XXX TODO: adapt to the case of a multisegment packet */
m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
if (m == NULL)
@@ -782,69 +799,54 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
/*
- * The host ring has packets from nr_hwcur to (cur - reserved)
- * to be sent down to the NIC.
- * We need to use the queue lock on the source (host RX ring)
- * to protect against netmap_transmit.
- * If the user is well behaved we do not need to acquire locks
- * on the destination(s),
- * so we only need to make sure that there are no panics because
- * of user errors.
- * XXX verify
- *
- * We scan the tx rings, which have just been
- * flushed so nr_hwcur == cur. Pushing packets down means
- * increment cur and decrement avail.
- * XXX to be verified
+ * Send to the NIC rings packets marked NS_FORWARD between
+ * kring->nr_hwcur and kring->rhead
+ * Called under kring->rx_queue.lock on the sw rx ring,
*/
-static void
+static u_int
netmap_sw_to_nic(struct netmap_adapter *na)
{
struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
- struct netmap_kring *k1 = &na->tx_rings[0];
- u_int i, howmany, src_lim, dst_lim;
-
- /* XXX we should also check that the carrier is on */
- if (kring->nkr_stopped)
- return;
+ struct netmap_slot *rxslot = kring->ring->slot;
+ u_int i, rxcur = kring->nr_hwcur;
+ u_int const head = kring->rhead;
+ u_int const src_lim = kring->nkr_num_slots - 1;
+ u_int sent = 0;
+
+ /* scan rings to find space, then fill as much as possible */
+ for (i = 0; i < na->num_tx_rings; i++) {
+ struct netmap_kring *kdst = &na->tx_rings[i];
+ struct netmap_ring *rdst = kdst->ring;
+ u_int const dst_lim = kdst->nkr_num_slots - 1;
+
+ /* XXX do we trust ring or kring->rcur,rtail ? */
+ for (; rxcur != head && !nm_ring_empty(rdst);
+ rxcur = nm_next(rxcur, src_lim) ) {
+ struct netmap_slot *src, *dst, tmp;
+ u_int dst_cur = rdst->cur;
- mtx_lock(&kring->q_lock);
+ src = &rxslot[rxcur];
+ if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
+ continue;
- if (kring->nkr_stopped)
- goto out;
+ sent++;
- howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */
+ dst = &rdst->slot[dst_cur];
- src_lim = kring->nkr_num_slots - 1;
- for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
- ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
- dst_lim = k1->nkr_num_slots - 1;
- while (howmany > 0 && k1->ring->avail > 0) {
- struct netmap_slot *src, *dst, tmp;
- src = &kring->ring->slot[kring->nr_hwcur];
- dst = &k1->ring->slot[k1->ring->cur];
tmp = *src;
+
src->buf_idx = dst->buf_idx;
src->flags = NS_BUF_CHANGED;
dst->buf_idx = tmp.buf_idx;
dst->len = tmp.len;
dst->flags = NS_BUF_CHANGED;
- ND("out len %d buf %d from %d to %d",
- dst->len, dst->buf_idx,
- kring->nr_hwcur, k1->ring->cur);
-
- kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
- howmany--;
- kring->nr_hwavail--;
- k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
- k1->ring->avail--;
+
+ rdst->cur = nm_next(dst_cur, dst_lim);
}
- kring->ring->cur = kring->nr_hwcur; // XXX
- k1++; // XXX why?
+ /* if (sent) XXX txsync ? */
}
-out:
- mtx_unlock(&kring->q_lock);
+ return sent;
}
@@ -859,7 +861,8 @@ netmap_txsync_to_host(struct netmap_adapter *na)
{
struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
struct netmap_ring *ring = kring->ring;
- u_int k, lim = kring->nkr_num_slots - 1;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = nm_txsync_prologue(kring);
struct mbq q;
int error;
@@ -869,22 +872,27 @@ netmap_txsync_to_host(struct netmap_adapter *na)
D("ring %p busy (user error)", kring);
return;
}
- k = ring->cur;
- if (k > lim) {
+ if (head > lim) {
D("invalid ring index in stack TX kring %p", kring);
netmap_ring_reinit(kring);
nm_kr_put(kring);
return;
}
- /* Take packets from hwcur to cur and pass them up.
+ /* Take packets from hwcur to head and pass them up.
+ * force head = cur since netmap_grab_packets() stops at head
* In case of no buffers we give up. At the end of the loop,
* the queue is drained in all cases.
*/
mbq_init(&q);
- netmap_grab_packets(kring, &q, 1);
- kring->nr_hwcur = k;
- kring->nr_hwavail = ring->avail = lim;
+ ring->cur = head;
+ netmap_grab_packets(kring, &q, 1 /* force */);
+ ND("have %d pkts in queue", mbq_len(&q));
+ kring->nr_hwcur = head;
+ kring->nr_hwtail = head + lim;
+ if (kring->nr_hwtail > lim)
+ kring->nr_hwtail -= lim + 1;
+ nm_txsync_finalize(kring);
nm_kr_put(kring);
netmap_send_up(na->ifp, &q);
@@ -893,60 +901,89 @@ netmap_txsync_to_host(struct netmap_adapter *na)
/*
* rxsync backend for packets coming from the host stack.
- * They have been put in the queue by netmap_transmit() so we
- * need to protect access to the kring using a lock.
+ * They have been put in kring->rx_queue by netmap_transmit().
+ * We protect access to the kring using kring->rx_queue.lock
*
* This routine also does the selrecord if called from the poll handler
* (we know because td != NULL).
*
* NOTE: on linux, selrecord() is defined as a macro and uses pwait
* as an additional hidden argument.
+ * returns the number of packets delivered to tx queues in
+ * transparent mode, or a negative value if error
*/
-static void
+int
netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
{
struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
struct netmap_ring *ring = kring->ring;
- u_int j, n, lim = kring->nkr_num_slots;
- u_int k = ring->cur, resvd = ring->reserved;
+ u_int nm_i, n;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = nm_rxsync_prologue(kring);
+ int ret = 0;
+ struct mbq *q = &kring->rx_queue;
(void)pwait; /* disable unused warnings */
- if (kring->nkr_stopped) /* check a first time without lock */
- return;
+ if (head > lim) {
+ netmap_ring_reinit(kring);
+ return EINVAL;
+ }
- mtx_lock(&kring->q_lock);
+ if (kring->nkr_stopped) /* check a first time without lock */
+ return EBUSY;
- if (kring->nkr_stopped) /* check again with lock held */
- goto unlock_out;
+ mtx_lock(&q->lock);
- if (k >= lim) {
- netmap_ring_reinit(kring);
+ if (kring->nkr_stopped) { /* check again with lock held */
+ ret = EBUSY;
goto unlock_out;
}
- /* new packets are already set in nr_hwavail */
- /* skip past packets that userspace has released */
- j = kring->nr_hwcur;
- if (resvd > 0) {
- if (resvd + ring->avail >= lim + 1) {
- D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
- ring->reserved = resvd = 0; // XXX panic...
+
+ /* First part: import newly received packets */
+ n = mbq_len(q);
+ if (n) { /* grab packets from the queue */
+ struct mbuf *m;
+ uint32_t stop_i;
+
+ nm_i = kring->nr_hwtail;
+ stop_i = nm_prev(nm_i, lim);
+ while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
+ int len = MBUF_LEN(m);
+ struct netmap_slot *slot = &ring->slot[nm_i];
+
+ m_copydata(m, 0, len, BDG_NMB(na, slot));
+ ND("nm %d len %d", nm_i, len);
+ if (netmap_verbose)
+ D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
+
+ slot->len = len;
+ slot->flags = kring->nkr_slot_flags;
+ nm_i = nm_next(nm_i, lim);
}
- k = (k >= resvd) ? k - resvd : k + lim - resvd;
+ kring->nr_hwtail = nm_i;
}
- if (j != k) {
- n = k >= j ? k - j : k + lim - j;
- kring->nr_hwavail -= n;
- kring->nr_hwcur = k;
+
+ /*
+ * Second part: skip past packets that userspace has released.
+ */
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) { /* something was released */
+ if (netmap_fwd || kring->ring->flags & NR_FORWARD)
+ ret = netmap_sw_to_nic(na);
+ kring->nr_hwcur = head;
}
- k = ring->avail = kring->nr_hwavail - resvd;
- if (k == 0 && td)
+
+ nm_rxsync_finalize(kring);
+
+ /* access copies of cur,tail in the kring */
+ if (kring->rcur == kring->rtail && td) /* no bufs available */
selrecord(td, &kring->si);
- if (k && (netmap_verbose & NM_VERB_HOST))
- D("%d pkts from stack", k);
+
unlock_out:
- mtx_unlock(&kring->q_lock);
+ mtx_unlock(&q->lock);
+ return ret;
}
@@ -1042,7 +1079,7 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
// XXX add a refcount ?
netmap_adapter_get(prev_na);
}
- D("Created generic NA %p (prev %p)", gna, gna->prev);
+ ND("Created generic NA %p (prev %p)", gna, gna->prev);
return 0;
}
@@ -1113,154 +1150,167 @@ out:
/*
* validate parameters on entry for *_txsync()
* Returns ring->cur if ok, or something >= kring->nkr_num_slots
- * in case of error. The extra argument is a pointer to
- * 'new_bufs'. XXX this may be deprecated at some point.
+ * in case of error.
*
- * Below is a correct configuration on input. ring->cur
- * must be in the region covered by kring->hwavail,
- * and ring->avail and kring->avail should end at the same slot.
+ * rhead, rcur and rtail=hwtail are stored from previous round.
+ * hwcur is the next packet to send to the ring.
*
- * +-hwcur
- * |
- * v<--hwres-->|<-----hwavail---->
- * ------+------------------------------+-------- ring
- * |
- * |<---avail--->
- * +--cur
+ * We want
+ * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
*
+ * hwcur, rhead, rtail and hwtail are reliable
*/
u_int
-nm_txsync_prologue(struct netmap_kring *kring, u_int *new_slots)
+nm_txsync_prologue(struct netmap_kring *kring)
{
struct netmap_ring *ring = kring->ring;
+ u_int head = ring->head; /* read only once */
u_int cur = ring->cur; /* read only once */
- u_int avail = ring->avail; /* read only once */
u_int n = kring->nkr_num_slots;
- u_int kstart, kend, a;
-#if 1 /* kernel sanity checks */
- if (kring->nr_hwcur >= n ||
- kring->nr_hwreserved >= n || kring->nr_hwavail >= n ||
- kring->nr_hwreserved + kring->nr_hwavail >= n)
+ ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
+ kring->name,
+ kring->nr_hwcur, kring->nr_hwtail,
+ ring->head, ring->cur, ring->tail);
+#if 1 /* kernel sanity checks; but we can trust the kring. */
+ if (kring->nr_hwcur >= n || kring->rhead >= n ||
+ kring->rtail >= n || kring->nr_hwtail >= n)
goto error;
#endif /* kernel sanity checks */
- kstart = kring->nr_hwcur + kring->nr_hwreserved;
- if (kstart >= n)
- kstart -= n;
- kend = kstart + kring->nr_hwavail;
- /* user sanity checks. a is the expected avail */
- if (cur < kstart) {
- /* too low, but maybe wraparound */
- if (cur + n > kend)
+ /*
+ * user sanity checks. We only use 'cur',
+ * A, B, ... are possible positions for cur:
+ *
+ * 0 A cur B tail C n-1
+ * 0 D tail E cur F n-1
+ *
+ * B, F, D are valid. A, C, E are wrong
+ */
+ if (kring->rtail >= kring->rhead) {
+ /* want rhead <= head <= rtail */
+ if (head < kring->rhead || head > kring->rtail)
goto error;
- *new_slots = cur + n - kstart;
- a = kend - cur - n;
- } else {
- if (cur > kend)
+ /* and also head <= cur <= rtail */
+ if (cur < head || cur > kring->rtail)
+ goto error;
+ } else { /* here rtail < rhead */
+ /* we need head outside rtail .. rhead */
+ if (head > kring->rtail && head < kring->rhead)
goto error;
- *new_slots = cur - kstart;
- a = kend - cur;
+
+ /* two cases now: head <= rtail or head >= rhead */
+ if (head <= kring->rtail) {
+ /* want head <= cur <= rtail */
+ if (cur < head || cur > kring->rtail)
+ goto error;
+ } else { /* head >= rhead */
+ /* cur must be outside rtail..head */
+ if (cur > kring->rtail && cur < head)
+ goto error;
+ }
}
- if (a != avail) {
- RD(5, "wrong but fixable avail have %d need %d",
- avail, a);
- ring->avail = avail = a;
+ if (ring->tail != kring->rtail) {
+ RD(5, "tail overwritten was %d need %d",
+ ring->tail, kring->rtail);
+ ring->tail = kring->rtail;
}
- return cur;
+ kring->rhead = head;
+ kring->rcur = cur;
+ return head;
error:
- RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d",
+ RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
+ kring->name,
kring->nr_hwcur,
- kring->nr_hwreserved, kring->nr_hwavail,
- cur, avail);
+ kring->rcur, kring->nr_hwtail,
+ cur, ring->tail);
return n;
}
/*
* validate parameters on entry for *_rxsync()
- * Returns ring->cur - ring->reserved if ok,
- * or something >= kring->nkr_num_slots
- * in case of error. The extra argument is a pointer to
- * 'resvd'. XXX this may be deprecated at some point.
+ * Returns ring->head if ok, kring->nkr_num_slots on error.
*
- * Below is a correct configuration on input. ring->cur and
- * ring->reserved must be in the region covered by kring->hwavail,
- * and ring->avail and kring->avail should end at the same slot.
+ * For a valid configuration,
+ * hwcur <= head <= cur <= tail <= hwtail
*
- * +-hwcur
- * |
- * v<-------hwavail---------->
- * ---------+--------------------------+-------- ring
- * |<--res-->|
- * |<---avail--->
- * +--cur
+ * We only consider head and cur.
+ * hwcur and hwtail are reliable.
*
*/
u_int
-nm_rxsync_prologue(struct netmap_kring *kring, u_int *resvd)
+nm_rxsync_prologue(struct netmap_kring *kring)
{
struct netmap_ring *ring = kring->ring;
- u_int cur = ring->cur; /* read only once */
- u_int avail = ring->avail; /* read only once */
- u_int res = ring->reserved; /* read only once */
- u_int n = kring->nkr_num_slots;
- u_int kend = kring->nr_hwcur + kring->nr_hwavail;
- u_int a;
+ uint32_t const n = kring->nkr_num_slots;
+ uint32_t head, cur;
+ ND("%s kc %d kt %d h %d c %d t %d",
+ kring->name,
+ kring->nr_hwcur, kring->nr_hwtail,
+ ring->head, ring->cur, ring->tail);
+ /*
+ * Before storing the new values, we should check they do not
+ * move backwards. However:
+ * - head is not an issue because the previous value is hwcur;
+ * - cur could in principle go back, however it does not matter
+ * because we are processing a brand new rxsync()
+ */
+ cur = kring->rcur = ring->cur; /* read only once */
+ head = kring->rhead = ring->head; /* read only once */
#if 1 /* kernel sanity checks */
- if (kring->nr_hwcur >= n || kring->nr_hwavail >= n)
+ if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
goto error;
#endif /* kernel sanity checks */
/* user sanity checks */
- if (res >= n)
- goto error;
- /* check that cur is valid, a is the expected value of avail */
- if (cur < kring->nr_hwcur) {
- /* too low, but maybe wraparound */
- if (cur + n > kend)
+ if (kring->nr_hwtail >= kring->nr_hwcur) {
+ /* want hwcur <= rhead <= hwtail */
+ if (head < kring->nr_hwcur || head > kring->nr_hwtail)
goto error;
- a = kend - (cur + n);
- } else {
- if (cur > kend)
+ /* and also rhead <= rcur <= hwtail */
+ if (cur < head || cur > kring->nr_hwtail)
goto error;
- a = kend - cur;
- }
- if (a != avail) {
- RD(5, "wrong but fixable avail have %d need %d",
- avail, a);
- ring->avail = avail = a;
- }
- if (res != 0) {
- /* then repeat the check for cur + res */
- cur = (cur >= res) ? cur - res : n + cur - res;
- if (cur < kring->nr_hwcur) {
- /* too low, but maybe wraparound */
- if (cur + n > kend)
- goto error;
- } else if (cur > kend) {
+ } else {
+ /* we need rhead outside hwtail..hwcur */
+ if (head < kring->nr_hwcur && head > kring->nr_hwtail)
goto error;
+ /* two cases now: head <= hwtail or head >= hwcur */
+ if (head <= kring->nr_hwtail) {
+ /* want head <= cur <= hwtail */
+ if (cur < head || cur > kring->nr_hwtail)
+ goto error;
+ } else {
+ /* cur must be outside hwtail..head */
+ if (cur < head && cur > kring->nr_hwtail)
+ goto error;
}
}
- *resvd = res;
- return cur;
+ if (ring->tail != kring->rtail) {
+ RD(5, "%s tail overwritten was %d need %d",
+ kring->name,
+ ring->tail, kring->rtail);
+ ring->tail = kring->rtail;
+ }
+ return head;
error:
- RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d res %d",
+ RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
kring->nr_hwcur,
- kring->nr_hwreserved, kring->nr_hwavail,
- ring->cur, avail, res);
+ kring->rcur, kring->nr_hwtail,
+ kring->rhead, kring->rcur, ring->tail);
return n;
}
+
/*
* Error routine called when txsync/rxsync detects an error.
- * Can't do much more than resetting cur = hwcur, avail = hwavail.
+ * Can't do much more than resetting head =cur = hwcur, tail = hwtail
* Return 1 on reinit.
*
* This routine is only called by the upper half of the kernel.
* It only reads hwcur (which is changed only by the upper half, too)
- * and hwavail (which may be changed by the lower half, but only on
+ * and hwtail (which may be changed by the lower half, but only on
* a tx ring and only to increase it, so any error will be recovered
* on the next call). For the above, we don't strictly need to call
* it under lock.
@@ -1274,36 +1324,38 @@ netmap_ring_reinit(struct netmap_kring *kring)
// XXX KASSERT nm_kr_tryget
RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
+ // XXX probably wrong to trust userspace
+ kring->rhead = ring->head;
+ kring->rcur = ring->cur;
+ kring->rtail = ring->tail;
+
if (ring->cur > lim)
errors++;
+ if (ring->head > lim)
+ errors++;
+ if (ring->tail > lim)
+ errors++;
for (i = 0; i <= lim; i++) {
u_int idx = ring->slot[i].buf_idx;
u_int len = ring->slot[i].len;
if (idx < 2 || idx >= netmap_total_buffers) {
- if (!errors++)
- D("bad buffer at slot %d idx %d len %d ", i, idx, len);
+ RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
ring->slot[i].buf_idx = 0;
ring->slot[i].len = 0;
} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
ring->slot[i].len = 0;
- if (!errors++)
- D("bad len %d at slot %d idx %d",
- len, i, idx);
+ RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
}
}
if (errors) {
- int pos = kring - kring->na->tx_rings;
- int n = kring->na->num_tx_rings + 1;
-
RD(10, "total %d errors", errors);
- errors++;
- RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
- NM_IFPNAME(kring->na->ifp),
- pos < n ? "TX" : "RX", pos < n ? pos : pos - n,
+ RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
+ kring->name,
ring->cur, kring->nr_hwcur,
- ring->avail, kring->nr_hwavail);
- ring->cur = kring->nr_hwcur;
- ring->avail = kring->nr_hwavail;
+ ring->tail, kring->nr_hwtail);
+ ring->head = kring->rhead = kring->nr_hwcur;
+ ring->cur = kring->rcur = kring->nr_hwcur;
+ ring->tail = kring->rtail = kring->nr_hwtail;
}
return (errors ? 1 : 0);
}
@@ -1436,7 +1488,6 @@ out:
* - NIOCGINFO
* - SIOCGIFADDR just for convenience
* - NIOCREGIF
- * - NIOCUNREGIF
* - NIOCTXSYNC
* - NIOCRXSYNC
*
@@ -1472,6 +1523,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
} while (0)
#endif /* linux */
+ if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
+ /* truncate name */
+ nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
+ if (nmr->nr_version != NETMAP_API) {
+ D("API mismatch for %s got %d need %d",
+ nmr->nr_name,
+ nmr->nr_version, NETMAP_API);
+ nmr->nr_version = NETMAP_API;
+ return EINVAL;
+ }
+ }
CURVNET_SET(TD_TO_VNET(td));
error = devfs_get_cdevpriv((void **)&priv);
@@ -1482,16 +1544,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
return (error == ENOENT ? ENXIO : error);
}
- nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */
switch (cmd) {
case NIOCGINFO: /* return capabilities etc */
- if (nmr->nr_version != NETMAP_API) {
- D("API mismatch got %d have %d",
- nmr->nr_version, NETMAP_API);
- nmr->nr_version = NETMAP_API;
- error = EINVAL;
- break;
- }
if (nmr->nr_cmd == NETMAP_BDG_LIST) {
error = netmap_bdg_ctl(nmr, NULL);
break;
@@ -1531,11 +1585,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
break;
case NIOCREGIF:
- if (nmr->nr_version != NETMAP_API) {
- nmr->nr_version = NETMAP_API;
- error = EINVAL;
- break;
- }
/* possibly attach/detach NIC and VALE switch */
i = nmr->nr_cmd;
if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
@@ -1593,12 +1642,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
NMG_UNLOCK();
break;
- case NIOCUNREGIF:
- // XXX we have no data here ?
- D("deprecated, data is %p", nmr);
- error = EINVAL;
- break;
-
case NIOCTXSYNC:
case NIOCRXSYNC:
nifp = priv->np_nifp;
@@ -1649,7 +1692,11 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
D("pre txsync ring %d cur %d hwcur %d",
i, kring->ring->cur,
kring->nr_hwcur);
- na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
+ if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
+ netmap_ring_reinit(kring);
+ } else {
+ na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
+ }
if (netmap_verbose & NM_VERB_TXSYNC)
D("post txsync ring %d cur %d hwcur %d",
i, kring->ring->cur,
@@ -1726,8 +1773,8 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
struct ifnet *ifp;
struct netmap_kring *kring;
u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
- u_int lim_tx, lim_rx, host_forwarded = 0;
- struct mbq q;
+ u_int lim_tx, lim_rx;
+ struct mbq q; /* packets from hw queues to host stack */
void *pwait = dev; /* linux compatibility */
/*
@@ -1735,7 +1782,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
* txsync and rxsync if we decide to do a selrecord().
* retry_tx (and retry_rx, later) prevent looping forever.
*/
- int retry_tx = 1;
+ int retry_tx = 1, retry_rx = 1;
(void)pwait;
mbq_init(&q);
@@ -1769,6 +1816,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
lim_rx = na->num_rx_rings;
if (priv->np_qfirst == NETMAP_SW_RING) {
+ // XXX locking ?
/* handle the host stack ring */
if (priv->np_txpoll || want_tx) {
/* push any packets up, then we are always ready */
@@ -1777,29 +1825,15 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
}
if (want_rx) {
kring = &na->rx_rings[lim_rx];
- if (kring->ring->avail == 0)
+ /* XXX replace with rxprologue etc. */
+ if (nm_ring_empty(kring->ring))
netmap_rxsync_from_host(na, td, dev);
- if (kring->ring->avail > 0) {
+ if (!nm_ring_empty(kring->ring))
revents |= want_rx;
- }
}
return (revents);
}
- /*
- * If we are in transparent mode, check also the host rx ring
- * XXX Transparent mode at the moment requires to bind all
- * rings to a single file descriptor.
- */
- kring = &na->rx_rings[lim_rx];
- if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
- && want_rx
- && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
- if (kring->ring->avail == 0)
- netmap_rxsync_from_host(na, td, dev);
- if (kring->ring->avail > 0)
- revents |= want_rx;
- }
/*
* check_all_{tx|rx} are set if the card has more than one queue AND
@@ -1825,81 +1859,71 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
* We start with a lock free round which is cheap if we have
* slots available. If this fails, then lock and call the sync
* routines.
- * XXX rather than ring->avail >0 should check that
- * ring->cur has not reached hwcur+hwavail
*/
for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
kring = &na->rx_rings[i];
- if (kring->ring->avail > 0) {
+ /* XXX compare ring->cur and kring->tail */
+ if (!nm_ring_empty(kring->ring)) {
revents |= want_rx;
want_rx = 0; /* also breaks the loop */
}
}
for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
kring = &na->tx_rings[i];
- if (kring->ring->avail > 0) {
+ /* XXX compare ring->cur and kring->tail */
+ if (!nm_ring_empty(kring->ring)) {
revents |= want_tx;
want_tx = 0; /* also breaks the loop */
}
}
/*
- * If we to push packets out (priv->np_txpoll) or want_tx is
- * still set, we do need to run the txsync calls (on all rings,
- * to avoid that the tx rings stall).
+ * If we want to push packets out (priv->np_txpoll) or
+ * want_tx is still set, we must issue txsync calls
+ * (on all rings, to avoid that the tx rings stall).
* XXX should also check cur != hwcur on the tx rings.
* Fortunately, normal tx mode has np_txpoll set.
*/
if (priv->np_txpoll || want_tx) {
- /* If we really want to be woken up (want_tx),
- * do a selrecord, either on the global or on
- * the private structure. Then issue the txsync
- * so there is no race in the selrecord/selwait
+ /*
+ * The first round checks if anyone is ready, if not
+ * do a selrecord and another round to handle races.
+ * want_tx goes to 0 if any space is found, and is
+ * used to skip rings with no pending transmissions.
*/
flush_tx:
for (i = priv->np_qfirst; i < lim_tx; i++) {
+ int found = 0;
+
kring = &na->tx_rings[i];
- /*
- * Skip this ring if want_tx == 0
- * (we have already done a successful sync on
- * a previous ring) AND kring->cur == kring->hwcur
- * (there are no pending transmissions for this ring).
- */
if (!want_tx && kring->ring->cur == kring->nr_hwcur)
continue;
- /* make sure only one user thread is doing this */
+ /* only one thread does txsync */
if (nm_kr_tryget(kring)) {
- ND("ring %p busy is %d",
- kring, (int)kring->nr_busy);
- revents |= POLLERR;
- goto out;
+ D("%p lost race on txring %d, ok", priv, i);
+ continue;
}
-
- if (netmap_verbose & NM_VERB_TXSYNC)
- D("send %d on %s %d",
- kring->ring->cur, NM_IFPNAME(ifp), i);
- if (na->nm_txsync(na, i, 0))
+ if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
+ netmap_ring_reinit(kring);
revents |= POLLERR;
+ } else {
+ if (na->nm_txsync(na, i, 0))
+ revents |= POLLERR;
+ }
- /* Check avail and call selrecord only if
- * called with POLLOUT and run out of bufs.
- * XXX Note, we cannot trust much ring->avail
- * as it is exposed to userspace (even though
- * just updated by txsync). We should really
- * check kring->nr_hwavail or better have
- * txsync set a flag telling if we need
- * to do a selrecord().
+ /*
+ * If we found new slots, notify potential
+ * listeners on the same ring.
+ * Since we just did a txsync, look at the copies
+ * of cur,tail in the kring.
*/
- if (want_tx) {
- if (kring->ring->avail > 0) {
- /* stop at the first ring. We don't risk
- * starvation.
- */
- revents |= want_tx;
- want_tx = 0;
- }
- }
+ found = kring->rcur != kring->rtail;
nm_kr_put(kring);
+ if (found) { /* notify other listeners */
+ revents |= want_tx;
+ want_tx = 0;
+ na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY);
+ }
}
if (want_tx && retry_tx) {
selrecord(td, check_all_tx ?
@@ -1910,21 +1934,27 @@ flush_tx:
}
/*
- * now if want_rx is still set we need to lock and rxsync.
+ * If want_rx is still set scan receive rings.
* Do it on all rings because otherwise we starve.
*/
if (want_rx) {
- int retry_rx = 1;
+ int send_down = 0; /* transparent mode */
+ /* two rounds here to for race avoidance */
do_retry_rx:
for (i = priv->np_qfirst; i < lim_rx; i++) {
+ int found = 0;
+
kring = &na->rx_rings[i];
if (nm_kr_tryget(kring)) {
- revents |= POLLERR;
- goto out;
+ D("%p lost race on rxring %d, ok", priv, i);
+ continue;
}
- /* XXX NR_FORWARD should only be read on
+ /*
+ * transparent mode support: collect packets
+ * from the rxring(s).
+ * XXX NR_FORWARD should only be read on
* physical or NIC ports
*/
if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
@@ -1939,49 +1969,65 @@ do_retry_rx:
kring->ring->flags & NR_TIMESTAMP) {
microtime(&kring->ring->ts);
}
-
- if (kring->ring->avail > 0) {
+ /* after an rxsync we can use kring->rcur, rtail */
+ found = kring->rcur != kring->rtail;
+ nm_kr_put(kring);
+ if (found) {
revents |= want_rx;
retry_rx = 0;
+ na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY);
}
- nm_kr_put(kring);
}
- if (retry_rx) {
- retry_rx = 0;
+
+ /* transparent mode XXX only during first pass ? */
+ kring = &na->rx_rings[lim_rx];
+ if (check_all_rx
+ && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
+ /* XXX fix to use kring fields */
+ if (nm_ring_empty(kring->ring))
+ send_down = netmap_rxsync_from_host(na, td, dev);
+ if (!nm_ring_empty(kring->ring))
+ revents |= want_rx;
+ }
+
+ if (retry_rx)
selrecord(td, check_all_rx ?
&na->rx_si : &na->rx_rings[priv->np_qfirst].si);
- goto do_retry_rx;
+ if (send_down > 0 || retry_rx) {
+ retry_rx = 0;
+ if (send_down)
+ goto flush_tx; /* and retry_rx */
+ else
+ goto do_retry_rx;
}
}
- /* forward host to the netmap ring.
- * I am accessing nr_hwavail without lock, but netmap_transmit
- * can only increment it, so the operation is safe.
+ /*
+ * Transparent mode: marked bufs on rx rings between
+ * kring->nr_hwcur and ring->head
+ * are passed to the other endpoint.
+ *
+ * In this mode we also scan the sw rxring, which in
+ * turn passes packets up.
+ *
+ * XXX Transparent mode at the moment requires to bind all
+ * rings to a single file descriptor.
*/
- kring = &na->rx_rings[lim_rx];
- if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
- && (netmap_fwd || kring->ring->flags & NR_FORWARD)
- && kring->nr_hwavail > 0 && !host_forwarded) {
- netmap_sw_to_nic(na);
- host_forwarded = 1; /* prevent another pass */
- want_rx = 0;
- goto flush_tx;
- }
if (q.head)
netmap_send_up(na->ifp, &q);
-out:
-
return (revents);
}
-/*------- driver support routines ------*/
+
+/*-------------------- driver support routines -------------------*/
static int netmap_hw_krings_create(struct netmap_adapter *);
static int
-netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags)
+netmap_notify(struct netmap_adapter *na, u_int n_ring,
+ enum txrx tx, int flags)
{
struct netmap_kring *kring;
@@ -2012,10 +2058,18 @@ netmap_attach_common(struct netmap_adapter *na)
return EINVAL;
}
WNA(ifp) = na;
+
+ /* the following is only needed for na that use the host port.
+ * XXX do we have something similar for linux ?
+ */
+#ifdef __FreeBSD__
+ na->if_input = ifp->if_input; /* for netmap_send_up */
+#endif /* __FreeBSD__ */
+
NETMAP_SET_CAPABLE(ifp);
if (na->nm_krings_create == NULL) {
na->nm_krings_create = netmap_hw_krings_create;
- na->nm_krings_delete = netmap_krings_delete;
+ na->nm_krings_delete = netmap_hw_krings_delete;
}
if (na->nm_notify == NULL)
na->nm_notify = netmap_notify;
@@ -2051,12 +2105,8 @@ netmap_detach_common(struct netmap_adapter *na)
* of hardware rings):
* krings 0..N-1 are for the hardware queues.
* kring N is for the host stack queue
- * kring N+1 is only used for the selinfo for all queues.
+ * kring N+1 is only used for the selinfo for all queues. // XXX still true ?
* Return 0 on success, ENOMEM otherwise.
- *
- * By default the receive and transmit adapter ring counts are both initialized
- * to num_queues. na->num_tx_rings can be set for cards with different tx/rx
- * setups.
*/
int
netmap_attach(struct netmap_adapter *arg)
@@ -2132,8 +2182,14 @@ NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
int
netmap_hw_krings_create(struct netmap_adapter *na)
{
- return netmap_krings_create(na,
+ int ret = netmap_krings_create(na,
na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
+ if (ret == 0) {
+ /* initialize the mbq for the sw rx ring */
+ mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
+ ND("initialized sw rx queue %d", na->num_rx_rings);
+ }
+ return ret;
}
@@ -2162,6 +2218,10 @@ netmap_detach(struct ifnet *ifp)
/*
* Intercept packets from the network stack and pass them
* to netmap as incoming packets on the 'software' ring.
+ *
+ * We only store packets in a bounded mbq and then copy them
+ * in the relevant rxsync routine.
+ *
* We rely on the OS to make sure that the ifp and na do not go
* away (typically the caller checks for IFF_DRV_RUNNING or the like).
* In nm_register() or whenever there is a reinitialization,
@@ -2172,63 +2232,60 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)
{
struct netmap_adapter *na = NA(ifp);
struct netmap_kring *kring;
- u_int i, len = MBUF_LEN(m);
- u_int error = EBUSY, lim;
- struct netmap_slot *slot;
+ u_int len = MBUF_LEN(m);
+ u_int error = ENOBUFS;
+ struct mbq *q;
+ int space;
// XXX [Linux] we do not need this lock
// if we follow the down/configure/up protocol -gl
// mtx_lock(&na->core_lock);
+
if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
- /* interface not in netmap mode anymore */
+ D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
error = ENXIO;
goto done;
}
kring = &na->rx_rings[na->num_rx_rings];
- lim = kring->nkr_num_slots - 1;
- if (netmap_verbose & NM_VERB_HOST)
- D("%s packet %d len %d from the stack", NM_IFPNAME(ifp),
- kring->nr_hwcur + kring->nr_hwavail, len);
+ q = &kring->rx_queue;
+
// XXX reconsider long packets if we handle fragments
if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
goto done;
}
- /* protect against other instances of netmap_transmit,
- * and userspace invocations of rxsync().
+
+ /* protect against rxsync_from_host(), netmap_sw_to_nic()
+ * and maybe other instances of netmap_transmit (the latter
+ * not possible on Linux).
+ * Also avoid overflowing the queue.
*/
- // XXX [Linux] there can be no other instances of netmap_transmit
- // on this same ring, but we still need this lock to protect
- // concurrent access from netmap_sw_to_nic() -gl
- mtx_lock(&kring->q_lock);
- if (kring->nr_hwavail >= lim) {
- if (netmap_verbose)
- D("stack ring %s full\n", NM_IFPNAME(ifp));
+ mtx_lock(&q->lock);
+
+ space = kring->nr_hwtail - kring->nr_hwcur;
+ if (space < 0)
+ space += kring->nkr_num_slots;
+ if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
+ RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
+ NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
+ len, m);
} else {
- /* compute the insert position */
- i = nm_kr_rxpos(kring);
- slot = &kring->ring->slot[i];
- m_copydata(m, 0, (int)len, BDG_NMB(na, slot));
- slot->len = len;
- slot->flags = kring->nkr_slot_flags;
- kring->nr_hwavail++;
- if (netmap_verbose & NM_VERB_HOST)
- D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings);
- na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
+ mbq_enqueue(q, m);
+ ND(10, "%s %d bufs in queue len %d m %p",
+ NM_IFPNAME(ifp), mbq_len(q), len, m);
+ /* notify outside the lock */
+ m = NULL;
error = 0;
}
- mtx_unlock(&kring->q_lock);
+ mtx_unlock(&q->lock);
done:
- // mtx_unlock(&na->core_lock);
-
- /* release the mbuf in either cases of success or failure. As an
- * alternative, put the mbuf in a free list and free the list
- * only when really necessary.
- */
- m_freem(m);
+ if (m)
+ m_freem(m);
+ /* unconditionally wake up listeners */
+ na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
return (error);
}
@@ -2267,27 +2324,32 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
if (n >= na->num_tx_rings)
return NULL;
kring = na->tx_rings + n;
+ // XXX check whether we should use hwcur or rcur
new_hwofs = kring->nr_hwcur - new_cur;
} else {
if (n >= na->num_rx_rings)
return NULL;
kring = na->rx_rings + n;
- new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
+ new_hwofs = kring->nr_hwtail - new_cur;
}
lim = kring->nkr_num_slots - 1;
if (new_hwofs > lim)
new_hwofs -= lim + 1;
/* Always set the new offset value and realign the ring. */
- D("%s hwofs %d -> %d, hwavail %d -> %d",
- tx == NR_TX ? "TX" : "RX",
+ if (netmap_verbose)
+ D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
+ NM_IFPNAME(na->ifp),
+ tx == NR_TX ? "TX" : "RX", n,
kring->nkr_hwofs, new_hwofs,
- kring->nr_hwavail,
- tx == NR_TX ? lim : kring->nr_hwavail);
+ kring->nr_hwtail,
+ tx == NR_TX ? lim : kring->nr_hwtail);
kring->nkr_hwofs = new_hwofs;
- if (tx == NR_TX)
- kring->nr_hwavail = lim;
- kring->nr_hwreserved = 0;
+ if (tx == NR_TX) {
+ kring->nr_hwtail = kring->nr_hwcur + lim;
+ if (kring->nr_hwtail > lim)
+ kring->nr_hwtail -= lim + 1;
+ }
#if 0 // def linux
/* XXX check that the mappings are correct */
@@ -2351,6 +2413,7 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
}
}
+
/*
* Default functions to handle rx/tx interrupts from a physical device.
* "work_done" is non-null on the RX path, NULL for the TX path.
@@ -2397,6 +2460,7 @@ netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
static struct cdev *netmap_dev; /* /dev/netmap character device. */
extern struct cdevsw netmap_cdevsw;
+
void
netmap_fini(void)
{
@@ -2408,6 +2472,7 @@ netmap_fini(void)
printf("netmap: unloaded module.\n");
}
+
int
netmap_init(void)
{
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index c2814146d2ef..6716168526dc 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -86,21 +86,31 @@ netmap_catch_rx(struct netmap_adapter *na, int intercept)
return 0;
}
+
/*
* Intercept the packet steering routine in the tx path,
* so that we can decide which queue is used for an mbuf.
* Second argument is non-zero to intercept, 0 to restore.
*
+ * actually we also need to redirect the if_transmit ?
+ *
* XXX see if FreeBSD has such a mechanism
*/
void
-netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable)
+netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)
{
+ struct netmap_adapter *na = &gna->up.up;
+ struct ifnet *ifp = na->ifp;
+
if (enable) {
+ na->if_transmit = ifp->if_transmit;
+ ifp->if_transmit = netmap_transmit;
} else {
+ ifp->if_transmit = na->if_transmit;
}
}
+
/* Transmit routine used by generic_netmap_txsync(). Returns 0 on success
* and non-zero on error (which may be packet drops or other errors).
* addr and len identify the netmap buffer, m is the (preallocated)
@@ -126,16 +136,16 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,
// copy data to the mbuf
m_copyback(m, 0, len, addr);
-
// inc refcount. We are alone, so we can skip the atomic
atomic_fetchadd_int(m->m_ext.ref_cnt, 1);
m->m_flags |= M_FLOWID;
m->m_pkthdr.flowid = ring_nr;
m->m_pkthdr.rcvif = ifp; /* used for tx notification */
- ret = ifp->if_transmit(ifp, m);
+ ret = NA(ifp)->if_transmit(ifp, m);
return ret;
}
+
/*
* The following two functions are empty until we have a generic
* way to extract the info from the ifp
@@ -147,6 +157,7 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
return 0;
}
+
void
generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
{
@@ -155,6 +166,7 @@ generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
*rxq = 1;
}
+
void netmap_mitigation_init(struct netmap_generic_adapter *na)
{
ND("called");
@@ -167,22 +179,26 @@ void netmap_mitigation_start(struct netmap_generic_adapter *na)
ND("called");
}
+
void netmap_mitigation_restart(struct netmap_generic_adapter *na)
{
ND("called");
}
+
int netmap_mitigation_active(struct netmap_generic_adapter *na)
{
ND("called");
return 0;
}
+
void netmap_mitigation_cleanup(struct netmap_generic_adapter *na)
{
ND("called");
}
+
/*
* In order to track whether pages are still mapped, we hook into
* the standard cdev_pager and intercept the constructor and
@@ -194,6 +210,7 @@ struct netmap_vm_handle_t {
struct netmap_priv_d *priv;
};
+
static int
netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
@@ -218,6 +235,7 @@ netmap_dev_pager_dtor(void *handle)
dev_rel(dev);
}
+
static int
netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
int prot, vm_page_t *mres)
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index 2c42db3f8862..109a734cac9f 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -82,7 +82,7 @@ __FBSDID("$FreeBSD$");
#include <dev/netmap/netmap_mem2.h>
#define rtnl_lock() D("rtnl_lock called");
-#define rtnl_unlock() D("rtnl_lock called");
+#define rtnl_unlock() D("rtnl_unlock called");
#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)
#define smp_mb()
@@ -101,9 +101,9 @@ __FBSDID("$FreeBSD$");
* (or reinstall the buffer ?)
*/
#define SET_MBUF_DESTRUCTOR(m, fn) do { \
- (m)->m_ext.ext_free = (void *)fn; \
- (m)->m_ext.ext_type = EXT_EXTREF; \
- } while (0)
+ (m)->m_ext.ext_free = (void *)fn; \
+ (m)->m_ext.ext_type = EXT_EXTREF; \
+} while (0)
#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1)
@@ -137,43 +137,43 @@ __FBSDID("$FreeBSD$");
#ifdef RATE
#define IFRATE(x) x
struct rate_stats {
- unsigned long txpkt;
- unsigned long txsync;
- unsigned long txirq;
- unsigned long rxpkt;
- unsigned long rxirq;
- unsigned long rxsync;
+ unsigned long txpkt;
+ unsigned long txsync;
+ unsigned long txirq;
+ unsigned long rxpkt;
+ unsigned long rxirq;
+ unsigned long rxsync;
};
struct rate_context {
- unsigned refcount;
- struct timer_list timer;
- struct rate_stats new;
- struct rate_stats old;
+ unsigned refcount;
+ struct timer_list timer;
+ struct rate_stats new;
+ struct rate_stats old;
};
#define RATE_PRINTK(_NAME_) \
- printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD);
+ printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD);
#define RATE_PERIOD 2
static void rate_callback(unsigned long arg)
{
- struct rate_context * ctx = (struct rate_context *)arg;
- struct rate_stats cur = ctx->new;
- int r;
-
- RATE_PRINTK(txpkt);
- RATE_PRINTK(txsync);
- RATE_PRINTK(txirq);
- RATE_PRINTK(rxpkt);
- RATE_PRINTK(rxsync);
- RATE_PRINTK(rxirq);
- printk("\n");
-
- ctx->old = cur;
- r = mod_timer(&ctx->timer, jiffies +
- msecs_to_jiffies(RATE_PERIOD * 1000));
- if (unlikely(r))
- D("[v1000] Error: mod_timer()");
+ struct rate_context * ctx = (struct rate_context *)arg;
+ struct rate_stats cur = ctx->new;
+ int r;
+
+ RATE_PRINTK(txpkt);
+ RATE_PRINTK(txsync);
+ RATE_PRINTK(txirq);
+ RATE_PRINTK(rxpkt);
+ RATE_PRINTK(rxsync);
+ RATE_PRINTK(rxirq);
+ printk("\n");
+
+ ctx->old = cur;
+ r = mod_timer(&ctx->timer, jiffies +
+ msecs_to_jiffies(RATE_PERIOD * 1000));
+ if (unlikely(r))
+ D("[v1000] Error: mod_timer()");
}
static struct rate_context rate_ctx;
@@ -197,150 +197,150 @@ netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done)
if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP)))
return;
- netmap_common_irq(ifp, q, work_done);
+ netmap_common_irq(ifp, q, work_done);
}
/* Enable/disable netmap mode for a generic network interface. */
-int generic_netmap_register(struct netmap_adapter *na, int enable)
+static int
+generic_netmap_register(struct netmap_adapter *na, int enable)
{
- struct ifnet *ifp = na->ifp;
- struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
- struct mbuf *m;
- int error;
- int i, r;
+ struct ifnet *ifp = na->ifp;
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ struct mbuf *m;
+ int error;
+ int i, r;
- if (!na)
- return EINVAL;
+ if (!na)
+ return EINVAL;
#ifdef REG_RESET
- error = ifp->netdev_ops->ndo_stop(ifp);
- if (error) {
- return error;
- }
+ error = ifp->netdev_ops->ndo_stop(ifp);
+ if (error) {
+ return error;
+ }
#endif /* REG_RESET */
- if (enable) { /* Enable netmap mode. */
- /* Initialize the rx queue, as generic_rx_handler() can
- * be called as soon as netmap_catch_rx() returns.
- */
- for (r=0; r<na->num_rx_rings; r++) {
- mbq_safe_init(&na->rx_rings[r].rx_queue);
- na->rx_rings[r].nr_ntc = 0;
- }
-
- /* Init the mitigation timer. */
- netmap_mitigation_init(gna);
-
- /*
- * Preallocate packet buffers for the tx rings.
- */
- for (r=0; r<na->num_tx_rings; r++) {
- na->tx_rings[r].nr_ntc = 0;
- na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *),
- M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!na->tx_rings[r].tx_pool) {
- D("tx_pool allocation failed");
- error = ENOMEM;
- goto free_tx_pool;
- }
- for (i=0; i<na->num_tx_desc; i++) {
- m = netmap_get_mbuf(GENERIC_BUF_SIZE);
- if (!m) {
- D("tx_pool[%d] allocation failed", i);
- error = ENOMEM;
- goto free_mbufs;
- }
- na->tx_rings[r].tx_pool[i] = m;
- }
- }
- rtnl_lock();
- /* Prepare to intercept incoming traffic. */
- error = netmap_catch_rx(na, 1);
- if (error) {
- D("netdev_rx_handler_register() failed");
- goto register_handler;
- }
- ifp->if_capenable |= IFCAP_NETMAP;
-
- /* Make netmap control the packet steering. */
- netmap_catch_packet_steering(gna, 1);
-
- rtnl_unlock();
+ if (enable) { /* Enable netmap mode. */
+ /* Initialize the rx queue, as generic_rx_handler() can
+ * be called as soon as netmap_catch_rx() returns.
+ */
+ for (r=0; r<na->num_rx_rings; r++) {
+ mbq_safe_init(&na->rx_rings[r].rx_queue);
+ }
+
+ /* Init the mitigation timer. */
+ netmap_mitigation_init(gna);
+
+ /*
+ * Preallocate packet buffers for the tx rings.
+ */
+ for (r=0; r<na->num_tx_rings; r++)
+ na->tx_rings[r].tx_pool = NULL;
+ for (r=0; r<na->num_tx_rings; r++) {
+ na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!na->tx_rings[r].tx_pool) {
+ D("tx_pool allocation failed");
+ error = ENOMEM;
+ goto free_tx_pools;
+ }
+ for (i=0; i<na->num_tx_desc; i++)
+ na->tx_rings[r].tx_pool[i] = NULL;
+ for (i=0; i<na->num_tx_desc; i++) {
+ m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ if (!m) {
+ D("tx_pool[%d] allocation failed", i);
+ error = ENOMEM;
+ goto free_tx_pools;
+ }
+ na->tx_rings[r].tx_pool[i] = m;
+ }
+ }
+ rtnl_lock();
+ /* Prepare to intercept incoming traffic. */
+ error = netmap_catch_rx(na, 1);
+ if (error) {
+ D("netdev_rx_handler_register() failed");
+ goto register_handler;
+ }
+ ifp->if_capenable |= IFCAP_NETMAP;
+
+ /* Make netmap control the packet steering. */
+ netmap_catch_tx(gna, 1);
+
+ rtnl_unlock();
#ifdef RATE
- if (rate_ctx.refcount == 0) {
- D("setup_timer()");
- memset(&rate_ctx, 0, sizeof(rate_ctx));
- setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx);
- if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) {
- D("Error: mod_timer()");
- }
- }
- rate_ctx.refcount++;
+ if (rate_ctx.refcount == 0) {
+ D("setup_timer()");
+ memset(&rate_ctx, 0, sizeof(rate_ctx));
+ setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx);
+ if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) {
+ D("Error: mod_timer()");
+ }
+ }
+ rate_ctx.refcount++;
#endif /* RATE */
- } else { /* Disable netmap mode. */
- rtnl_lock();
+ } else { /* Disable netmap mode. */
+ rtnl_lock();
- ifp->if_capenable &= ~IFCAP_NETMAP;
+ ifp->if_capenable &= ~IFCAP_NETMAP;
- /* Release packet steering control. */
- netmap_catch_packet_steering(gna, 0);
+ /* Release packet steering control. */
+ netmap_catch_tx(gna, 0);
- /* Do not intercept packets on the rx path. */
- netmap_catch_rx(na, 0);
+ /* Do not intercept packets on the rx path. */
+ netmap_catch_rx(na, 0);
- rtnl_unlock();
+ rtnl_unlock();
- /* Free the mbufs going to the netmap rings */
- for (r=0; r<na->num_rx_rings; r++) {
- mbq_safe_purge(&na->rx_rings[r].rx_queue);
- mbq_safe_destroy(&na->rx_rings[r].rx_queue);
- }
+ /* Free the mbufs going to the netmap rings */
+ for (r=0; r<na->num_rx_rings; r++) {
+ mbq_safe_purge(&na->rx_rings[r].rx_queue);
+ mbq_safe_destroy(&na->rx_rings[r].rx_queue);
+ }
- netmap_mitigation_cleanup(gna);
+ netmap_mitigation_cleanup(gna);
- for (r=0; r<na->num_tx_rings; r++) {
- for (i=0; i<na->num_tx_desc; i++) {
- m_freem(na->tx_rings[r].tx_pool[i]);
- }
- free(na->tx_rings[r].tx_pool, M_DEVBUF);
- }
+ for (r=0; r<na->num_tx_rings; r++) {
+ for (i=0; i<na->num_tx_desc; i++) {
+ m_freem(na->tx_rings[r].tx_pool[i]);
+ }
+ free(na->tx_rings[r].tx_pool, M_DEVBUF);
+ }
#ifdef RATE
- if (--rate_ctx.refcount == 0) {
- D("del_timer()");
- del_timer(&rate_ctx.timer);
- }
+ if (--rate_ctx.refcount == 0) {
+ D("del_timer()");
+ del_timer(&rate_ctx.timer);
+ }
#endif
- }
+ }
#ifdef REG_RESET
- error = ifp->netdev_ops->ndo_open(ifp);
- if (error) {
- goto alloc_tx_pool;
- }
+ error = ifp->netdev_ops->ndo_open(ifp);
+ if (error) {
+ goto alloc_tx_pool;
+ }
#endif
- return 0;
+ return 0;
register_handler:
- rtnl_unlock();
-free_tx_pool:
- r--;
- i = na->num_tx_desc; /* Useless, but just to stay safe. */
-free_mbufs:
- i--;
- for (; r>=0; r--) {
- for (; i>=0; i--) {
- m_freem(na->tx_rings[r].tx_pool[i]);
- }
- free(na->tx_rings[r].tx_pool, M_DEVBUF);
- i = na->num_tx_desc - 1;
- }
-
- return error;
+ rtnl_unlock();
+free_tx_pools:
+ for (r=0; r<na->num_tx_rings; r++) {
+ if (na->tx_rings[r].tx_pool == NULL)
+ continue;
+ for (i=0; i<na->num_tx_desc; i++)
+ if (na->tx_rings[r].tx_pool[i])
+ m_freem(na->tx_rings[r].tx_pool[i]);
+ free(na->tx_rings[r].tx_pool, M_DEVBUF);
+ }
+
+ return error;
}
/*
@@ -351,93 +351,88 @@ free_mbufs:
static void
generic_mbuf_destructor(struct mbuf *m)
{
- if (netmap_verbose)
- D("Tx irq (%p) queue %d", m, MBUF_TXQ(m));
- netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL);
+ if (netmap_verbose)
+ D("Tx irq (%p) queue %d", m, MBUF_TXQ(m));
+ netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL);
#ifdef __FreeBSD__
- m->m_ext.ext_type = EXT_PACKET;
- m->m_ext.ext_free = NULL;
- if (*(m->m_ext.ref_cnt) == 0)
- *(m->m_ext.ref_cnt) = 1;
- uma_zfree(zone_pack, m);
+ m->m_ext.ext_type = EXT_PACKET;
+ m->m_ext.ext_free = NULL;
+ if (*(m->m_ext.ref_cnt) == 0)
+ *(m->m_ext.ref_cnt) = 1;
+ uma_zfree(zone_pack, m);
#endif /* __FreeBSD__ */
- IFRATE(rate_ctx.new.txirq++);
+ IFRATE(rate_ctx.new.txirq++);
}
-/* Record completed transmissions and update hwavail.
+/* Record completed transmissions and update hwtail.
*
- * nr_ntc is the oldest tx buffer not yet completed
- * (same as nr_hwavail + nr_hwcur + 1),
+ * The oldest tx buffer not yet completed is at nr_hwtail + 1,
* nr_hwcur is the first unsent buffer.
- * When cleaning, we try to recover buffers between nr_ntc and nr_hwcur.
*/
-static int
+static u_int
generic_netmap_tx_clean(struct netmap_kring *kring)
{
- u_int num_slots = kring->nkr_num_slots;
- u_int ntc = kring->nr_ntc;
- u_int hwcur = kring->nr_hwcur;
- u_int n = 0;
- struct mbuf **tx_pool = kring->tx_pool;
-
- while (ntc != hwcur) { /* buffers not completed */
- struct mbuf *m = tx_pool[ntc];
-
- if (unlikely(m == NULL)) {
- /* try to replenish the entry */
- tx_pool[ntc] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
- if (unlikely(m == NULL)) {
- D("mbuf allocation failed, XXX error");
- // XXX how do we proceed ? break ?
- return -ENOMEM;
- }
- } else if (GET_MBUF_REFCNT(m) != 1) {
- break; /* This mbuf is still busy: its refcnt is 2. */
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int nm_i = nm_next(kring->nr_hwtail, lim);
+ u_int hwcur = kring->nr_hwcur;
+ u_int n = 0;
+ struct mbuf **tx_pool = kring->tx_pool;
+
+ while (nm_i != hwcur) { /* buffers not completed */
+ struct mbuf *m = tx_pool[nm_i];
+
+ if (unlikely(m == NULL)) {
+ /* this is done, try to replenish the entry */
+ tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ if (unlikely(m == NULL)) {
+ D("mbuf allocation failed, XXX error");
+ // XXX how do we proceed ? break ?
+ return -ENOMEM;
+ }
+ } else if (GET_MBUF_REFCNT(m) != 1) {
+ break; /* This mbuf is still busy: its refcnt is 2. */
+ }
+ n++;
+ nm_i = nm_next(nm_i, lim);
}
- if (unlikely(++ntc == num_slots)) {
- ntc = 0;
- }
- n++;
- }
- kring->nr_ntc = ntc;
- kring->nr_hwavail += n;
- ND("tx completed [%d] -> hwavail %d", n, kring->nr_hwavail);
-
- return n;
+ kring->nr_hwtail = nm_prev(nm_i, lim);
+ ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail);
+
+ return n;
}
/*
- * We have pending packets in the driver between nr_ntc and j.
+ * We have pending packets in the driver between nr_hwtail +1 and hwcur.
* Compute a position in the middle, to be used to generate
* a notification.
*/
static inline u_int
generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)
{
- u_int n = kring->nkr_num_slots;
- u_int ntc = kring->nr_ntc;
- u_int e;
-
- if (hwcur >= ntc) {
- e = (hwcur + ntc) / 2;
- } else { /* wrap around */
- e = (hwcur + n + ntc) / 2;
- if (e >= n) {
- e -= n;
- }
- }
-
- if (unlikely(e >= n)) {
- D("This cannot happen");
- e = 0;
- }
-
- return e;
+ u_int n = kring->nkr_num_slots;
+ u_int ntc = nm_next(kring->nr_hwtail, n-1);
+ u_int e;
+
+ if (hwcur >= ntc) {
+ e = (hwcur + ntc) / 2;
+ } else { /* wrap around */
+ e = (hwcur + n + ntc) / 2;
+ if (e >= n) {
+ e -= n;
+ }
+ }
+
+ if (unlikely(e >= n)) {
+ D("This cannot happen");
+ e = 0;
+ }
+
+ return e;
}
/*
- * We have pending packets in the driver between nr_ntc and hwcur.
+ * We have pending packets in the driver between nr_hwtail+1 and hwcur.
* Schedule a notification approximately in the middle of the two.
* There is a race but this is only called within txsync which does
* a double check.
@@ -445,28 +440,28 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)
static void
generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
{
- struct mbuf *m;
- u_int e;
-
- if (kring->nr_ntc == hwcur) {
- return;
- }
- e = generic_tx_event_middle(kring, hwcur);
-
- m = kring->tx_pool[e];
- if (m == NULL) {
- /* This can happen if there is already an event on the netmap
- slot 'e': There is nothing to do. */
- return;
- }
- ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m));
- kring->tx_pool[e] = NULL;
- SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
-
- // XXX wmb() ?
- /* Decrement the refcount an free it if we have the last one. */
- m_freem(m);
- smp_mb();
+ struct mbuf *m;
+ u_int e;
+
+ if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) {
+ return; /* all buffers are free */
+ }
+ e = generic_tx_event_middle(kring, hwcur);
+
+ m = kring->tx_pool[e];
+ if (m == NULL) {
+ /* This can happen if there is already an event on the netmap
+ slot 'e': There is nothing to do. */
+ return;
+ }
+ ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m));
+ kring->tx_pool[e] = NULL;
+ SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
+
+ // XXX wmb() ?
+ /* Decrement the refcount an free it if we have the last one. */
+ m_freem(m);
+ smp_mb();
}
@@ -480,133 +475,108 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
static int
generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
{
- struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
- struct netmap_ring *ring = kring->ring;
- u_int j, k, num_slots = kring->nkr_num_slots;
- int new_slots, ntx;
-
- IFRATE(rate_ctx.new.txsync++);
-
- // TODO: handle the case of mbuf allocation failure
- /* first, reclaim completed buffers */
- generic_netmap_tx_clean(kring);
-
- /* Take a copy of ring->cur now, and never read it again. */
- k = ring->cur;
- if (unlikely(k >= num_slots)) {
- return netmap_ring_reinit(kring);
- }
-
- rmb();
- j = kring->nr_hwcur;
- /*
- * 'new_slots' counts how many new slots have been added:
- * everything from hwcur to cur, excluding reserved ones, if any.
- * nr_hwreserved start from hwcur and counts how many slots were
- * not sent to the NIC from the previous round.
- */
- new_slots = k - j - kring->nr_hwreserved;
- if (new_slots < 0) {
- new_slots += num_slots;
- }
- ntx = 0;
- if (j != k) {
- /* Process new packets to send:
- * j is the current index in the netmap ring.
+ struct ifnet *ifp = na->ifp;
+ struct netmap_kring *kring = &na->tx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ u_int nm_i; /* index into the netmap ring */ // j
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = kring->rhead;
+
+ IFRATE(rate_ctx.new.txsync++);
+
+ // TODO: handle the case of mbuf allocation failure
+
+ rmb();
+
+ /*
+ * First part: process new packets to send.
*/
- while (j != k) {
- struct netmap_slot *slot = &ring->slot[j]; /* Current slot in the netmap ring */
- void *addr = NMB(slot);
- u_int len = slot->len;
- struct mbuf *m;
- int tx_ret;
-
- if (unlikely(addr == netmap_buffer_base || len > NETMAP_BUF_SIZE)) {
- return netmap_ring_reinit(kring);
- }
- /* Tale a mbuf from the tx pool and copy in the user packet. */
- m = kring->tx_pool[j];
- if (unlikely(!m)) {
- RD(5, "This should never happen");
- kring->tx_pool[j] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
- if (unlikely(m == NULL)) {
- D("mbuf allocation failed");
- break;
- }
- }
- /* XXX we should ask notifications when NS_REPORT is set,
- * or roughly every half frame. We can optimize this
- * by lazily requesting notifications only when a
- * transmission fails. Probably the best way is to
- * break on failures and set notifications when
- * ring->avail == 0 || j != k
- */
- tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr);
- if (unlikely(tx_ret)) {
- RD(5, "start_xmit failed: err %d [%u,%u,%u,%u]",
- tx_ret, kring->nr_ntc, j, k, kring->nr_hwavail);
- /*
- * No room for this mbuf in the device driver.
- * Request a notification FOR A PREVIOUS MBUF,
- * then call generic_netmap_tx_clean(kring) to do the
- * double check and see if we can free more buffers.
- * If there is space continue, else break;
- * NOTE: the double check is necessary if the problem
- * occurs in the txsync call after selrecord().
- * Also, we need some way to tell the caller that not
- * all buffers were queued onto the device (this was
- * not a problem with native netmap driver where space
- * is preallocated). The bridge has a similar problem
- * and we solve it there by dropping the excess packets.
- */
- generic_set_tx_event(kring, j);
- if (generic_netmap_tx_clean(kring)) { /* space now available */
- continue;
- } else {
- break;
- }
- }
- slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
- if (unlikely(++j == num_slots))
- j = 0;
- ntx++;
- }
-
- /* Update hwcur to the next slot to transmit. */
- kring->nr_hwcur = j;
-
- /*
- * Report all new slots as unavailable, even those not sent.
- * We account for them with with hwreserved, so that
- * nr_hwreserved =:= cur - nr_hwcur
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) { /* we have new packets to send */
+ while (nm_i != head) {
+ struct netmap_slot *slot = &ring->slot[nm_i];
+ u_int len = slot->len;
+ void *addr = NMB(slot);
+
+ /* device-specific */
+ struct mbuf *m;
+ int tx_ret;
+
+ NM_CHECK_ADDR_LEN(addr, len);
+
+ /* Tale a mbuf from the tx pool and copy in the user packet. */
+ m = kring->tx_pool[nm_i];
+ if (unlikely(!m)) {
+ RD(5, "This should never happen");
+ kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ if (unlikely(m == NULL)) {
+ D("mbuf allocation failed");
+ break;
+ }
+ }
+ /* XXX we should ask notifications when NS_REPORT is set,
+ * or roughly every half frame. We can optimize this
+ * by lazily requesting notifications only when a
+ * transmission fails. Probably the best way is to
+ * break on failures and set notifications when
+ * ring->cur == ring->tail || nm_i != cur
+ */
+ tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr);
+ if (unlikely(tx_ret)) {
+ RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]",
+ tx_ret, nm_i, head, kring->nr_hwtail);
+ /*
+ * No room for this mbuf in the device driver.
+ * Request a notification FOR A PREVIOUS MBUF,
+ * then call generic_netmap_tx_clean(kring) to do the
+ * double check and see if we can free more buffers.
+ * If there is space continue, else break;
+ * NOTE: the double check is necessary if the problem
+ * occurs in the txsync call after selrecord().
+ * Also, we need some way to tell the caller that not
+ * all buffers were queued onto the device (this was
+ * not a problem with native netmap driver where space
+ * is preallocated). The bridge has a similar problem
+ * and we solve it there by dropping the excess packets.
+ */
+ generic_set_tx_event(kring, nm_i);
+ if (generic_netmap_tx_clean(kring)) { /* space now available */
+ continue;
+ } else {
+ break;
+ }
+ }
+ slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
+ nm_i = nm_next(nm_i, lim);
+ }
+
+ /* Update hwcur to the next slot to transmit. */
+ kring->nr_hwcur = nm_i; /* not head, we could break early */
+
+ IFRATE(rate_ctx.new.txpkt += ntx);
+ }
+
+ /*
+ * Second, reclaim completed buffers
*/
- kring->nr_hwavail -= new_slots;
- kring->nr_hwreserved = k - j;
- if (kring->nr_hwreserved < 0) {
- kring->nr_hwreserved += num_slots;
- }
-
- IFRATE(rate_ctx.new.txpkt += ntx);
-
- if (!kring->nr_hwavail) {
- /* No more available slots? Set a notification event
- * on a netmap slot that will be cleaned in the future.
- * No doublecheck is performed, since txsync() will be
- * called twice by netmap_poll().
- */
- generic_set_tx_event(kring, j);
- }
- ND("tx #%d, hwavail = %d", n, kring->nr_hwavail);
- }
-
- /* Synchronize the user's view to the kernel view. */
- ring->avail = kring->nr_hwavail;
- ring->reserved = kring->nr_hwreserved;
-
- return 0;
+ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
+ /* No more available slots? Set a notification event
+ * on a netmap slot that will be cleaned in the future.
+ * No doublecheck is performed, since txsync() will be
+ * called twice by netmap_poll().
+ */
+ generic_set_tx_event(kring, nm_i);
+ }
+ ND("tx #%d, hwtail = %d", n, kring->nr_hwtail);
+
+ generic_netmap_tx_clean(kring);
+
+ nm_txsync_finalize(kring);
+
+ return 0;
}
+
/*
* This handler is registered (through netmap_catch_rx())
* within the attached network interface
@@ -615,38 +585,38 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Stolen packets are put in a queue where the
* generic_netmap_rxsync() callback can extract them.
*/
-void generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
+void
+generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
{
- struct netmap_adapter *na = NA(ifp);
- struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
- u_int work_done;
- u_int rr = 0; // receive ring number
-
- ND("called");
- /* limit the size of the queue */
- if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) {
- m_freem(m);
- } else {
- mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m);
- }
-
- if (netmap_generic_mit < 32768) {
- /* no rx mitigation, pass notification up */
- netmap_generic_irq(na->ifp, rr, &work_done);
- IFRATE(rate_ctx.new.rxirq++);
- } else {
- /* same as send combining, filter notification if there is a
- * pending timer, otherwise pass it up and start a timer.
- */
- if (likely(netmap_mitigation_active(gna))) {
- /* Record that there is some pending work. */
- gna->mit_pending = 1;
- } else {
- netmap_generic_irq(na->ifp, rr, &work_done);
- IFRATE(rate_ctx.new.rxirq++);
- netmap_mitigation_start(gna);
- }
- }
+ struct netmap_adapter *na = NA(ifp);
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ u_int work_done;
+ u_int rr = 0; // receive ring number
+
+ /* limit the size of the queue */
+ if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) {
+ m_freem(m);
+ } else {
+ mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m);
+ }
+
+ if (netmap_generic_mit < 32768) {
+ /* no rx mitigation, pass notification up */
+ netmap_generic_irq(na->ifp, rr, &work_done);
+ IFRATE(rate_ctx.new.rxirq++);
+ } else {
+ /* same as send combining, filter notification if there is a
+ * pending timer, otherwise pass it up and start a timer.
+ */
+ if (likely(netmap_mitigation_active(gna))) {
+ /* Record that there is some pending work. */
+ gna->mit_pending = 1;
+ } else {
+ netmap_generic_irq(na->ifp, rr, &work_done);
+ IFRATE(rate_ctx.new.rxirq++);
+ netmap_mitigation_start(gna);
+ }
+ }
}
/*
@@ -658,105 +628,99 @@ void generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
static int
generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
{
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
- struct netmap_ring *ring = kring->ring;
- u_int j, n, lim = kring->nkr_num_slots - 1;
- int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
- u_int k, resvd = ring->reserved;
-
- if (ring->cur > lim)
- return netmap_ring_reinit(kring);
-
- /* Import newly received packets into the netmap ring. */
- if (netmap_no_pendintr || force_update) {
- uint16_t slot_flags = kring->nkr_slot_flags;
- struct mbuf *m;
-
- n = 0;
- j = kring->nr_ntc; /* first empty slot in the receive ring */
- /* extract buffers from the rx queue, stop at most one
- * slot before nr_hwcur (index k)
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ u_int nm_i; /* index into the netmap ring */ //j,
+ u_int n;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = nm_rxsync_prologue(kring);
+ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
+
+ if (head > lim)
+ return netmap_ring_reinit(kring);
+
+ /*
+ * First part: import newly received packets.
+ */
+ if (netmap_no_pendintr || force_update) {
+ /* extract buffers from the rx queue, stop at most one
+ * slot before nr_hwcur (stop_i)
+ */
+ uint16_t slot_flags = kring->nkr_slot_flags;
+ u_int stop_i = nm_prev(kring->nr_hwcur, lim);
+
+ nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */
+ for (n = 0; nm_i != stop_i; n++) {
+ int len;
+ void *addr = NMB(&ring->slot[nm_i]);
+ struct mbuf *m;
+
+ /* we only check the address here on generic rx rings */
+ if (addr == netmap_buffer_base) { /* Bad buffer */
+ return netmap_ring_reinit(kring);
+ }
+ /*
+ * Call the locked version of the function.
+ * XXX Ideally we could grab a batch of mbufs at once
+ * and save some locking overhead.
+ */
+ m = mbq_safe_dequeue(&kring->rx_queue);
+ if (!m) /* no more data */
+ break;
+ len = MBUF_LEN(m);
+ m_copydata(m, 0, len, addr);
+ ring->slot[nm_i].len = len;
+ ring->slot[nm_i].flags = slot_flags;
+ m_freem(m);
+ nm_i = nm_next(nm_i, lim);
+ n++;
+ }
+ if (n) {
+ kring->nr_hwtail = nm_i;
+ IFRATE(rate_ctx.new.rxpkt += n);
+ }
+ kring->nr_kflags &= ~NKR_PENDINTR;
+ }
+
+ // XXX should we invert the order ?
+ /*
+ * Second part: skip past packets that userspace has released.
*/
- k = (kring->nr_hwcur) ? kring->nr_hwcur-1 : lim;
- while (j != k) {
- int len;
- void *addr = NMB(&ring->slot[j]);
-
- if (addr == netmap_buffer_base) { /* Bad buffer */
- return netmap_ring_reinit(kring);
- }
- /*
- * Call the locked version of the function.
- * XXX Ideally we could grab a batch of mbufs at once,
- * by changing rx_queue into a ring.
- */
- m = mbq_safe_dequeue(&kring->rx_queue);
- if (!m)
- break;
- len = MBUF_LEN(m);
- m_copydata(m, 0, len, addr);
- ring->slot[j].len = len;
- ring->slot[j].flags = slot_flags;
- m_freem(m);
- if (unlikely(j++ == lim))
- j = 0;
- n++;
- }
- if (n) {
- kring->nr_ntc = j;
- kring->nr_hwavail += n;
- IFRATE(rate_ctx.new.rxpkt += n);
- }
- kring->nr_kflags &= ~NKR_PENDINTR;
- }
-
- // XXX should we invert the order ?
- /* Skip past packets that userspace has released */
- j = kring->nr_hwcur;
- k = ring->cur;
- if (resvd > 0) {
- if (resvd + ring->avail >= lim + 1) {
- D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
- ring->reserved = resvd = 0; // XXX panic...
- }
- k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
- }
- if (j != k) {
- /* Userspace has released some packets. */
- for (n = 0; j != k; n++) {
- struct netmap_slot *slot = &ring->slot[j];
-
- slot->flags &= ~NS_BUF_CHANGED;
- if (unlikely(j++ == lim))
- j = 0;
- }
- kring->nr_hwavail -= n;
- kring->nr_hwcur = k;
- }
- /* Tell userspace that there are new packets. */
- ring->avail = kring->nr_hwavail - resvd;
- IFRATE(rate_ctx.new.rxsync++);
-
- return 0;
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) {
+ /* Userspace has released some packets. */
+ for (n = 0; nm_i != head; n++) {
+ struct netmap_slot *slot = &ring->slot[nm_i];
+
+ slot->flags &= ~NS_BUF_CHANGED;
+ nm_i = nm_next(nm_i, lim);
+ }
+ kring->nr_hwcur = head;
+ }
+ /* tell userspace that there might be new packets. */
+ nm_rxsync_finalize(kring);
+ IFRATE(rate_ctx.new.rxsync++);
+
+ return 0;
}
static void
generic_netmap_dtor(struct netmap_adapter *na)
{
- struct ifnet *ifp = na->ifp;
- struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na;
- struct netmap_adapter *prev_na = gna->prev;
-
- if (prev_na != NULL) {
- D("Released generic NA %p", gna);
- if_rele(na->ifp);
- netmap_adapter_put(prev_na);
- }
- if (ifp != NULL) {
- WNA(ifp) = prev_na;
- D("Restored native NA %p", prev_na);
- na->ifp = NULL;
- }
+ struct ifnet *ifp = na->ifp;
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na;
+ struct netmap_adapter *prev_na = gna->prev;
+
+ if (prev_na != NULL) {
+ D("Released generic NA %p", gna);
+ if_rele(na->ifp);
+ netmap_adapter_put(prev_na);
+ }
+ if (ifp != NULL) {
+ WNA(ifp) = prev_na;
+ D("Restored native NA %p", prev_na);
+ na->ifp = NULL;
+ }
}
/*
@@ -773,46 +737,46 @@ generic_netmap_dtor(struct netmap_adapter *na)
int
generic_netmap_attach(struct ifnet *ifp)
{
- struct netmap_adapter *na;
- struct netmap_generic_adapter *gna;
- int retval;
- u_int num_tx_desc, num_rx_desc;
-
- num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */
-
- generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc);
- ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc);
-
- gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (gna == NULL) {
- D("no memory on attach, give up");
- return ENOMEM;
- }
- na = (struct netmap_adapter *)gna;
- na->ifp = ifp;
- na->num_tx_desc = num_tx_desc;
- na->num_rx_desc = num_rx_desc;
- na->nm_register = &generic_netmap_register;
- na->nm_txsync = &generic_netmap_txsync;
- na->nm_rxsync = &generic_netmap_rxsync;
- na->nm_dtor = &generic_netmap_dtor;
- /* when using generic, IFCAP_NETMAP is set so we force
- * NAF_SKIP_INTR to use the regular interrupt handler
- */
- na->na_flags = NAF_SKIP_INTR;
-
- ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)",
- ifp->num_tx_queues, ifp->real_num_tx_queues,
- ifp->tx_queue_len);
- ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)",
- ifp->num_rx_queues, ifp->real_num_rx_queues);
-
- generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
-
- retval = netmap_attach_common(na);
- if (retval) {
- free(gna, M_DEVBUF);
- }
-
- return retval;
+ struct netmap_adapter *na;
+ struct netmap_generic_adapter *gna;
+ int retval;
+ u_int num_tx_desc, num_rx_desc;
+
+ num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */
+
+ generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc);
+ ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc);
+
+ gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (gna == NULL) {
+ D("no memory on attach, give up");
+ return ENOMEM;
+ }
+ na = (struct netmap_adapter *)gna;
+ na->ifp = ifp;
+ na->num_tx_desc = num_tx_desc;
+ na->num_rx_desc = num_rx_desc;
+ na->nm_register = &generic_netmap_register;
+ na->nm_txsync = &generic_netmap_txsync;
+ na->nm_rxsync = &generic_netmap_rxsync;
+ na->nm_dtor = &generic_netmap_dtor;
+ /* when using generic, IFCAP_NETMAP is set so we force
+ * NAF_SKIP_INTR to use the regular interrupt handler
+ */
+ na->na_flags = NAF_SKIP_INTR;
+
+ ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)",
+ ifp->num_tx_queues, ifp->real_num_tx_queues,
+ ifp->tx_queue_len);
+ ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)",
+ ifp->num_rx_queues, ifp->real_num_rx_queues);
+
+ generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
+
+ retval = netmap_attach_common(na);
+ if (retval) {
+ free(gna, M_DEVBUF);
+ }
+
+ return retval;
}
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 9381cd4cedd3..74a46297ff3d 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
- * Copyright (C) 2013 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -53,7 +53,7 @@
#define NM_SELINFO_T struct selinfo
#define MBUF_LEN(m) ((m)->m_pkthdr.len)
#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif)
-#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m)
+#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m)
#define NM_ATOMIC_T volatile int // XXX ?
/* atomic operations */
@@ -76,7 +76,11 @@ struct hrtimer {
#define NM_SELINFO_T wait_queue_head_t
#define MBUF_LEN(m) ((m)->len)
#define MBUF_IFP(m) ((m)->dev)
-#define NM_SEND_UP(ifp, m) netif_rx(m)
+#define NM_SEND_UP(ifp, m) \
+ do { \
+ m->priority = NM_MAGIC_PRIORITY; \
+ netif_rx(m); \
+ } while (0)
#define NM_ATOMIC_T volatile long unsigned int
@@ -125,9 +129,9 @@ struct hrtimer {
do { \
struct timeval __xxts; \
microtime(&__xxts); \
- printf("%03d.%06d %s [%d] " format "\n", \
+ printf("%03d.%06d [%4d] %-25s " format "\n", \
(int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \
- __FUNCTION__, __LINE__, ##__VA_ARGS__); \
+ __LINE__, __FUNCTION__, ##__VA_ARGS__); \
} while (0)
/* rate limited, lps indicates how many per second */
@@ -158,15 +162,23 @@ extern NMG_LOCK_T netmap_global_lock;
* a ring across system calls.
*
* nr_hwcur index of the next buffer to refill.
- * It corresponds to ring->cur - ring->reserved
+ * It corresponds to ring->head
+ * at the time the system call returns.
*
- * nr_hwavail the number of slots "owned" by userspace.
- * nr_hwavail =:= ring->avail + ring->reserved
+ * nr_hwtail index of the first buffer owned by the kernel.
+ * On RX, hwcur->hwtail are receive buffers
+ * not yet released. hwcur is advanced following
+ * ring->head, hwtail is advanced on incoming packets,
+ * and a wakeup is generated when hwtail passes ring->cur
+ * On TX, hwcur->rcur have been filled by the sender
+ * but not sent yet to the NIC; rcur->hwtail are available
+ * for new transmissions, and hwtail->hwcur-1 are pending
+ * transmissions not yet acknowledged.
*
* The indexes in the NIC and netmap rings are offset by nkr_hwofs slots.
* This is so that, on a reset, buffers owned by userspace are not
* modified by the kernel. In particular:
- * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with
+ * RX rings: the next empty buffer (hwtail + hwofs) coincides with
* the next empty buffer as known by the hardware (next_to_check or so).
* TX rings: hwcur + hwofs coincides with next_to_send
*
@@ -184,44 +196,76 @@ extern NMG_LOCK_T netmap_global_lock;
* from nr_hwlease, advances it, then does the
* copy outside the lock.
* In RX rings (used for VALE ports),
- * nkr_hwcur + nkr_hwavail <= nkr_hwlease < nkr_hwcur+N-1
+ * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1
* In TX rings (used for NIC or host stack ports)
- * nkr_hwcur <= nkr_hwlease < nkr_hwcur+ nkr_hwavail
+ * nkr_hwcur <= nkr_hwlease < nkr_hwtail
* nkr_leases array of nkr_num_slots where writers can report
* completion of their block. NR_NOSLOT (~0) indicates
* that the writer has not finished yet
* nkr_lease_idx index of next free slot in nr_leases, to be assigned
*
* The kring is manipulated by txsync/rxsync and generic netmap function.
- * q_lock is used to arbitrate access to the kring from within the netmap
- * code, and this and other protections guarantee that there is never
- * more than 1 concurrent call to txsync or rxsync. So we are free
- * to manipulate the kring from within txsync/rxsync without any extra
- * locks.
+ *
+ * Concurrent rxsync or txsync on the same ring are prevented through
+ * by nm_kr_lock() which in turn uses nr_busy. This is all we need
+ * for NIC rings, and for TX rings attached to the host stack.
+ *
+ * RX rings attached to the host stack use an mbq (rx_queue) on both
+ * rxsync_from_host() and netmap_transmit(). The mbq is protected
+ * by its internal lock.
+ *
+ * RX rings attached to the VALE switch are accessed by both sender
+ * and receiver. They are protected through the q_lock on the RX ring.
*/
struct netmap_kring {
- struct netmap_ring *ring;
- uint32_t nr_hwcur;
- uint32_t nr_hwavail;
- uint32_t nr_kflags; /* private driver flags */
- int32_t nr_hwreserved;
-#define NKR_PENDINTR 0x1 // Pending interrupt.
- uint32_t nkr_num_slots;
- int32_t nkr_hwofs; /* offset between NIC and netmap ring */
+ struct netmap_ring *ring;
+
+ uint32_t nr_hwcur;
+ uint32_t nr_hwtail;
+
+ /*
+ * Copies of values in user rings, so we do not need to look
+ * at the ring (which could be modified). These are set in the
+ * *sync_prologue()/finalize() routines.
+ */
+ uint32_t rhead;
+ uint32_t rcur;
+ uint32_t rtail;
+
+ uint32_t nr_kflags; /* private driver flags */
+#define NKR_PENDINTR 0x1 // Pending interrupt.
+ uint32_t nkr_num_slots;
+
+ /*
+ * On a NIC reset, the NIC ring indexes may be reset but the
+ * indexes in the netmap rings remain the same. nkr_hwofs
+ * keeps track of the offset between the two.
+ */
+ int32_t nkr_hwofs;
uint16_t nkr_slot_flags; /* initial value for flags */
+
+ /* last_reclaim is opaque marker to help reduce the frequency
+ * of operations such as reclaiming tx buffers. A possible use
+ * is set it to ticks and do the reclaim only once per tick.
+ */
+ uint64_t last_reclaim;
+
+
+ NM_SELINFO_T si; /* poll/select wait queue */
+ NM_LOCK_T q_lock; /* protects kring and ring. */
+ NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */
+
struct netmap_adapter *na;
- struct nm_bdg_fwd *nkr_ft;
- uint32_t *nkr_leases;
-#define NR_NOSLOT ((uint32_t)~0)
- uint32_t nkr_hwlease;
- uint32_t nkr_lease_idx;
- NM_SELINFO_T si; /* poll/select wait queue */
- NM_LOCK_T q_lock; /* protects kring and ring. */
- NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */
+ /* The folloiwing fields are for VALE switch support */
+ struct nm_bdg_fwd *nkr_ft;
+ uint32_t *nkr_leases;
+#define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */
+ uint32_t nkr_hwlease;
+ uint32_t nkr_lease_idx;
- volatile int nkr_stopped;
+ volatile int nkr_stopped; // XXX what for ?
/* support for adapters without native netmap support.
* On tx rings we preallocate an array of tx buffers
@@ -230,8 +274,11 @@ struct netmap_kring {
* XXX who writes to the rx queue ?
*/
struct mbuf **tx_pool;
- u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */
- struct mbq rx_queue; /* A queue for intercepted rx mbufs. */
+ // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */
+ struct mbq rx_queue; /* intercepted rx mbufs. */
+
+ uint32_t ring_id; /* debugging */
+ char name[64]; /* diagnostic */
} __attribute__((__aligned__(64)));
@@ -243,6 +290,15 @@ nm_next(uint32_t i, uint32_t lim)
return unlikely (i == lim) ? 0 : i + 1;
}
+
+/* return the previous index, with wraparound */
+static inline uint32_t
+nm_prev(uint32_t i, uint32_t lim)
+{
+ return unlikely (i == 0) ? lim : i - 1;
+}
+
+
/*
*
* Here is the layout for the Rx and Tx rings.
@@ -253,36 +309,36 @@ nm_next(uint32_t i, uint32_t lim)
| | | |
|XXX free slot XXX| |XXX free slot XXX|
+-----------------+ +-----------------+
- | |<-hwcur | |<-hwcur
- | reserved h | | (ready |
- +----------- w -+ | to be |
- cur->| a | | sent) h |
- | v | +---------- w |
- | a | cur->| (being a |
- | i | | prepared) v |
- | avail l | | a |
- +-----------------+ + a ------ i +
- | | ... | v l |<-hwlease
- | (being | ... | a | ...
- | prepared) | ... | i | ...
- +-----------------+ ... | l | ...
- | |<-hwlease +-----------------+
+head->| owned by user |<-hwcur | not sent to nic |<-hwcur
+ | | | yet |
+ +-----------------+ | |
+ cur->| available to | | |
+ | user, not read | +-----------------+
+ | yet | cur->| (being |
+ | | | prepared) |
| | | |
+ +-----------------+ + ------ +
+tail->| |<-hwtail | |<-hwlease
+ | (being | ... | | ...
+ | prepared) | ... | | ...
+ +-----------------+ ... | | ...
+ | |<-hwlease +-----------------+
+ | | tail->| |<-hwtail
| | | |
| | | |
| | | |
+-----------------+ +-----------------+
- * The cur/avail (user view) and hwcur/hwavail (kernel view)
+ * The cur/tail (user view) and hwcur/hwtail (kernel view)
* are used in the normal operation of the card.
*
* When a ring is the output of a switch port (Rx ring for
* a VALE port, Tx ring for the host stack or NIC), slots
* are reserved in blocks through 'hwlease' which points
* to the next unused slot.
- * On an Rx ring, hwlease is always after hwavail,
- * and completions cause avail to advance.
- * On a Tx ring, hwlease is always between cur and hwavail,
+ * On an Rx ring, hwlease is always after hwtail,
+ * and completions cause hwtail to advance.
+ * On a Tx ring, hwlease is always between cur and hwtail,
* and completions cause cur to advance.
*
* nm_kr_space() returns the maximum number of slots that
@@ -294,7 +350,6 @@ nm_next(uint32_t i, uint32_t lim)
-
enum txrx { NR_RX = 0, NR_TX = 1 };
/*
@@ -349,6 +404,7 @@ struct netmap_adapter {
*/
struct netmap_kring *tx_rings; /* array of TX rings. */
struct netmap_kring *rx_rings; /* array of RX rings. */
+
void *tailroom; /* space below the rings array */
/* (used for leases) */
@@ -360,11 +416,38 @@ struct netmap_adapter {
*/
int (*if_transmit)(struct ifnet *, struct mbuf *);
+ /* copy of if_input for netmap_send_up() */
+ void (*if_input)(struct ifnet *, struct mbuf *);
+
/* references to the ifnet and device routines, used by
* the generic netmap functions.
*/
struct ifnet *ifp; /* adapter is ifp->if_softc */
+ /*---- callbacks for this netmap adapter -----*/
+ /*
+ * nm_dtor() is the cleanup routine called when destroying
+ * the adapter.
+ *
+ * nm_register() is called on NIOCREGIF and close() to enter
+ * or exit netmap mode on the NIC
+ *
+ * nm_txsync() pushes packets to the underlying hw/switch
+ *
+ * nm_rxsync() collects packets from the underlying hw/switch
+ *
+ * nm_config() returns configuration information from the OS
+ *
+ * nm_krings_create() XXX
+ *
+ * nm_krings_delete() XXX
+ *
+ * nm_notify() is used to act after data have become available.
+ * For hw devices this is typically a selwakeup(),
+ * but for NIC/host ports attached to a switch (or vice-versa)
+ * we also need to invoke the 'txsync' code downstream.
+ */
+
/* private cleanup */
void (*nm_dtor)(struct netmap_adapter *);
@@ -403,6 +486,7 @@ struct netmap_adapter {
void *na_private;
};
+
/*
* If the NIC is owned by the kernel
* (i.e., bridge), neither another bridge nor user can use it;
@@ -433,13 +517,15 @@ struct netmap_vp_adapter { /* VALE software port */
u_int offset; /* Offset of ethernet header for each packet. */
};
+
struct netmap_hw_adapter { /* physical device */
struct netmap_adapter up;
struct net_device_ops nm_ndo; // XXX linux only
};
-struct netmap_generic_adapter { /* non-native device */
+
+struct netmap_generic_adapter { /* emulated device */
struct netmap_hw_adapter up;
/* Pointer to a previously used netmap adapter. */
@@ -455,16 +541,20 @@ struct netmap_generic_adapter { /* non-native device */
struct hrtimer mit_timer;
int mit_pending;
+#ifdef linux
+ netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *);
+#endif
};
#ifdef WITH_VALE
-/* bridge wrapper for non VALE ports. It is used to connect real devices to the bridge.
+/*
+ * Bridge wrapper for non VALE ports attached to a VALE switch.
*
- * The real device must already have its own netmap adapter (hwna). The
- * bridge wrapper and the hwna adapter share the same set of netmap rings and
- * buffers, but they have two separate sets of krings descriptors, with tx/rx
- * meanings swapped:
+ * The real device must already have its own netmap adapter (hwna).
+ * The bridge wrapper and the hwna adapter share the same set of
+ * netmap rings and buffers, but they have two separate sets of
+ * krings descriptors, with tx/rx meanings swapped:
*
* netmap
* bwrap krings rings krings hwna
@@ -478,23 +568,28 @@ struct netmap_generic_adapter { /* non-native device */
* | | +------+ +-----+ +------+ | |
* +------+ +------+
*
- * - packets coming from the bridge go to the brwap rx rings, which are also the
- * hwna tx rings. The bwrap notify callback will then complete the hwna tx
- * (see netmap_bwrap_notify).
- * - packets coming from the outside go to the hwna rx rings, which are also the
- * bwrap tx rings. The (overwritten) hwna notify method will then complete
- * the bridge tx (see netmap_bwrap_intr_notify).
+ * - packets coming from the bridge go to the brwap rx rings,
+ * which are also the hwna tx rings. The bwrap notify callback
+ * will then complete the hwna tx (see netmap_bwrap_notify).
*
- * The bridge wrapper may optionally connect the hwna 'host' rings to the
- * bridge. This is done by using a second port in the bridge and connecting it
- * to the 'host' netmap_vp_adapter contained in the netmap_bwrap_adapter.
- * The brwap host adapter cross-links the hwna host rings in the same way as shown above.
+ * - packets coming from the outside go to the hwna rx rings,
+ * which are also the bwrap tx rings. The (overwritten) hwna
+ * notify method will then complete the bridge tx
+ * (see netmap_bwrap_intr_notify).
*
- * - packets coming from the bridge and directed to host stack are handled by the
- * bwrap host notify callback (see netmap_bwrap_host_notify)
- * - packets coming from the host stack are still handled by the overwritten
- * hwna notify callback (netmap_bwrap_intr_notify), but are diverted to the
- * host adapter depending on the ring number.
+ * The bridge wrapper may optionally connect the hwna 'host' rings
+ * to the bridge. This is done by using a second port in the
+ * bridge and connecting it to the 'host' netmap_vp_adapter
+ * contained in the netmap_bwrap_adapter. The brwap host adapter
+ * cross-links the hwna host rings in the same way as shown above.
+ *
+ * - packets coming from the bridge and directed to the host stack
+ * are handled by the bwrap host notify callback
+ * (see netmap_bwrap_host_notify)
+ *
+ * - packets coming from the host stack are still handled by the
+ * overwritten hwna notify callback (netmap_bwrap_intr_notify),
+ * but are diverted to the host adapter depending on the ring number.
*
*/
struct netmap_bwrap_adapter {
@@ -505,103 +600,39 @@ struct netmap_bwrap_adapter {
/* backup of the hwna notify callback */
int (*save_notify)(struct netmap_adapter *,
u_int ring, enum txrx, int flags);
- /* When we attach a physical interface to the bridge, we
+
+ /*
+ * When we attach a physical interface to the bridge, we
* allow the controlling process to terminate, so we need
* a place to store the netmap_priv_d data structure.
- * This is only done when physical interfaces are attached to a bridge.
+ * This is only done when physical interfaces
+ * are attached to a bridge.
*/
struct netmap_priv_d *na_kpriv;
};
-/*
- * Available space in the ring. Only used in VALE code
- */
-static inline uint32_t
-nm_kr_space(struct netmap_kring *k, int is_rx)
-{
- int space;
-
- if (is_rx) {
- int busy = k->nkr_hwlease - k->nr_hwcur + k->nr_hwreserved;
- if (busy < 0)
- busy += k->nkr_num_slots;
- space = k->nkr_num_slots - 1 - busy;
- } else {
- space = k->nr_hwcur + k->nr_hwavail - k->nkr_hwlease;
- if (space < 0)
- space += k->nkr_num_slots;
- }
-#if 0
- // sanity check
- if (k->nkr_hwlease >= k->nkr_num_slots ||
- k->nr_hwcur >= k->nkr_num_slots ||
- k->nr_hwavail >= k->nkr_num_slots ||
- busy < 0 ||
- busy >= k->nkr_num_slots) {
- D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease,
- k->nkr_lease_idx, k->nkr_num_slots);
- }
-#endif
- return space;
-}
-
-
+#endif /* WITH_VALE */
-/* make a lease on the kring for N positions. return the
- * lease index
- */
+/* return slots reserved to rx clients; used in drivers */
static inline uint32_t
-nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
+nm_kr_rxspace(struct netmap_kring *k)
{
- uint32_t lim = k->nkr_num_slots - 1;
- uint32_t lease_idx = k->nkr_lease_idx;
-
- k->nkr_leases[lease_idx] = NR_NOSLOT;
- k->nkr_lease_idx = nm_next(lease_idx, lim);
+ int space = k->nr_hwtail - k->nr_hwcur;
+ if (space < 0)
+ space += k->nkr_num_slots;
+ ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail);
- if (n > nm_kr_space(k, is_rx)) {
- D("invalid request for %d slots", n);
- panic("x");
- }
- /* XXX verify that there are n slots */
- k->nkr_hwlease += n;
- if (k->nkr_hwlease > lim)
- k->nkr_hwlease -= lim + 1;
-
- if (k->nkr_hwlease >= k->nkr_num_slots ||
- k->nr_hwcur >= k->nkr_num_slots ||
- k->nr_hwavail >= k->nkr_num_slots ||
- k->nkr_lease_idx >= k->nkr_num_slots) {
- D("invalid kring %s, cur %d avail %d lease %d lease_idx %d lim %d",
- k->na->ifp->if_xname,
- k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease,
- k->nkr_lease_idx, k->nkr_num_slots);
- }
- return lease_idx;
+ return space;
}
-#endif /* WITH_VALE */
-/* return update position */
-static inline uint32_t
-nm_kr_rxpos(struct netmap_kring *k)
+/* True if no space in the tx ring. only valid after txsync_prologue */
+static inline int
+nm_kr_txempty(struct netmap_kring *kring)
{
- uint32_t pos = k->nr_hwcur + k->nr_hwavail;
- if (pos >= k->nkr_num_slots)
- pos -= k->nkr_num_slots;
-#if 0
- if (pos >= k->nkr_num_slots ||
- k->nkr_hwlease >= k->nkr_num_slots ||
- k->nr_hwcur >= k->nkr_num_slots ||
- k->nr_hwavail >= k->nkr_num_slots ||
- k->nkr_lease_idx >= k->nkr_num_slots) {
- D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease,
- k->nkr_lease_idx, k->nkr_num_slots);
- }
-#endif
- return pos;
+ return kring->rcur == kring->nr_hwtail;
}
@@ -613,11 +644,13 @@ nm_kr_rxpos(struct netmap_kring *k)
#define NM_KR_BUSY 1
#define NM_KR_STOPPED 2
+
static __inline void nm_kr_put(struct netmap_kring *kr)
{
NM_ATOMIC_CLEAR(&kr->nr_busy);
}
+
static __inline int nm_kr_tryget(struct netmap_kring *kr)
{
/* check a first time without taking the lock
@@ -640,7 +673,7 @@ static __inline int nm_kr_tryget(struct netmap_kring *kr)
/*
- * The following are support routines used by individual drivers to
+ * The following functions are used by individual drivers to
* support netmap operation.
*
* netmap_attach() initializes a struct netmap_adapter, allocating the
@@ -666,7 +699,17 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na,
enum txrx tx, u_int n, u_int new_cur);
int netmap_ring_reinit(struct netmap_kring *);
-/* set/clear native flags. XXX maybe also if_transmit ? */
+/* default functions to handle rx/tx interrupts */
+int netmap_rx_irq(struct ifnet *, u_int, u_int *);
+#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)
+void netmap_common_irq(struct ifnet *, u_int, u_int *work_done);
+
+void netmap_disable_all_rings(struct ifnet *);
+void netmap_enable_all_rings(struct ifnet *);
+void netmap_disable_ring(struct netmap_kring *kr);
+
+
+/* set/clear native flags and if_transmit/netdev_ops */
static inline void
nm_set_native_flags(struct netmap_adapter *na)
{
@@ -685,6 +728,7 @@ nm_set_native_flags(struct netmap_adapter *na)
#endif
}
+
static inline void
nm_clear_native_flags(struct netmap_adapter *na)
{
@@ -701,36 +745,58 @@ nm_clear_native_flags(struct netmap_adapter *na)
#endif
}
+
/*
- * validates parameters in the ring/kring, returns a value for cur,
- * and the 'new_slots' value in the argument.
- * If any error, returns cur > lim to force a reinit.
+ * validates parameters in the ring/kring, returns a value for head
+ * If any error, returns ring_size to force a reinit.
*/
-u_int nm_txsync_prologue(struct netmap_kring *, u_int *);
+uint32_t nm_txsync_prologue(struct netmap_kring *);
+
/*
- * validates parameters in the ring/kring, returns a value for cur,
+ * validates parameters in the ring/kring, returns a value for head,
* and the 'reserved' value in the argument.
- * If any error, returns cur > lim to force a reinit.
+ * If any error, returns ring_size lim to force a reinit.
+ */
+uint32_t nm_rxsync_prologue(struct netmap_kring *);
+
+
+/*
+ * update kring and ring at the end of txsync.
*/
-u_int nm_rxsync_prologue(struct netmap_kring *, u_int *);
+static inline void
+nm_txsync_finalize(struct netmap_kring *kring)
+{
+ /* update ring head/tail to what the kernel knows */
+ kring->ring->tail = kring->rtail = kring->nr_hwtail;
+ kring->ring->head = kring->rhead = kring->nr_hwcur;
+
+ /* note, head/rhead/hwcur might be behind cur/rcur
+ * if no carrier
+ */
+ ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
+ kring->name, kring->nr_hwcur, kring->nr_hwtail,
+ kring->rhead, kring->rcur, kring->rtail);
+}
+
/*
- * update kring and ring at the end of txsync
+ * update kring and ring at the end of rxsync
*/
static inline void
-nm_txsync_finalize(struct netmap_kring *kring, u_int cur)
+nm_rxsync_finalize(struct netmap_kring *kring)
{
- /* recompute hwreserved */
- kring->nr_hwreserved = cur - kring->nr_hwcur;
- if (kring->nr_hwreserved < 0)
- kring->nr_hwreserved += kring->nkr_num_slots;
-
- /* update avail and reserved to what the kernel knows */
- kring->ring->avail = kring->nr_hwavail;
- kring->ring->reserved = kring->nr_hwreserved;
+ /* tell userspace that there might be new packets */
+ //struct netmap_ring *ring = kring->ring;
+ ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail,
+ kring->nr_hwtail);
+ kring->ring->tail = kring->rtail = kring->nr_hwtail;
+ /* make a copy of the state for next round */
+ kring->rhead = kring->ring->head;
+ kring->rcur = kring->ring->cur;
}
+
/* check/fix address and len in tx rings */
#if 1 /* debug version */
#define NM_CHECK_ADDR_LEN(_a, _l) do { \
@@ -755,6 +821,8 @@ nm_txsync_finalize(struct netmap_kring *kring, u_int cur)
int netmap_update_config(struct netmap_adapter *na);
int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom);
void netmap_krings_delete(struct netmap_adapter *na);
+int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
+
struct netmap_if *
netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
@@ -766,10 +834,13 @@ u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg);
int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na);
+
#ifdef WITH_VALE
/*
- * The following bridge-related interfaces are used by other kernel modules
- * In the version that only supports unicast or broadcast, the lookup
+ * The following bridge-related functions are used by other
+ * kernel modules.
+ *
+ * VALE only supports unicast or broadcast. The lookup
* function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports,
* NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown.
* XXX in practice "unknown" might be handled same as broadcast.
@@ -799,8 +870,6 @@ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func);
/* Various prototypes */
int netmap_poll(struct cdev *dev, int events, struct thread *td);
-
-
int netmap_init(void);
void netmap_fini(void);
int netmap_get_memory(struct netmap_priv_d* p);
@@ -811,7 +880,8 @@ int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct t
/* netmap_adapter creation/destruction */
#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie")
-#define NM_DEBUG_PUTGET 1
+
+// #define NM_DEBUG_PUTGET 1
#ifdef NM_DEBUG_PUTGET
@@ -844,12 +914,15 @@ int netmap_adapter_put(struct netmap_adapter *na);
#endif /* !NM_DEBUG_PUTGET */
+/*
+ * module variables
+ */
extern u_int netmap_buf_size;
#define NETMAP_BUF_SIZE netmap_buf_size // XXX remove
-extern int netmap_mitigate;
+extern int netmap_mitigate; // XXX not really used
extern int netmap_no_pendintr;
-extern u_int netmap_total_buffers;
-extern char *netmap_buffer_base;
+extern u_int netmap_total_buffers; // global allocator
+extern char *netmap_buffer_base; // global allocator
extern int netmap_verbose; // XXX debugging
enum { /* verbose flags */
NM_VERB_ON = 1, /* generic verbose */
@@ -908,7 +981,7 @@ extern int netmap_generic_ringsize;
#ifdef __FreeBSD__
-/* Callback invoked by the dma machinery after a successfull dmamap_load */
+/* Callback invoked by the dma machinery after a successful dmamap_load */
static void netmap_dmamap_cb(__unused void *arg,
__unused bus_dma_segment_t * segs, __unused int nseg, __unused int error)
{
@@ -1053,31 +1126,27 @@ BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot)
lut[0].vaddr : lut[i].vaddr;
}
-/* default functions to handle rx/tx interrupts */
-int netmap_rx_irq(struct ifnet *, u_int, u_int *);
-#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)
-void netmap_common_irq(struct ifnet *, u_int, u_int *work_done);
void netmap_txsync_to_host(struct netmap_adapter *na);
-void netmap_disable_all_rings(struct ifnet *);
-void netmap_enable_all_rings(struct ifnet *);
-void netmap_disable_ring(struct netmap_kring *kr);
-/* Structure associated to each thread which registered an interface.
+/*
+ * Structure associated to each thread which registered an interface.
*
* The first 4 fields of this structure are written by NIOCREGIF and
* read by poll() and NIOC?XSYNC.
- * There is low contention among writers (actually, a correct user program
- * should have no contention among writers) and among writers and readers,
- * so we use a single global lock to protect the structure initialization.
- * Since initialization involves the allocation of memory, we reuse the memory
- * allocator lock.
+ *
+ * There is low contention among writers (a correct user program
+ * should have none) and among writers and readers, so we use a
+ * single global lock to protect the structure initialization;
+ * since initialization involves the allocation of memory,
+ * we reuse the memory allocator lock.
+ *
* Read access to the structure is lock free. Readers must check that
* np_nifp is not NULL before using the other fields.
- * If np_nifp is NULL initialization has not been performed, so they should
- * return an error to userlevel.
+ * If np_nifp is NULL initialization has not been performed,
+ * so they should return an error to userspace.
*
* The ref_done field is used to regulate access to the refcount in the
* memory allocator. The refcount must be incremented at most once for
@@ -1091,38 +1160,29 @@ struct netmap_priv_d {
struct netmap_if * volatile np_nifp; /* netmap if descriptor. */
struct netmap_adapter *np_na;
- int np_ringid; /* from the ioctl */
- u_int np_qfirst, np_qlast; /* range of rings to scan */
- uint16_t np_txpoll;
+ int np_ringid; /* from the ioctl */
+ u_int np_qfirst, np_qlast; /* range of rings to scan */
+ uint16_t np_txpoll;
struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */
/* np_refcount is only used on FreeBSD */
- int np_refcount; /* use with NMG_LOCK held */
+ int np_refcount; /* use with NMG_LOCK held */
};
/*
* generic netmap emulation for devices that do not have
* native netmap support.
- * XXX generic_netmap_register() is only exported to implement
- * nma_is_generic().
*/
-int generic_netmap_register(struct netmap_adapter *na, int enable);
int generic_netmap_attach(struct ifnet *ifp);
int netmap_catch_rx(struct netmap_adapter *na, int intercept);
void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);;
-void netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable);
+void netmap_catch_tx(struct netmap_generic_adapter *na, int enable);
int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr);
int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx);
void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);
-static __inline int
-nma_is_generic(struct netmap_adapter *na)
-{
- return na->nm_register == generic_netmap_register;
-}
-
/*
* netmap_mitigation API. This is used by the generic adapter
* to reduce the number of interrupt requests/selwakeup
@@ -1134,6 +1194,4 @@ void netmap_mitigation_restart(struct netmap_generic_adapter *na);
int netmap_mitigation_active(struct netmap_generic_adapter *na);
void netmap_mitigation_cleanup(struct netmap_generic_adapter *na);
-// int generic_timer_handler(struct hrtimer *t);
-
#endif /* _NET_NETMAP_KERN_H_ */
diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c
index c8e581b69fe5..2606b13d48dc 100644
--- a/sys/dev/netmap/netmap_mbq.c
+++ b/sys/dev/netmap/netmap_mbq.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Vincenzo Maffione. All rights reserved.
+ * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -47,17 +47,20 @@ static inline void __mbq_init(struct mbq *q)
q->count = 0;
}
+
void mbq_safe_init(struct mbq *q)
{
mtx_init(&q->lock, "mbq", NULL, MTX_SPIN);
__mbq_init(q);
}
+
void mbq_init(struct mbq *q)
{
__mbq_init(q);
}
+
static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m)
{
m->m_nextpkt = NULL;
@@ -70,6 +73,7 @@ static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m)
q->count++;
}
+
void mbq_safe_enqueue(struct mbq *q, struct mbuf *m)
{
mtx_lock(&q->lock);
@@ -77,11 +81,13 @@ void mbq_safe_enqueue(struct mbq *q, struct mbuf *m)
mtx_unlock(&q->lock);
}
+
void mbq_enqueue(struct mbq *q, struct mbuf *m)
{
__mbq_enqueue(q, m);
}
+
static inline struct mbuf *__mbq_dequeue(struct mbq *q)
{
struct mbuf *ret = NULL;
@@ -99,6 +105,7 @@ static inline struct mbuf *__mbq_dequeue(struct mbq *q)
return ret;
}
+
struct mbuf *mbq_safe_dequeue(struct mbq *q)
{
struct mbuf *ret;
@@ -110,11 +117,13 @@ struct mbuf *mbq_safe_dequeue(struct mbq *q)
return ret;
}
+
struct mbuf *mbq_dequeue(struct mbq *q)
{
return __mbq_dequeue(q);
}
+
/* XXX seems pointless to have a generic purge */
static void __mbq_purge(struct mbq *q, int safe)
{
@@ -130,16 +139,19 @@ static void __mbq_purge(struct mbq *q, int safe)
}
}
+
void mbq_purge(struct mbq *q)
{
__mbq_purge(q, 0);
}
+
void mbq_safe_purge(struct mbq *q)
{
__mbq_purge(q, 1);
}
+
void mbq_safe_destroy(struct mbq *q)
{
mtx_destroy(&q->lock);
@@ -149,4 +161,3 @@ void mbq_safe_destroy(struct mbq *q)
void mbq_destroy(struct mbq *q)
{
}
-
diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h
index ad023b617a5d..d273d8a8fa23 100644
--- a/sys/dev/netmap/netmap_mbq.h
+++ b/sys/dev/netmap/netmap_mbq.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Vincenzo Maffione. All rights reserved.
+ * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c
index f28f2c04751a..b25f79cef3a4 100644
--- a/sys/dev/netmap/netmap_mem2.c
+++ b/sys/dev/netmap/netmap_mem2.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
+ * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -506,7 +506,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj
p->r_objsize = objsize;
#define MAX_CLUSTSIZE (1<<17)
-#define LINE_ROUND 64
+#define LINE_ROUND NM_CACHE_ALIGN // 64
if (objsize >= MAX_CLUSTSIZE) {
/* we could do it but there is no point */
D("unsupported allocation for %d bytes", objsize);
@@ -960,13 +960,15 @@ netmap_mem_rings_create(struct netmap_adapter *na)
ND("txring[%d] at %p ofs %d", i, ring);
kring->ring = ring;
*(uint32_t *)(uintptr_t)&ring->num_slots = ndesc;
- *(ssize_t *)(uintptr_t)&ring->buf_ofs =
+ *(int64_t *)(uintptr_t)&ring->buf_ofs =
(na->nm_mem->pools[NETMAP_IF_POOL].memtotal +
na->nm_mem->pools[NETMAP_RING_POOL].memtotal) -
netmap_ring_offset(na->nm_mem, ring);
- ring->avail = kring->nr_hwavail;
- ring->cur = kring->nr_hwcur;
+ /* copy values from kring */
+ ring->head = kring->rhead;
+ ring->cur = kring->rcur;
+ ring->tail = kring->rtail;
*(uint16_t *)(uintptr_t)&ring->nr_buf_size =
NETMAP_BDG_BUF_SIZE(na->nm_mem);
ND("initializing slots for txring");
@@ -989,13 +991,15 @@ netmap_mem_rings_create(struct netmap_adapter *na)
kring->ring = ring;
*(uint32_t *)(uintptr_t)&ring->num_slots = ndesc;
- *(ssize_t *)(uintptr_t)&ring->buf_ofs =
+ *(int64_t *)(uintptr_t)&ring->buf_ofs =
(na->nm_mem->pools[NETMAP_IF_POOL].memtotal +
na->nm_mem->pools[NETMAP_RING_POOL].memtotal) -
netmap_ring_offset(na->nm_mem, ring);
- ring->cur = kring->nr_hwcur;
- ring->avail = kring->nr_hwavail;
+ /* copy values from kring */
+ ring->head = kring->rhead;
+ ring->cur = kring->rcur;
+ ring->tail = kring->rtail;
*(int *)(uintptr_t)&ring->nr_buf_size =
NETMAP_BDG_BUF_SIZE(na->nm_mem);
ND("initializing slots for rxring[%d]", i);
diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h
index f492f9814b79..8e6c58cbc4ee 100644
--- a/sys/dev/netmap/netmap_mem2.h
+++ b/sys/dev/netmap/netmap_mem2.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
+ * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c
index 32d6422de120..f988b84e78b2 100644
--- a/sys/dev/netmap/netmap_vale.c
+++ b/sys/dev/netmap/netmap_vale.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Universita` di Pisa. All rights reserved.
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -251,44 +251,6 @@ struct nm_bridge nm_bridges[NM_BRIDGES];
/*
- * A few function to tell which kind of port are we using.
- * XXX should we hold a lock ?
- *
- * nma_is_vp() virtual port
- * nma_is_host() port connected to the host stack
- * nma_is_hw() port connected to a NIC
- * nma_is_generic() generic netmap adapter XXX stop this madness
- */
-static __inline int
-nma_is_vp(struct netmap_adapter *na)
-{
- return na->nm_register == bdg_netmap_reg;
-}
-
-
-static __inline int
-nma_is_host(struct netmap_adapter *na)
-{
- return na->nm_register == NULL;
-}
-
-
-static __inline int
-nma_is_hw(struct netmap_adapter *na)
-{
- /* In case of sw adapter, nm_register is NULL */
- return !nma_is_vp(na) && !nma_is_host(na) && !nma_is_generic(na);
-}
-
-static __inline int
-nma_is_bwrap(struct netmap_adapter *na)
-{
- return na->nm_register == netmap_bwrap_register;
-}
-
-
-
-/*
* this is a slightly optimized copy routine which rounds
* to multiple of 64 bytes and is often faster than dealing
* with other odd sizes. We assume there is enough room
@@ -318,7 +280,6 @@ pkt_copy(void *_src, void *_dst, int l)
}
-
/*
* locate a bridge among the existing ones.
* MUST BE CALLED WITH NMG_LOCK()
@@ -393,8 +354,8 @@ nm_free_bdgfwd(struct netmap_adapter *na)
struct netmap_kring *kring;
NMG_LOCK_ASSERT();
- nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
- kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+ nrings = na->num_tx_rings;
+ kring = na->tx_rings;
for (i = 0; i < nrings; i++) {
if (kring[i].nkr_ft) {
free(kring[i].nkr_ft, M_DEVBUF);
@@ -502,6 +463,7 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
}
}
+
static void
netmap_adapter_vp_dtor(struct netmap_adapter *na)
{
@@ -520,6 +482,16 @@ netmap_adapter_vp_dtor(struct netmap_adapter *na)
na->ifp = NULL;
}
+
+/* Try to get a reference to a netmap adapter attached to a VALE switch.
+ * If the adapter is found (or is created), this function returns 0, a
+ * non NULL pointer is returned into *na, and the caller holds a
+ * reference to the adapter.
+ * If an adapter is not found, then no reference is grabbed and the
+ * function returns an error code, or 0 if there is just a VALE prefix
+ * mismatch. Therefore the caller holds a reference when
+ * (*na != NULL && return == 0).
+ */
int
netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
{
@@ -688,18 +660,12 @@ nm_bdg_attach(struct nmreq *nmr)
return ENOMEM;
NMG_LOCK();
/* XXX probably netmap_get_bdg_na() */
- error = netmap_get_na(nmr, &na, 1 /* create if not exists */);
+ error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
if (error) /* no device, or another bridge or user owns the device */
goto unlock_exit;
- /* netmap_get_na() sets na_bdg if this is a physical interface
- * that we can attach to a switch.
- */
- if (!nma_is_bwrap(na)) {
- /* got reference to a virtual port or direct access to a NIC.
- * perhaps specified no bridge prefix or wrong NIC name
- */
+ if (na == NULL) { /* VALE prefix missing */
error = EINVAL;
- goto unref_exit;
+ goto unlock_exit;
}
if (na->active_fds > 0) { /* already registered */
@@ -727,6 +693,7 @@ unlock_exit:
return error;
}
+
static int
nm_bdg_detach(struct nmreq *nmr)
{
@@ -736,17 +703,15 @@ nm_bdg_detach(struct nmreq *nmr)
int last_instance;
NMG_LOCK();
- error = netmap_get_na(nmr, &na, 0 /* don't create */);
+ error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
if (error) { /* no device, or another bridge or user owns the device */
goto unlock_exit;
}
- if (!nma_is_bwrap(na)) {
- /* got reference to a virtual port or direct access to a NIC.
- * perhaps specified no bridge's prefix or wrong NIC's name
- */
+ if (na == NULL) { /* VALE prefix missing */
error = EINVAL;
- goto unref_exit;
+ goto unlock_exit;
}
+
bna = (struct netmap_bwrap_adapter *)na;
if (na->active_fds == 0) { /* not registered */
@@ -890,12 +855,13 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
case NETMAP_BDG_OFFSET:
NMG_LOCK();
error = netmap_get_bdg_na(nmr, &na, 0);
- if (!error) {
+ if (na && !error) {
vpna = (struct netmap_vp_adapter *)na;
if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET)
nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET;
vpna->offset = nmr->nr_arg1;
D("Using offset %d for %p", vpna->offset, vpna);
+ netmap_adapter_put(na);
}
NMG_UNLOCK();
break;
@@ -947,6 +913,7 @@ netmap_vp_krings_create(struct netmap_adapter *na)
return 0;
}
+
static void
netmap_vp_krings_delete(struct netmap_adapter *na)
{
@@ -1027,10 +994,6 @@ nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
}
-/*
- *---- support for virtual bridge -----
- */
-
/* ----- FreeBSD if_bridge hash function ------- */
/*
@@ -1052,6 +1015,7 @@ do { \
c -= a; c -= b; c ^= (b >> 15); \
} while (/*CONSTCOND*/0)
+
static __inline uint32_t
nm_bridge_rthash(const uint8_t *addr)
{
@@ -1144,6 +1108,77 @@ netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
/*
+ * Available space in the ring. Only used in VALE code
+ * and only with is_rx = 1
+ */
+static inline uint32_t
+nm_kr_space(struct netmap_kring *k, int is_rx)
+{
+ int space;
+
+ if (is_rx) {
+ int busy = k->nkr_hwlease - k->nr_hwcur;
+ if (busy < 0)
+ busy += k->nkr_num_slots;
+ space = k->nkr_num_slots - 1 - busy;
+ } else {
+ /* XXX never used in this branch */
+ space = k->nr_hwtail - k->nkr_hwlease;
+ if (space < 0)
+ space += k->nkr_num_slots;
+ }
+#if 0
+ // sanity check
+ if (k->nkr_hwlease >= k->nkr_num_slots ||
+ k->nr_hwcur >= k->nkr_num_slots ||
+ k->nr_tail >= k->nkr_num_slots ||
+ busy < 0 ||
+ busy >= k->nkr_num_slots) {
+ D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
+ k->nkr_lease_idx, k->nkr_num_slots);
+ }
+#endif
+ return space;
+}
+
+
+
+
+/* make a lease on the kring for N positions. return the
+ * lease index
+ * XXX only used in VALE code and with is_rx = 1
+ */
+static inline uint32_t
+nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
+{
+ uint32_t lim = k->nkr_num_slots - 1;
+ uint32_t lease_idx = k->nkr_lease_idx;
+
+ k->nkr_leases[lease_idx] = NR_NOSLOT;
+ k->nkr_lease_idx = nm_next(lease_idx, lim);
+
+ if (n > nm_kr_space(k, is_rx)) {
+ D("invalid request for %d slots", n);
+ panic("x");
+ }
+ /* XXX verify that there are n slots */
+ k->nkr_hwlease += n;
+ if (k->nkr_hwlease > lim)
+ k->nkr_hwlease -= lim + 1;
+
+ if (k->nkr_hwlease >= k->nkr_num_slots ||
+ k->nr_hwcur >= k->nkr_num_slots ||
+ k->nr_hwtail >= k->nkr_num_slots ||
+ k->nkr_lease_idx >= k->nkr_num_slots) {
+ D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
+ k->na->ifp->if_xname,
+ k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
+ k->nkr_lease_idx, k->nkr_num_slots);
+ }
+ return lease_idx;
+}
+
+/*
* This flush routine supports only unicast and broadcast but a large
* number of ports, and lets us replace the learn and dispatch functions.
*/
@@ -1357,28 +1392,30 @@ retry:
dst = BDG_NMB(&dst_na->up, slot);
if (unlikely(fix_mismatch)) {
- if (na->offset > dst_na->offset) {
- src += na->offset - dst_na->offset;
- copy_len -= na->offset - dst_na->offset;
- dst_len = copy_len;
- } else {
- bzero(dst, dst_na->offset - na->offset);
- dst_len += dst_na->offset - na->offset;
- dst += dst_na->offset - na->offset;
- }
- /* fix the first fragment only */
- fix_mismatch = 0;
- /* completely skip an header only fragment */
- if (copy_len == 0) {
- ft_p++;
- continue;
- }
+ /* We are processing the first fragment
+ * and there is a mismatch between source
+ * and destination offsets. Create a zeroed
+ * header for the destination, independently
+ * of the source header length and content.
+ */
+ src += na->offset;
+ copy_len -= na->offset;
+ bzero(dst, dst_na->offset);
+ dst += dst_na->offset;
+ dst_len = dst_na->offset + copy_len;
+ /* fix the first fragment only */
+ fix_mismatch = 0;
+ /* Here it could be copy_len == dst_len == 0,
+ * and so a zero length fragment is passed.
+ */
}
+
+ ND("send [%d] %d(%d) bytes at %s:%d",
+ i, (int)copy_len, (int)dst_len,
+ NM_IFPNAME(dst_ifp), j);
/* round to a multiple of 64 */
copy_len = (copy_len + 63) & ~63;
- ND("send %d %d bytes at %s:%d",
- i, ft_p->ft_len, NM_IFPNAME(dst_ifp), j);
if (ft_p->ft_flags & NS_INDIRECT) {
if (copyin(src, dst, copy_len)) {
// invalid user pointer, pretend len is 0
@@ -1426,7 +1463,7 @@ retry:
}
p[lease_idx] = j; /* report I am done */
- update_pos = nm_kr_rxpos(kring);
+ update_pos = kring->nr_hwtail;
if (my_start == update_pos) {
/* all slots before my_start have been reported,
@@ -1443,15 +1480,7 @@ retry:
* means there are new buffers to report
*/
if (likely(j != my_start)) {
- uint32_t old_avail = kring->nr_hwavail;
-
- kring->nr_hwavail = (j >= kring->nr_hwcur) ?
- j - kring->nr_hwcur :
- j + lim + 1 - kring->nr_hwcur;
- if (kring->nr_hwavail < old_avail) {
- D("avail shrink %d -> %d",
- old_avail, kring->nr_hwavail);
- }
+ kring->nr_hwtail = j;
dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
still_locked = 0;
mtx_unlock(&kring->q_lock);
@@ -1471,35 +1500,32 @@ cleanup:
return 0;
}
+
static int
netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
{
struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
- struct netmap_ring *ring = kring->ring;
- u_int j, k, lim = kring->nkr_num_slots - 1;
-
- k = ring->cur;
- if (k > lim)
- return netmap_ring_reinit(kring);
+ u_int done;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const cur = kring->rcur;
if (bridge_batch <= 0) { /* testing only */
- j = k; // used all
+ done = cur; // used all
goto done;
}
if (bridge_batch > NM_BDG_BATCH)
bridge_batch = NM_BDG_BATCH;
- j = nm_bdg_preflush(na, ring_nr, kring, k);
- if (j != k)
- D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail);
- /* k-j modulo ring size is the number of slots processed */
- if (k < j)
- k += kring->nkr_num_slots;
- kring->nr_hwavail = lim - (k - j);
-
+ done = nm_bdg_preflush(na, ring_nr, kring, cur);
done:
- kring->nr_hwcur = j;
- ring->avail = kring->nr_hwavail;
+ if (done != cur)
+ D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
+ /*
+ * packets between 'done' and 'cur' are left unsent.
+ */
+ kring->nr_hwcur = done;
+ kring->nr_hwtail = nm_prev(done, lim);
+ nm_txsync_finalize(kring);
if (netmap_verbose)
D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
return 0;
@@ -1518,46 +1544,30 @@ bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
return netmap_vp_txsync(vpna, ring_nr, flags);
}
-
-/*
- * user process reading from a VALE switch.
- * Already protected against concurrent calls from userspace,
- * but we must acquire the queue's lock to protect against
- * writers on the same queue.
- */
static int
-bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
{
struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
- u_int j, lim = kring->nkr_num_slots - 1;
- u_int k = ring->cur, resvd = ring->reserved;
+ u_int nm_i, lim = kring->nkr_num_slots - 1;
+ u_int head = nm_rxsync_prologue(kring);
int n;
- mtx_lock(&kring->q_lock);
- if (k > lim) {
+ if (head > lim) {
D("ouch dangerous reset!!!");
n = netmap_ring_reinit(kring);
goto done;
}
- /* skip past packets that userspace has released */
- j = kring->nr_hwcur; /* netmap ring index */
- if (resvd > 0) {
- if (resvd + ring->avail >= lim + 1) {
- D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
- ring->reserved = resvd = 0; // XXX panic...
- }
- k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
- }
+ /* First part, import newly received packets. */
+ /* actually nothing to do here, they are already in the kring */
- if (j != k) { /* userspace has released some packets. */
- n = k - j;
- if (n < 0)
- n += kring->nkr_num_slots;
- ND("userspace releases %d packets", n);
- for (n = 0; likely(j != k); n++) {
- struct netmap_slot *slot = &ring->slot[j];
+ /* Second part, skip past packets that userspace has released. */
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) {
+ /* consistency check, but nothing really important here */
+ for (n = 0; likely(nm_i != head); n++) {
+ struct netmap_slot *slot = &ring->slot[nm_i];
void *addr = BDG_NMB(na, slot);
if (addr == netmap_buffer_base) { /* bad buf */
@@ -1565,19 +1575,37 @@ bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
slot->buf_idx);
}
slot->flags &= ~NS_BUF_CHANGED;
- j = nm_next(j, lim);
+ nm_i = nm_next(nm_i, lim);
}
- kring->nr_hwavail -= n;
- kring->nr_hwcur = k;
+ kring->nr_hwcur = head;
}
+
/* tell userspace that there are new packets */
- ring->avail = kring->nr_hwavail - resvd;
+ nm_rxsync_finalize(kring);
n = 0;
done:
+ return n;
+}
+
+/*
+ * user process reading from a VALE switch.
+ * Already protected against concurrent calls from userspace,
+ * but we must acquire the queue's lock to protect against
+ * writers on the same queue.
+ */
+static int
+bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ int n;
+
+ mtx_lock(&kring->q_lock);
+ n = netmap_vp_rxsync(na, ring_nr, flags);
mtx_unlock(&kring->q_lock);
return n;
}
+
static int
bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
{
@@ -1627,6 +1655,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
return 0;
}
+
static void
netmap_bwrap_dtor(struct netmap_adapter *na)
{
@@ -1652,16 +1681,22 @@ netmap_bwrap_dtor(struct netmap_adapter *na)
}
+
/*
- * Pass packets from nic to the bridge.
+ * Intr callback for NICs connected to a bridge.
+ * Simply ignore tx interrupts (maybe we could try to recover space ?)
+ * and pass received packets from nic to the bridge.
+ *
* XXX TODO check locking: this is called from the interrupt
* handler so we should make sure that the interface is not
* disconnected while passing down an interrupt.
*
- * Note, no user process can access this NIC so we can ignore
- * the info in the 'ring'.
- */
-/* callback that overwrites the hwna notify callback.
+ * Note, no user process can access this NIC or the host stack.
+ * The only part of the ring that is significant are the slots,
+ * and head/cur/tail are set from the kring as needed
+ * (part as a receive ring, part as a transmit ring).
+ *
+ * callback that overwrites the hwna notify callback.
* Packets come from the outside or from the host stack and are put on an hwna rx ring.
* The bridge wrapper then sends the packets through the bridge.
*/
@@ -1677,21 +1712,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
struct netmap_vp_adapter *vpna = &bna->up;
int error = 0;
- ND("%s[%d] %s %x", NM_IFPNAME(ifp), ring_nr, (tx == NR_TX ? "TX" : "RX"), flags);
+ if (netmap_verbose)
+ D("%s %s%d 0x%x", NM_IFPNAME(ifp),
+ (tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
if (flags & NAF_DISABLE_NOTIFY) {
kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
- if (kring->nkr_stopped)
- netmap_disable_ring(bkring);
+ if (kring[ring_nr].nkr_stopped)
+ netmap_disable_ring(&bkring[ring_nr]);
else
- bkring->nkr_stopped = 0;
+ bkring[ring_nr].nkr_stopped = 0;
return 0;
}
if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
return 0;
+ /* we only care about receive interrupts */
if (tx == NR_TX)
return 0;
@@ -1707,7 +1745,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
goto put_out;
}
+ /* Here we expect ring->head = ring->cur = ring->tail
+ * because everything has been released from the previous round.
+ * However the ring is shared and we might have info from
+ * the wrong side (the tx ring). Hence we overwrite with
+ * the info from the rx kring.
+ */
+ if (netmap_verbose)
+ D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp),
+ ring->head, ring->cur, ring->tail,
+ kring->rhead, kring->rcur, kring->rtail);
+
+ ring->head = kring->rhead;
+ ring->cur = kring->rcur;
+ ring->tail = kring->rtail;
+
+ /* simulate a user wakeup on the rx ring */
if (is_host_ring) {
+ netmap_rxsync_from_host(na, NULL, NULL);
vpna = hostna;
ring_nr = 0;
} else {
@@ -1718,23 +1773,46 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
if (error)
goto put_out;
}
- if (kring->nr_hwavail == 0 && netmap_verbose) {
+ if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
D("how strange, interrupt with no packets on %s",
NM_IFPNAME(ifp));
goto put_out;
}
- /* XXX avail ? */
- ring->cur = nm_kr_rxpos(kring);
+
+ /* new packets are ring->cur to ring->tail, and the bkring
+ * had hwcur == ring->cur. So advance ring->cur to ring->tail
+ * to push all packets out.
+ */
+ ring->head = ring->cur = ring->tail;
+
+ /* also set tail to what the bwrap expects */
+ bkring = &vpna->up.tx_rings[ring_nr];
+ ring->tail = bkring->nr_hwtail; // rtail too ?
+
+ /* pass packets to the switch */
+ nm_txsync_prologue(bkring); // XXX error checking ?
netmap_vp_txsync(vpna, ring_nr, flags);
- if (!is_host_ring)
+ /* mark all buffers as released on this ring */
+ ring->head = ring->cur = kring->nr_hwtail;
+ ring->tail = kring->rtail;
+ /* another call to actually release the buffers */
+ if (!is_host_ring) {
error = na->nm_rxsync(na, ring_nr, 0);
+ } else {
+ /* mark all packets as released, as in the
+ * second part of netmap_rxsync_from_host()
+ */
+ kring->nr_hwcur = kring->nr_hwtail;
+ nm_rxsync_finalize(kring);
+ }
put_out:
nm_kr_put(kring);
return error;
}
+
static int
netmap_bwrap_register(struct netmap_adapter *na, int onoff)
{
@@ -1744,7 +1822,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
struct netmap_vp_adapter *hostna = &bna->host;
int error;
- ND("%s %d", NM_IFPNAME(ifp), onoff);
+ ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
if (onoff) {
int i;
@@ -1788,6 +1866,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
return 0;
}
+
static int
netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
u_int *rxr, u_int *rxd)
@@ -1807,6 +1886,7 @@ netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
return 0;
}
+
static int
netmap_bwrap_krings_create(struct netmap_adapter *na)
{
@@ -1834,6 +1914,7 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)
return 0;
}
+
static void
netmap_bwrap_krings_delete(struct netmap_adapter *na)
{
@@ -1847,6 +1928,7 @@ netmap_bwrap_krings_delete(struct netmap_adapter *na)
netmap_vp_krings_delete(na);
}
+
/* notify method for the bridge-->hwna direction */
static int
netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
@@ -1856,7 +1938,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
struct netmap_adapter *hwna = bna->hwna;
struct netmap_kring *kring, *hw_kring;
struct netmap_ring *ring;
- u_int lim, k;
+ u_int lim;
int error = 0;
if (tx == NR_TX)
@@ -1865,35 +1947,49 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
kring = &na->rx_rings[ring_n];
hw_kring = &hwna->tx_rings[ring_n];
ring = kring->ring;
-
lim = kring->nkr_num_slots - 1;
- k = nm_kr_rxpos(kring);
if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
return 0;
- ring->cur = k;
- ND("%s[%d] PRE rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)",
+ /* first step: simulate a user wakeup on the rx ring */
+ netmap_vp_rxsync(na, ring_n, flags);
+ ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
NM_IFPNAME(na->ifp), ring_n,
- kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved,
- ring->cur, ring->avail, ring->reserved,
- hw_kring->nr_hwcur, hw_kring->nr_hwavail);
+ kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
+ ring->head, ring->cur, ring->tail,
+ hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
+ /* second step: the simulated user consumes all new packets */
+ ring->head = ring->cur = ring->tail;
+
+ /* third step: the new packets are sent on the tx ring
+ * (which is actually the same ring)
+ */
+ /* set tail to what the hw expects */
+ ring->tail = hw_kring->rtail;
if (ring_n == na->num_rx_rings) {
netmap_txsync_to_host(hwna);
} else {
+ nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
error = hwna->nm_txsync(hwna, ring_n, flags);
}
- kring->nr_hwcur = ring->cur;
- kring->nr_hwavail = 0;
- kring->nr_hwreserved = lim - ring->avail;
- ND("%s[%d] PST rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)",
+
+ /* fourth step: now we are back the rx ring */
+ /* claim ownership on all hw owned bufs */
+ ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
+ ring->tail = kring->rtail; /* restore saved value of tail, for safety */
+
+ /* fifth step: the user goes to sleep again, causing another rxsync */
+ netmap_vp_rxsync(na, ring_n, flags);
+ ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
NM_IFPNAME(na->ifp), ring_n,
- kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved,
- ring->cur, ring->avail, ring->reserved,
- hw_kring->nr_hwcur, hw_kring->nr_hwavail);
+ kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
+ ring->head, ring->cur, ring->tail,
+ hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
return error;
}
+
static int
netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
{
@@ -1904,6 +2000,7 @@ netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx,
return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
}
+
/* attach a bridge wrapper to the 'real' device */
static int
netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
@@ -1957,7 +2054,8 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
hostna->nm_mem = na->nm_mem;
hostna->na_private = bna;
- D("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname,
+ ND("%s<->%s txr %d txd %d rxr %d rxd %d",
+ fake->if_xname, real->if_xname,
na->num_tx_rings, na->num_tx_desc,
na->num_rx_rings, na->num_rx_desc);
@@ -1970,6 +2068,7 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
return 0;
}
+
void
netmap_init_bridges(void)
{
diff --git a/sys/net/netmap.h b/sys/net/netmap.h
index 50e230934dd0..a5ee9b55edc9 100644
--- a/sys/net/netmap.h
+++ b/sys/net/netmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -39,6 +39,16 @@
#ifndef _NET_NETMAP_H_
#define _NET_NETMAP_H_
+#define NETMAP_API 10 /* current API version */
+
+/*
+ * Some fields should be cache-aligned to reduce contention.
+ * The alignment is architecture and OS dependent, but rather than
+ * digging into OS headers to find the exact value we use an estimate
+ * that should cover most architectures.
+ */
+#define NM_CACHE_ALIGN 128
+
/*
* --- Netmap data structures ---
*
@@ -52,23 +62,23 @@
====================================================================
|
USERSPACE | struct netmap_ring
- +---->+--------------+
- / | cur |
- struct netmap_if (nifp, 1 per fd) / | avail |
- +---------------+ / | buf_ofs |
- | ni_tx_rings | / +==============+
- | ni_rx_rings | / | buf_idx, len | slot[0]
- | | / | flags, ptr |
- | | / +--------------+
- +===============+ / | buf_idx, len | slot[1]
- | txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
- | txring_ofs[1] | +--------------+
- (ni_tx_rings+1 entries) (num_slots entries)
- | txring_ofs[t] | | buf_idx, len | slot[n-1]
- +---------------+ | flags, ptr |
- | rxring_ofs[0] | +--------------+
+ +---->+---------------+
+ / | head,cur,tail |
+ struct netmap_if (nifp, 1 per fd) / | buf_ofs |
+ +---------------+ / | other fields |
+ | ni_tx_rings | / +===============+
+ | ni_rx_rings | / | buf_idx, len | slot[0]
+ | | / | flags, ptr |
+ | | / +---------------+
+ +===============+ / | buf_idx, len | slot[1]
+ | txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
+ | txring_ofs[1] | +---------------+
+ (tx+1+extra_tx entries) (num_slots entries)
+ | txring_ofs[t] | | buf_idx, len | slot[n-1]
+ +---------------+ | flags, ptr |
+ | rxring_ofs[0] | +---------------+
| rxring_ofs[1] |
- (ni_rx_rings+1 entries)
+ (rx+1+extra_rx entries)
| rxring_ofs[r] |
+---------------+
@@ -93,122 +103,115 @@
/*
* struct netmap_slot is a buffer descriptor
- *
- * buf_idx the index of the buffer associated to the slot.
- * len the length of the payload
- * flags control operation on the slot, as defined below
- *
- * NS_BUF_CHANGED must be set whenever userspace wants
- * to change buf_idx (it might be necessary to
- * reprogram the NIC)
- *
- * NS_REPORT must be set if we want the NIC to generate an interrupt
- * when this slot is used. Leaving it to 0 improves
- * performance.
- *
- * NS_FORWARD if set on a receive ring, and the device is in
- * transparent mode, buffers released with the flag set
- * will be forwarded to the 'other' side (host stack
- * or NIC, respectively) on the next select() or ioctl()
- *
- * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for
- * this packet.
- *
- * NS_INDIRECT (tx rings only) data is in a userspace buffer pointed
- * by the ptr field in the slot.
- *
- * NS_MOREFRAG Part of a multi-segment frame. The last (or only)
- * segment must not have this flag.
- * Only supported on VALE ports.
- *
- * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the
- * destination port for the VALE switch, overriding
- * the lookup table.
*/
-
struct netmap_slot {
uint32_t buf_idx; /* buffer index */
- uint16_t len; /* packet length */
+ uint16_t len; /* length for this slot */
uint16_t flags; /* buf changed, etc. */
+ uint64_t ptr; /* pointer for indirect buffers */
+};
+
+/*
+ * The following flags control how the slot is used
+ */
+
#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */
-#define NS_REPORT 0x0002 /* ask the hardware to report results
- * e.g. by generating an interrupt
- */
-#define NS_FORWARD 0x0004 /* pass packet to the other endpoint
- * (host stack or device)
- */
-#define NS_NO_LEARN 0x0008
-#define NS_INDIRECT 0x0010
-#define NS_MOREFRAG 0x0020
+ /*
+ * must be set whenever buf_idx is changed (as it might be
+ * necessary to recompute the physical address and mapping)
+ */
+
+#define NS_REPORT 0x0002 /* ask the hardware to report results */
+ /*
+ * Request notification when slot is used by the hardware.
+ * Normally transmit completions are handled lazily and
+ * may be unreported. This flag lets us know when a slot
+ * has been sent (e.g. to terminate the sender).
+ */
+
+#define NS_FORWARD 0x0004 /* pass packet 'forward' */
+ /*
+ * (Only for physical ports, rx rings with NR_FORWARD set).
+ * Slot released to the kernel (i.e. before ring->head) with
+ * this flag set are passed to the peer ring (host/NIC),
+ * thus restoring the host-NIC connection for these slots.
+ * This supports efficient traffic monitoring or firewalling.
+ */
+
+#define NS_NO_LEARN 0x0008 /* disable bridge learning */
+ /*
+ * On a VALE switch, do not 'learn' the source port for
+ * this buffer.
+ */
+
+#define NS_INDIRECT 0x0010 /* userspace buffer */
+ /*
+ * (VALE tx rings only) data is in a userspace buffer,
+ * whose address is in the 'ptr' field in the slot.
+ */
+
+#define NS_MOREFRAG 0x0020 /* packet has more fragments */
+ /*
+ * (VALE ports only)
+ * Set on all but the last slot of a multi-segment packet.
+ * The 'len' field refers to the individual fragment.
+ */
+
#define NS_PORT_SHIFT 8
#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
- /*
- * in rx rings, the high 8 bits
- * are the number of fragments.
- */
+ /*
+ * The high 8 bits of the flag, if not zero, indicate the
+ * destination port for the VALE switch, overriding
+ * the lookup table.
+ */
+
#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff)
- uint64_t ptr; /* pointer for indirect buffers */
-};
+ /*
+ * (VALE rx rings only) the high 8 bits
+ * are the number of fragments.
+ */
+
/*
* struct netmap_ring
*
* Netmap representation of a TX or RX ring (also known as "queue").
* This is a queue implemented as a fixed-size circular array.
- * At the software level, two fields are important: avail and cur.
+ * At the software level the important fields are: head, cur, tail.
*
* In TX rings:
*
- * avail tells how many slots are available for transmission.
- * It is updated by the kernel in each netmap system call.
- * It MUST BE decremented by the user when it
- * adds a new packet to send.
+ * head first slot available for transmission.
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
*
- * cur indicates the slot to use for the next packet
- * to send (i.e. the "tail" of the queue).
- * It MUST BE incremented by the user before
- * netmap system calls to reflect the number of newly
- * sent packets.
- * It is checked by the kernel on netmap system calls
- * (normally unmodified by the kernel unless invalid).
+ * [head .. tail-1] can be used for new packets to send;
+ * 'head' and 'cur' must be incremented as slots are filled
+ * with new packets to be sent;
+ * 'cur' can be moved further ahead if we need more space
+ * for new transmissions.
*
* In RX rings:
*
- * avail is the number of packets available (possibly 0).
- * It is updated by the kernel in each netmap system call.
- * It MUST BE decremented by the user when it
- * consumes a packet.
- *
- * cur indicates the first slot that contains a packet not
- * yet processed (the "head" of the queue).
- * It MUST BE incremented by the user when it consumes
- * a packet.
- *
- * reserved indicates the number of buffers before 'cur'
- * that the user has not released yet. Normally 0,
- * it MUST BE incremented by the user when it
- * does not return the buffer immediately, and decremented
- * when the buffer is finally freed.
+ * head first valid received packet
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
*
+ * [head .. tail-1] contain received packets;
+ * 'head' and 'cur' must be incremented as slots are consumed
+ * and can be returned to the kernel;
+ * 'cur' can be moved further ahead if we want to wait for
+ * new packets without returning the previous ones.
*
* DATA OWNERSHIP/LOCKING:
- * The netmap_ring, all slots, and buffers in the range
- * [reserved-cur , cur+avail[ are owned by the user program,
- * and the kernel only touches them in the same thread context
- * during a system call.
- * Other buffers are reserved for use by the NIC's DMA engines.
- *
- * FLAGS
- * NR_TIMESTAMP updates the 'ts' field on each syscall. This is
- * a global timestamp for all packets.
- * NR_RX_TSTMP if set, the last 64 byte in each buffer will
- * contain a timestamp for the frame supplied by
- * the hardware (if supported)
- * NR_FORWARD if set, the NS_FORWARD flag in each slot of the
- * RX ring is checked, and if set the packet is
- * passed to the other side (host stack or device,
- * respectively). This permits bpf-like behaviour
- * or transparency for selected packets.
+ * The netmap_ring, and all slots and buffers in the range
+ * [head .. tail-1] are owned by the user program;
+ * the kernel only accesses them during a netmap system call
+ * and in the user thread context.
+ *
+ * Other slots and buffers are reserved for use by the kernel
*/
struct netmap_ring {
/*
@@ -216,19 +219,22 @@ struct netmap_ring {
* It contains the offset of the buffer region from this
* descriptor.
*/
- const ssize_t buf_ofs;
+ const int64_t buf_ofs;
const uint32_t num_slots; /* number of slots in the ring. */
- uint32_t avail; /* number of usable slots */
- uint32_t cur; /* 'current' r/w position */
- uint32_t reserved; /* not refilled before current */
+ const uint32_t nr_buf_size;
+ const uint16_t ringid;
+ const uint16_t dir; /* 0: tx, 1: rx */
- const uint16_t nr_buf_size;
- uint16_t flags;
-#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
-#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
-#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */
+ uint32_t head; /* (u) first user slot */
+ uint32_t cur; /* (u) wakeup point */
+ uint32_t tail; /* (k) first kernel slot */
- struct timeval ts; /* time of last *sync() */
+ uint32_t flags;
+
+ struct timeval ts; /* (k) time of last *sync() */
+
+ /* opaque room for a mutex or similar object */
+ uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN)));
/* the slots follow. This struct has variable size */
struct netmap_slot slot[0]; /* array of slots. */
@@ -236,6 +242,22 @@ struct netmap_ring {
/*
+ * RING FLAGS
+ */
+#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
+ /*
+ * updates the 'ts' field on each netmap syscall. This saves
+ * saves a separate gettimeofday(), and is not much worse than
+ * software timestamps generated in the interrupt handler.
+ */
+
+#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
+ /*
+ * Enables the NS_FORWARD slot flag for the ring.
+ */
+
+
+/*
* Netmap representation of an interface and its queue(s).
* This is initialized by the kernel when binding a file
* descriptor to a port, and should be considered as readonly
@@ -252,81 +274,109 @@ struct netmap_if {
const uint32_t ni_flags; /* properties */
#define NI_PRIV_MEM 0x1 /* private memory region */
- const uint32_t ni_rx_rings; /* number of rx rings */
- const uint32_t ni_tx_rings; /* number of tx rings */
+ /*
+ * The number of packet rings available in netmap mode.
+ * Physical NICs can have different numbers of tx and rx rings.
+ * Physical NICs also have a 'host' ring pair.
+ * Additionally, clients can request additional ring pairs to
+ * be used for internal communication.
+ */
+ const uint32_t ni_tx_rings; /* number of HW tx rings */
+ const uint32_t ni_rx_rings; /* number of HW rx rings */
+
+ const uint32_t ni_extra_tx_rings;
+ const uint32_t ni_extra_rx_rings;
/*
* The following array contains the offset of each netmap ring
- * from this structure. The first ni_tx_rings+1 entries refer
- * to the tx rings, the next ni_rx_rings+1 refer to the rx rings
- * (the last entry in each block refers to the host stack rings).
+ * from this structure, in the following order:
+ * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
+ * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
+ *
* The area is filled up by the kernel on NIOCREGIF,
* and then only read by userspace code.
*/
const ssize_t ring_ofs[0];
};
+
#ifndef NIOCREGIF
/*
* ioctl names and related fields
*
+ * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
+ * whose identity is set in NIOCREGIF through nr_ringid.
+ * These are non blocking and take no argument.
+ *
* NIOCGINFO takes a struct ifreq, the interface name is the input,
* the outputs are number of queues and number of descriptor
* for each queue (useful to set number of threads etc.).
* The info returned is only advisory and may change before
* the interface is bound to a file descriptor.
*
- * NIOCREGIF takes an interface name within a struct ifreq,
+ * NIOCREGIF takes an interface name within a struct nmre,
* and activates netmap mode on the interface (if possible).
*
- * nr_name is the name of the interface
+ * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we
+ * can pass it down to other NIC-related ioctls.
*
- * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings
- * indicate the configuration of the port on return.
+ * The actual argument (struct nmreq) has a number of options to request
+ * different functions.
*
- * On input, non-zero values for nr_tx_rings, nr_tx_slots and the
- * rx counterparts may be used to reconfigure the port according
- * to the requested values, but this is not guaranteed.
- * The actual values are returned on completion of the ioctl().
+ * nr_name (in)
+ * The name of the port (em0, valeXXX:YYY, etc.)
+ * limited to IFNAMSIZ for backward compatibility.
*
- * nr_ringid
- * indicates how rings should be bound to the file descriptors.
- * The default (0) means all physical rings of a NIC are bound.
- * NETMAP_HW_RING plus a ring number lets you bind just
- * a single ring pair.
- * NETMAP_SW_RING binds only the host tx/rx rings
- * NETMAP_NO_TX_POLL prevents select()/poll() from pushing
- * out packets on the tx ring unless POLLOUT is specified.
+ * nr_version (in/out)
+ * Must match NETMAP_API as used in the kernel, error otherwise.
+ * Always returns the desired value on output.
*
- * NETMAP_PRIV_MEM is a return value used to indicate that
- * this ring is in a private memory region hence buffer
- * swapping cannot be used
+ * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
+ * On input, non-zero values may be used to reconfigure the port
+ * according to the requested values, but this is not guaranteed.
+ * On output the actual values in use are reported.
*
- * nr_cmd is used to configure NICs attached to a VALE switch,
- * or to dump the configuration of a VALE switch.
+ * nr_ringid (in)
+ * Indicates how rings should be bound to the file descriptors.
+ * 0 (default) binds all physical rings
+ * NETMAP_HW_RING | ring number binds a single ring pair
+ * NETMAP_SW_RING binds only the host tx/rx rings
*
- * nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname
- * attaches the NIC to the switch, with nr_ringid specifying
- * which rings to use
+ * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push
+ * packets on tx rings only if POLLOUT is set.
+ * The default is to push any pending packet.
*
- * nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname
- * disconnects a previously attached NIC
+ * NETMAP_PRIV_MEM is set on return for ports that use private
+ * memory regions and cannot use buffer swapping.
*
- * nr_cmd = NETMAP_BDG_LIST is used to list the configuration
- * of VALE switches, with additional arguments.
+ * nr_cmd (in) if non-zero indicates a special command:
+ * NETMAP_BDG_ATTACH and nr_name = vale*:ifname
+ * attaches the NIC to the switch; nr_ringid specifies
+ * which rings to use. Used by vale-ctl -a ...
+ * nr_arg1 = NETMAP_BDG_HOST also attaches the host port
+ * as in vale-ctl -h ...
*
- * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
- * whose identity is set in NIOCREGIF through nr_ringid
+ * NETMAP_BDG_DETACH and nr_name = vale*:ifname
+ * disconnects a previously attached NIC.
+ * Used by vale-ctl -d ...
+ *
+ * NETMAP_BDG_LIST
+ * list the configuration of VALE switches.
+ *
+ * NETMAP_BDG_OFFSET XXX ?
+ * Set the offset of data in packets. Used with VALE
+ * switches where the clients use the vhost header.
+ *
+ * nr_arg1, nr_arg2 (in/out) command specific
*
- * NETMAP_API is the API version.
*/
+
/*
* struct nmreq overlays a struct ifreq
*/
struct nmreq {
char nr_name[IFNAMSIZ];
uint32_t nr_version; /* API version */
-#define NETMAP_API 5 /* current version */
uint32_t nr_offset; /* nifp offset in the shared region */
uint32_t nr_memsize; /* size of the shared region */
uint32_t nr_tx_slots; /* slots in tx rings */
@@ -339,19 +389,23 @@ struct nmreq {
#define NETMAP_SW_RING 0x2000 /* process the sw ring */
#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */
#define NETMAP_RING_MASK 0xfff /* the ring number */
+
uint16_t nr_cmd;
#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
#define NETMAP_BDG_DETACH 2 /* detach the NIC */
#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */
#define NETMAP_BDG_LIST 4 /* get bridge's info */
#define NETMAP_BDG_OFFSET 5 /* set the port offset */
+
uint16_t nr_arg1;
#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */
#define NETMAP_BDG_MAX_OFFSET 12
+
uint16_t nr_arg2;
uint32_t spare2[3];
};
+
/*
* FreeBSD uses the size value embedded in the _IOWR to determine
* how much to copy in/out. So we need it to match the actual
@@ -360,9 +414,22 @@ struct nmreq {
*/
#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */
#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */
-#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */
#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */
#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */
#endif /* !NIOCREGIF */
+
+/*
+ * Helper functions for kernel and userspace
+ */
+
+/*
+ * check if space is available in the ring.
+ */
+static inline int
+nm_ring_empty(struct netmap_ring *ring)
+{
+ return (ring->cur == ring->tail);
+}
+
#endif /* _NET_NETMAP_H_ */
diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h
index 3f2858304caf..bd6fe0db22ae 100644
--- a/sys/net/netmap_user.h
+++ b/sys/net/netmap_user.h
@@ -1,6 +1,5 @@
/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
- * Copyright (C) 2013 Universita` di Pisa
+ * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -28,8 +27,8 @@
/*
* $FreeBSD$
*
- * This header contains the macros used to manipulate netmap structures
- * and packets in userspace. See netmap(4) for more information.
+ * Functions and macros to manipulate netmap structures and packets
+ * in userspace. See netmap(4) for more information.
*
* The address of the struct netmap_if, say nifp, is computed from the
* value returned from ioctl(.., NIOCREG, ...) and the mmap region:
@@ -44,17 +43,20 @@
* we can access ring->nr_cur, ring->nr_avail, ring->nr_flags
*
* ring->slot[i] gives us the i-th slot (we can access
- * directly plen, flags, bufindex)
+ * directly len, flags, buf_idx)
*
* char *buf = NETMAP_BUF(ring, x) returns a pointer to
* the buffer numbered x
*
- * Since rings are circular, we have macros to compute the next index
- * i = NETMAP_RING_NEXT(ring, i);
+ * All ring indexes (head, cur, tail) should always move forward.
+ * To compute the next index in a circular ring you can use
+ * i = nm_ring_next(ring, i);
*
* To ease porting apps from pcap to netmap we supply a few fuctions
- * that can be called to open, close and read from netmap in a way
- * similar to libpcap.
+ * that can be called to open, close, read and write on netmap in a way
+ * similar to libpcap. Note that the read/write function depend on
+ * an ioctl()/select()/poll() being issued to refill rings or push
+ * packets out.
*
* In order to use these, include #define NETMAP_WITH_LIBS
* in the source file that invokes these functions.
@@ -65,12 +67,19 @@
#include <stdint.h>
#include <net/if.h> /* IFNAMSIZ */
+
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif /* likely and unlikely */
+
#include <net/netmap.h>
+/* helper macro */
#define _NETMAP_OFFSET(type, ptr, offset) \
((type)(void *)((char *)(ptr) + (offset)))
-#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o)
+#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs)
#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
nifp, (nifp)->ring_ofs[index] )
@@ -85,18 +94,34 @@
( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \
(ring)->nr_buf_size )
-#define NETMAP_RING_NEXT(r, i) \
- ((i)+1 == (r)->num_slots ? 0 : (i) + 1 )
-#define NETMAP_RING_FIRST_RESERVED(r) \
- ( (r)->cur < (r)->reserved ? \
- (r)->cur + (r)->num_slots - (r)->reserved : \
- (r)->cur - (r)->reserved )
+static inline uint32_t
+nm_ring_next(struct netmap_ring *r, uint32_t i)
+{
+ return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1);
+}
+
/*
- * Return 1 if the given tx ring is empty.
+ * Return 1 if we have pending transmissions in the tx ring.
+ * When everything is complete ring->cur = ring->tail + 1 (modulo ring size)
*/
-#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1)
+static inline int
+nm_tx_pending(struct netmap_ring *r)
+{
+ return nm_ring_next(r, r->tail) != r->cur;
+}
+
+
+static inline uint32_t
+nm_ring_space(struct netmap_ring *ring)
+{
+ int ret = ring->tail - ring->cur;
+ if (ret < 0)
+ ret += ring->num_slots;
+ return ret;
+}
+
#ifdef NETMAP_WITH_LIBS
/*
@@ -113,7 +138,12 @@
#include <sys/ioctl.h>
#include <sys/errno.h> /* EINVAL */
#include <fcntl.h> /* O_RDWR */
-#include <malloc.h>
+#include <unistd.h> /* close() */
+#ifdef __FreeBSD__
+#include <stdlib.h>
+#else
+#include <malloc.h> /* on FreeBSD it is stdlib.h */
+#endif
struct nm_hdr_t { /* same as pcap_pkthdr */
struct timeval ts;
@@ -139,30 +169,73 @@ struct nm_desc_t {
#define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d))
#define NETMAP_FD(d) (P2NMD(d)->fd)
+
+/*
+ * this is a slightly optimized copy routine which rounds
+ * to multiple of 64 bytes and is often faster than dealing
+ * with other odd sizes. We assume there is enough room
+ * in the source and destination buffers.
+ *
+ * XXX only for multiples of 64 bytes, non overlapped.
+ */
+static inline void
+pkt_copy(const void *_src, void *_dst, int l)
+{
+ const uint64_t *src = _src;
+ uint64_t *dst = _dst;
+ if (unlikely(l >= 1024)) {
+ memcpy(dst, src, l);
+ return;
+ }
+ for (; likely(l > 0); l-=64) {
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
+}
+
+
/*
* The callback, invoked on each received packet. Same as libpcap
*/
typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d);
/*
- * The open routine accepts an ifname (netmap:foo or vale:foo) and
- * optionally a second (string) argument indicating the ring number
+ *--- the pcap-like API ---
+ *
+ * nm_open() opens a file descriptor, binds to a port and maps memory.
+ *
+ * ifname (netmap:foo or vale:foo) is the port name
+ * flags can be NETMAP_SW_RING or NETMAP_HW_RING etc.
+ * ring_no only used if NETMAP_HW_RING is specified, is interpreted
+ * as a string or integer indicating the ring number
+ * ring_flags is stored in all ring flags (e.g. for transparent mode)
* to open. If successful, t opens the fd and maps the memory.
*/
+
static struct nm_desc_t *nm_open(const char *ifname,
const char *ring_no, int flags, int ring_flags);
/*
- * nm_dispatch() is the same as pcap_dispatch()
- * nm_next() is the same as pcap_next()
+ * nm_close() closes and restores the port to its previous state
*/
-static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *);
-static u_char *nm_next(struct nm_desc_t *, struct nm_hdr_t *);
+
+static int nm_close(struct nm_desc_t *);
/*
- * unmap memory, close file descriptor and free the descriptor.
+ * nm_inject() is the same as pcap_inject()
+ * nm_dispatch() is the same as pcap_dispatch()
+ * nm_nextpkt() is the same as pcap_next()
*/
-static int nm_close(struct nm_desc_t *);
+
+static int nm_inject(struct nm_desc_t *, const void *, size_t);
+static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *);
+static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *);
/*
@@ -240,6 +313,12 @@ fail:
static int
nm_close(struct nm_desc_t *d)
{
+ /*
+ * ugly trick to avoid unused warnings
+ */
+ static void *__xxzt[] __attribute__ ((unused)) =
+ { nm_open, nm_inject, nm_dispatch, nm_nextpkt } ;
+
if (d == NULL || d->self != d)
return EINVAL;
if (d->mem)
@@ -253,9 +332,45 @@ nm_close(struct nm_desc_t *d)
/*
+ * Same prototype as pcap_inject(), only need to cast.
+ */
+static int
+nm_inject(struct nm_desc_t *d, const void *buf, size_t size)
+{
+ u_int c, n = d->last_ring - d->first_ring + 1;
+
+ if (0) fprintf(stderr, "%s rings %d %d %d\n", __FUNCTION__,
+ d->first_ring, d->cur_ring, d->last_ring);
+ for (c = 0; c < n ; c++) {
+ /* compute current ring to use */
+ struct netmap_ring *ring;
+ uint32_t i, idx;
+ uint32_t ri = d->cur_ring + c;
+
+ if (ri > d->last_ring)
+ ri = d->first_ring;
+ ring = NETMAP_TXRING(d->nifp, ri);
+ if (nm_ring_empty(ring)) {
+ if (0) fprintf(stderr, "%s ring %d cur %d tail %d\n",
+ __FUNCTION__,
+ ri, ring->cur, ring->tail);
+ continue;
+ }
+ i = ring->cur;
+ idx = ring->slot[i].buf_idx;
+ ring->slot[i].len = size;
+ pkt_copy(buf, NETMAP_BUF(ring, idx), size);
+ d->cur_ring = ri;
+ ring->head = ring->cur = nm_ring_next(ring, i);
+ return size;
+ }
+ return 0; /* fail */
+}
+
+
+/*
* Same prototype as pcap_dispatch(), only need to cast.
*/
-inline /* not really, but disable unused warnings */
static int
nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
{
@@ -276,7 +391,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
if (ri > d->last_ring)
ri = d->first_ring;
ring = NETMAP_RXRING(d->nifp, ri);
- for ( ; ring->avail > 0 && cnt != got; got++) {
+ for ( ; !nm_ring_empty(ring) && cnt != got; got++) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
@@ -285,24 +400,22 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
d->hdr.len = d->hdr.caplen = ring->slot[i].len;
d->hdr.ts = ring->ts;
cb(arg, &d->hdr, buf);
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
+ ring->head = ring->cur = nm_ring_next(ring, i);
}
}
d->cur_ring = ri;
return got;
}
-inline /* not really, but disable unused warnings */
static u_char *
-nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr)
+nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr)
{
int ri = d->cur_ring;
do {
/* compute current ring to use */
struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri);
- if (ring->avail > 0) {
+ if (!nm_ring_empty(ring)) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
@@ -310,8 +423,12 @@ nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr)
// prefetch(buf);
hdr->ts = ring->ts;
hdr->len = hdr->caplen = ring->slot[i].len;
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
+ ring->cur = nm_ring_next(ring, i);
+ /* we could postpone advancing head if we want
+ * to hold the buffer. This can be supported in
+ * the future.
+ */
+ ring->head = ring->cur;
d->cur_ring = ri;
return buf;
}
diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c
index 0aca44d448d6..6dc77e438273 100644
--- a/tools/tools/netmap/bridge.c
+++ b/tools/tools/netmap/bridge.c
@@ -1,5 +1,5 @@
/*
- * (C) 2011 Luigi Rizzo, Matteo Landi
+ * (C) 2011-2014 Luigi Rizzo, Matteo Landi
*
* BSD license
*
@@ -42,10 +42,12 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
msg, rxring->flags, txring->flags);
j = rxring->cur; /* RX */
k = txring->cur; /* TX */
- if (rxring->avail < limit)
- limit = rxring->avail;
- if (txring->avail < limit)
- limit = txring->avail;
+ m = nm_ring_space(rxring);
+ if (m < limit)
+ limit = m;
+ m = nm_ring_space(txring);
+ if (m < limit)
+ limit = m;
m = limit;
while (limit-- > 0) {
struct netmap_slot *rs = &rxring->slot[j];
@@ -81,13 +83,11 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring,
ts->flags |= NS_BUF_CHANGED;
rs->flags |= NS_BUF_CHANGED;
#endif /* NO_SWAP */
- j = NETMAP_RING_NEXT(rxring, j);
- k = NETMAP_RING_NEXT(txring, k);
+ j = nm_ring_next(rxring, j);
+ k = nm_ring_next(txring, k);
}
- rxring->avail -= m;
- txring->avail -= m;
- rxring->cur = j;
- txring->cur = k;
+ rxring->head = rxring->cur = j;
+ txring->head = txring->cur = k;
if (verbose && m > 0)
D("%s sent %d packets to %p", msg, m, txring);
@@ -107,11 +107,11 @@ move(struct my_ring *src, struct my_ring *dst, u_int limit)
rxring = NETMAP_RXRING(src->nifp, si);
txring = NETMAP_TXRING(dst->nifp, di);
ND("txring %p rxring %p", txring, rxring);
- if (rxring->avail == 0) {
+ if (nm_ring_empty(rxring)) {
si++;
continue;
}
- if (txring->avail == 0) {
+ if (nm_ring_empty(txring)) {
di++;
continue;
}
@@ -133,7 +133,7 @@ pkt_queued(struct my_ring *me, int tx)
for (i = me->begin; i < me->end; i++) {
struct netmap_ring *ring = tx ?
NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i);
- tot += ring->avail;
+ tot += nm_ring_space(ring);
}
if (0 && verbose && tot && !tx)
D("ring %s %s %s has %d avail at %d",
@@ -288,12 +288,12 @@ main(int argc, char **argv)
if (ret < 0)
continue;
if (pollfd[0].revents & POLLERR) {
- D("error on fd0, rxcur %d@%d",
- me[0].rx->avail, me[0].rx->cur);
+ D("error on fd0, rx [%d,%d)",
+ me[0].rx->cur, me[0].rx->tail);
}
if (pollfd[1].revents & POLLERR) {
- D("error on fd1, rxcur %d@%d",
- me[1].rx->avail, me[1].rx->cur);
+ D("error on fd1, rx [%d,%d)",
+ me[1].rx->cur, me[1].rx->tail);
}
if (pollfd[0].revents & POLLOUT) {
move(me + 1, me, burst);
diff --git a/tools/tools/netmap/nm_util.c b/tools/tools/netmap/nm_util.c
index 195b68776c3b..1268840cd868 100644
--- a/tools/tools/netmap/nm_util.c
+++ b/tools/tools/netmap/nm_util.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2012-2013 Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -232,7 +232,7 @@ pkt_queued(struct my_ring *me, int tx)
for (i = me->begin; i < me->end; i++) {
struct netmap_ring *ring = tx ?
NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i);
- tot += ring->avail;
+ tot += nm_ring_space(ring);
}
if (0 && verbose && tot && !tx)
D("ring %s %s %s has %d avail at %d",
@@ -242,3 +242,90 @@ pkt_queued(struct my_ring *me, int tx)
tot, NETMAP_TXRING(me->nifp, me->begin)->cur);
return tot;
}
+
+#if 0
+
+/*
+ *
+
+Helper routines for multiple readers from the same queue
+
+- all readers open the device in 'passive' mode (NETMAP_PRIV_RING set).
+ In this mode a thread that loses the race on a poll() just continues
+ without calling *xsync()
+
+- all readers share an extra 'ring' which contains the sync information.
+ In particular we have a shared head+tail pointers that work
+ together with cur and available
+ ON RETURN FROM THE SYSCALL:
+ shadow->head = ring->cur
+ shadow->tail = ring->tail
+ shadow->link[i] = i for all slots // mark invalid
+
+ */
+
+struct nm_q_arg {
+ u_int want; /* Input */
+ u_int have; /* Output, 0 on error */
+ u_int head;
+ u_int tail;
+ struct netmap_ring *ring;
+};
+
+/*
+ * grab a number of slots from the queue.
+ */
+struct nm_q_arg
+my_grab(struct nm_q_arg q)
+{
+ const u_int ns = q.ring->num_slots;
+
+ for (;;) {
+
+ q.head = (volatile u_int)q.ring->head;
+ q.have = ns + q.head - (volatile u_int)q.ring->tail;
+ if (q.have >= ns)
+ q.have -= ns;
+ if (q.have == 0) /* no space */
+ break;
+ if (q.want < q.have)
+ q.have = q.want;
+ q.tail = q.head + q.have;
+ if (q.tail >= ns)
+ q.tail -= ns;
+ if (atomic_cmpset_int(&q.ring->head, q.head, q.tail)
+ break; /* success */
+ }
+ D("returns %d out of %d at %d,%d",
+ q.have, q.want, q.head, q.tail);
+ /* the last one can clear avail ? */
+ return q;
+}
+
+
+int
+my_release(struct nm_q_arg q)
+{
+ u_int head = q.head, tail = q.tail, i;
+ struct netmap_ring *r = q.ring;
+
+ /* link the block to the next one.
+ * there is no race here because the location is mine.
+ */
+ r->slot[head].ptr = tail; /* this is mine */
+ // memory barrier
+ if (r->head != head)
+ return; /* not my turn to release */
+ for (;;) {
+ // advance head
+ r->head = head = r->slot[head].ptr;
+ // barrier ?
+ if (head == r->slot[head].ptr)
+ break; // stop here
+ }
+ /* we have advanced from q.head to head (r.head might be
+ * further down.
+ */
+ // do an ioctl/poll to flush.
+}
+#endif /* unused */
diff --git a/tools/tools/netmap/nm_util.h b/tools/tools/netmap/nm_util.h
index 0d64f131f289..d8f8f94fd162 100644
--- a/tools/tools/netmap/nm_util.h
+++ b/tools/tools/netmap/nm_util.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2012 Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -32,6 +32,9 @@
#ifndef _NM_UTIL_H
#define _NM_UTIL_H
+
+#define _GNU_SOURCE /* for CPU_SET() */
+
#include <errno.h>
#include <signal.h> /* signal */
#include <stdlib.h>
@@ -79,6 +82,9 @@ struct pcap_pkthdr;
#include <pthread.h> /* pthread_* */
#ifdef linux
+
+#define cpuset_t cpu_set_t
+
#define ifr_flagshigh ifr_flags
#define ifr_curcap ifr_flags
#define ifr_reqcap ifr_flags
diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c
index f30f57bf804a..dd87c4a1b00e 100644
--- a/tools/tools/netmap/pcap.c
+++ b/tools/tools/netmap/pcap.c
@@ -1,5 +1,5 @@
/*
- * (C) 2011-2012 Luigi Rizzo
+ * (C) 2011-2014 Luigi Rizzo
*
* BSD license
*
@@ -499,15 +499,14 @@ pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
/* scan all rings */
for (si = me->begin; si < me->end; si++) {
struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si);
- ND("ring has %d pkts", ring->avail);
- if (ring->avail == 0)
+ if (nm_ring_empty(ring))
continue;
pme->hdr.ts = ring->ts;
/*
* XXX a proper prefetch should be done as
* prefetch(i); callback(i-1); ...
*/
- while ((cnt == -1 || cnt != got) && ring->avail > 0) {
+ while ((cnt == -1 || cnt != got) && !nm_ring_empty(ring)) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
if (idx < 2) {
@@ -520,8 +519,7 @@ pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
pme->hdr.len = pme->hdr.caplen = ring->slot[i].len;
// D("call %p len %d", p, me->hdr.len);
callback(user, &pme->hdr, buf);
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
+ ring->head = ring->cur = nm_ring_next(ring, i);
got++;
}
}
@@ -540,8 +538,7 @@ pcap_inject(pcap_t *p, const void *buf, size_t size)
for (si = me->begin; si < me->end; si++) {
struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si);
- ND("ring has %d pkts", ring->avail);
- if (ring->avail == 0)
+ if (nm_ring_empty(ring))
continue;
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
@@ -553,9 +550,8 @@ pcap_inject(pcap_t *p, const void *buf, size_t size)
u_char *dst = (u_char *)NETMAP_BUF(ring, idx);
ring->slot[i].len = size;
pkt_copy(buf, dst, size);
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
- // if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL);
+ ring->head = ring->cur = nm_ring_next(ring, i);
+ // if (ring->cur == ring->tail) ioctl(me->fd, NIOCTXSYNC, NULL);
return size;
}
errno = ENOBUFS;
diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c
index a6e5eeb227f6..c1d084028d93 100644
--- a/tools/tools/netmap/pkt-gen.c
+++ b/tools/tools/netmap/pkt-gen.c
@@ -1,5 +1,6 @@
/*
- * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -52,7 +53,16 @@ int verbose = 0;
#define SKIP_PAYLOAD 1 /* do not check payload. */
+
+#define VIRT_HDR_1 10 /* length of a base vnet-hdr */
+#define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */
+#define VIRT_HDR_MAX VIRT_HDR_2
+struct virt_header {
+ uint8_t fields[VIRT_HDR_MAX];
+};
+
struct pkt {
+ struct virt_header vh;
struct ether_header eh;
struct ip ip;
struct udphdr udp;
@@ -109,6 +119,8 @@ struct glob_arg {
char *ifname;
char *nmr_config;
int dummy_send;
+ int virt_header; /* send also the virt_header */
+ int host_ring;
};
enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
@@ -146,7 +158,8 @@ extract_ip_range(struct ip_range *r)
char *ap, *pp;
struct in_addr a;
- D("extract IP range from %s", r->name);
+ if (verbose)
+ D("extract IP range from %s", r->name);
r->port0 = r->port1 = 0;
r->start = r->end = 0;
@@ -192,7 +205,8 @@ extract_ip_range(struct ip_range *r)
a.s_addr = htonl(r->end);
strncpy(buf1, inet_ntoa(a), sizeof(buf1));
a.s_addr = htonl(r->start);
- D("range is %s:%d to %s:%d",
+ if (1)
+ D("range is %s:%d to %s:%d",
inet_ntoa(a), r->port0, buf1, r->port1);
}
}
@@ -200,7 +214,8 @@ extract_ip_range(struct ip_range *r)
static void
extract_mac_range(struct mac_range *r)
{
- D("extract MAC range from %s", r->name);
+ if (verbose)
+ D("extract MAC range from %s", r->name);
bcopy(ether_aton(r->name), &r->start, 6);
bcopy(ether_aton(r->name), &r->end, 6);
#if 0
@@ -215,7 +230,8 @@ extract_mac_range(struct mac_range *r)
if (p)
targ->dst_mac_range = atoi(p+1);
#endif
- D("%s starts at %s", r->name, ether_ntoa(&r->start));
+ if (verbose)
+ D("%s starts at %s", r->name, ether_ntoa(&r->start));
}
static struct targ *targs;
@@ -281,7 +297,7 @@ system_ncpus(void)
* Missing numbers or zeroes stand for default values.
* As an additional convenience, if exactly one number
* is specified, then this is assigned to both #tx-slots and #rx-slots.
- * If there is no 4th number, then the 3rd is assigned to both #tx-rings
+ * If there is no 4th number, then the 3rd is assigned to both #tx-rings
* and #rx-rings.
*/
void parse_nmr_config(const char* conf, struct nmreq *nmr)
@@ -362,7 +378,7 @@ source_hwaddr(const char *ifname, char *buf)
static int
setaffinity(pthread_t me, int i)
{
-#ifdef __FreeBSD__
+#if 1 // def __FreeBSD__
cpuset_t cpumask;
if (i == -1)
@@ -373,7 +389,7 @@ setaffinity(pthread_t me, int i)
CPU_SET(i, &cpumask);
if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
- D("Unable to set affinity");
+ D("Unable to set affinity: %s", strerror(errno));
return 1;
}
#else
@@ -559,6 +575,8 @@ initialize_packet(struct targ *targ)
bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
eh->ether_type = htons(ETHERTYPE_IP);
+
+ bzero(&pkt->vh, sizeof(pkt->vh));
// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
}
@@ -570,18 +588,19 @@ initialize_packet(struct targ *targ)
* an interrupt when done.
*/
static int
-send_packets(struct netmap_ring *ring, struct pkt *pkt,
- struct glob_arg *g, u_int count, int options, u_int nfrags)
+send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
+ int size, struct glob_arg *g, u_int count, int options,
+ u_int nfrags)
{
- u_int sent, cur = ring->cur;
+ u_int n, sent, cur = ring->cur;
int fcnt;
- int size = g->pkt_size;
- if (ring->avail < count)
- count = ring->avail;
+ n = nm_ring_space(ring);
+ if (n < count)
+ count = n;
if (count < nfrags) {
D("truncating packet, no room for frags %d %d",
- count, nfrags);
+ count, nfrags);
}
#if 0
if (options & (OPT_COPY | OPT_PREFETCH) ) {
@@ -590,7 +609,7 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt,
char *p = NETMAP_BUF(ring, slot->buf_idx);
prefetch(p);
- cur = NETMAP_RING_NEXT(ring, cur);
+ cur = nm_ring_next(ring, cur);
}
cur = ring->cur;
}
@@ -602,13 +621,13 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt,
slot->flags = 0;
if (options & OPT_INDIRECT) {
slot->flags |= NS_INDIRECT;
- slot->ptr = (uint64_t)pkt;
+ slot->ptr = (uint64_t)frame;
} else if (options & OPT_COPY) {
- pkt_copy(pkt, p, size);
+ pkt_copy(frame, p, size);
if (fcnt == 1)
update_addresses(pkt, g);
} else if (options & OPT_MEMCPY) {
- memcpy(p, pkt, size);
+ memcpy(p, frame, size);
if (fcnt == 1)
update_addresses(pkt, g);
} else if (options & OPT_PREFETCH) {
@@ -625,10 +644,9 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt,
slot->flags &= ~NS_MOREFRAG;
slot->flags |= NS_REPORT;
}
- cur = NETMAP_RING_NEXT(ring, cur);
+ cur = nm_ring_next(ring, cur);
}
- ring->avail -= sent;
- ring->cur = cur;
+ ring->head = ring->cur = cur;
return (sent);
}
@@ -647,6 +665,12 @@ pinger_body(void *data)
struct pollfd fds[1];
struct netmap_if *nifp = targ->nifp;
int i, rx = 0, n = targ->g->npackets;
+ void *frame;
+ int size;
+
+ frame = &targ->pkt;
+ frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
+ size = targ->g->pkt_size + targ->g->virt_header;
fds[0].fd = targ->fd;
fds[0].events = (POLLIN);
@@ -660,36 +684,37 @@ pinger_body(void *data)
}
clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
+ now = last_print;
while (n == 0 || (int)sent < n) {
struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);
struct netmap_slot *slot;
char *p;
- for (i = 0; i < 1; i++) {
+ for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */
slot = &ring->slot[ring->cur];
- slot->len = targ->g->pkt_size;
+ slot->len = size;
p = NETMAP_BUF(ring, slot->buf_idx);
- if (ring->avail == 0) {
+ if (nm_ring_empty(ring)) {
D("-- ouch, cannot send");
} else {
- pkt_copy(&targ->pkt, p, targ->g->pkt_size);
+ pkt_copy(frame, p, size);
clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
bcopy(&sent, p+42, sizeof(sent));
bcopy(&ts, p+46, sizeof(ts));
sent++;
- ring->cur = NETMAP_RING_NEXT(ring, ring->cur);
- ring->avail--;
+ ring->head = ring->cur = nm_ring_next(ring, ring->cur);
}
}
/* should use a parameter to decide how often to send */
if (poll(fds, 1, 3000) <= 0) {
- D("poll error/timeout on queue %d", targ->me);
+ D("poll error/timeout on queue %d: %s", targ->me,
+ strerror(errno));
continue;
}
/* see what we got back */
for (i = targ->qfirst; i < targ->qlast; i++) {
ring = NETMAP_RXRING(nifp, i);
- while (ring->avail > 0) {
+ while (!nm_ring_empty(ring)) {
uint32_t seq;
slot = &ring->slot[ring->cur];
p = NETMAP_BUF(ring, slot->buf_idx);
@@ -709,8 +734,7 @@ pinger_body(void *data)
min = ts.tv_nsec;
count ++;
av += ts.tv_nsec;
- ring->avail--;
- ring->cur = NETMAP_RING_NEXT(ring, ring->cur);
+ ring->head = ring->cur = nm_ring_next(ring, ring->cur);
rx++;
}
}
@@ -761,25 +785,25 @@ ponger_body(void *data)
ioctl(fds[0].fd, NIOCRXSYNC, NULL);
#else
if (poll(fds, 1, 1000) <= 0) {
- D("poll error/timeout on queue %d", targ->me);
+ D("poll error/timeout on queue %d: %s", targ->me,
+ strerror(errno));
continue;
}
#endif
txring = NETMAP_TXRING(nifp, 0);
txcur = txring->cur;
- txavail = txring->avail;
+ txavail = nm_ring_space(txring);
/* see what we got back */
for (i = targ->qfirst; i < targ->qlast; i++) {
rxring = NETMAP_RXRING(nifp, i);
- while (rxring->avail > 0) {
+ while (!nm_ring_empty(rxring)) {
uint16_t *spkt, *dpkt;
uint32_t cur = rxring->cur;
struct netmap_slot *slot = &rxring->slot[cur];
char *src, *dst;
src = NETMAP_BUF(rxring, slot->buf_idx);
//D("got pkt %p of size %d", src, slot->len);
- rxring->avail--;
- rxring->cur = NETMAP_RING_NEXT(rxring, cur);
+ rxring->head = rxring->cur = nm_ring_next(rxring, cur);
rx++;
if (txavail == 0)
continue;
@@ -797,13 +821,12 @@ ponger_body(void *data)
dpkt[5] = spkt[2];
txring->slot[txcur].len = slot->len;
/* XXX swap src dst mac */
- txcur = NETMAP_RING_NEXT(txring, txcur);
+ txcur = nm_ring_next(txring, txcur);
txavail--;
sent++;
}
}
- txring->cur = txcur;
- txring->avail = txavail;
+ txring->head = txring->cur = txcur;
targ->count = sent;
#ifdef BUSYWAIT
ioctl(fds[0].fd, NIOCTXSYNC, NULL);
@@ -847,43 +870,47 @@ timespec2val(const struct timespec *a)
}
-static int
-wait_time(struct timespec ts, struct timespec *wakeup_ts, long long *waited)
+static __inline struct timespec
+timespec_add(struct timespec a, struct timespec b)
{
- struct timespec curtime;
-
- curtime.tv_sec = 0;
- curtime.tv_nsec = 0;
-
- if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) {
- D("clock_gettime: %s", strerror(errno));
- return (-1);
- }
- while (timespec_ge(&ts, &curtime)) {
- if (waited != NULL)
- (*waited)++;
- if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) {
- D("clock_gettime");
- return (-1);
- }
+ struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };
+ if (ret.tv_nsec >= 1000000000) {
+ ret.tv_sec++;
+ ret.tv_nsec -= 1000000000;
}
- if (wakeup_ts != NULL)
- *wakeup_ts = curtime;
- return (0);
+ return ret;
}
-static __inline void
-timespec_add(struct timespec *tsa, struct timespec *tsb)
+static __inline struct timespec
+timespec_sub(struct timespec a, struct timespec b)
{
- tsa->tv_sec += tsb->tv_sec;
- tsa->tv_nsec += tsb->tv_nsec;
- if (tsa->tv_nsec >= 1000000000) {
- tsa->tv_sec++;
- tsa->tv_nsec -= 1000000000;
+ struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };
+ if (ret.tv_nsec < 0) {
+ ret.tv_sec--;
+ ret.tv_nsec += 1000000000;
}
+ return ret;
}
+/*
+ * wait until ts, either busy or sleeping if more than 1ms.
+ * Return wakeup time.
+ */
+static struct timespec
+wait_time(struct timespec ts)
+{
+ for (;;) {
+ struct timespec w, cur;
+ clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
+ w = timespec_sub(ts, cur);
+ if (w.tv_sec < 0)
+ return cur;
+ else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
+ poll(NULL, 0, 1);
+ }
+}
+
static void *
sender_body(void *data)
{
@@ -894,9 +921,15 @@ sender_body(void *data)
struct netmap_ring *txring;
int i, n = targ->g->npackets / targ->g->nthreads, sent = 0;
int options = targ->g->options | OPT_COPY;
- struct timespec tmptime, nexttime = { 0, 0}; // XXX silence compiler
+ struct timespec nexttime = { 0, 0}; // XXX silence compiler
int rate_limit = targ->g->tx_rate;
- long long waited = 0;
+ struct pkt *pkt = &targ->pkt;
+ void *frame;
+ int size;
+
+ frame = pkt;
+ frame += sizeof(pkt->vh) - targ->g->virt_header;
+ size = targ->g->pkt_size + targ->g->virt_header;
D("start");
if (setaffinity(targ->thread, targ->affinity))
@@ -909,23 +942,16 @@ sender_body(void *data)
/* main loop.*/
clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
if (rate_limit) {
- tmptime.tv_sec = 2;
- tmptime.tv_nsec = 0;
- timespec_add(&targ->tic, &tmptime);
+ targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
targ->tic.tv_nsec = 0;
- if (wait_time(targ->tic, NULL, NULL) == -1) {
- D("wait_time: %s", strerror(errno));
- goto quit;
- }
+ wait_time(targ->tic);
nexttime = targ->tic;
}
if (targ->g->dev_type == DEV_PCAP) {
- int size = targ->g->pkt_size;
- void *pkt = &targ->pkt;
pcap_t *p = targ->g->p;
for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
- if (pcap_inject(p, pkt, size) != -1)
+ if (pcap_inject(p, frame, size) != -1)
sent++;
update_addresses(pkt, targ->g);
if (i > 10000) {
@@ -934,12 +960,10 @@ sender_body(void *data)
}
}
} else if (targ->g->dev_type == DEV_TAP) { /* tap */
- int size = targ->g->pkt_size;
- void *pkt = &targ->pkt;
D("writing to file desc %d", targ->g->main_fd);
for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
- if (write(targ->g->main_fd, pkt, size) != -1)
+ if (write(targ->g->main_fd, frame, size) != -1)
sent++;
update_addresses(pkt, targ->g);
if (i > 10000) {
@@ -955,11 +979,8 @@ sender_body(void *data)
if (rate_limit && tosend <= 0) {
tosend = targ->g->burst;
- timespec_add(&nexttime, &targ->g->tx_period);
- if (wait_time(nexttime, &tmptime, &waited) == -1) {
- D("wait_time");
- goto quit;
- }
+ nexttime = timespec_add(nexttime, targ->g->tx_period);
+ wait_time(nexttime);
}
/*
@@ -968,7 +989,12 @@ sender_body(void *data)
if (poll(fds, 1, 2000) <= 0) {
if (targ->cancel)
break;
- D("poll error/timeout on queue %d", targ->me);
+ D("poll error/timeout on queue %d: %s", targ->me,
+ strerror(errno));
+ goto quit;
+ }
+ if (fds[0].revents & POLLERR) {
+ D("poll error");
goto quit;
}
/*
@@ -983,12 +1009,12 @@ sender_body(void *data)
if (n > 0 && n - sent < limit)
limit = n - sent;
txring = NETMAP_TXRING(nifp, i);
- if (txring->avail == 0)
+ if (nm_ring_empty(txring))
continue;
if (frags > 1)
limit = ((limit + frags - 1) / frags) * frags;
- m = send_packets(txring, &targ->pkt, targ->g,
+ m = send_packets(txring, pkt, frame, size, targ->g,
limit, options, frags);
ND("limit %d avail %d frags %d m %d",
limit, txring->avail, frags, m);
@@ -1007,7 +1033,7 @@ sender_body(void *data)
/* final part: wait all the TX queues to be empty. */
for (i = targ->qfirst; i < targ->qlast; i++) {
txring = NETMAP_TXRING(nifp, i);
- while (!NETMAP_TX_RING_EMPTY(txring)) {
+ while (nm_tx_pending(txring)) {
ioctl(fds[0].fd, NIOCTXSYNC, NULL);
usleep(1); /* wait 1 tick */
}
@@ -1039,11 +1065,12 @@ receive_pcap(u_char *user, const struct pcap_pkthdr * h,
static int
receive_packets(struct netmap_ring *ring, u_int limit, int dump)
{
- u_int cur, rx;
+ u_int cur, rx, n;
cur = ring->cur;
- if (ring->avail < limit)
- limit = ring->avail;
+ n = nm_ring_space(ring);
+ if (n < limit)
+ limit = n;
for (rx = 0; rx < limit; rx++) {
struct netmap_slot *slot = &ring->slot[cur];
char *p = NETMAP_BUF(ring, slot->buf_idx);
@@ -1051,10 +1078,9 @@ receive_packets(struct netmap_ring *ring, u_int limit, int dump)
if (dump)
dump_payload(p, slot->len, ring, cur);
- cur = NETMAP_RING_NEXT(ring, cur);
+ cur = nm_ring_next(ring, cur);
}
- ring->avail -= rx;
- ring->cur = cur;
+ ring->head = ring->cur = cur;
return (rx);
}
@@ -1082,7 +1108,7 @@ receiver_body(void *data)
i = poll(fds, 1, 1000);
if (i > 0 && !(fds[0].revents & POLLERR))
break;
- D("waiting for initial packets, poll returns %d %d", i, fds[0].revents);
+ RD(1, "waiting for initial packets, poll returns %d %d", i, fds[0].revents);
}
/* main loop, exit after 1s silence */
@@ -1111,11 +1137,16 @@ receiver_body(void *data)
break;
}
+ if (fds[0].revents & POLLERR) {
+ D("poll err");
+ goto quit;
+ }
+
for (i = targ->qfirst; i < targ->qlast; i++) {
int m;
rxring = NETMAP_RXRING(nifp, i);
- if (rxring->avail == 0)
+ if (nm_ring_empty(rxring))
continue;
m = receive_packets(rxring, targ->g->burst, dump);
@@ -1215,6 +1246,8 @@ usage(void)
"\t-w wait_for_link_time in seconds\n"
"\t-R rate in packets per second\n"
"\t-X dump payload\n"
+ "\t-H len add empty virtio-net-header with size 'len'\n"
+ "\t-h use host ring\n"
"",
cmd);
@@ -1243,7 +1276,7 @@ start_threads(struct glob_arg *g)
/* register interface. */
tfd = open("/dev/netmap", O_RDWR);
if (tfd == -1) {
- D("Unable to open /dev/netmap");
+ D("Unable to open /dev/netmap: %s", strerror(errno));
continue;
}
targs[i].fd = tfd;
@@ -1251,7 +1284,11 @@ start_threads(struct glob_arg *g)
bzero(&tifreq, sizeof(tifreq));
strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name));
tifreq.nr_version = NETMAP_API;
- tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0;
+ if (g->host_ring) {
+ tifreq.nr_ringid = NETMAP_SW_RING;
+ } else {
+ tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0;
+ }
parse_nmr_config(g->nmr_config, &tifreq);
/*
@@ -1264,7 +1301,7 @@ start_threads(struct glob_arg *g)
}
if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) {
- D("Unable to register %s", g->ifname);
+ D("Unable to register %s: %s", g->ifname, strerror(errno));
continue;
}
D("memsize is %d MB", tifreq.nr_memsize >> 20);
@@ -1272,9 +1309,14 @@ start_threads(struct glob_arg *g)
targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset);
D("nifp flags 0x%x", targs[i].nifp->ni_flags);
/* start threads. */
- targs[i].qfirst = (g->nthreads > 1) ? i : 0;
- targs[i].qlast = (g->nthreads > 1) ? i+1 :
- (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
+ if (g->host_ring) {
+ targs[i].qfirst = (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
+ targs[i].qlast = targs[i].qfirst + 1;
+ } else {
+ targs[i].qfirst = (g->nthreads > 1) ? i : 0;
+ targs[i].qlast = (g->nthreads > 1) ? i+1 :
+ (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
+ }
} else {
targs[i].fd = g->main_fd;
}
@@ -1292,7 +1334,7 @@ start_threads(struct glob_arg *g)
if (pthread_create(&targs[i].thread, NULL, g->td_body,
&targs[i]) == -1) {
- D("Unable to create thread %d", i);
+ D("Unable to create thread %d: %s", i, strerror(errno));
targs[i].used = 0;
}
}
@@ -1439,7 +1481,7 @@ tap_alloc(char *dev)
/* try to create the device */
if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
- D("failed to to a TUNSETIFF");
+ D("failed to to a TUNSETIFF: %s", strerror(errno));
close(fd);
return err;
}
@@ -1488,9 +1530,10 @@ main(int arc, char **argv)
g.tx_rate = 0;
g.frags = 1;
g.nmr_config = "";
+ g.virt_header = 0;
while ( (ch = getopt(arc, argv,
- "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:")) != -1) {
+ "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:H:h")) != -1) {
struct sf *fn;
switch(ch) {
@@ -1613,6 +1656,11 @@ main(int arc, char **argv)
break;
case 'C':
g.nmr_config = strdup(optarg);
+ break;
+ case 'H':
+ g.virt_header = atoi(optarg);
+ case 'h':
+ g.host_ring = 1;
}
}
@@ -1649,6 +1697,12 @@ main(int arc, char **argv)
extract_mac_range(&g.src_mac);
extract_mac_range(&g.dst_mac);
+ if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
+ && g.virt_header != VIRT_HDR_2) {
+ D("bad virtio-net-header length");
+ usage();
+ }
+
if (g.dev_type == DEV_TAP) {
D("want to use tap %s", g.ifname);
g.main_fd = tap_alloc(g.ifname);
@@ -1682,7 +1736,7 @@ main(int arc, char **argv)
*/
g.main_fd = open("/dev/netmap", O_RDWR);
if (g.main_fd == -1) {
- D("Unable to open /dev/netmap");
+ D("Unable to open /dev/netmap: %s", strerror(errno));
// fail later
}
/*
@@ -1696,22 +1750,16 @@ main(int arc, char **argv)
bzero(&nmr, sizeof(nmr));
nmr.nr_version = NETMAP_API;
strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name));
- nmr.nr_version = NETMAP_API;
parse_nmr_config(g.nmr_config, &nmr);
if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) {
- D("Unable to register interface %s", g.ifname);
+ D("Unable to register interface %s: %s", g.ifname, strerror(errno));
//continue, fail later
}
ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname,
nmr.nr_tx_rings, nmr.nr_tx_slots,
nmr.nr_rx_rings, nmr.nr_rx_slots);
- //if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) {
- // D("Unable to get if info without name");
- //} else {
- // D("map size is %d Kb", nmr.nr_memsize >> 10);
- //}
if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) {
- D("Unable to get if info for %s", g.ifname);
+ D("Unable to get if info for %s: %s", g.ifname, strerror(errno));
}
devqueues = nmr.nr_rx_rings;
@@ -1732,7 +1780,7 @@ main(int arc, char **argv)
PROT_WRITE | PROT_READ,
MAP_SHARED, g.main_fd, 0);
if (g.mmap_addr == MAP_FAILED) {
- D("Unable to mmap %d KB", nmr.nr_memsize >> 10);
+ D("Unable to mmap %d KB: %s", nmr.nr_memsize >> 10, strerror(errno));
// continue, fail later
}
@@ -1772,14 +1820,17 @@ main(int arc, char **argv)
g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
if (g.tx_rate > 0) {
/* try to have at least something every second,
- * reducing the burst size to 0.5s worth of data
+ * reducing the burst size to some 0.01s worth of data
* (but no less than one full set of fragments)
*/
- if (g.burst > g.tx_rate/2)
- g.burst = g.tx_rate/2;
+ uint64_t x;
+ int lim = (g.tx_rate)/300;
+ if (g.burst > lim)
+ g.burst = lim;
if (g.burst < g.frags)
g.burst = g.frags;
- g.tx_period.tv_nsec = (1e9 / g.tx_rate) * g.burst;
+ x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
+ g.tx_period.tv_nsec = x;
g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
}
diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c
index 0a478ba08b8f..c0cf574986b6 100644
--- a/tools/tools/netmap/vale-ctl.c
+++ b/tools/tools/netmap/vale-ctl.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 Michio Honda. All rights reserved.
+ * Copyright (C) 2013-2014 Michio Honda. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -118,7 +118,7 @@ main(int argc, char *argv[])
const char *command = basename(argv[0]);
char *name = NULL;
- if (argc != 3 && argc != 1 /* list all */ ) {
+ if (argc > 3) {
usage:
fprintf(stderr,
"Usage:\n"
@@ -127,12 +127,13 @@ usage:
"\t-d interface interface name to be detached\n"
"\t-a interface interface name to be attached\n"
"\t-h interface interface name to be attached with the host stack\n"
- "\t-l list all or specified bridge's interfaces\n"
+ "\t-l list all or specified bridge's interfaces (default)\n"
"", command);
return 0;
}
- while ((ch = getopt(argc, argv, "d:a:h:g:l:")) != -1) {
+ while ((ch = getopt(argc, argv, "d:a:h:g:l")) != -1) {
+ name = optarg; /* default */
switch (ch) {
default:
fprintf(stderr, "bad option %c %s", ch, optarg);
@@ -152,9 +153,14 @@ usage:
break;
case 'l':
nr_cmd = NETMAP_BDG_LIST;
+ if (optind < argc && argv[optind][0] == '-')
+ name = NULL;
break;
}
- name = optarg;
+ if (optind != argc) {
+ // fprintf(stderr, "optind %d argc %d\n", optind, argc);
+ goto usage;
+ }
}
if (argc == 1)
nr_cmd = NETMAP_BDG_LIST;