diff options
27 files changed, 3118 insertions, 2213 deletions
diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4 index 7975572b1e8a..523d8ddb8e5b 100644 --- a/share/man/man4/netmap.4 +++ b/share/man/man4/netmap.4 @@ -1,4 +1,4 @@ -.\" Copyright (c) 2011-2013 Matteo Landi, Luigi Rizzo, Universita` di Pisa +.\" Copyright (c) 2011-2014 Matteo Landi, Luigi Rizzo, Universita` di Pisa .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without @@ -27,434 +27,546 @@ .\" .\" $FreeBSD$ .\" -.Dd October 18, 2013 +.Dd January 4, 2014 .Dt NETMAP 4 .Os .Sh NAME .Nm netmap .Nd a framework for fast packet I/O +.br +.Nm VALE +.Nd a fast VirtuAl Local Ethernet using the netmap API .Sh SYNOPSIS .Cd device netmap .Sh DESCRIPTION .Nm is a framework for extremely fast and efficient packet I/O -(reaching 14.88 Mpps with a single core at less than 1 GHz) for both userspace and kernel clients. -Userspace clients can use the netmap API -to send and receive raw packets through physical interfaces -or ports of the -.Xr VALE 4 -switch. +It runs on FreeBSD and Linux, +and includes +.Nm VALE , +a very fast and modular in-kernel software switch/dataplane. .Pp +.Nm +and .Nm VALE -is a very fast (reaching 20 Mpps per port) -and modular software switch, -implemented within the kernel, which can interconnect -virtual ports, physical devices, and the native host stack. +are one order of magnitude faster than sockets, bpf or +native switches based on +.Xr tun/tap 4 , +reaching 14.88 Mpps with much less than one core on a 10 Gbit NIC, +and 20 Mpps per core for VALE ports. .Pp +Userspace clients can dynamically switch NICs into .Nm -uses a memory mapped region to share packet buffers, -descriptors and queues with the kernel. -Simple -.Pa ioctl()s -are used to bind interfaces/ports to file descriptors and -implement non-blocking I/O, whereas blocking I/O uses -.Pa select()/poll() . -.Nm -can exploit the parallelism in multiqueue devices and -multicore systems. +mode and send and receive raw packets through +memory mapped buffers. +A selectable file descriptor supports +synchronization and blocking I/O. +.Pp +Similarly, +.Nm VALE +can dynamically create switch instances and ports, +providing high speed packet I/O between processes, +virtual machines, NICs and the host stack. .Pp -For the best performance, +For best performance, .Nm requires explicit support in device drivers; -a generic emulation layer is available to implement the +however, the .Nm -API on top of unmodified device drivers, +API can be emulated on top of unmodified device drivers, at the price of reduced performance -(but still better than what can be achieved with -sockets or BPF/pcap). +(but still better than sockets or BPF/pcap). .Pp -For a list of devices with native +In the rest of this (long) manual page we document +various aspects of the .Nm -support, see the end of this manual page. -.Sh OPERATION - THE NETMAP API +and +.Nm VALE +architecture, features and usage. +.Pp +.Sh ARCHITECTURE .Nm -clients must first -.Pa open("/dev/netmap") , -and then issue an -.Pa ioctl(fd, NIOCREGIF, (struct nmreq *)arg) -to bind the file descriptor to a specific interface or port. +supports raw packet I/O through a +.Em port , +which can be connected to a physical interface +.Em ( NIC ) , +to the host stack, +or to a +.Nm VALE +switch). +Ports use preallocated circular queues of buffers +.Em ( rings ) +residing in an mmapped region. +There is one ring for each transmit/receive queue of a +NIC or virtual port. +An additional ring pair connects to the host stack. +.Pp +After binding a file descriptor to a port, a .Nm -has multiple modes of operation controlled by the -content of the -.Pa struct nmreq -passed to the -.Pa ioctl() . -In particular, the -.Em nr_name -field specifies whether the client operates on a physical network -interface or on a port of a +client can send or receive packets in batches through +the rings, and possibly implement zero-copy forwarding +between ports. +.Pp +All NICs operating in +.Nm +mode use the same memory region, +accessible to all processes who own +.Nm /dev/netmap +file descriptors bound to NICs. .Nm VALE -switch, as indicated below. Additional fields in the -.Pa struct nmreq -control the details of operation. +ports instead use separate memory regions. +.Pp +.Sh ENTERING AND EXITING NETMAP MODE +Ports and rings are created and controlled through a file descriptor, +created by opening a special device +.Dl fd = open("/dev/netmap"); +and then bound to a specific port with an +.Dl ioctl(fd, NIOCREGIF, (struct nmreq *)arg); +.Pp +.Nm +has multiple modes of operation controlled by the +.Vt struct nmreq +argument. +.Va arg.nr_name +specifies the port name, as follows: .Bl -tag -width XXXX -.It Dv Interface name (e.g. 'em0', 'eth1', ... ) -The data path of the interface is disconnected from the host stack. -Depending on additional arguments, -the file descriptor is bound to the NIC (one or all queues), -or to the host stack. +.It Dv OS network interface name (e.g. 'em0', 'eth1', ... ) +the data path of the NIC is disconnected from the host stack, +and the file descriptor is bound to the NIC (one or all queues), +or to the host stack; .It Dv valeXXX:YYY (arbitrary XXX and YYY) -The file descriptor is bound to port YYY of a VALE switch called XXX, -where XXX and YYY are arbitrary alphanumeric strings. +the file descriptor is bound to port YYY of a VALE switch called XXX, +both dynamically created if necessary. The string cannot exceed IFNAMSIZ characters, and YYY cannot -matching the name of any existing interface. -.Pp -The switch and the port are created if not existing. -.It Dv valeXXX:ifname (ifname is an existing interface) -Flags in the argument control whether the physical interface -(and optionally the corrisponding host stack endpoint) -are connected or disconnected from the VALE switch named XXX. -.Pp -In this case the -.Pa ioctl() -is used only for configuring the VALE switch, typically through the -.Nm vale-ctl -command. -The file descriptor cannot be used for I/O, and should be -.Pa close()d -after issuing the -.Pa ioctl(). +be the name of any existing OS network interface. .El .Pp -The binding can be removed (and the interface returns to -regular operation, or the virtual port destroyed) with a -.Pa close() -on the file descriptor. +On return, +.Va arg +indicates the size of the shared memory region, +and the number, size and location of all the +.Nm +data structures, which can be accessed by mmapping the memory +.Dl char *mem = mmap(0, arg.nr_memsize, fd); .Pp -The processes owning the file descriptor can then -.Pa mmap() -the memory region that contains pre-allocated -buffers, descriptors and queues, and use them to -read/write raw packets. Non blocking I/O is done with special -.Pa ioctl()'s , -whereas the file descriptor can be passed to -.Pa select()/poll() -to be notified about incoming packet or available transmit buffers. -.Ss DATA STRUCTURES -The data structures in the mmapped memory are described below -(see -.Xr sys/net/netmap.h -for reference). -All physical devices operating in +.Xr ioctl 2 +.Xr select 2 +and +.Xr poll 2 +on the file descriptor permit blocking I/O. +.Xr epoll 2 +and +.Xr kqueue 2 +are not supported on .Nm -mode use the same memory region, -shared by the kernel and all processes who own -.Pa /dev/netmap -descriptors bound to those devices -(NOTE: visibility may be restricted in future implementations). -Virtual ports instead use separate memory regions, -shared only with the kernel. -.Pp -All references between the shared data structure -are relative (offsets or indexes). Some macros help converting -them into actual pointers. +file descriptors. +.Pp +While a NIC is in +.Nm +mode, the OS will still believe the interface is up and running. +OS-generated packets for that NIC end up into a +.Nm +ring, and another ring is used to send packets into the OS network stack. +A +.Xr close 2 +on the file descriptor removes the binding, +and returns the NIC to normal mode (reconnecting the data path +to the host stack), or destroys the virtual port. +.Pp +.Sh DATA STRUCTURES +The data structures in the mmapped memory region are detailed in +.Xr sys/net/netmap.h , +which is the ultimate reference for the +.Nm +API. The main structures and fields are indicated below: .Bl -tag -width XXX .It Dv struct netmap_if (one per interface) -indicates the number of rings supported by an interface, their -sizes, and the offsets of the -.Pa netmap_rings -associated to the interface. -.Pp -.Pa struct netmap_if -is at offset -.Pa nr_offset -in the shared memory region is indicated by the -field in the structure returned by the -.Pa NIOCREGIF -(see below). .Bd -literal struct netmap_if { - char ni_name[IFNAMSIZ]; /* name of the interface. */ - const u_int ni_version; /* API version */ - const u_int ni_rx_rings; /* number of rx ring pairs */ - const u_int ni_tx_rings; /* if 0, same as ni_rx_rings */ - const ssize_t ring_ofs[]; /* offset of tx and rx rings */ + ... + const uint32_t ni_flags; /* properties */ + ... + const uint32_t ni_tx_rings; /* NIC tx rings */ + const uint32_t ni_rx_rings; /* NIC rx rings */ + const uint32_t ni_extra_tx_rings; /* extra tx rings */ + const uint32_t ni_extra_rx_rings; /* extra rx rings */ + ... }; .Ed +.Pp +Indicates the number of available rings +.Pa ( struct netmap_rings ) +and their position in the mmapped region. +The number of tx and rx rings +.Pa ( ni_tx_rings , ni_rx_rings ) +normally depends on the hardware. +NICs also have an extra tx/rx ring pair connected to the host stack. +.Em NIOCREGIF +can request additional tx/rx rings, +to be used between multiple processes/threads +accessing the same +.Nm +port. .It Dv struct netmap_ring (one per ring) -Contains the positions in the transmit and receive rings to -synchronize the kernel and the application, -and an array of -.Pa slots -describing the buffers. -'reserved' is used in receive rings to tell the kernel the -number of slots after 'cur' that are still in usr -indicates how many slots starting from 'cur' -the -.Pp -Each physical interface has one -.Pa netmap_ring -for each hardware transmit and receive ring, -plus one extra transmit and one receive structure -that connect to the host stack. .Bd -literal struct netmap_ring { - const ssize_t buf_ofs; /* see details */ - const uint32_t num_slots; /* number of slots in the ring */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' read/write index */ - uint32_t reserved; /* not refilled before current */ - - const uint16_t nr_buf_size; - uint16_t flags; -#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ -#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ -#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */ - struct timeval ts; - struct netmap_slot slot[0]; /* array of slots */ + ... + const uint32_t num_slots; /* slots in each ring */ + const uint32_t nr_buf_size; /* size of each buffer */ + ... + uint32_t head; /* (u) first buf owned by user */ + uint32_t cur; /* (u) wakeup position */ + const uint32_t tail; /* (k) first buf owned by kernel */ + ... + uint32_t flags; + struct timeval ts; /* (k) time of last rxsync() */ + ... + struct netmap_slot slot[0]; /* array of slots */ } .Ed .Pp -In transmit rings, after a system call 'cur' indicates -the first slot that can be used for transmissions, -and 'avail' reports how many of them are available. -Before the next netmap-related system call on the file -descriptor, the application should fill buffers and -slots with data, and update 'cur' and 'avail' -accordingly, as shown in the figure below: +Implements transmit and receive rings, with read/write +pointers, metadata and and an array of +.Pa slots +describing the buffers. +.Pp +.It Dv struct netmap_slot (one per buffer) .Bd -literal - - cur - |----- avail ---| (after syscall) - v - TX [*****aaaaaaaaaaaaaaaaa**] - TX [*****TTTTTaaaaaaaaaaaa**] - ^ - |-- avail --| (before syscall) - cur +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length */ + uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* address for indirect buffers */ +}; .Ed -In receive rings, after a system call 'cur' indicates -the first slot that contains a valid packet, -and 'avail' reports how many of them are available. -Before the next netmap-related system call on the file -descriptor, the application can process buffers and -release them to the kernel updating -'cur' and 'avail' accordingly, as shown in the figure below. -Receive rings have an additional field called 'reserved' -to indicate how many buffers before 'cur' are still -under processing and cannot be released. +.Pp +Describes a packet buffer, which normally is identified by +an index and resides in the mmapped region. +.It Dv packet buffers +Fixed size (normally 2 KB) packet buffers allocated by the kernel. +.El +.Pp +The offset of the +.Pa struct netmap_if +in the mmapped region is indicated by the +.Pa nr_offset +field in the structure returned by +.Pa NIOCREGIF . +From there, all other objects are reachable through +relative references (offsets or indexes). +Macros and functions in <net/netmap_user.h> +help converting them into actual pointers: +.Pp +.Dl struct netmap_if *nifp = NETMAP_IF(mem, arg.nr_offset); +.Dl struct netmap_ring *txr = NETMAP_TXRING(nifp, ring_index); +.Dl struct netmap_ring *rxr = NETMAP_RXRING(nifp, ring_index); +.Pp +.Dl char *buf = NETMAP_BUF(ring, buffer_index); +.Sh RINGS, BUFFERS AND DATA I/O +.Va Rings +are circular queues of packets with three indexes/pointers +.Va ( head , cur , tail ) ; +one slot is always kept empty. +The ring size +.Va ( num_slots ) +should not be assumed to be a power of two. +.br +(NOTE: older versions of netmap used head/count format to indicate +the content of a ring). +.Pp +.Va head +is the first slot available to userspace; +.br +.Va cur +is the wakeup point: +select/poll will unblock when +.Va tail +passes +.Va cur ; +.br +.Va tail +is the first slot reserved to the kernel. +.Pp +Slot indexes MUST only move forward; +for convenience, the function +.Dl nm_ring_next(ring, index) +returns the next index modulo the ring size. +.Pp +.Va head +and +.Va cur +are only modified by the user program; +.Va tail +is only modified by the kernel. +The kernel only reads/writes the +.Vt struct netmap_ring +slots and buffers +during the execution of a netmap-related system call. +The only exception are slots (and buffers) in the range +.Va tail\ . . . head-1 , +that are explicitly assigned to the kernel. +.Pp +.Ss TRANSMIT RINGS +On transmit rings, after a +.Nm +system call, slots in the range +.Va head\ . . . tail-1 +are available for transmission. +User code should fill the slots sequentially +and advance +.Va head +and +.Va cur +past slots ready to transmit. +.Va cur +may be moved further ahead if the user code needs +more slots before further transmissions (see +.Sx SCATTER GATHER I/O ) . +.Pp +At the next NIOCTXSYNC/select()/poll(), +slots up to +.Va head-1 +are pushed to the port, and +.Va tail +may advance if further slots have become available. +Below is an example of the evolution of a TX ring: +.Pp .Bd -literal - cur - |-res-|-- avail --| (after syscall) - v - RX [**rrrrrrRRRRRRRRRRRR******] - RX [**...........rrrrRRR******] - |res|--|<avail (before syscall) - ^ - cur + after the syscall, slots between cur and tail are (a)vailable + head=cur tail + | | + v v + TX [.....aaaaaaaaaaa.............] + user creates new packets to (T)ransmit + head=cur tail + | | + v v + TX [.....TTTTTaaaaaa.............] + + NIOCTXSYNC/poll()/select() sends packets and reports new slots + head=cur tail + | | + v v + TX [..........aaaaaaaaaaa........] .Ed -.It Dv struct netmap_slot (one per packet) -contains the metadata for a packet: +.Pp +select() and poll() wlll block if there is no space in the ring, i.e. +.Dl ring->cur == ring->tail +and return when new slots have become available. +.Pp +High speed applications may want to amortize the cost of system calls +by preparing as many packets as possible before issuing them. +.Pp +A transmit ring with pending transmissions has +.Dl ring->head != ring->tail + 1 (modulo the ring size). +The function +.Va int nm_tx_pending(ring) +implements this test. +.Pp +.Ss RECEIVE RINGS +On receive rings, after a +.Nm +system call, the slots in the range +.Va head\& . . . tail-1 +contain received packets. +User code should process them and advance +.Va head +and +.Va cur +past slots it wants to return to the kernel. +.Va cur +may be moved further ahead if the user code wants to +wait for more packets +without returning all the previous slots to the kernel. +.Pp +At the next NIOCRXSYNC/select()/poll(), +slots up to +.Va head-1 +are returned to the kernel for further receives, and +.Va tail +may advance to report new incoming packets. +.br +Below is an example of the evolution of an RX ring: .Bd -literal -struct netmap_slot { - uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length */ - uint16_t flags; /* buf changed, etc. */ -#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */ -#define NS_REPORT 0x0002 /* tell hw to report results - * e.g. by generating an interrupt - */ -#define NS_FORWARD 0x0004 /* pass packet to the other endpoint - * (host stack or device) - */ -#define NS_NO_LEARN 0x0008 -#define NS_INDIRECT 0x0010 -#define NS_MOREFRAG 0x0020 -#define NS_PORT_SHIFT 8 -#define NS_PORT_MASK (0xff << NS_PORT_SHIFT) -#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) - uint64_t ptr; /* buffer address (indirect buffers) */ -}; + after the syscall, there are some (h)eld and some (R)eceived slots + head cur tail + | | | + v v v + RX [..hhhhhhRRRRRRRR..........] + + user advances head and cur, releasing some slots and holding others + head cur tail + | | | + v v v + RX [..*****hhhRRRRRR...........] + + NICRXSYNC/poll()/select() recovers slots and reports new packets + head cur tail + | | | + v v v + RX [.......hhhRRRRRRRRRRRR....] .Ed -The flags control how the the buffer associated to the slot -should be managed. -.It Dv packet buffers -are normally fixed size (2 Kbyte) buffers allocated by the kernel -that contain packet data. Buffers addresses are computed through -macros. -.El -.Bl -tag -width XXX -Some macros support the access to objects in the shared memory -region. In particular, -.It NETMAP_TXRING(nifp, i) -.It NETMAP_RXRING(nifp, i) -return the address of the i-th transmit and receive ring, -respectively, whereas -.It NETMAP_BUF(ring, buf_idx) -returns the address of the buffer with index buf_idx -(which can be part of any ring for the given interface). -.El .Pp -Normally, buffers are associated to slots when interfaces are bound, -and one packet is fully contained in a single buffer. -Clients can however modify the mapping using the -following flags: -.Ss FLAGS +.Sh SLOTS AND PACKET BUFFERS +Normally, packets should be stored in the netmap-allocated buffers +assigned to slots when ports are bound to a file descriptor. +One packet is fully contained in a single buffer. +.Pp +The following flags affect slot and buffer processing: .Bl -tag -width XXX .It NS_BUF_CHANGED -indicates that the buf_idx in the slot has changed. -This can be useful if the client wants to implement -some form of zero-copy forwarding (e.g. by passing buffers -from an input interface to an output interface), or -needs to process packets out of order. +it MUST be used when the buf_idx in the slot is changed. +This can be used to implement +zero-copy forwarding, see +.Sx ZERO-COPY FORWARDING . .Pp -The flag MUST be used whenever the buffer index is changed. .It NS_REPORT -indicates that we want to be woken up when this buffer -has been transmitted. This reduces performance but insures -a prompt notification when a buffer has been sent. +reports when this buffer has been transmitted. Normally, .Nm notifies transmit completions in batches, hence signals -can be delayed indefinitely. However, we need such notifications -before closing a descriptor. +can be delayed indefinitely. This flag helps detecting +when packets have been send and a file descriptor can be closed. .It NS_FORWARD -When the device is open in 'transparent' mode, -the client can mark slots in receive rings with this flag. -For all marked slots, marked packets are forwarded to -the other endpoint at the next system call, thus restoring -(in a selective way) the connection between the NIC and the -host stack. +When a ring is in 'transparent' mode (see +.Sx TRANSPARENT MODE ) , +packets marked with this flags are forwarded to the other endpoint +at the next system call, thus restoring (in a selective way) +the connection between a NIC and the host stack. .It NS_NO_LEARN tells the forwarding code that the SRC MAC address for this -packet should not be used in the learning bridge +packet must not be used in the learning bridge code. .It NS_INDIRECT -indicates that the packet's payload is not in the netmap -supplied buffer, but in a user-supplied buffer whose -user virtual address is in the 'ptr' field of the slot. +indicates that the packet's payload is in a user-supplied buffer, +whose user virtual address is in the 'ptr' field of the slot. The size can reach 65535 bytes. -.Em This is only supported on the transmit ring of virtual ports +.br +This is only supported on the transmit ring of +.Nm VALE +ports, and it helps reducing data copies in the interconnection +of virtual machines. .It NS_MOREFRAG indicates that the packet continues with subsequent buffers; the last buffer in a packet must have the flag clear. +.El +.Sh SCATTER GATHER I/O +Packets can span multiple slots if the +.Va NS_MOREFRAG +flag is set in all but the last slot. The maximum length of a chain is 64 buffers. -.Em This is only supported on virtual ports -.It NS_RFRAGS(slot) -on receive rings, returns the number of remaining buffers -in a packet, including this one. -Slots with a value greater than 1 also have NS_MOREFRAG set. -The length refers to the individual buffer, there is no -field for the total length. +This is normally used with +.Nm VALE +ports when connecting virtual machines, as they generate large +TSO segments that are not split unless they reach a physical device. .Pp -On transmit rings, if NS_DST is set, it is passed to the lookup -function, which can use it e.g. as the index of the destination -port instead of doing an address lookup. -.El +NOTE: The length field always refers to the individual +fragment; there is no place with the total length of a packet. +.Pp +On receive rings the macro +.Va NS_RFRAGS(slot) +indicates the remaining number of slots for this packet, +including the current one. +Slots with a value greater than 1 also have NS_MOREFRAG set. .Sh IOCTLS .Nm -supports some ioctl() to synchronize the state of the rings -between the kernel and the user processes, plus some -to query and configure the interface. -The former do not require any argument, whereas the latter -use a -.Pa struct nmreq -defined as follows: +uses two ioctls (NIOCTXSYNC, NIOCRXSYNC) +for non-blocking I/O. They take no argument. +Two more ioctls (NIOCGINFO, NIOCREGIF) are used +to query and configure ports, with the following argument: .Bd -literal struct nmreq { - char nr_name[IFNAMSIZ]; - uint32_t nr_version; /* API version */ -#define NETMAP_API 4 /* current version */ - uint32_t nr_offset; /* nifp offset in the shared region */ - uint32_t nr_memsize; /* size of the shared region */ - uint32_t nr_tx_slots; /* slots in tx rings */ - uint32_t nr_rx_slots; /* slots in rx rings */ - uint16_t nr_tx_rings; /* number of tx rings */ - uint16_t nr_rx_rings; /* number of tx rings */ - uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* we process the sw ring */ -#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the actual ring number */ - uint16_t nr_cmd; -#define NETMAP_BDG_ATTACH 1 /* attach the NIC */ -#define NETMAP_BDG_DETACH 2 /* detach the NIC */ -#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ -#define NETMAP_BDG_LIST 4 /* get bridge's info */ - uint16_t nr_arg1; - uint16_t nr_arg2; - uint32_t spare2[3]; + char nr_name[IFNAMSIZ]; /* (i) port name */ + uint32_t nr_version; /* (i) API version */ + uint32_t nr_offset; /* (o) nifp offset in mmap region */ + uint32_t nr_memsize; /* (o) size of the mmap region */ + uint32_t nr_tx_slots; /* (o) slots in tx rings */ + uint32_t nr_rx_slots; /* (o) slots in rx rings */ + uint16_t nr_tx_rings; /* (o) number of tx rings */ + uint16_t nr_rx_rings; /* (o) number of tx rings */ + uint16_t nr_ringid; /* (i) ring(s) we care about */ + uint16_t nr_cmd; /* (i) special command */ + uint16_t nr_arg1; /* (i) extra arguments */ + uint16_t nr_arg2; /* (i) extra arguments */ + ... }; - .Ed -A device descriptor obtained through +.Pp +A file descriptor obtained through .Pa /dev/netmap -also supports the ioctl supported by network devices. +also supports the ioctl supported by network devices, see +.Xr netintro 4 . .Pp -The netmap-specific -.Xr ioctl 2 -command codes below are defined in -.In net/netmap.h -and are: .Bl -tag -width XXXX .It Dv NIOCGINFO -returns EINVAL if the named device does not support netmap. +returns EINVAL if the named port does not support netmap. Otherwise, it returns 0 and (advisory) information -about the interface. +about the port. Note that all the information below can change before the interface is actually put in netmap mode. .Pp -.Pa nr_memsize -indicates the size of the netmap -memory region. Physical devices all share the same memory region, -whereas VALE ports may have independent regions for each port. -These sizes can be set through system-wise sysctl variables. -.Pa nr_tx_slots, nr_rx_slots +.Bl -tag -width XX +.It Pa nr_memsize +indicates the size of the +.Nm +memory region. NICs in +.Nm +mode all share the same memory region, +whereas +.Nm VALE +ports have independent regions for each port. +.It Pa nr_tx_slots , nr_rx_slots indicate the size of transmit and receive rings. -.Pa nr_tx_rings, nr_rx_rings +.It Pa nr_tx_rings , nr_rx_rings indicate the number of transmit and receive rings. Both ring number and sizes may be configured at runtime using interface-specific functions (e.g. -.Pa sysctl -or -.Pa ethtool . +.Xr ethtool +). +.El .It Dv NIOCREGIF -puts the interface named in nr_name into netmap mode, disconnecting -it from the host stack, and/or defines which rings are controlled -through this file descriptor. +binds the port named in +.Va nr_name +to the file descriptor. For a physical device this also switches it into +.Nm +mode, disconnecting +it from the host stack. +Multiple file descriptors can be bound to the same port, +with proper synchronization left to the user. +.Pp On return, it gives the same info as NIOCGINFO, and nr_ringid indicates the identity of the rings controlled through the file descriptor. .Pp -Possible values for nr_ringid are +.Va nr_ringid +selects which rings are controlled through this file descriptor. +Possible values are: .Bl -tag -width XXXXX .It 0 -default, all hardware rings +(default) all hardware rings .It NETMAP_SW_RING -the ``host rings'' connecting to the host stack -.It NETMAP_HW_RING + i -the i-th hardware ring +the ``host rings'', connecting to the host stack. +.It NETMAP_HW_RING | i +the i-th hardware ring . .El +.Pp By default, a -.Nm poll +.Xr poll 2 or -.Nm select +.Xr select 2 call pushes out any pending packets on the transmit ring, even if no write events are specified. The feature can be disabled by or-ing -.Nm NETMAP_NO_TX_SYNC -to nr_ringid. -But normally you should keep this feature unless you are using -separate file descriptors for the send and receive rings, because -otherwise packets are pushed out only if NETMAP_TXSYNC is called, -or the send queue is full. -.Pp -.Pa NIOCREGIF -can be used multiple times to change the association of a -file descriptor to a ring pair, always within the same device. +.Va NETMAP_NO_TX_SYNC +to the value written to +.Va nr_ringid. +When this feature is used, +packets are transmitted only on +.Va ioctl(NIOCTXSYNC) +or select()/poll() are called with a write event (POLLOUT/wfdset) or a full ring. .Pp When registering a virtual interface that is dynamically created to a .Xr vale 4 @@ -467,6 +579,164 @@ number of slots available for transmission. tells the hardware of consumed packets, and asks for newly available packets. .El +.Sh SELECT AND POLL +.Xr select 2 +and +.Xr poll 2 +on a +.Nm +file descriptor process rings as indicated in +.Sx TRANSMIT RINGS +and +.Sx RECEIVE RINGS +when write (POLLOUT) and read (POLLIN) events are requested. +.Pp +Both block if no slots are available in the ring ( +.Va ring->cur == ring->tail ) +.Pp +Packets in transmit rings are normally pushed out even without +requesting write events. Passing the NETMAP_NO_TX_SYNC flag to +.Em NIOCREGIF +disables this feature. +.Sh LIBRARIES +The +.Nm +API is supposed to be used directly, both because of its simplicity and +for efficient integration with applications. +.Pp +For conveniency, the +.Va <net/netmap_user.h> +header provides a few macros and functions to ease creating +a file descriptor and doing I/O with a +.Nm +port. These are loosely modeled after the +.Xr pcap 3 +API, to ease porting of libpcap-based applications to +.Nm . +To use these extra functions, programs should +.Dl #define NETMAP_WITH_LIBS +before +.Dl #include <net/netmap_user.h> +.Pp +The following functions are available: +.Bl -tag -width XXXXX +.It Va struct nm_desc_t * nm_open(const char *ifname, const char *ring_name, int flags, int ring_flags) +similar to +.Xr pcap_open , +binds a file descriptor to a port. +.Bl -tag -width XX +.It Va ifname +is a port name, in the form "netmap:XXX" for a NIC and "valeXXX:YYY" for a +.Nm VALE +port. +.It Va flags +can be set to +.Va NETMAP_SW_RING +to bind to the host ring pair, +or to NETMAP_HW_RING to bind to a specific ring. +.Va ring_name +with NETMAP_HW_RING, +is interpreted as a string or an integer indicating the ring to use. +.It Va ring_flags +is copied directly into the ring flags, to specify additional parameters +such as NR_TIMESTAMP or NR_FORWARD. +.El +.It Va int nm_close(struct nm_desc_t *d) +closes the file descriptor, unmaps memory, frees resources. +.It Va int nm_inject(struct nm_desc_t *d, const void *buf, size_t size) +similar to pcap_inject(), pushes a packet to a ring, returns the size +of the packet is successful, or 0 on error; +.It Va int nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) +similar to pcap_dispatch(), applies a callback to incoming packets +.It Va u_char * nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr) +similar to pcap_next(), fetches the next packet +.Pp +.El +.Sh SUPPORTED DEVICES +.Nm +natively supports the following devices: +.Pp +On FreeBSD: +.Xr em 4 , +.Xr igb 4 , +.Xr ixgbe 4 , +.Xr lem 4 , +.Xr re 4 . +.Pp +On Linux +.Xr e1000 4 , +.Xr e1000e 4 , +.Xr igb 4 , +.Xr ixgbe 4 , +.Xr mlx4 4 , +.Xr forcedeth 4 , +.Xr r8169 4 . +.Pp +NICs without native support can still be used in +.Nm +mode through emulation. Performance is inferior to native netmap +mode but still significantly higher than sockets, and approaching +that of in-kernel solutions such as Linux's +.Xr pktgen . +.Pp +Emulation is also available for devices with native netmap support, +which can be used for testing or performance comparison. +The sysctl variable +.Va dev.netmap.admode +globally controls how netmap mode is implemented. +.Sh SYSCTL VARIABLES AND MODULE PARAMETERS +Some aspect of the operation of +.Nm +are controlled through sysctl variables on FreeBSD +.Em ( dev.netmap.* ) +and module parameters on Linux +.Em ( /sys/module/netmap_lin/parameters/* ) : +.Pp +.Bl -tag -width indent +.It Va dev.netmap.admode: 0 +Controls the use of native or emulated adapter mode. +0 uses the best available option, 1 forces native and +fails if not available, 2 forces emulated hence never fails. +.It Va dev.netmap.generic_ringsize: 1024 +Ring size used for emulated netmap mode +.It Va dev.netmap.generic_mit: 100000 +Controls interrupt moderation for emulated mode +.It Va dev.netmap.mmap_unreg: 0 +.It Va dev.netmap.fwd: 0 +Forces NS_FORWARD mode +.It Va dev.netmap.flags: 0 +.It Va dev.netmap.txsync_retry: 2 +.It Va dev.netmap.no_pendintr: 1 +Forces recovery of transmit buffers on system calls +.It Va dev.netmap.mitigate: 1 +Propagates interrupt mitigation to user processes +.It Va dev.netmap.no_timestamp: 0 +Disables the update of the timestamp in the netmap ring +.It Va dev.netmap.verbose: 0 +Verbose kernel messages +.It Va dev.netmap.buf_num: 163840 +.It Va dev.netmap.buf_size: 2048 +.It Va dev.netmap.ring_num: 200 +.It Va dev.netmap.ring_size: 36864 +.It Va dev.netmap.if_num: 100 +.It Va dev.netmap.if_size: 1024 +Sizes and number of objects (netmap_if, netmap_ring, buffers) +for the global memory region. The only parameter worth modifying is +.Va dev.netmap.buf_num +as it impacts the total amount of memory used by netmap. +.It Va dev.netmap.buf_curr_num: 0 +.It Va dev.netmap.buf_curr_size: 0 +.It Va dev.netmap.ring_curr_num: 0 +.It Va dev.netmap.ring_curr_size: 0 +.It Va dev.netmap.if_curr_num: 0 +.It Va dev.netmap.if_curr_size: 0 +Actual values in use. +.It Va dev.netmap.bridge_batch: 1024 +Batch size used when moving packets across a +.Nm VALE +switch. Values above 64 generally guarantee good +performance. +.El .Sh SYSTEM CALLS .Nm uses @@ -476,6 +746,9 @@ and to wake up processes when significant events occur, and .Xr mmap 2 to map memory. +.Xr ioctl 2 +is used to configure ports and +.Nm VALE switches . .Pp Applications may need to create threads and bind them to specific cores to improve performance, using standard @@ -484,47 +757,176 @@ OS primitives, see In particular, .Xr pthread_setaffinity_np 3 may be of use. +.Sh CAVEATS +No matter how fast the CPU and OS are, +achieving line rate on 10G and faster interfaces +requires hardware with sufficient performance. +Several NICs are unable to sustain line rate with +small packet sizes. Insufficient PCIe or memory bandwidth +can also cause reduced performance. +.Pp +Another frequent reason for low performance is the use +of flow control on the link: a slow receiver can limit +the transmit speed. +Be sure to disable flow control when running high +speed experiments. +.Pp +.Ss SPECIAL NIC FEATURES +.Nm +is orthogonal to some NIC features such as +multiqueue, schedulers, packet filters. +.Pp +Multiple transmit and receive rings are supported natively +and can be configured with ordinary OS tools, +such as +.Xr ethtool +or +device-specific sysctl variables. +The same goes for Receive Packet Steering (RPS) +and filtering of incoming traffic. +.Pp +.Nm +.Em does not use +features such as +.Em checksum offloading , TCP segmentation offloading , +.Em encryption , VLAN encapsulation/decapsulation , +etc. . +When using netmap to exchange packets with the host stack, +make sure to disable these features. .Sh EXAMPLES +.Ss TEST PROGRAMS +.Nm +comes with a few programs that can be used for testing or +simple applications. +See the +.Va examples/ +directory in +.Nm +distributions, or +.Va tools/tools/netmap/ +directory in FreeBSD distributions. +.Pp +.Xr pkt-gen +is a general purpose traffic source/sink. +.Pp +As an example +.Dl pkt-gen -i ix0 -f tx -l 60 +can generate an infinite stream of minimum size packets, and +.Dl pkt-gen -i ix0 -f rx +is a traffic sink. +Both print traffic statistics, to help monitor +how the system performs. +.Pp +.Xr pkt-gen +has many options can be uses to set packet sizes, addresses, +rates, and use multiple send/receive threads and cores. +.Pp +.Xr bridge +is another test program which interconnects two +.Nm +ports. It can be used for transparent forwarding between +interfaces, as in +.Dl bridge -i ix0 -i ix1 +or even connect the NIC to the host stack using netmap +.Dl bridge -i ix0 -i ix0 +.Ss USING THE NATIVE API The following code implements a traffic generator .Pp .Bd -literal -compact -#include <net/netmap.h> #include <net/netmap_user.h> -struct netmap_if *nifp; -struct netmap_ring *ring; -struct nmreq nmr; +... +void sender(void) +{ + struct netmap_if *nifp; + struct netmap_ring *ring; + struct nmreq nmr; + struct pollfd fds; -fd = open("/dev/netmap", O_RDWR); -bzero(&nmr, sizeof(nmr)); -strcpy(nmr.nr_name, "ix0"); -nmr.nm_version = NETMAP_API; -ioctl(fd, NIOCREGIF, &nmr); -p = mmap(0, nmr.nr_memsize, fd); -nifp = NETMAP_IF(p, nmr.nr_offset); -ring = NETMAP_TXRING(nifp, 0); -fds.fd = fd; -fds.events = POLLOUT; -for (;;) { - poll(list, 1, -1); - for ( ; ring->avail > 0 ; ring->avail--) { - i = ring->cur; - buf = NETMAP_BUF(ring, ring->slot[i].buf_index); - ... prepare packet in buf ... - ring->slot[i].len = ... packet length ... - ring->cur = NETMAP_RING_NEXT(ring, i); + fd = open("/dev/netmap", O_RDWR); + bzero(&nmr, sizeof(nmr)); + strcpy(nmr.nr_name, "ix0"); + nmr.nm_version = NETMAP_API; + ioctl(fd, NIOCREGIF, &nmr); + p = mmap(0, nmr.nr_memsize, fd); + nifp = NETMAP_IF(p, nmr.nr_offset); + ring = NETMAP_TXRING(nifp, 0); + fds.fd = fd; + fds.events = POLLOUT; + for (;;) { + poll(&fds, 1, -1); + while (!nm_ring_empty(ring)) { + i = ring->cur; + buf = NETMAP_BUF(ring, ring->slot[i].buf_index); + ... prepare packet in buf ... + ring->slot[i].len = ... packet length ... + ring->head = ring->cur = nm_ring_next(ring, i); + } } } .Ed -.Sh SUPPORTED INTERFACES +.Ss HELPER FUNCTIONS +A simple receiver can be implemented using the helper functions +.Bd -literal -compact +#define NETMAP_WITH_LIBS +#include <net/netmap_user.h> +... +void receiver(void) +{ + struct nm_desc_t *d; + struct pollfd fds; + u_char *buf; + struct nm_hdr_t h; + ... + d = nm_open("netmap:ix0", NULL, 0, 0); + fds.fd = NETMAP_FD(d); + fds.events = POLLIN; + for (;;) { + poll(&fds, 1, -1); + while ( (buf = nm_nextpkt(d, &h)) ) + consume_pkt(buf, h->len); + } + nm_close(d); +} +.Ed +.Ss ZERO-COPY FORWARDING +Since physical interfaces share the same memory region, +it is possible to do packet forwarding between ports +swapping buffers. The buffer from the transmit ring is used +to replenish the receive ring: +.Bd -literal -compact + uint32_t tmp; + struct netmap_slot *src, *dst; + ... + src = &src_ring->slot[rxr->cur]; + dst = &dst_ring->slot[txr->cur]; + tmp = dst->buf_idx; + dst->buf_idx = src->buf_idx; + dst->len = src->len; + dst->flags = NS_BUF_CHANGED; + src->buf_idx = tmp; + src->flags = NS_BUF_CHANGED; + rxr->head = rxr->cur = nm_ring_next(rxr, rxr->cur); + txr->head = txr->cur = nm_ring_next(txr, txr->cur); + ... +.Ed +.Ss ACCESSING THE HOST STACK +.Ss VALE SWITCH +A simple way to test the performance of a +.Nm VALE +switch is to attach a sender and a receiver to it, +e.g. running the following in two different terminals: +.Dl pkt-gen -i vale1:a -f rx # receiver +.Dl pkt-gen -i vale1:b -f tx # sender +.Pp +The following command attaches an interface and the host stack +to a switch: +.Dl vale-ctl -h vale2:em0 +Other .Nm -supports the following interfaces: -.Xr em 4 , -.Xr igb 4 , -.Xr ixgbe 4 , -.Xr lem 4 , -.Xr re 4 +clients attached to the same switch can now communicate +with the network card or the host. +.Pp .Sh SEE ALSO -.Xr vale 4 .Pp http://info.iet.unipi.it/~luigi/netmap/ .Pp @@ -551,3 +953,20 @@ and .Nm VALE have been funded by the European Commission within FP7 Projects CHANGE (257422) and OPENLAB (287581). +.Pp +.Ss SPECIAL MODES +When the device name has the form +.Dl valeXXX:ifname (ifname is an existing interface) +the physical interface +(and optionally the corrisponding host stack endpoint) +are connected or disconnected from the +.Nm VALE +switch named XXX. +In this case the +.Pa ioctl() +is only used only for configuration, typically through the +.Xr vale-ctl +command. +The file descriptor cannot be used for I/O, and should be +closed after issuing the +.Pa ioctl() . diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index 580407a529fd..428612a4a695 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -4352,7 +4352,7 @@ em_initialize_receive_unit(struct adapter *adapter) * preserve the rx buffers passed to userspace. */ if (ifp->if_capenable & IFCAP_NETMAP) - rdt -= NA(adapter->ifp)->rx_rings[i].nr_hwavail; + rdt -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[i]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rdt); } diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index 57e4f893ab35..2134e29625cc 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -4630,13 +4630,13 @@ igb_initialize_receive_units(struct adapter *adapter) * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. * In this driver it means we adjust RDT to - * somthing different from next_to_refresh + * something different from next_to_refresh * (which is not used in netmap mode). */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = rxr->next_to_refresh - kring->nr_hwavail; + int t = rxr->next_to_refresh - nm_kr_rxspace(kring); if (t >= adapter->num_rx_desc) t -= adapter->num_rx_desc; diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c index a3da50c176ed..8014a0f9fde7 100644 --- a/sys/dev/e1000/if_lem.c +++ b/sys/dev/e1000/if_lem.c @@ -3367,7 +3367,7 @@ lem_initialize_receive_unit(struct adapter *adapter) #ifdef DEV_NETMAP /* preserve buffers already made available to clients */ if (ifp->if_capenable & IFCAP_NETMAP) - rctl -= NA(adapter->ifp)->rx_rings[0].nr_hwavail; + rctl -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[0]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), rctl); diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c index 740f7709e5b2..6dfec02cc8d9 100644 --- a/sys/dev/ixgbe/ixgbe.c +++ b/sys/dev/ixgbe/ixgbe.c @@ -1245,7 +1245,7 @@ ixgbe_init_locked(struct adapter *adapter) if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = na->num_rx_desc - 1 - kring->nr_hwavail; + int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t); } else diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index dbbee4222407..17b4c4fd2e14 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -120,9 +120,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; @@ -130,9 +130,6 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -141,9 +138,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -175,9 +172,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, @@ -190,26 +185,20 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { - int delta; - + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; } - delta = nic_i - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; + if (nic_i != txr->next_to_clean) { txr->next_to_clean = nic_i; - kring->nr_hwavail += delta; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -226,16 +215,16 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -251,7 +240,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i = rxr->next_to_check; nm_i = netmap_idx_n2k(kring, nic_i); - for (n = 0; ; n++) { + for (n = 0; ; n++) { // XXX no need to count struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->status); @@ -268,7 +257,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) } if (n) { /* update the state variables */ rxr->next_to_check = nic_i; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } @@ -277,9 +266,9 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -302,8 +291,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); @@ -311,12 +299,12 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h index b91d0baba06f..e1929f0918e2 100644 --- a/sys/dev/netmap/if_igb_netmap.h +++ b/sys/dev/netmap/if_igb_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -88,9 +88,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; @@ -101,9 +101,6 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) u32 olinfo_status = (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -112,9 +109,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -155,9 +152,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* Set the watchdog XXX ? */ txr->queue_status = IGB_QUEUE_WORKING; @@ -174,26 +169,18 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { - int delta; - + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; } - delta = nic_i - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = nic_i; - kring->nr_hwavail += delta; - } + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -210,16 +197,16 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -250,7 +237,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) } if (n) { /* update the state variables */ rxr->next_to_check = nic_i; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } @@ -259,9 +246,9 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -284,8 +271,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); @@ -293,12 +279,12 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index 8ad3b7a2a352..4fce5c988d09 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -91,18 +91,14 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; /* device-specific */ struct adapter *adapter = ifp->if_softc; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -111,9 +107,9 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -145,9 +141,7 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, @@ -160,26 +154,19 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { - int delta; - + if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + kring->last_reclaim = ticks; /* record completed transmissions using TDH */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; } - delta = nic_i - adapter->next_tx_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - adapter->next_tx_to_clean = nic_i; - kring->nr_hwavail += delta; - } + adapter->next_tx_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -196,15 +183,15 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -241,9 +228,14 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ + ND("%d new packets at nic %d nm %d tail %d", + n, + adapter->next_rx_desc_to_check, + netmap_idx_n2k(kring, adapter->next_rx_desc_to_check), + kring->nr_hwtail); adapter->next_rx_desc_to_check = nic_i; // ifp->if_ipackets += n; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } @@ -252,9 +244,9 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -277,20 +269,19 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h index 2c7ba060cffd..10abe4f49f83 100644 --- a/sys/dev/netmap/if_re_netmap.h +++ b/sys/dev/netmap/if_re_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -72,17 +72,14 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* device-specific */ struct rl_softc *sc = ifp->if_softc; struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ? @@ -91,11 +88,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * First part: process new packets to send. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = sc->rl_ldata.rl_tx_prodidx; // XXX or netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -132,9 +129,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i = nm_next(nic_i, lim); } sc->rl_ldata.rl_tx_prodidx = nic_i; - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwcur = cur; /* the saved ring->cur */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, @@ -148,7 +143,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { nic_i = sc->rl_ldata.rl_tx_considx; for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx; n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) { @@ -160,11 +155,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) if (n > 0) { sc->rl_ldata.rl_tx_considx = nic_i; sc->rl_ldata.rl_tx_free += n; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -181,16 +176,16 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct rl_softc *sc = ifp->if_softc; struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, @@ -202,16 +197,17 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * * This device uses all the buffers in the ring, so we need * another termination condition in addition to RL_RDESC_STAT_OWN - * cleared (all buffers could have it cleared. The easiest one - * is to limit the amount of data reported up to 'lim' + * cleared (all buffers could have it cleared). The easiest one + * is to stop right before nm_hwcur. */ if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + uint32_t stop_i = nm_prev(kring->nr_hwcur, lim); nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ nm_i = netmap_idx_n2k(kring, nic_i); - for (n = kring->nr_hwavail; n < lim ; n++) { + while (nm_i != stop_i) { struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i]; uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); uint32_t total_len; @@ -226,14 +222,12 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* sync was in re_newbuf() */ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD); + // sc->rl_ifp->if_ipackets++; nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - if (n != kring->nr_hwavail) { - sc->rl_ldata.rl_rx_prodidx = nic_i; - sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; - kring->nr_hwavail = n; - } + sc->rl_ldata.rl_rx_prodidx = nic_i; + kring->nr_hwtail = nm_i; kring->nr_kflags &= ~NKR_PENDINTR; } @@ -241,9 +235,9 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -272,8 +266,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, @@ -281,7 +274,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; @@ -336,36 +329,35 @@ re_netmap_rx_init(struct rl_softc *sc) struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); struct rl_desc *desc = sc->rl_ldata.rl_rx_list; uint32_t cmdstat; - int i, n, max_avail; + uint32_t nic_i, max_avail; + uint32_t const n = sc->rl_ldata.rl_rx_desc_cnt; if (!slot) return; - n = sc->rl_ldata.rl_rx_desc_cnt; /* - * Userspace owned hwavail packets before the reset, - * so the NIC that last hwavail descriptors of the ring - * are still owned by the driver (and keep one empty). + * Do not release the slots owned by userspace, + * and also keep one empty. */ - max_avail = n - 1 - na->rx_rings[0].nr_hwavail; - for (i = 0; i < n; i++) { + max_avail = n - 1 - nm_kr_rxspace(&na->rx_rings[0]); + for (nic_i = 0; nic_i < n; nic_i++) { void *addr; uint64_t paddr; - int l = netmap_idx_n2k(&na->rx_rings[0], i); + uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i); - addr = PNMB(slot + l, &paddr); + addr = PNMB(slot + nm_i, &paddr); netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, addr); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, BUS_DMASYNC_PREREAD); - desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); - desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD); + desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); cmdstat = NETMAP_BUF_SIZE; - if (i == n - 1) /* mark the end of ring */ + if (nic_i == n - 1) /* mark the end of ring */ cmdstat |= RL_RDESC_CMD_EOR; - if (i < max_avail) + if (nic_i < max_avail) cmdstat |= RL_RDESC_CMD_OWN; - desc[i].rl_cmdstat = htole32(cmdstat); + desc[nic_i].rl_cmdstat = htole32(cmdstat); } } diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index 4dea6639d325..a617cc4c2429 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -141,14 +141,13 @@ ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) /* * Reconcile kernel and user view of the transmit ring. * - * Userspace wants to send packets up to the one before ring->cur, + * All information is in the kring. + * Userspace wants to send packets up to the one before kring->rhead, * kernel knows kring->nr_hwcur is the first unsent packet. * * Here we push packets out (as many as possible), and possibly * reclaim buffers from previously completed transmission. * - * ring->avail is not used on input, but it is updated on return. - * * The caller (netmap) guarantees that there is only one instance * running at any time. Any interference with other driver * methods should be handled by the individual drivers. @@ -161,9 +160,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* * interrupts on every tx packet are expensive so request * them every half ring, or where NS_REPORT is set @@ -175,9 +174,6 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct tx_ring *txr = &adapter->tx_rings[ring_nr]; int reclaim_tx; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -199,7 +195,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ /* - * If we have packets to send (kring->nr_hwcur != ring->cur) + * If we have packets to send (kring->nr_hwcur != kring->rhead) * iterate over the netmap ring, fetch length and update * the corresponding slot in the NIC ring. Some drivers also * need to update the buffer's physical address in the NIC slot @@ -217,13 +213,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); __builtin_prefetch(&ring->slot[nm_i]); __builtin_prefetch(&txr->tx_buffers[nic_i]); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -262,9 +258,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, @@ -281,7 +275,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ if (flags & NAF_FORCE_RECLAIM) { reclaim_tx = 1; /* forced reclaim */ - } else if (kring->nr_hwavail > 0) { + } else if (!nm_kr_txempty(kring)) { reclaim_tx = 0; /* have buffers, no reclaim */ } else { /* @@ -321,21 +315,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i -= kring->nkr_num_slots; } if (nic_i != txr->next_to_clean) { - n = (nic_i + lim + 1) - txr->next_to_clean; - if (n > lim) - n -= lim + 1; /* some tx completed, increment avail */ txr->next_to_clean = nic_i; - kring->nr_hwavail += n; - if (kring->nr_hwavail > lim) { - RD(5, "bad hwavail %d", - kring->nr_hwavail); - return netmap_ring_reinit(kring); - } + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -347,14 +333,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * The caller guarantees a single invocations, but races against * the rest of the driver should be handled here. * - * When called, userspace has released buffers up to - * ring->cur - ring->reserved (last one excluded). - * - * The last interrupt reported kring->nr_hwavail slots available - * after kring->nr_hwcur. - * We must subtract the newly consumed slots (cur - nr_hwcur) - * from nr_hwavail, make the descriptors available for the next reads, - * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail. + * On call, kring->rhead is the first packet that userspace wants + * to keep, and kring->rcur is the wakeup point. + * The kernel has previously reported packets up to kring->rtail. * * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective * of whether or not we received an interrupt. @@ -367,16 +348,16 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -391,8 +372,8 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * - * nm_i = (kring->nr_hwcur + kring->nr_hwavail) % ring_size * nic_i = rxr->next_to_check; + * nm_i = kring->nr_hwtail (previous) * and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * @@ -402,7 +383,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) int crclen = ix_crcstrip ? 0 : 4; uint16_t slot_flags = kring->nkr_slot_flags; - nic_i = rxr->next_to_check; + nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail) nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { @@ -425,23 +406,23 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) ix_rx_miss_bufs += n; } rxr->next_to_check = nic_i; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } /* * Second part: skip past packets that userspace has released. - * (kring->nr_hwcur to ring->cur - ring->reserved excluded), + * (kring->nr_hwcur to kring->rhead excluded), * and make the buffers available for reception. * As usual nm_i is the index in the netmap ring, * nic_i is the index in the NIC ring, and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -464,8 +445,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); @@ -473,12 +453,12 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 478d9374937f..358d4693dcb3 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -151,7 +151,6 @@ ports attached to the switch) #include <machine/bus.h> /* bus_dmamap_* */ #include <sys/endian.h> #include <sys/refcount.h> -#include <sys/jail.h> /* reduce conditional code */ @@ -226,9 +225,6 @@ enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ NETMAP_ADMODE_NATIVE, /* either native or none */ NETMAP_ADMODE_GENERIC, /* force generic */ NETMAP_ADMODE_LAST }; -#define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */ -#define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */ -#define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */ static int netmap_admode = NETMAP_ADMODE_BEST; int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ @@ -252,6 +248,10 @@ nm_kr_get(struct netmap_kring *kr) } +/* + * mark the ring as stopped, and run through the locks + * to make sure other users get to see it. + */ void netmap_disable_ring(struct netmap_kring *kr) { @@ -380,7 +380,6 @@ nm_dump_buf(char *p, int len, int lim, char *dst) } - /* * Fetch configuration from the device, to cope with dynamic * reconfigurations after loading the module. @@ -432,6 +431,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail u_int i, len, ndesc; struct netmap_kring *kring; + // XXX additional space for extra rings ? len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); @@ -441,19 +441,23 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail } na->rx_rings = na->tx_rings + ntx; + /* + * All fields in krings are 0 except the one initialized below. + * but better be explicit on important kring fields. + */ ndesc = na->num_tx_desc; for (i = 0; i < ntx; i++) { /* Transmit rings */ kring = &na->tx_rings[i]; bzero(kring, sizeof(*kring)); kring->na = na; + kring->ring_id = i; kring->nkr_num_slots = ndesc; /* - * IMPORTANT: - * Always keep one slot empty, so we can detect new - * transmissions comparing cur and nr_hwcur (they are - * the same only if there are no new transmissions). + * IMPORTANT: Always keep one slot empty. */ - kring->nr_hwavail = ndesc - 1; + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = ndesc - 1; + snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -463,7 +467,11 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail kring = &na->rx_rings[i]; bzero(kring, sizeof(*kring)); kring->na = na; + kring->ring_id = i; kring->nkr_num_slots = ndesc; + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = 0; + snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -473,10 +481,10 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail na->tailroom = na->rx_rings + nrx; return 0; - } +/* XXX check boundaries */ void netmap_krings_delete(struct netmap_adapter *na) { @@ -493,6 +501,23 @@ netmap_krings_delete(struct netmap_adapter *na) } +/* + * Destructor for NIC ports. They also have an mbuf queue + * on the rings connected to the host so we need to purge + * them first. + */ +static void +netmap_hw_krings_delete(struct netmap_adapter *na) +{ + struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; + + ND("destroy sw mbq with len %d", mbq_len(q)); + mbq_purge(q); + mbq_safe_destroy(q); + netmap_krings_delete(na); +} + + static struct netmap_if* netmap_if_new(const char *ifname, struct netmap_adapter *na) { @@ -721,6 +746,7 @@ netmap_dtor(void *data) /* * pass a chain of buffers to the host stack as coming from 'dst' + * We do not need to lock because the queue is private. */ static void netmap_send_up(struct ifnet *dst, struct mbq *q) @@ -739,39 +765,30 @@ netmap_send_up(struct ifnet *dst, struct mbq *q) /* * put a copy of the buffers marked NS_FORWARD into an mbuf chain. - * Run from hwcur to cur - reserved + * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) + * and pass them up. Drop remaining packets in the unlikely event + * of an mbuf shortage. */ static void netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) { - /* Take packets from hwcur to cur-reserved and pass them up. - * In case of no buffers we give up. At the end of the loop, - * the queue is drained in all cases. - * XXX handle reserved - */ - u_int lim = kring->nkr_num_slots - 1; - struct mbuf *m; - u_int k = kring->ring->cur, n = kring->ring->reserved; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->ring->head; + u_int n; struct netmap_adapter *na = kring->na; - /* compute the final position, ring->cur - ring->reserved */ - if (n > 0) { - if (k < n) - k += kring->nkr_num_slots; - k += n; - } - for (n = kring->nr_hwcur; n != k;) { + for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { + struct mbuf *m; struct netmap_slot *slot = &kring->ring->slot[n]; - n = nm_next(n, lim); if ((slot->flags & NS_FORWARD) == 0 && !force) continue; if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { - D("bad pkt at %d len %d", n, slot->len); + RD(5, "bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? - /* XXX adapt to the case of a multisegment packet */ + /* XXX TODO: adapt to the case of a multisegment packet */ m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) @@ -782,69 +799,54 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) /* - * The host ring has packets from nr_hwcur to (cur - reserved) - * to be sent down to the NIC. - * We need to use the queue lock on the source (host RX ring) - * to protect against netmap_transmit. - * If the user is well behaved we do not need to acquire locks - * on the destination(s), - * so we only need to make sure that there are no panics because - * of user errors. - * XXX verify - * - * We scan the tx rings, which have just been - * flushed so nr_hwcur == cur. Pushing packets down means - * increment cur and decrement avail. - * XXX to be verified + * Send to the NIC rings packets marked NS_FORWARD between + * kring->nr_hwcur and kring->rhead + * Called under kring->rx_queue.lock on the sw rx ring, */ -static void +static u_int netmap_sw_to_nic(struct netmap_adapter *na) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; - struct netmap_kring *k1 = &na->tx_rings[0]; - u_int i, howmany, src_lim, dst_lim; - - /* XXX we should also check that the carrier is on */ - if (kring->nkr_stopped) - return; + struct netmap_slot *rxslot = kring->ring->slot; + u_int i, rxcur = kring->nr_hwcur; + u_int const head = kring->rhead; + u_int const src_lim = kring->nkr_num_slots - 1; + u_int sent = 0; + + /* scan rings to find space, then fill as much as possible */ + for (i = 0; i < na->num_tx_rings; i++) { + struct netmap_kring *kdst = &na->tx_rings[i]; + struct netmap_ring *rdst = kdst->ring; + u_int const dst_lim = kdst->nkr_num_slots - 1; + + /* XXX do we trust ring or kring->rcur,rtail ? */ + for (; rxcur != head && !nm_ring_empty(rdst); + rxcur = nm_next(rxcur, src_lim) ) { + struct netmap_slot *src, *dst, tmp; + u_int dst_cur = rdst->cur; - mtx_lock(&kring->q_lock); + src = &rxslot[rxcur]; + if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) + continue; - if (kring->nkr_stopped) - goto out; + sent++; - howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ + dst = &rdst->slot[dst_cur]; - src_lim = kring->nkr_num_slots - 1; - for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { - ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); - dst_lim = k1->nkr_num_slots - 1; - while (howmany > 0 && k1->ring->avail > 0) { - struct netmap_slot *src, *dst, tmp; - src = &kring->ring->slot[kring->nr_hwcur]; - dst = &k1->ring->slot[k1->ring->cur]; tmp = *src; + src->buf_idx = dst->buf_idx; src->flags = NS_BUF_CHANGED; dst->buf_idx = tmp.buf_idx; dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - ND("out len %d buf %d from %d to %d", - dst->len, dst->buf_idx, - kring->nr_hwcur, k1->ring->cur); - - kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); - howmany--; - kring->nr_hwavail--; - k1->ring->cur = nm_next(k1->ring->cur, dst_lim); - k1->ring->avail--; + + rdst->cur = nm_next(dst_cur, dst_lim); } - kring->ring->cur = kring->nr_hwcur; // XXX - k1++; // XXX why? + /* if (sent) XXX txsync ? */ } -out: - mtx_unlock(&kring->q_lock); + return sent; } @@ -859,7 +861,8 @@ netmap_txsync_to_host(struct netmap_adapter *na) { struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; - u_int k, lim = kring->nkr_num_slots - 1; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_txsync_prologue(kring); struct mbq q; int error; @@ -869,22 +872,27 @@ netmap_txsync_to_host(struct netmap_adapter *na) D("ring %p busy (user error)", kring); return; } - k = ring->cur; - if (k > lim) { + if (head > lim) { D("invalid ring index in stack TX kring %p", kring); netmap_ring_reinit(kring); nm_kr_put(kring); return; } - /* Take packets from hwcur to cur and pass them up. + /* Take packets from hwcur to head and pass them up. + * force head = cur since netmap_grab_packets() stops at head * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. */ mbq_init(&q); - netmap_grab_packets(kring, &q, 1); - kring->nr_hwcur = k; - kring->nr_hwavail = ring->avail = lim; + ring->cur = head; + netmap_grab_packets(kring, &q, 1 /* force */); + ND("have %d pkts in queue", mbq_len(&q)); + kring->nr_hwcur = head; + kring->nr_hwtail = head + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + nm_txsync_finalize(kring); nm_kr_put(kring); netmap_send_up(na->ifp, &q); @@ -893,60 +901,89 @@ netmap_txsync_to_host(struct netmap_adapter *na) /* * rxsync backend for packets coming from the host stack. - * They have been put in the queue by netmap_transmit() so we - * need to protect access to the kring using a lock. + * They have been put in kring->rx_queue by netmap_transmit(). + * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler * (we know because td != NULL). * * NOTE: on linux, selrecord() is defined as a macro and uses pwait * as an additional hidden argument. + * returns the number of packets delivered to tx queues in + * transparent mode, or a negative value if error */ -static void +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_ring *ring = kring->ring; - u_int j, n, lim = kring->nkr_num_slots; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i, n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int ret = 0; + struct mbq *q = &kring->rx_queue; (void)pwait; /* disable unused warnings */ - if (kring->nkr_stopped) /* check a first time without lock */ - return; + if (head > lim) { + netmap_ring_reinit(kring); + return EINVAL; + } - mtx_lock(&kring->q_lock); + if (kring->nkr_stopped) /* check a first time without lock */ + return EBUSY; - if (kring->nkr_stopped) /* check again with lock held */ - goto unlock_out; + mtx_lock(&q->lock); - if (k >= lim) { - netmap_ring_reinit(kring); + if (kring->nkr_stopped) { /* check again with lock held */ + ret = EBUSY; goto unlock_out; } - /* new packets are already set in nr_hwavail */ - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... + + /* First part: import newly received packets */ + n = mbq_len(q); + if (n) { /* grab packets from the queue */ + struct mbuf *m; + uint32_t stop_i; + + nm_i = kring->nr_hwtail; + stop_i = nm_prev(nm_i, lim); + while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { + int len = MBUF_LEN(m); + struct netmap_slot *slot = &ring->slot[nm_i]; + + m_copydata(m, 0, len, BDG_NMB(na, slot)); + ND("nm %d len %d", nm_i, len); + if (netmap_verbose) + D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); + + slot->len = len; + slot->flags = kring->nkr_slot_flags; + nm_i = nm_next(nm_i, lim); } - k = (k >= resvd) ? k - resvd : k + lim - resvd; + kring->nr_hwtail = nm_i; } - if (j != k) { - n = k >= j ? k - j : k + lim - j; - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* something was released */ + if (netmap_fwd || kring->ring->flags & NR_FORWARD) + ret = netmap_sw_to_nic(na); + kring->nr_hwcur = head; } - k = ring->avail = kring->nr_hwavail - resvd; - if (k == 0 && td) + + nm_rxsync_finalize(kring); + + /* access copies of cur,tail in the kring */ + if (kring->rcur == kring->rtail && td) /* no bufs available */ selrecord(td, &kring->si); - if (k && (netmap_verbose & NM_VERB_HOST)) - D("%d pkts from stack", k); + unlock_out: - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); + return ret; } @@ -1042,7 +1079,7 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) // XXX add a refcount ? netmap_adapter_get(prev_na); } - D("Created generic NA %p (prev %p)", gna, gna->prev); + ND("Created generic NA %p (prev %p)", gna, gna->prev); return 0; } @@ -1113,154 +1150,167 @@ out: /* * validate parameters on entry for *_txsync() * Returns ring->cur if ok, or something >= kring->nkr_num_slots - * in case of error. The extra argument is a pointer to - * 'new_bufs'. XXX this may be deprecated at some point. + * in case of error. * - * Below is a correct configuration on input. ring->cur - * must be in the region covered by kring->hwavail, - * and ring->avail and kring->avail should end at the same slot. + * rhead, rcur and rtail=hwtail are stored from previous round. + * hwcur is the next packet to send to the ring. * - * +-hwcur - * | - * v<--hwres-->|<-----hwavail----> - * ------+------------------------------+-------- ring - * | - * |<---avail---> - * +--cur + * We want + * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail * + * hwcur, rhead, rtail and hwtail are reliable */ u_int -nm_txsync_prologue(struct netmap_kring *kring, u_int *new_slots) +nm_txsync_prologue(struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; + u_int head = ring->head; /* read only once */ u_int cur = ring->cur; /* read only once */ - u_int avail = ring->avail; /* read only once */ u_int n = kring->nkr_num_slots; - u_int kstart, kend, a; -#if 1 /* kernel sanity checks */ - if (kring->nr_hwcur >= n || - kring->nr_hwreserved >= n || kring->nr_hwavail >= n || - kring->nr_hwreserved + kring->nr_hwavail >= n) + ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); +#if 1 /* kernel sanity checks; but we can trust the kring. */ + if (kring->nr_hwcur >= n || kring->rhead >= n || + kring->rtail >= n || kring->nr_hwtail >= n) goto error; #endif /* kernel sanity checks */ - kstart = kring->nr_hwcur + kring->nr_hwreserved; - if (kstart >= n) - kstart -= n; - kend = kstart + kring->nr_hwavail; - /* user sanity checks. a is the expected avail */ - if (cur < kstart) { - /* too low, but maybe wraparound */ - if (cur + n > kend) + /* + * user sanity checks. We only use 'cur', + * A, B, ... are possible positions for cur: + * + * 0 A cur B tail C n-1 + * 0 D tail E cur F n-1 + * + * B, F, D are valid. A, C, E are wrong + */ + if (kring->rtail >= kring->rhead) { + /* want rhead <= head <= rtail */ + if (head < kring->rhead || head > kring->rtail) goto error; - *new_slots = cur + n - kstart; - a = kend - cur - n; - } else { - if (cur > kend) + /* and also head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* here rtail < rhead */ + /* we need head outside rtail .. rhead */ + if (head > kring->rtail && head < kring->rhead) goto error; - *new_slots = cur - kstart; - a = kend - cur; + + /* two cases now: head <= rtail or head >= rhead */ + if (head <= kring->rtail) { + /* want head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* head >= rhead */ + /* cur must be outside rtail..head */ + if (cur > kring->rtail && cur < head) + goto error; + } } - if (a != avail) { - RD(5, "wrong but fixable avail have %d need %d", - avail, a); - ring->avail = avail = a; + if (ring->tail != kring->rtail) { + RD(5, "tail overwritten was %d need %d", + ring->tail, kring->rtail); + ring->tail = kring->rtail; } - return cur; + kring->rhead = head; + kring->rcur = cur; + return head; error: - RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d", + RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", + kring->name, kring->nr_hwcur, - kring->nr_hwreserved, kring->nr_hwavail, - cur, avail); + kring->rcur, kring->nr_hwtail, + cur, ring->tail); return n; } /* * validate parameters on entry for *_rxsync() - * Returns ring->cur - ring->reserved if ok, - * or something >= kring->nkr_num_slots - * in case of error. The extra argument is a pointer to - * 'resvd'. XXX this may be deprecated at some point. + * Returns ring->head if ok, kring->nkr_num_slots on error. * - * Below is a correct configuration on input. ring->cur and - * ring->reserved must be in the region covered by kring->hwavail, - * and ring->avail and kring->avail should end at the same slot. + * For a valid configuration, + * hwcur <= head <= cur <= tail <= hwtail * - * +-hwcur - * | - * v<-------hwavail----------> - * ---------+--------------------------+-------- ring - * |<--res-->| - * |<---avail---> - * +--cur + * We only consider head and cur. + * hwcur and hwtail are reliable. * */ u_int -nm_rxsync_prologue(struct netmap_kring *kring, u_int *resvd) +nm_rxsync_prologue(struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; - u_int cur = ring->cur; /* read only once */ - u_int avail = ring->avail; /* read only once */ - u_int res = ring->reserved; /* read only once */ - u_int n = kring->nkr_num_slots; - u_int kend = kring->nr_hwcur + kring->nr_hwavail; - u_int a; + uint32_t const n = kring->nkr_num_slots; + uint32_t head, cur; + ND("%s kc %d kt %d h %d c %d t %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); + /* + * Before storing the new values, we should check they do not + * move backwards. However: + * - head is not an issue because the previous value is hwcur; + * - cur could in principle go back, however it does not matter + * because we are processing a brand new rxsync() + */ + cur = kring->rcur = ring->cur; /* read only once */ + head = kring->rhead = ring->head; /* read only once */ #if 1 /* kernel sanity checks */ - if (kring->nr_hwcur >= n || kring->nr_hwavail >= n) + if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) goto error; #endif /* kernel sanity checks */ /* user sanity checks */ - if (res >= n) - goto error; - /* check that cur is valid, a is the expected value of avail */ - if (cur < kring->nr_hwcur) { - /* too low, but maybe wraparound */ - if (cur + n > kend) + if (kring->nr_hwtail >= kring->nr_hwcur) { + /* want hwcur <= rhead <= hwtail */ + if (head < kring->nr_hwcur || head > kring->nr_hwtail) goto error; - a = kend - (cur + n); - } else { - if (cur > kend) + /* and also rhead <= rcur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) goto error; - a = kend - cur; - } - if (a != avail) { - RD(5, "wrong but fixable avail have %d need %d", - avail, a); - ring->avail = avail = a; - } - if (res != 0) { - /* then repeat the check for cur + res */ - cur = (cur >= res) ? cur - res : n + cur - res; - if (cur < kring->nr_hwcur) { - /* too low, but maybe wraparound */ - if (cur + n > kend) - goto error; - } else if (cur > kend) { + } else { + /* we need rhead outside hwtail..hwcur */ + if (head < kring->nr_hwcur && head > kring->nr_hwtail) goto error; + /* two cases now: head <= hwtail or head >= hwcur */ + if (head <= kring->nr_hwtail) { + /* want head <= cur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* cur must be outside hwtail..head */ + if (cur < head && cur > kring->nr_hwtail) + goto error; } } - *resvd = res; - return cur; + if (ring->tail != kring->rtail) { + RD(5, "%s tail overwritten was %d need %d", + kring->name, + ring->tail, kring->rtail); + ring->tail = kring->rtail; + } + return head; error: - RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d res %d", + RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", kring->nr_hwcur, - kring->nr_hwreserved, kring->nr_hwavail, - ring->cur, avail, res); + kring->rcur, kring->nr_hwtail, + kring->rhead, kring->rcur, ring->tail); return n; } + /* * Error routine called when txsync/rxsync detects an error. - * Can't do much more than resetting cur = hwcur, avail = hwavail. + * Can't do much more than resetting head =cur = hwcur, tail = hwtail * Return 1 on reinit. * * This routine is only called by the upper half of the kernel. * It only reads hwcur (which is changed only by the upper half, too) - * and hwavail (which may be changed by the lower half, but only on + * and hwtail (which may be changed by the lower half, but only on * a tx ring and only to increase it, so any error will be recovered * on the next call). For the above, we don't strictly need to call * it under lock. @@ -1274,36 +1324,38 @@ netmap_ring_reinit(struct netmap_kring *kring) // XXX KASSERT nm_kr_tryget RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); + // XXX probably wrong to trust userspace + kring->rhead = ring->head; + kring->rcur = ring->cur; + kring->rtail = ring->tail; + if (ring->cur > lim) errors++; + if (ring->head > lim) + errors++; + if (ring->tail > lim) + errors++; for (i = 0; i <= lim; i++) { u_int idx = ring->slot[i].buf_idx; u_int len = ring->slot[i].len; if (idx < 2 || idx >= netmap_total_buffers) { - if (!errors++) - D("bad buffer at slot %d idx %d len %d ", i, idx, len); + RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); ring->slot[i].buf_idx = 0; ring->slot[i].len = 0; } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { ring->slot[i].len = 0; - if (!errors++) - D("bad len %d at slot %d idx %d", - len, i, idx); + RD(5, "bad len at slot %d idx %d len %d", i, idx, len); } } if (errors) { - int pos = kring - kring->na->tx_rings; - int n = kring->na->num_tx_rings + 1; - RD(10, "total %d errors", errors); - errors++; - RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", - NM_IFPNAME(kring->na->ifp), - pos < n ? "TX" : "RX", pos < n ? pos : pos - n, + RD(10, "%s reinit, cur %d -> %d tail %d -> %d", + kring->name, ring->cur, kring->nr_hwcur, - ring->avail, kring->nr_hwavail); - ring->cur = kring->nr_hwcur; - ring->avail = kring->nr_hwavail; + ring->tail, kring->nr_hwtail); + ring->head = kring->rhead = kring->nr_hwcur; + ring->cur = kring->rcur = kring->nr_hwcur; + ring->tail = kring->rtail = kring->nr_hwtail; } return (errors ? 1 : 0); } @@ -1436,7 +1488,6 @@ out: * - NIOCGINFO * - SIOCGIFADDR just for convenience * - NIOCREGIF - * - NIOCUNREGIF * - NIOCTXSYNC * - NIOCRXSYNC * @@ -1472,6 +1523,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } while (0) #endif /* linux */ + if (cmd == NIOCGINFO || cmd == NIOCREGIF) { + /* truncate name */ + nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; + if (nmr->nr_version != NETMAP_API) { + D("API mismatch for %s got %d need %d", + nmr->nr_name, + nmr->nr_version, NETMAP_API); + nmr->nr_version = NETMAP_API; + return EINVAL; + } + } CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); @@ -1482,16 +1544,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, return (error == ENOENT ? ENXIO : error); } - nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ switch (cmd) { case NIOCGINFO: /* return capabilities etc */ - if (nmr->nr_version != NETMAP_API) { - D("API mismatch got %d have %d", - nmr->nr_version, NETMAP_API); - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } if (nmr->nr_cmd == NETMAP_BDG_LIST) { error = netmap_bdg_ctl(nmr, NULL); break; @@ -1531,11 +1585,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; case NIOCREGIF: - if (nmr->nr_version != NETMAP_API) { - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH @@ -1593,12 +1642,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, NMG_UNLOCK(); break; - case NIOCUNREGIF: - // XXX we have no data here ? - D("deprecated, data is %p", nmr); - error = EINVAL; - break; - case NIOCTXSYNC: case NIOCRXSYNC: nifp = priv->np_nifp; @@ -1649,7 +1692,11 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, D("pre txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); - na->nm_txsync(na, i, NAF_FORCE_RECLAIM); + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); + } else { + na->nm_txsync(na, i, NAF_FORCE_RECLAIM); + } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, @@ -1726,8 +1773,8 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) struct ifnet *ifp; struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; - u_int lim_tx, lim_rx, host_forwarded = 0; - struct mbq q; + u_int lim_tx, lim_rx; + struct mbq q; /* packets from hw queues to host stack */ void *pwait = dev; /* linux compatibility */ /* @@ -1735,7 +1782,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * txsync and rxsync if we decide to do a selrecord(). * retry_tx (and retry_rx, later) prevent looping forever. */ - int retry_tx = 1; + int retry_tx = 1, retry_rx = 1; (void)pwait; mbq_init(&q); @@ -1769,6 +1816,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) lim_rx = na->num_rx_rings; if (priv->np_qfirst == NETMAP_SW_RING) { + // XXX locking ? /* handle the host stack ring */ if (priv->np_txpoll || want_tx) { /* push any packets up, then we are always ready */ @@ -1777,29 +1825,15 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } if (want_rx) { kring = &na->rx_rings[lim_rx]; - if (kring->ring->avail == 0) + /* XXX replace with rxprologue etc. */ + if (nm_ring_empty(kring->ring)) netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) { + if (!nm_ring_empty(kring->ring)) revents |= want_rx; - } } return (revents); } - /* - * If we are in transparent mode, check also the host rx ring - * XXX Transparent mode at the moment requires to bind all - * rings to a single file descriptor. - */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && want_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { - if (kring->ring->avail == 0) - netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) - revents |= want_rx; - } /* * check_all_{tx|rx} are set if the card has more than one queue AND @@ -1825,81 +1859,71 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * We start with a lock free round which is cheap if we have * slots available. If this fails, then lock and call the sync * routines. - * XXX rather than ring->avail >0 should check that - * ring->cur has not reached hwcur+hwavail */ for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { kring = &na->rx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_rx; want_rx = 0; /* also breaks the loop */ } } for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { kring = &na->tx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_tx; want_tx = 0; /* also breaks the loop */ } } /* - * If we to push packets out (priv->np_txpoll) or want_tx is - * still set, we do need to run the txsync calls (on all rings, - * to avoid that the tx rings stall). + * If we want to push packets out (priv->np_txpoll) or + * want_tx is still set, we must issue txsync calls + * (on all rings, to avoid that the tx rings stall). * XXX should also check cur != hwcur on the tx rings. * Fortunately, normal tx mode has np_txpoll set. */ if (priv->np_txpoll || want_tx) { - /* If we really want to be woken up (want_tx), - * do a selrecord, either on the global or on - * the private structure. Then issue the txsync - * so there is no race in the selrecord/selwait + /* + * The first round checks if anyone is ready, if not + * do a selrecord and another round to handle races. + * want_tx goes to 0 if any space is found, and is + * used to skip rings with no pending transmissions. */ flush_tx: for (i = priv->np_qfirst; i < lim_tx; i++) { + int found = 0; + kring = &na->tx_rings[i]; - /* - * Skip this ring if want_tx == 0 - * (we have already done a successful sync on - * a previous ring) AND kring->cur == kring->hwcur - * (there are no pending transmissions for this ring). - */ if (!want_tx && kring->ring->cur == kring->nr_hwcur) continue; - /* make sure only one user thread is doing this */ + /* only one thread does txsync */ if (nm_kr_tryget(kring)) { - ND("ring %p busy is %d", - kring, (int)kring->nr_busy); - revents |= POLLERR; - goto out; + D("%p lost race on txring %d, ok", priv, i); + continue; } - - if (netmap_verbose & NM_VERB_TXSYNC) - D("send %d on %s %d", - kring->ring->cur, NM_IFPNAME(ifp), i); - if (na->nm_txsync(na, i, 0)) + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); revents |= POLLERR; + } else { + if (na->nm_txsync(na, i, 0)) + revents |= POLLERR; + } - /* Check avail and call selrecord only if - * called with POLLOUT and run out of bufs. - * XXX Note, we cannot trust much ring->avail - * as it is exposed to userspace (even though - * just updated by txsync). We should really - * check kring->nr_hwavail or better have - * txsync set a flag telling if we need - * to do a selrecord(). + /* + * If we found new slots, notify potential + * listeners on the same ring. + * Since we just did a txsync, look at the copies + * of cur,tail in the kring. */ - if (want_tx) { - if (kring->ring->avail > 0) { - /* stop at the first ring. We don't risk - * starvation. - */ - revents |= want_tx; - want_tx = 0; - } - } + found = kring->rcur != kring->rtail; nm_kr_put(kring); + if (found) { /* notify other listeners */ + revents |= want_tx; + want_tx = 0; + na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY); + } } if (want_tx && retry_tx) { selrecord(td, check_all_tx ? @@ -1910,21 +1934,27 @@ flush_tx: } /* - * now if want_rx is still set we need to lock and rxsync. + * If want_rx is still set scan receive rings. * Do it on all rings because otherwise we starve. */ if (want_rx) { - int retry_rx = 1; + int send_down = 0; /* transparent mode */ + /* two rounds here to for race avoidance */ do_retry_rx: for (i = priv->np_qfirst; i < lim_rx; i++) { + int found = 0; + kring = &na->rx_rings[i]; if (nm_kr_tryget(kring)) { - revents |= POLLERR; - goto out; + D("%p lost race on rxring %d, ok", priv, i); + continue; } - /* XXX NR_FORWARD should only be read on + /* + * transparent mode support: collect packets + * from the rxring(s). + * XXX NR_FORWARD should only be read on * physical or NIC ports */ if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { @@ -1939,49 +1969,65 @@ do_retry_rx: kring->ring->flags & NR_TIMESTAMP) { microtime(&kring->ring->ts); } - - if (kring->ring->avail > 0) { + /* after an rxsync we can use kring->rcur, rtail */ + found = kring->rcur != kring->rtail; + nm_kr_put(kring); + if (found) { revents |= want_rx; retry_rx = 0; + na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY); } - nm_kr_put(kring); } - if (retry_rx) { - retry_rx = 0; + + /* transparent mode XXX only during first pass ? */ + kring = &na->rx_rings[lim_rx]; + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; + } + + if (retry_rx) selrecord(td, check_all_rx ? &na->rx_si : &na->rx_rings[priv->np_qfirst].si); - goto do_retry_rx; + if (send_down > 0 || retry_rx) { + retry_rx = 0; + if (send_down) + goto flush_tx; /* and retry_rx */ + else + goto do_retry_rx; } } - /* forward host to the netmap ring. - * I am accessing nr_hwavail without lock, but netmap_transmit - * can only increment it, so the operation is safe. + /* + * Transparent mode: marked bufs on rx rings between + * kring->nr_hwcur and ring->head + * are passed to the other endpoint. + * + * In this mode we also scan the sw rxring, which in + * turn passes packets up. + * + * XXX Transparent mode at the moment requires to bind all + * rings to a single file descriptor. */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && (netmap_fwd || kring->ring->flags & NR_FORWARD) - && kring->nr_hwavail > 0 && !host_forwarded) { - netmap_sw_to_nic(na); - host_forwarded = 1; /* prevent another pass */ - want_rx = 0; - goto flush_tx; - } if (q.head) netmap_send_up(na->ifp, &q); -out: - return (revents); } -/*------- driver support routines ------*/ + +/*-------------------- driver support routines -------------------*/ static int netmap_hw_krings_create(struct netmap_adapter *); static int -netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) +netmap_notify(struct netmap_adapter *na, u_int n_ring, + enum txrx tx, int flags) { struct netmap_kring *kring; @@ -2012,10 +2058,18 @@ netmap_attach_common(struct netmap_adapter *na) return EINVAL; } WNA(ifp) = na; + + /* the following is only needed for na that use the host port. + * XXX do we have something similar for linux ? + */ +#ifdef __FreeBSD__ + na->if_input = ifp->if_input; /* for netmap_send_up */ +#endif /* __FreeBSD__ */ + NETMAP_SET_CAPABLE(ifp); if (na->nm_krings_create == NULL) { na->nm_krings_create = netmap_hw_krings_create; - na->nm_krings_delete = netmap_krings_delete; + na->nm_krings_delete = netmap_hw_krings_delete; } if (na->nm_notify == NULL) na->nm_notify = netmap_notify; @@ -2051,12 +2105,8 @@ netmap_detach_common(struct netmap_adapter *na) * of hardware rings): * krings 0..N-1 are for the hardware queues. * kring N is for the host stack queue - * kring N+1 is only used for the selinfo for all queues. + * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. - * - * By default the receive and transmit adapter ring counts are both initialized - * to num_queues. na->num_tx_rings can be set for cards with different tx/rx - * setups. */ int netmap_attach(struct netmap_adapter *arg) @@ -2132,8 +2182,14 @@ NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) int netmap_hw_krings_create(struct netmap_adapter *na) { - return netmap_krings_create(na, + int ret = netmap_krings_create(na, na->num_tx_rings + 1, na->num_rx_rings + 1, 0); + if (ret == 0) { + /* initialize the mbq for the sw rx ring */ + mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); + ND("initialized sw rx queue %d", na->num_rx_rings); + } + return ret; } @@ -2162,6 +2218,10 @@ netmap_detach(struct ifnet *ifp) /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. + * + * We only store packets in a bounded mbq and then copy them + * in the relevant rxsync routine. + * * We rely on the OS to make sure that the ifp and na do not go * away (typically the caller checks for IFF_DRV_RUNNING or the like). * In nm_register() or whenever there is a reinitialization, @@ -2172,63 +2232,60 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring; - u_int i, len = MBUF_LEN(m); - u_int error = EBUSY, lim; - struct netmap_slot *slot; + u_int len = MBUF_LEN(m); + u_int error = ENOBUFS; + struct mbq *q; + int space; // XXX [Linux] we do not need this lock // if we follow the down/configure/up protocol -gl // mtx_lock(&na->core_lock); + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { - /* interface not in netmap mode anymore */ + D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); error = ENXIO; goto done; } kring = &na->rx_rings[na->num_rx_rings]; - lim = kring->nkr_num_slots - 1; - if (netmap_verbose & NM_VERB_HOST) - D("%s packet %d len %d from the stack", NM_IFPNAME(ifp), - kring->nr_hwcur + kring->nr_hwavail, len); + q = &kring->rx_queue; + // XXX reconsider long packets if we handle fragments if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); goto done; } - /* protect against other instances of netmap_transmit, - * and userspace invocations of rxsync(). + + /* protect against rxsync_from_host(), netmap_sw_to_nic() + * and maybe other instances of netmap_transmit (the latter + * not possible on Linux). + * Also avoid overflowing the queue. */ - // XXX [Linux] there can be no other instances of netmap_transmit - // on this same ring, but we still need this lock to protect - // concurrent access from netmap_sw_to_nic() -gl - mtx_lock(&kring->q_lock); - if (kring->nr_hwavail >= lim) { - if (netmap_verbose) - D("stack ring %s full\n", NM_IFPNAME(ifp)); + mtx_lock(&q->lock); + + space = kring->nr_hwtail - kring->nr_hwcur; + if (space < 0) + space += kring->nkr_num_slots; + if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX + RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", + NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), + len, m); } else { - /* compute the insert position */ - i = nm_kr_rxpos(kring); - slot = &kring->ring->slot[i]; - m_copydata(m, 0, (int)len, BDG_NMB(na, slot)); - slot->len = len; - slot->flags = kring->nkr_slot_flags; - kring->nr_hwavail++; - if (netmap_verbose & NM_VERB_HOST) - D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings); - na->nm_notify(na, na->num_rx_rings, NR_RX, 0); + mbq_enqueue(q, m); + ND(10, "%s %d bufs in queue len %d m %p", + NM_IFPNAME(ifp), mbq_len(q), len, m); + /* notify outside the lock */ + m = NULL; error = 0; } - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); done: - // mtx_unlock(&na->core_lock); - - /* release the mbuf in either cases of success or failure. As an - * alternative, put the mbuf in a free list and free the list - * only when really necessary. - */ - m_freem(m); + if (m) + m_freem(m); + /* unconditionally wake up listeners */ + na->nm_notify(na, na->num_rx_rings, NR_RX, 0); return (error); } @@ -2267,27 +2324,32 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, if (n >= na->num_tx_rings) return NULL; kring = na->tx_rings + n; + // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; - new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; + new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; if (new_hwofs > lim) new_hwofs -= lim + 1; /* Always set the new offset value and realign the ring. */ - D("%s hwofs %d -> %d, hwavail %d -> %d", - tx == NR_TX ? "TX" : "RX", + if (netmap_verbose) + D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", + NM_IFPNAME(na->ifp), + tx == NR_TX ? "TX" : "RX", n, kring->nkr_hwofs, new_hwofs, - kring->nr_hwavail, - tx == NR_TX ? lim : kring->nr_hwavail); + kring->nr_hwtail, + tx == NR_TX ? lim : kring->nr_hwtail); kring->nkr_hwofs = new_hwofs; - if (tx == NR_TX) - kring->nr_hwavail = lim; - kring->nr_hwreserved = 0; + if (tx == NR_TX) { + kring->nr_hwtail = kring->nr_hwcur + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + } #if 0 // def linux /* XXX check that the mappings are correct */ @@ -2351,6 +2413,7 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) } } + /* * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. @@ -2397,6 +2460,7 @@ netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) static struct cdev *netmap_dev; /* /dev/netmap character device. */ extern struct cdevsw netmap_cdevsw; + void netmap_fini(void) { @@ -2408,6 +2472,7 @@ netmap_fini(void) printf("netmap: unloaded module.\n"); } + int netmap_init(void) { diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index c2814146d2ef..6716168526dc 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -86,21 +86,31 @@ netmap_catch_rx(struct netmap_adapter *na, int intercept) return 0; } + /* * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. * + * actually we also need to redirect the if_transmit ? + * * XXX see if FreeBSD has such a mechanism */ void -netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable) +netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) { + struct netmap_adapter *na = &gna->up.up; + struct ifnet *ifp = na->ifp; + if (enable) { + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; } else { + ifp->if_transmit = na->if_transmit; } } + /* Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) @@ -126,16 +136,16 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, // copy data to the mbuf m_copyback(m, 0, len, addr); - // inc refcount. We are alone, so we can skip the atomic atomic_fetchadd_int(m->m_ext.ref_cnt, 1); m->m_flags |= M_FLOWID; m->m_pkthdr.flowid = ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ - ret = ifp->if_transmit(ifp, m); + ret = NA(ifp)->if_transmit(ifp, m); return ret; } + /* * The following two functions are empty until we have a generic * way to extract the info from the ifp @@ -147,6 +157,7 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) return 0; } + void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { @@ -155,6 +166,7 @@ generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) *rxq = 1; } + void netmap_mitigation_init(struct netmap_generic_adapter *na) { ND("called"); @@ -167,22 +179,26 @@ void netmap_mitigation_start(struct netmap_generic_adapter *na) ND("called"); } + void netmap_mitigation_restart(struct netmap_generic_adapter *na) { ND("called"); } + int netmap_mitigation_active(struct netmap_generic_adapter *na) { ND("called"); return 0; } + void netmap_mitigation_cleanup(struct netmap_generic_adapter *na) { ND("called"); } + /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and @@ -194,6 +210,7 @@ struct netmap_vm_handle_t { struct netmap_priv_d *priv; }; + static int netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) @@ -218,6 +235,7 @@ netmap_dev_pager_dtor(void *handle) dev_rel(dev); } + static int netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index 2c42db3f8862..109a734cac9f 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -82,7 +82,7 @@ __FBSDID("$FreeBSD$"); #include <dev/netmap/netmap_mem2.h> #define rtnl_lock() D("rtnl_lock called"); -#define rtnl_unlock() D("rtnl_lock called"); +#define rtnl_unlock() D("rtnl_unlock called"); #define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) #define smp_mb() @@ -101,9 +101,9 @@ __FBSDID("$FreeBSD$"); * (or reinstall the buffer ?) */ #define SET_MBUF_DESTRUCTOR(m, fn) do { \ - (m)->m_ext.ext_free = (void *)fn; \ - (m)->m_ext.ext_type = EXT_EXTREF; \ - } while (0) + (m)->m_ext.ext_free = (void *)fn; \ + (m)->m_ext.ext_type = EXT_EXTREF; \ +} while (0) #define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1) @@ -137,43 +137,43 @@ __FBSDID("$FreeBSD$"); #ifdef RATE #define IFRATE(x) x struct rate_stats { - unsigned long txpkt; - unsigned long txsync; - unsigned long txirq; - unsigned long rxpkt; - unsigned long rxirq; - unsigned long rxsync; + unsigned long txpkt; + unsigned long txsync; + unsigned long txirq; + unsigned long rxpkt; + unsigned long rxirq; + unsigned long rxsync; }; struct rate_context { - unsigned refcount; - struct timer_list timer; - struct rate_stats new; - struct rate_stats old; + unsigned refcount; + struct timer_list timer; + struct rate_stats new; + struct rate_stats old; }; #define RATE_PRINTK(_NAME_) \ - printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); + printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); #define RATE_PERIOD 2 static void rate_callback(unsigned long arg) { - struct rate_context * ctx = (struct rate_context *)arg; - struct rate_stats cur = ctx->new; - int r; - - RATE_PRINTK(txpkt); - RATE_PRINTK(txsync); - RATE_PRINTK(txirq); - RATE_PRINTK(rxpkt); - RATE_PRINTK(rxsync); - RATE_PRINTK(rxirq); - printk("\n"); - - ctx->old = cur; - r = mod_timer(&ctx->timer, jiffies + - msecs_to_jiffies(RATE_PERIOD * 1000)); - if (unlikely(r)) - D("[v1000] Error: mod_timer()"); + struct rate_context * ctx = (struct rate_context *)arg; + struct rate_stats cur = ctx->new; + int r; + + RATE_PRINTK(txpkt); + RATE_PRINTK(txsync); + RATE_PRINTK(txirq); + RATE_PRINTK(rxpkt); + RATE_PRINTK(rxsync); + RATE_PRINTK(rxirq); + printk("\n"); + + ctx->old = cur; + r = mod_timer(&ctx->timer, jiffies + + msecs_to_jiffies(RATE_PERIOD * 1000)); + if (unlikely(r)) + D("[v1000] Error: mod_timer()"); } static struct rate_context rate_ctx; @@ -197,150 +197,150 @@ netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP))) return; - netmap_common_irq(ifp, q, work_done); + netmap_common_irq(ifp, q, work_done); } /* Enable/disable netmap mode for a generic network interface. */ -int generic_netmap_register(struct netmap_adapter *na, int enable) +static int +generic_netmap_register(struct netmap_adapter *na, int enable) { - struct ifnet *ifp = na->ifp; - struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; - struct mbuf *m; - int error; - int i, r; + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct mbuf *m; + int error; + int i, r; - if (!na) - return EINVAL; + if (!na) + return EINVAL; #ifdef REG_RESET - error = ifp->netdev_ops->ndo_stop(ifp); - if (error) { - return error; - } + error = ifp->netdev_ops->ndo_stop(ifp); + if (error) { + return error; + } #endif /* REG_RESET */ - if (enable) { /* Enable netmap mode. */ - /* Initialize the rx queue, as generic_rx_handler() can - * be called as soon as netmap_catch_rx() returns. - */ - for (r=0; r<na->num_rx_rings; r++) { - mbq_safe_init(&na->rx_rings[r].rx_queue); - na->rx_rings[r].nr_ntc = 0; - } - - /* Init the mitigation timer. */ - netmap_mitigation_init(gna); - - /* - * Preallocate packet buffers for the tx rings. - */ - for (r=0; r<na->num_tx_rings; r++) { - na->tx_rings[r].nr_ntc = 0; - na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), - M_DEVBUF, M_NOWAIT | M_ZERO); - if (!na->tx_rings[r].tx_pool) { - D("tx_pool allocation failed"); - error = ENOMEM; - goto free_tx_pool; - } - for (i=0; i<na->num_tx_desc; i++) { - m = netmap_get_mbuf(GENERIC_BUF_SIZE); - if (!m) { - D("tx_pool[%d] allocation failed", i); - error = ENOMEM; - goto free_mbufs; - } - na->tx_rings[r].tx_pool[i] = m; - } - } - rtnl_lock(); - /* Prepare to intercept incoming traffic. */ - error = netmap_catch_rx(na, 1); - if (error) { - D("netdev_rx_handler_register() failed"); - goto register_handler; - } - ifp->if_capenable |= IFCAP_NETMAP; - - /* Make netmap control the packet steering. */ - netmap_catch_packet_steering(gna, 1); - - rtnl_unlock(); + if (enable) { /* Enable netmap mode. */ + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as netmap_catch_rx() returns. + */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_init(&na->rx_rings[r].rx_queue); + } + + /* Init the mitigation timer. */ + netmap_mitigation_init(gna); + + /* + * Preallocate packet buffers for the tx rings. + */ + for (r=0; r<na->num_tx_rings; r++) + na->tx_rings[r].tx_pool = NULL; + for (r=0; r<na->num_tx_rings; r++) { + na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!na->tx_rings[r].tx_pool) { + D("tx_pool allocation failed"); + error = ENOMEM; + goto free_tx_pools; + } + for (i=0; i<na->num_tx_desc; i++) + na->tx_rings[r].tx_pool[i] = NULL; + for (i=0; i<na->num_tx_desc; i++) { + m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (!m) { + D("tx_pool[%d] allocation failed", i); + error = ENOMEM; + goto free_tx_pools; + } + na->tx_rings[r].tx_pool[i] = m; + } + } + rtnl_lock(); + /* Prepare to intercept incoming traffic. */ + error = netmap_catch_rx(na, 1); + if (error) { + D("netdev_rx_handler_register() failed"); + goto register_handler; + } + ifp->if_capenable |= IFCAP_NETMAP; + + /* Make netmap control the packet steering. */ + netmap_catch_tx(gna, 1); + + rtnl_unlock(); #ifdef RATE - if (rate_ctx.refcount == 0) { - D("setup_timer()"); - memset(&rate_ctx, 0, sizeof(rate_ctx)); - setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); - if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { - D("Error: mod_timer()"); - } - } - rate_ctx.refcount++; + if (rate_ctx.refcount == 0) { + D("setup_timer()"); + memset(&rate_ctx, 0, sizeof(rate_ctx)); + setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); + if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { + D("Error: mod_timer()"); + } + } + rate_ctx.refcount++; #endif /* RATE */ - } else { /* Disable netmap mode. */ - rtnl_lock(); + } else { /* Disable netmap mode. */ + rtnl_lock(); - ifp->if_capenable &= ~IFCAP_NETMAP; + ifp->if_capenable &= ~IFCAP_NETMAP; - /* Release packet steering control. */ - netmap_catch_packet_steering(gna, 0); + /* Release packet steering control. */ + netmap_catch_tx(gna, 0); - /* Do not intercept packets on the rx path. */ - netmap_catch_rx(na, 0); + /* Do not intercept packets on the rx path. */ + netmap_catch_rx(na, 0); - rtnl_unlock(); + rtnl_unlock(); - /* Free the mbufs going to the netmap rings */ - for (r=0; r<na->num_rx_rings; r++) { - mbq_safe_purge(&na->rx_rings[r].rx_queue); - mbq_safe_destroy(&na->rx_rings[r].rx_queue); - } + /* Free the mbufs going to the netmap rings */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_purge(&na->rx_rings[r].rx_queue); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } - netmap_mitigation_cleanup(gna); + netmap_mitigation_cleanup(gna); - for (r=0; r<na->num_tx_rings; r++) { - for (i=0; i<na->num_tx_desc; i++) { - m_freem(na->tx_rings[r].tx_pool[i]); - } - free(na->tx_rings[r].tx_pool, M_DEVBUF); - } + for (r=0; r<na->num_tx_rings; r++) { + for (i=0; i<na->num_tx_desc; i++) { + m_freem(na->tx_rings[r].tx_pool[i]); + } + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } #ifdef RATE - if (--rate_ctx.refcount == 0) { - D("del_timer()"); - del_timer(&rate_ctx.timer); - } + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } #endif - } + } #ifdef REG_RESET - error = ifp->netdev_ops->ndo_open(ifp); - if (error) { - goto alloc_tx_pool; - } + error = ifp->netdev_ops->ndo_open(ifp); + if (error) { + goto alloc_tx_pool; + } #endif - return 0; + return 0; register_handler: - rtnl_unlock(); -free_tx_pool: - r--; - i = na->num_tx_desc; /* Useless, but just to stay safe. */ -free_mbufs: - i--; - for (; r>=0; r--) { - for (; i>=0; i--) { - m_freem(na->tx_rings[r].tx_pool[i]); - } - free(na->tx_rings[r].tx_pool, M_DEVBUF); - i = na->num_tx_desc - 1; - } - - return error; + rtnl_unlock(); +free_tx_pools: + for (r=0; r<na->num_tx_rings; r++) { + if (na->tx_rings[r].tx_pool == NULL) + continue; + for (i=0; i<na->num_tx_desc; i++) + if (na->tx_rings[r].tx_pool[i]) + m_freem(na->tx_rings[r].tx_pool[i]); + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } + + return error; } /* @@ -351,93 +351,88 @@ free_mbufs: static void generic_mbuf_destructor(struct mbuf *m) { - if (netmap_verbose) - D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); - netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); + if (netmap_verbose) + D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); + netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); #ifdef __FreeBSD__ - m->m_ext.ext_type = EXT_PACKET; - m->m_ext.ext_free = NULL; - if (*(m->m_ext.ref_cnt) == 0) - *(m->m_ext.ref_cnt) = 1; - uma_zfree(zone_pack, m); + m->m_ext.ext_type = EXT_PACKET; + m->m_ext.ext_free = NULL; + if (*(m->m_ext.ref_cnt) == 0) + *(m->m_ext.ref_cnt) = 1; + uma_zfree(zone_pack, m); #endif /* __FreeBSD__ */ - IFRATE(rate_ctx.new.txirq++); + IFRATE(rate_ctx.new.txirq++); } -/* Record completed transmissions and update hwavail. +/* Record completed transmissions and update hwtail. * - * nr_ntc is the oldest tx buffer not yet completed - * (same as nr_hwavail + nr_hwcur + 1), + * The oldest tx buffer not yet completed is at nr_hwtail + 1, * nr_hwcur is the first unsent buffer. - * When cleaning, we try to recover buffers between nr_ntc and nr_hwcur. */ -static int +static u_int generic_netmap_tx_clean(struct netmap_kring *kring) { - u_int num_slots = kring->nkr_num_slots; - u_int ntc = kring->nr_ntc; - u_int hwcur = kring->nr_hwcur; - u_int n = 0; - struct mbuf **tx_pool = kring->tx_pool; - - while (ntc != hwcur) { /* buffers not completed */ - struct mbuf *m = tx_pool[ntc]; - - if (unlikely(m == NULL)) { - /* try to replenish the entry */ - tx_pool[ntc] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); - if (unlikely(m == NULL)) { - D("mbuf allocation failed, XXX error"); - // XXX how do we proceed ? break ? - return -ENOMEM; - } - } else if (GET_MBUF_REFCNT(m) != 1) { - break; /* This mbuf is still busy: its refcnt is 2. */ + u_int const lim = kring->nkr_num_slots - 1; + u_int nm_i = nm_next(kring->nr_hwtail, lim); + u_int hwcur = kring->nr_hwcur; + u_int n = 0; + struct mbuf **tx_pool = kring->tx_pool; + + while (nm_i != hwcur) { /* buffers not completed */ + struct mbuf *m = tx_pool[nm_i]; + + if (unlikely(m == NULL)) { + /* this is done, try to replenish the entry */ + tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed, XXX error"); + // XXX how do we proceed ? break ? + return -ENOMEM; + } + } else if (GET_MBUF_REFCNT(m) != 1) { + break; /* This mbuf is still busy: its refcnt is 2. */ + } + n++; + nm_i = nm_next(nm_i, lim); } - if (unlikely(++ntc == num_slots)) { - ntc = 0; - } - n++; - } - kring->nr_ntc = ntc; - kring->nr_hwavail += n; - ND("tx completed [%d] -> hwavail %d", n, kring->nr_hwavail); - - return n; + kring->nr_hwtail = nm_prev(nm_i, lim); + ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail); + + return n; } /* - * We have pending packets in the driver between nr_ntc and j. + * We have pending packets in the driver between nr_hwtail +1 and hwcur. * Compute a position in the middle, to be used to generate * a notification. */ static inline u_int generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) { - u_int n = kring->nkr_num_slots; - u_int ntc = kring->nr_ntc; - u_int e; - - if (hwcur >= ntc) { - e = (hwcur + ntc) / 2; - } else { /* wrap around */ - e = (hwcur + n + ntc) / 2; - if (e >= n) { - e -= n; - } - } - - if (unlikely(e >= n)) { - D("This cannot happen"); - e = 0; - } - - return e; + u_int n = kring->nkr_num_slots; + u_int ntc = nm_next(kring->nr_hwtail, n-1); + u_int e; + + if (hwcur >= ntc) { + e = (hwcur + ntc) / 2; + } else { /* wrap around */ + e = (hwcur + n + ntc) / 2; + if (e >= n) { + e -= n; + } + } + + if (unlikely(e >= n)) { + D("This cannot happen"); + e = 0; + } + + return e; } /* - * We have pending packets in the driver between nr_ntc and hwcur. + * We have pending packets in the driver between nr_hwtail+1 and hwcur. * Schedule a notification approximately in the middle of the two. * There is a race but this is only called within txsync which does * a double check. @@ -445,28 +440,28 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) static void generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) { - struct mbuf *m; - u_int e; - - if (kring->nr_ntc == hwcur) { - return; - } - e = generic_tx_event_middle(kring, hwcur); - - m = kring->tx_pool[e]; - if (m == NULL) { - /* This can happen if there is already an event on the netmap - slot 'e': There is nothing to do. */ - return; - } - ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); - kring->tx_pool[e] = NULL; - SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); - - // XXX wmb() ? - /* Decrement the refcount an free it if we have the last one. */ - m_freem(m); - smp_mb(); + struct mbuf *m; + u_int e; + + if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) { + return; /* all buffers are free */ + } + e = generic_tx_event_middle(kring, hwcur); + + m = kring->tx_pool[e]; + if (m == NULL) { + /* This can happen if there is already an event on the netmap + slot 'e': There is nothing to do. */ + return; + } + ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); + kring->tx_pool[e] = NULL; + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + + // XXX wmb() ? + /* Decrement the refcount an free it if we have the last one. */ + m_freem(m); + smp_mb(); } @@ -480,133 +475,108 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) static int generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k, num_slots = kring->nkr_num_slots; - int new_slots, ntx; - - IFRATE(rate_ctx.new.txsync++); - - // TODO: handle the case of mbuf allocation failure - /* first, reclaim completed buffers */ - generic_netmap_tx_clean(kring); - - /* Take a copy of ring->cur now, and never read it again. */ - k = ring->cur; - if (unlikely(k >= num_slots)) { - return netmap_ring_reinit(kring); - } - - rmb(); - j = kring->nr_hwcur; - /* - * 'new_slots' counts how many new slots have been added: - * everything from hwcur to cur, excluding reserved ones, if any. - * nr_hwreserved start from hwcur and counts how many slots were - * not sent to the NIC from the previous round. - */ - new_slots = k - j - kring->nr_hwreserved; - if (new_slots < 0) { - new_slots += num_slots; - } - ntx = 0; - if (j != k) { - /* Process new packets to send: - * j is the current index in the netmap ring. + struct ifnet *ifp = na->ifp; + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ // j + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + + IFRATE(rate_ctx.new.txsync++); + + // TODO: handle the case of mbuf allocation failure + + rmb(); + + /* + * First part: process new packets to send. */ - while (j != k) { - struct netmap_slot *slot = &ring->slot[j]; /* Current slot in the netmap ring */ - void *addr = NMB(slot); - u_int len = slot->len; - struct mbuf *m; - int tx_ret; - - if (unlikely(addr == netmap_buffer_base || len > NETMAP_BUF_SIZE)) { - return netmap_ring_reinit(kring); - } - /* Tale a mbuf from the tx pool and copy in the user packet. */ - m = kring->tx_pool[j]; - if (unlikely(!m)) { - RD(5, "This should never happen"); - kring->tx_pool[j] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); - if (unlikely(m == NULL)) { - D("mbuf allocation failed"); - break; - } - } - /* XXX we should ask notifications when NS_REPORT is set, - * or roughly every half frame. We can optimize this - * by lazily requesting notifications only when a - * transmission fails. Probably the best way is to - * break on failures and set notifications when - * ring->avail == 0 || j != k - */ - tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); - if (unlikely(tx_ret)) { - RD(5, "start_xmit failed: err %d [%u,%u,%u,%u]", - tx_ret, kring->nr_ntc, j, k, kring->nr_hwavail); - /* - * No room for this mbuf in the device driver. - * Request a notification FOR A PREVIOUS MBUF, - * then call generic_netmap_tx_clean(kring) to do the - * double check and see if we can free more buffers. - * If there is space continue, else break; - * NOTE: the double check is necessary if the problem - * occurs in the txsync call after selrecord(). - * Also, we need some way to tell the caller that not - * all buffers were queued onto the device (this was - * not a problem with native netmap driver where space - * is preallocated). The bridge has a similar problem - * and we solve it there by dropping the excess packets. - */ - generic_set_tx_event(kring, j); - if (generic_netmap_tx_clean(kring)) { /* space now available */ - continue; - } else { - break; - } - } - slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); - if (unlikely(++j == num_slots)) - j = 0; - ntx++; - } - - /* Update hwcur to the next slot to transmit. */ - kring->nr_hwcur = j; - - /* - * Report all new slots as unavailable, even those not sent. - * We account for them with with hwreserved, so that - * nr_hwreserved =:= cur - nr_hwcur + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + void *addr = NMB(slot); + + /* device-specific */ + struct mbuf *m; + int tx_ret; + + NM_CHECK_ADDR_LEN(addr, len); + + /* Tale a mbuf from the tx pool and copy in the user packet. */ + m = kring->tx_pool[nm_i]; + if (unlikely(!m)) { + RD(5, "This should never happen"); + kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed"); + break; + } + } + /* XXX we should ask notifications when NS_REPORT is set, + * or roughly every half frame. We can optimize this + * by lazily requesting notifications only when a + * transmission fails. Probably the best way is to + * break on failures and set notifications when + * ring->cur == ring->tail || nm_i != cur + */ + tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + if (unlikely(tx_ret)) { + RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", + tx_ret, nm_i, head, kring->nr_hwtail); + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, nm_i); + if (generic_netmap_tx_clean(kring)) { /* space now available */ + continue; + } else { + break; + } + } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + nm_i = nm_next(nm_i, lim); + } + + /* Update hwcur to the next slot to transmit. */ + kring->nr_hwcur = nm_i; /* not head, we could break early */ + + IFRATE(rate_ctx.new.txpkt += ntx); + } + + /* + * Second, reclaim completed buffers */ - kring->nr_hwavail -= new_slots; - kring->nr_hwreserved = k - j; - if (kring->nr_hwreserved < 0) { - kring->nr_hwreserved += num_slots; - } - - IFRATE(rate_ctx.new.txpkt += ntx); - - if (!kring->nr_hwavail) { - /* No more available slots? Set a notification event - * on a netmap slot that will be cleaned in the future. - * No doublecheck is performed, since txsync() will be - * called twice by netmap_poll(). - */ - generic_set_tx_event(kring, j); - } - ND("tx #%d, hwavail = %d", n, kring->nr_hwavail); - } - - /* Synchronize the user's view to the kernel view. */ - ring->avail = kring->nr_hwavail; - ring->reserved = kring->nr_hwreserved; - - return 0; + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + /* No more available slots? Set a notification event + * on a netmap slot that will be cleaned in the future. + * No doublecheck is performed, since txsync() will be + * called twice by netmap_poll(). + */ + generic_set_tx_event(kring, nm_i); + } + ND("tx #%d, hwtail = %d", n, kring->nr_hwtail); + + generic_netmap_tx_clean(kring); + + nm_txsync_finalize(kring); + + return 0; } + /* * This handler is registered (through netmap_catch_rx()) * within the attached network interface @@ -615,38 +585,38 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Stolen packets are put in a queue where the * generic_netmap_rxsync() callback can extract them. */ -void generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +void +generic_rx_handler(struct ifnet *ifp, struct mbuf *m) { - struct netmap_adapter *na = NA(ifp); - struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; - u_int work_done; - u_int rr = 0; // receive ring number - - ND("called"); - /* limit the size of the queue */ - if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { - m_freem(m); - } else { - mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); - } - - if (netmap_generic_mit < 32768) { - /* no rx mitigation, pass notification up */ - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); - } else { - /* same as send combining, filter notification if there is a - * pending timer, otherwise pass it up and start a timer. - */ - if (likely(netmap_mitigation_active(gna))) { - /* Record that there is some pending work. */ - gna->mit_pending = 1; - } else { - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); - netmap_mitigation_start(gna); - } - } + struct netmap_adapter *na = NA(ifp); + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + u_int work_done; + u_int rr = 0; // receive ring number + + /* limit the size of the queue */ + if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + m_freem(m); + } else { + mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + } + + if (netmap_generic_mit < 32768) { + /* no rx mitigation, pass notification up */ + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + } else { + /* same as send combining, filter notification if there is a + * pending timer, otherwise pass it up and start a timer. + */ + if (likely(netmap_mitigation_active(gna))) { + /* Record that there is some pending work. */ + gna->mit_pending = 1; + } else { + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + netmap_mitigation_start(gna); + } + } } /* @@ -658,105 +628,99 @@ void generic_rx_handler(struct ifnet *ifp, struct mbuf *m) static int generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, n, lim = kring->nkr_num_slots - 1; - int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k, resvd = ring->reserved; - - if (ring->cur > lim) - return netmap_ring_reinit(kring); - - /* Import newly received packets into the netmap ring. */ - if (netmap_no_pendintr || force_update) { - uint16_t slot_flags = kring->nkr_slot_flags; - struct mbuf *m; - - n = 0; - j = kring->nr_ntc; /* first empty slot in the receive ring */ - /* extract buffers from the rx queue, stop at most one - * slot before nr_hwcur (index k) + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ //j, + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + if (head > lim) + return netmap_ring_reinit(kring); + + /* + * First part: import newly received packets. + */ + if (netmap_no_pendintr || force_update) { + /* extract buffers from the rx queue, stop at most one + * slot before nr_hwcur (stop_i) + */ + uint16_t slot_flags = kring->nkr_slot_flags; + u_int stop_i = nm_prev(kring->nr_hwcur, lim); + + nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ + for (n = 0; nm_i != stop_i; n++) { + int len; + void *addr = NMB(&ring->slot[nm_i]); + struct mbuf *m; + + /* we only check the address here on generic rx rings */ + if (addr == netmap_buffer_base) { /* Bad buffer */ + return netmap_ring_reinit(kring); + } + /* + * Call the locked version of the function. + * XXX Ideally we could grab a batch of mbufs at once + * and save some locking overhead. + */ + m = mbq_safe_dequeue(&kring->rx_queue); + if (!m) /* no more data */ + break; + len = MBUF_LEN(m); + m_copydata(m, 0, len, addr); + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; + m_freem(m); + nm_i = nm_next(nm_i, lim); + n++; + } + if (n) { + kring->nr_hwtail = nm_i; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; + } + + // XXX should we invert the order ? + /* + * Second part: skip past packets that userspace has released. */ - k = (kring->nr_hwcur) ? kring->nr_hwcur-1 : lim; - while (j != k) { - int len; - void *addr = NMB(&ring->slot[j]); - - if (addr == netmap_buffer_base) { /* Bad buffer */ - return netmap_ring_reinit(kring); - } - /* - * Call the locked version of the function. - * XXX Ideally we could grab a batch of mbufs at once, - * by changing rx_queue into a ring. - */ - m = mbq_safe_dequeue(&kring->rx_queue); - if (!m) - break; - len = MBUF_LEN(m); - m_copydata(m, 0, len, addr); - ring->slot[j].len = len; - ring->slot[j].flags = slot_flags; - m_freem(m); - if (unlikely(j++ == lim)) - j = 0; - n++; - } - if (n) { - kring->nr_ntc = j; - kring->nr_hwavail += n; - IFRATE(rate_ctx.new.rxpkt += n); - } - kring->nr_kflags &= ~NKR_PENDINTR; - } - - // XXX should we invert the order ? - /* Skip past packets that userspace has released */ - j = kring->nr_hwcur; - k = ring->cur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { - /* Userspace has released some packets. */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - - slot->flags &= ~NS_BUF_CHANGED; - if (unlikely(j++ == lim)) - j = 0; - } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - /* Tell userspace that there are new packets. */ - ring->avail = kring->nr_hwavail - resvd; - IFRATE(rate_ctx.new.rxsync++); - - return 0; + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* Userspace has released some packets. */ + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + /* tell userspace that there might be new packets. */ + nm_rxsync_finalize(kring); + IFRATE(rate_ctx.new.rxsync++); + + return 0; } static void generic_netmap_dtor(struct netmap_adapter *na) { - struct ifnet *ifp = na->ifp; - struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; - struct netmap_adapter *prev_na = gna->prev; - - if (prev_na != NULL) { - D("Released generic NA %p", gna); - if_rele(na->ifp); - netmap_adapter_put(prev_na); - } - if (ifp != NULL) { - WNA(ifp) = prev_na; - D("Restored native NA %p", prev_na); - na->ifp = NULL; - } + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; + struct netmap_adapter *prev_na = gna->prev; + + if (prev_na != NULL) { + D("Released generic NA %p", gna); + if_rele(na->ifp); + netmap_adapter_put(prev_na); + } + if (ifp != NULL) { + WNA(ifp) = prev_na; + D("Restored native NA %p", prev_na); + na->ifp = NULL; + } } /* @@ -773,46 +737,46 @@ generic_netmap_dtor(struct netmap_adapter *na) int generic_netmap_attach(struct ifnet *ifp) { - struct netmap_adapter *na; - struct netmap_generic_adapter *gna; - int retval; - u_int num_tx_desc, num_rx_desc; - - num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ - - generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); - ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); - - gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); - if (gna == NULL) { - D("no memory on attach, give up"); - return ENOMEM; - } - na = (struct netmap_adapter *)gna; - na->ifp = ifp; - na->num_tx_desc = num_tx_desc; - na->num_rx_desc = num_rx_desc; - na->nm_register = &generic_netmap_register; - na->nm_txsync = &generic_netmap_txsync; - na->nm_rxsync = &generic_netmap_rxsync; - na->nm_dtor = &generic_netmap_dtor; - /* when using generic, IFCAP_NETMAP is set so we force - * NAF_SKIP_INTR to use the regular interrupt handler - */ - na->na_flags = NAF_SKIP_INTR; - - ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", - ifp->num_tx_queues, ifp->real_num_tx_queues, - ifp->tx_queue_len); - ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", - ifp->num_rx_queues, ifp->real_num_rx_queues); - - generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); - - retval = netmap_attach_common(na); - if (retval) { - free(gna, M_DEVBUF); - } - - return retval; + struct netmap_adapter *na; + struct netmap_generic_adapter *gna; + int retval; + u_int num_tx_desc, num_rx_desc; + + num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ + + generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); + ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); + + gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (gna == NULL) { + D("no memory on attach, give up"); + return ENOMEM; + } + na = (struct netmap_adapter *)gna; + na->ifp = ifp; + na->num_tx_desc = num_tx_desc; + na->num_rx_desc = num_rx_desc; + na->nm_register = &generic_netmap_register; + na->nm_txsync = &generic_netmap_txsync; + na->nm_rxsync = &generic_netmap_rxsync; + na->nm_dtor = &generic_netmap_dtor; + /* when using generic, IFCAP_NETMAP is set so we force + * NAF_SKIP_INTR to use the regular interrupt handler + */ + na->na_flags = NAF_SKIP_INTR; + + ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", + ifp->num_tx_queues, ifp->real_num_tx_queues, + ifp->tx_queue_len); + ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", + ifp->num_rx_queues, ifp->real_num_rx_queues); + + generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + + retval = netmap_attach_common(na); + if (retval) { + free(gna, M_DEVBUF); + } + + return retval; } diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 9381cd4cedd3..74a46297ff3d 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,6 +1,6 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -53,7 +53,7 @@ #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) #define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) -#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) +#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) #define NM_ATOMIC_T volatile int // XXX ? /* atomic operations */ @@ -76,7 +76,11 @@ struct hrtimer { #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) #define MBUF_IFP(m) ((m)->dev) -#define NM_SEND_UP(ifp, m) netif_rx(m) +#define NM_SEND_UP(ifp, m) \ + do { \ + m->priority = NM_MAGIC_PRIORITY; \ + netif_rx(m); \ + } while (0) #define NM_ATOMIC_T volatile long unsigned int @@ -125,9 +129,9 @@ struct hrtimer { do { \ struct timeval __xxts; \ microtime(&__xxts); \ - printf("%03d.%06d %s [%d] " format "\n", \ + printf("%03d.%06d [%4d] %-25s " format "\n", \ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* rate limited, lps indicates how many per second */ @@ -158,15 +162,23 @@ extern NMG_LOCK_T netmap_global_lock; * a ring across system calls. * * nr_hwcur index of the next buffer to refill. - * It corresponds to ring->cur - ring->reserved + * It corresponds to ring->head + * at the time the system call returns. * - * nr_hwavail the number of slots "owned" by userspace. - * nr_hwavail =:= ring->avail + ring->reserved + * nr_hwtail index of the first buffer owned by the kernel. + * On RX, hwcur->hwtail are receive buffers + * not yet released. hwcur is advanced following + * ring->head, hwtail is advanced on incoming packets, + * and a wakeup is generated when hwtail passes ring->cur + * On TX, hwcur->rcur have been filled by the sender + * but not sent yet to the NIC; rcur->hwtail are available + * for new transmissions, and hwtail->hwcur-1 are pending + * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: - * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with + * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * @@ -184,44 +196,76 @@ extern NMG_LOCK_T netmap_global_lock; * from nr_hwlease, advances it, then does the * copy outside the lock. * In RX rings (used for VALE ports), - * nkr_hwcur + nkr_hwavail <= nkr_hwlease < nkr_hwcur+N-1 + * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 * In TX rings (used for NIC or host stack ports) - * nkr_hwcur <= nkr_hwlease < nkr_hwcur+ nkr_hwavail + * nkr_hwcur <= nkr_hwlease < nkr_hwtail * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. - * q_lock is used to arbitrate access to the kring from within the netmap - * code, and this and other protections guarantee that there is never - * more than 1 concurrent call to txsync or rxsync. So we are free - * to manipulate the kring from within txsync/rxsync without any extra - * locks. + * + * Concurrent rxsync or txsync on the same ring are prevented through + * by nm_kr_lock() which in turn uses nr_busy. This is all we need + * for NIC rings, and for TX rings attached to the host stack. + * + * RX rings attached to the host stack use an mbq (rx_queue) on both + * rxsync_from_host() and netmap_transmit(). The mbq is protected + * by its internal lock. + * + * RX rings attached to the VALE switch are accessed by both sender + * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { - struct netmap_ring *ring; - uint32_t nr_hwcur; - uint32_t nr_hwavail; - uint32_t nr_kflags; /* private driver flags */ - int32_t nr_hwreserved; -#define NKR_PENDINTR 0x1 // Pending interrupt. - uint32_t nkr_num_slots; - int32_t nkr_hwofs; /* offset between NIC and netmap ring */ + struct netmap_ring *ring; + + uint32_t nr_hwcur; + uint32_t nr_hwtail; + + /* + * Copies of values in user rings, so we do not need to look + * at the ring (which could be modified). These are set in the + * *sync_prologue()/finalize() routines. + */ + uint32_t rhead; + uint32_t rcur; + uint32_t rtail; + + uint32_t nr_kflags; /* private driver flags */ +#define NKR_PENDINTR 0x1 // Pending interrupt. + uint32_t nkr_num_slots; + + /* + * On a NIC reset, the NIC ring indexes may be reset but the + * indexes in the netmap rings remain the same. nkr_hwofs + * keeps track of the offset between the two. + */ + int32_t nkr_hwofs; uint16_t nkr_slot_flags; /* initial value for flags */ + + /* last_reclaim is opaque marker to help reduce the frequency + * of operations such as reclaiming tx buffers. A possible use + * is set it to ticks and do the reclaim only once per tick. + */ + uint64_t last_reclaim; + + + NM_SELINFO_T si; /* poll/select wait queue */ + NM_LOCK_T q_lock; /* protects kring and ring. */ + NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + struct netmap_adapter *na; - struct nm_bdg_fwd *nkr_ft; - uint32_t *nkr_leases; -#define NR_NOSLOT ((uint32_t)~0) - uint32_t nkr_hwlease; - uint32_t nkr_lease_idx; - NM_SELINFO_T si; /* poll/select wait queue */ - NM_LOCK_T q_lock; /* protects kring and ring. */ - NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + /* The folloiwing fields are for VALE switch support */ + struct nm_bdg_fwd *nkr_ft; + uint32_t *nkr_leases; +#define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ + uint32_t nkr_hwlease; + uint32_t nkr_lease_idx; - volatile int nkr_stopped; + volatile int nkr_stopped; // XXX what for ? /* support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers @@ -230,8 +274,11 @@ struct netmap_kring { * XXX who writes to the rx queue ? */ struct mbuf **tx_pool; - u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ - struct mbq rx_queue; /* A queue for intercepted rx mbufs. */ + // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ + struct mbq rx_queue; /* intercepted rx mbufs. */ + + uint32_t ring_id; /* debugging */ + char name[64]; /* diagnostic */ } __attribute__((__aligned__(64))); @@ -243,6 +290,15 @@ nm_next(uint32_t i, uint32_t lim) return unlikely (i == lim) ? 0 : i + 1; } + +/* return the previous index, with wraparound */ +static inline uint32_t +nm_prev(uint32_t i, uint32_t lim) +{ + return unlikely (i == 0) ? lim : i - 1; +} + + /* * * Here is the layout for the Rx and Tx rings. @@ -253,36 +309,36 @@ nm_next(uint32_t i, uint32_t lim) | | | | |XXX free slot XXX| |XXX free slot XXX| +-----------------+ +-----------------+ - | |<-hwcur | |<-hwcur - | reserved h | | (ready | - +----------- w -+ | to be | - cur->| a | | sent) h | - | v | +---------- w | - | a | cur->| (being a | - | i | | prepared) v | - | avail l | | a | - +-----------------+ + a ------ i + - | | ... | v l |<-hwlease - | (being | ... | a | ... - | prepared) | ... | i | ... - +-----------------+ ... | l | ... - | |<-hwlease +-----------------+ +head->| owned by user |<-hwcur | not sent to nic |<-hwcur + | | | yet | + +-----------------+ | | + cur->| available to | | | + | user, not read | +-----------------+ + | yet | cur->| (being | + | | | prepared) | | | | | + +-----------------+ + ------ + +tail->| |<-hwtail | |<-hwlease + | (being | ... | | ... + | prepared) | ... | | ... + +-----------------+ ... | | ... + | |<-hwlease +-----------------+ + | | tail->| |<-hwtail | | | | | | | | | | | | +-----------------+ +-----------------+ - * The cur/avail (user view) and hwcur/hwavail (kernel view) + * The cur/tail (user view) and hwcur/hwtail (kernel view) * are used in the normal operation of the card. * * When a ring is the output of a switch port (Rx ring for * a VALE port, Tx ring for the host stack or NIC), slots * are reserved in blocks through 'hwlease' which points * to the next unused slot. - * On an Rx ring, hwlease is always after hwavail, - * and completions cause avail to advance. - * On a Tx ring, hwlease is always between cur and hwavail, + * On an Rx ring, hwlease is always after hwtail, + * and completions cause hwtail to advance. + * On a Tx ring, hwlease is always between cur and hwtail, * and completions cause cur to advance. * * nm_kr_space() returns the maximum number of slots that @@ -294,7 +350,6 @@ nm_next(uint32_t i, uint32_t lim) - enum txrx { NR_RX = 0, NR_TX = 1 }; /* @@ -349,6 +404,7 @@ struct netmap_adapter { */ struct netmap_kring *tx_rings; /* array of TX rings. */ struct netmap_kring *rx_rings; /* array of RX rings. */ + void *tailroom; /* space below the rings array */ /* (used for leases) */ @@ -360,11 +416,38 @@ struct netmap_adapter { */ int (*if_transmit)(struct ifnet *, struct mbuf *); + /* copy of if_input for netmap_send_up() */ + void (*if_input)(struct ifnet *, struct mbuf *); + /* references to the ifnet and device routines, used by * the generic netmap functions. */ struct ifnet *ifp; /* adapter is ifp->if_softc */ + /*---- callbacks for this netmap adapter -----*/ + /* + * nm_dtor() is the cleanup routine called when destroying + * the adapter. + * + * nm_register() is called on NIOCREGIF and close() to enter + * or exit netmap mode on the NIC + * + * nm_txsync() pushes packets to the underlying hw/switch + * + * nm_rxsync() collects packets from the underlying hw/switch + * + * nm_config() returns configuration information from the OS + * + * nm_krings_create() XXX + * + * nm_krings_delete() XXX + * + * nm_notify() is used to act after data have become available. + * For hw devices this is typically a selwakeup(), + * but for NIC/host ports attached to a switch (or vice-versa) + * we also need to invoke the 'txsync' code downstream. + */ + /* private cleanup */ void (*nm_dtor)(struct netmap_adapter *); @@ -403,6 +486,7 @@ struct netmap_adapter { void *na_private; }; + /* * If the NIC is owned by the kernel * (i.e., bridge), neither another bridge nor user can use it; @@ -433,13 +517,15 @@ struct netmap_vp_adapter { /* VALE software port */ u_int offset; /* Offset of ethernet header for each packet. */ }; + struct netmap_hw_adapter { /* physical device */ struct netmap_adapter up; struct net_device_ops nm_ndo; // XXX linux only }; -struct netmap_generic_adapter { /* non-native device */ + +struct netmap_generic_adapter { /* emulated device */ struct netmap_hw_adapter up; /* Pointer to a previously used netmap adapter. */ @@ -455,16 +541,20 @@ struct netmap_generic_adapter { /* non-native device */ struct hrtimer mit_timer; int mit_pending; +#ifdef linux + netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); +#endif }; #ifdef WITH_VALE -/* bridge wrapper for non VALE ports. It is used to connect real devices to the bridge. +/* + * Bridge wrapper for non VALE ports attached to a VALE switch. * - * The real device must already have its own netmap adapter (hwna). The - * bridge wrapper and the hwna adapter share the same set of netmap rings and - * buffers, but they have two separate sets of krings descriptors, with tx/rx - * meanings swapped: + * The real device must already have its own netmap adapter (hwna). + * The bridge wrapper and the hwna adapter share the same set of + * netmap rings and buffers, but they have two separate sets of + * krings descriptors, with tx/rx meanings swapped: * * netmap * bwrap krings rings krings hwna @@ -478,23 +568,28 @@ struct netmap_generic_adapter { /* non-native device */ * | | +------+ +-----+ +------+ | | * +------+ +------+ * - * - packets coming from the bridge go to the brwap rx rings, which are also the - * hwna tx rings. The bwrap notify callback will then complete the hwna tx - * (see netmap_bwrap_notify). - * - packets coming from the outside go to the hwna rx rings, which are also the - * bwrap tx rings. The (overwritten) hwna notify method will then complete - * the bridge tx (see netmap_bwrap_intr_notify). + * - packets coming from the bridge go to the brwap rx rings, + * which are also the hwna tx rings. The bwrap notify callback + * will then complete the hwna tx (see netmap_bwrap_notify). * - * The bridge wrapper may optionally connect the hwna 'host' rings to the - * bridge. This is done by using a second port in the bridge and connecting it - * to the 'host' netmap_vp_adapter contained in the netmap_bwrap_adapter. - * The brwap host adapter cross-links the hwna host rings in the same way as shown above. + * - packets coming from the outside go to the hwna rx rings, + * which are also the bwrap tx rings. The (overwritten) hwna + * notify method will then complete the bridge tx + * (see netmap_bwrap_intr_notify). * - * - packets coming from the bridge and directed to host stack are handled by the - * bwrap host notify callback (see netmap_bwrap_host_notify) - * - packets coming from the host stack are still handled by the overwritten - * hwna notify callback (netmap_bwrap_intr_notify), but are diverted to the - * host adapter depending on the ring number. + * The bridge wrapper may optionally connect the hwna 'host' rings + * to the bridge. This is done by using a second port in the + * bridge and connecting it to the 'host' netmap_vp_adapter + * contained in the netmap_bwrap_adapter. The brwap host adapter + * cross-links the hwna host rings in the same way as shown above. + * + * - packets coming from the bridge and directed to the host stack + * are handled by the bwrap host notify callback + * (see netmap_bwrap_host_notify) + * + * - packets coming from the host stack are still handled by the + * overwritten hwna notify callback (netmap_bwrap_intr_notify), + * but are diverted to the host adapter depending on the ring number. * */ struct netmap_bwrap_adapter { @@ -505,103 +600,39 @@ struct netmap_bwrap_adapter { /* backup of the hwna notify callback */ int (*save_notify)(struct netmap_adapter *, u_int ring, enum txrx, int flags); - /* When we attach a physical interface to the bridge, we + + /* + * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need * a place to store the netmap_priv_d data structure. - * This is only done when physical interfaces are attached to a bridge. + * This is only done when physical interfaces + * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; }; -/* - * Available space in the ring. Only used in VALE code - */ -static inline uint32_t -nm_kr_space(struct netmap_kring *k, int is_rx) -{ - int space; - - if (is_rx) { - int busy = k->nkr_hwlease - k->nr_hwcur + k->nr_hwreserved; - if (busy < 0) - busy += k->nkr_num_slots; - space = k->nkr_num_slots - 1 - busy; - } else { - space = k->nr_hwcur + k->nr_hwavail - k->nkr_hwlease; - if (space < 0) - space += k->nkr_num_slots; - } -#if 0 - // sanity check - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - busy < 0 || - busy >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif - return space; -} - - +#endif /* WITH_VALE */ -/* make a lease on the kring for N positions. return the - * lease index - */ +/* return slots reserved to rx clients; used in drivers */ static inline uint32_t -nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +nm_kr_rxspace(struct netmap_kring *k) { - uint32_t lim = k->nkr_num_slots - 1; - uint32_t lease_idx = k->nkr_lease_idx; - - k->nkr_leases[lease_idx] = NR_NOSLOT; - k->nkr_lease_idx = nm_next(lease_idx, lim); + int space = k->nr_hwtail - k->nr_hwcur; + if (space < 0) + space += k->nkr_num_slots; + ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); - if (n > nm_kr_space(k, is_rx)) { - D("invalid request for %d slots", n); - panic("x"); - } - /* XXX verify that there are n slots */ - k->nkr_hwlease += n; - if (k->nkr_hwlease > lim) - k->nkr_hwlease -= lim + 1; - - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring %s, cur %d avail %d lease %d lease_idx %d lim %d", - k->na->ifp->if_xname, - k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } - return lease_idx; + return space; } -#endif /* WITH_VALE */ -/* return update position */ -static inline uint32_t -nm_kr_rxpos(struct netmap_kring *k) +/* True if no space in the tx ring. only valid after txsync_prologue */ +static inline int +nm_kr_txempty(struct netmap_kring *kring) { - uint32_t pos = k->nr_hwcur + k->nr_hwavail; - if (pos >= k->nkr_num_slots) - pos -= k->nkr_num_slots; -#if 0 - if (pos >= k->nkr_num_slots || - k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif - return pos; + return kring->rcur == kring->nr_hwtail; } @@ -613,11 +644,13 @@ nm_kr_rxpos(struct netmap_kring *k) #define NM_KR_BUSY 1 #define NM_KR_STOPPED 2 + static __inline void nm_kr_put(struct netmap_kring *kr) { NM_ATOMIC_CLEAR(&kr->nr_busy); } + static __inline int nm_kr_tryget(struct netmap_kring *kr) { /* check a first time without taking the lock @@ -640,7 +673,7 @@ static __inline int nm_kr_tryget(struct netmap_kring *kr) /* - * The following are support routines used by individual drivers to + * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the @@ -666,7 +699,17 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); -/* set/clear native flags. XXX maybe also if_transmit ? */ +/* default functions to handle rx/tx interrupts */ +int netmap_rx_irq(struct ifnet *, u_int, u_int *); +#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); + +void netmap_disable_all_rings(struct ifnet *); +void netmap_enable_all_rings(struct ifnet *); +void netmap_disable_ring(struct netmap_kring *kr); + + +/* set/clear native flags and if_transmit/netdev_ops */ static inline void nm_set_native_flags(struct netmap_adapter *na) { @@ -685,6 +728,7 @@ nm_set_native_flags(struct netmap_adapter *na) #endif } + static inline void nm_clear_native_flags(struct netmap_adapter *na) { @@ -701,36 +745,58 @@ nm_clear_native_flags(struct netmap_adapter *na) #endif } + /* - * validates parameters in the ring/kring, returns a value for cur, - * and the 'new_slots' value in the argument. - * If any error, returns cur > lim to force a reinit. + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size to force a reinit. */ -u_int nm_txsync_prologue(struct netmap_kring *, u_int *); +uint32_t nm_txsync_prologue(struct netmap_kring *); + /* - * validates parameters in the ring/kring, returns a value for cur, + * validates parameters in the ring/kring, returns a value for head, * and the 'reserved' value in the argument. - * If any error, returns cur > lim to force a reinit. + * If any error, returns ring_size lim to force a reinit. + */ +uint32_t nm_rxsync_prologue(struct netmap_kring *); + + +/* + * update kring and ring at the end of txsync. */ -u_int nm_rxsync_prologue(struct netmap_kring *, u_int *); +static inline void +nm_txsync_finalize(struct netmap_kring *kring) +{ + /* update ring head/tail to what the kernel knows */ + kring->ring->tail = kring->rtail = kring->nr_hwtail; + kring->ring->head = kring->rhead = kring->nr_hwcur; + + /* note, head/rhead/hwcur might be behind cur/rcur + * if no carrier + */ + ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", + kring->name, kring->nr_hwcur, kring->nr_hwtail, + kring->rhead, kring->rcur, kring->rtail); +} + /* - * update kring and ring at the end of txsync + * update kring and ring at the end of rxsync */ static inline void -nm_txsync_finalize(struct netmap_kring *kring, u_int cur) +nm_rxsync_finalize(struct netmap_kring *kring) { - /* recompute hwreserved */ - kring->nr_hwreserved = cur - kring->nr_hwcur; - if (kring->nr_hwreserved < 0) - kring->nr_hwreserved += kring->nkr_num_slots; - - /* update avail and reserved to what the kernel knows */ - kring->ring->avail = kring->nr_hwavail; - kring->ring->reserved = kring->nr_hwreserved; + /* tell userspace that there might be new packets */ + //struct netmap_ring *ring = kring->ring; + ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, + kring->nr_hwtail); + kring->ring->tail = kring->rtail = kring->nr_hwtail; + /* make a copy of the state for next round */ + kring->rhead = kring->ring->head; + kring->rcur = kring->ring->cur; } + /* check/fix address and len in tx rings */ #if 1 /* debug version */ #define NM_CHECK_ADDR_LEN(_a, _l) do { \ @@ -755,6 +821,8 @@ nm_txsync_finalize(struct netmap_kring *kring, u_int cur) int netmap_update_config(struct netmap_adapter *na); int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom); void netmap_krings_delete(struct netmap_adapter *na); +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); + struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, @@ -766,10 +834,13 @@ u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); + #ifdef WITH_VALE /* - * The following bridge-related interfaces are used by other kernel modules - * In the version that only supports unicast or broadcast, the lookup + * The following bridge-related functions are used by other + * kernel modules. + * + * VALE only supports unicast or broadcast. The lookup * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports, * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. @@ -799,8 +870,6 @@ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); /* Various prototypes */ int netmap_poll(struct cdev *dev, int events, struct thread *td); - - int netmap_init(void); void netmap_fini(void); int netmap_get_memory(struct netmap_priv_d* p); @@ -811,7 +880,8 @@ int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct t /* netmap_adapter creation/destruction */ #define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") -#define NM_DEBUG_PUTGET 1 + +// #define NM_DEBUG_PUTGET 1 #ifdef NM_DEBUG_PUTGET @@ -844,12 +914,15 @@ int netmap_adapter_put(struct netmap_adapter *na); #endif /* !NM_DEBUG_PUTGET */ +/* + * module variables + */ extern u_int netmap_buf_size; #define NETMAP_BUF_SIZE netmap_buf_size // XXX remove -extern int netmap_mitigate; +extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; -extern u_int netmap_total_buffers; -extern char *netmap_buffer_base; +extern u_int netmap_total_buffers; // global allocator +extern char *netmap_buffer_base; // global allocator extern int netmap_verbose; // XXX debugging enum { /* verbose flags */ NM_VERB_ON = 1, /* generic verbose */ @@ -908,7 +981,7 @@ extern int netmap_generic_ringsize; #ifdef __FreeBSD__ -/* Callback invoked by the dma machinery after a successfull dmamap_load */ +/* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { @@ -1053,31 +1126,27 @@ BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) lut[0].vaddr : lut[i].vaddr; } -/* default functions to handle rx/tx interrupts */ -int netmap_rx_irq(struct ifnet *, u_int, u_int *); -#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) -void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); void netmap_txsync_to_host(struct netmap_adapter *na); -void netmap_disable_all_rings(struct ifnet *); -void netmap_enable_all_rings(struct ifnet *); -void netmap_disable_ring(struct netmap_kring *kr); -/* Structure associated to each thread which registered an interface. +/* + * Structure associated to each thread which registered an interface. * * The first 4 fields of this structure are written by NIOCREGIF and * read by poll() and NIOC?XSYNC. - * There is low contention among writers (actually, a correct user program - * should have no contention among writers) and among writers and readers, - * so we use a single global lock to protect the structure initialization. - * Since initialization involves the allocation of memory, we reuse the memory - * allocator lock. + * + * There is low contention among writers (a correct user program + * should have none) and among writers and readers, so we use a + * single global lock to protect the structure initialization; + * since initialization involves the allocation of memory, + * we reuse the memory allocator lock. + * * Read access to the structure is lock free. Readers must check that * np_nifp is not NULL before using the other fields. - * If np_nifp is NULL initialization has not been performed, so they should - * return an error to userlevel. + * If np_nifp is NULL initialization has not been performed, + * so they should return an error to userspace. * * The ref_done field is used to regulate access to the refcount in the * memory allocator. The refcount must be incremented at most once for @@ -1091,38 +1160,29 @@ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; + int np_ringid; /* from the ioctl */ + u_int np_qfirst, np_qlast; /* range of rings to scan */ + uint16_t np_txpoll; struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ /* np_refcount is only used on FreeBSD */ - int np_refcount; /* use with NMG_LOCK held */ + int np_refcount; /* use with NMG_LOCK held */ }; /* * generic netmap emulation for devices that do not have * native netmap support. - * XXX generic_netmap_register() is only exported to implement - * nma_is_generic(). */ -int generic_netmap_register(struct netmap_adapter *na, int enable); int generic_netmap_attach(struct ifnet *ifp); int netmap_catch_rx(struct netmap_adapter *na, int intercept); void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; -void netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable); +void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); -static __inline int -nma_is_generic(struct netmap_adapter *na) -{ - return na->nm_register == generic_netmap_register; -} - /* * netmap_mitigation API. This is used by the generic adapter * to reduce the number of interrupt requests/selwakeup @@ -1134,6 +1194,4 @@ void netmap_mitigation_restart(struct netmap_generic_adapter *na); int netmap_mitigation_active(struct netmap_generic_adapter *na); void netmap_mitigation_cleanup(struct netmap_generic_adapter *na); -// int generic_timer_handler(struct hrtimer *t); - #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c index c8e581b69fe5..2606b13d48dc 100644 --- a/sys/dev/netmap/netmap_mbq.c +++ b/sys/dev/netmap/netmap_mbq.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -47,17 +47,20 @@ static inline void __mbq_init(struct mbq *q) q->count = 0; } + void mbq_safe_init(struct mbq *q) { mtx_init(&q->lock, "mbq", NULL, MTX_SPIN); __mbq_init(q); } + void mbq_init(struct mbq *q) { __mbq_init(q); } + static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) { m->m_nextpkt = NULL; @@ -70,6 +73,7 @@ static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) q->count++; } + void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) { mtx_lock(&q->lock); @@ -77,11 +81,13 @@ void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) mtx_unlock(&q->lock); } + void mbq_enqueue(struct mbq *q, struct mbuf *m) { __mbq_enqueue(q, m); } + static inline struct mbuf *__mbq_dequeue(struct mbq *q) { struct mbuf *ret = NULL; @@ -99,6 +105,7 @@ static inline struct mbuf *__mbq_dequeue(struct mbq *q) return ret; } + struct mbuf *mbq_safe_dequeue(struct mbq *q) { struct mbuf *ret; @@ -110,11 +117,13 @@ struct mbuf *mbq_safe_dequeue(struct mbq *q) return ret; } + struct mbuf *mbq_dequeue(struct mbq *q) { return __mbq_dequeue(q); } + /* XXX seems pointless to have a generic purge */ static void __mbq_purge(struct mbq *q, int safe) { @@ -130,16 +139,19 @@ static void __mbq_purge(struct mbq *q, int safe) } } + void mbq_purge(struct mbq *q) { __mbq_purge(q, 0); } + void mbq_safe_purge(struct mbq *q) { __mbq_purge(q, 1); } + void mbq_safe_destroy(struct mbq *q) { mtx_destroy(&q->lock); @@ -149,4 +161,3 @@ void mbq_safe_destroy(struct mbq *q) void mbq_destroy(struct mbq *q) { } - diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h index ad023b617a5d..d273d8a8fa23 100644 --- a/sys/dev/netmap/netmap_mbq.h +++ b/sys/dev/netmap/netmap_mbq.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index f28f2c04751a..b25f79cef3a4 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -506,7 +506,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj p->r_objsize = objsize; #define MAX_CLUSTSIZE (1<<17) -#define LINE_ROUND 64 +#define LINE_ROUND NM_CACHE_ALIGN // 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); @@ -960,13 +960,15 @@ netmap_mem_rings_create(struct netmap_adapter *na) ND("txring[%d] at %p ofs %d", i, ring); kring->ring = ring; *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - ring->avail = kring->nr_hwavail; - ring->cur = kring->nr_hwcur; + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); ND("initializing slots for txring"); @@ -989,13 +991,15 @@ netmap_mem_rings_create(struct netmap_adapter *na) kring->ring = ring; *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - ring->cur = kring->nr_hwcur; - ring->avail = kring->nr_hwavail; + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); ND("initializing slots for rxring[%d]", i); diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index f492f9814b79..8e6c58cbc4ee 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c index 32d6422de120..f988b84e78b2 100644 --- a/sys/dev/netmap/netmap_vale.c +++ b/sys/dev/netmap/netmap_vale.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -251,44 +251,6 @@ struct nm_bridge nm_bridges[NM_BRIDGES]; /* - * A few function to tell which kind of port are we using. - * XXX should we hold a lock ? - * - * nma_is_vp() virtual port - * nma_is_host() port connected to the host stack - * nma_is_hw() port connected to a NIC - * nma_is_generic() generic netmap adapter XXX stop this madness - */ -static __inline int -nma_is_vp(struct netmap_adapter *na) -{ - return na->nm_register == bdg_netmap_reg; -} - - -static __inline int -nma_is_host(struct netmap_adapter *na) -{ - return na->nm_register == NULL; -} - - -static __inline int -nma_is_hw(struct netmap_adapter *na) -{ - /* In case of sw adapter, nm_register is NULL */ - return !nma_is_vp(na) && !nma_is_host(na) && !nma_is_generic(na); -} - -static __inline int -nma_is_bwrap(struct netmap_adapter *na) -{ - return na->nm_register == netmap_bwrap_register; -} - - - -/* * this is a slightly optimized copy routine which rounds * to multiple of 64 bytes and is often faster than dealing * with other odd sizes. We assume there is enough room @@ -318,7 +280,6 @@ pkt_copy(void *_src, void *_dst, int l) } - /* * locate a bridge among the existing ones. * MUST BE CALLED WITH NMG_LOCK() @@ -393,8 +354,8 @@ nm_free_bdgfwd(struct netmap_adapter *na) struct netmap_kring *kring; NMG_LOCK_ASSERT(); - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; + nrings = na->num_tx_rings; + kring = na->tx_rings; for (i = 0; i < nrings; i++) { if (kring[i].nkr_ft) { free(kring[i].nkr_ft, M_DEVBUF); @@ -502,6 +463,7 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) } } + static void netmap_adapter_vp_dtor(struct netmap_adapter *na) { @@ -520,6 +482,16 @@ netmap_adapter_vp_dtor(struct netmap_adapter *na) na->ifp = NULL; } + +/* Try to get a reference to a netmap adapter attached to a VALE switch. + * If the adapter is found (or is created), this function returns 0, a + * non NULL pointer is returned into *na, and the caller holds a + * reference to the adapter. + * If an adapter is not found, then no reference is grabbed and the + * function returns an error code, or 0 if there is just a VALE prefix + * mismatch. Therefore the caller holds a reference when + * (*na != NULL && return == 0). + */ int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { @@ -688,18 +660,12 @@ nm_bdg_attach(struct nmreq *nmr) return ENOMEM; NMG_LOCK(); /* XXX probably netmap_get_bdg_na() */ - error = netmap_get_na(nmr, &na, 1 /* create if not exists */); + error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); if (error) /* no device, or another bridge or user owns the device */ goto unlock_exit; - /* netmap_get_na() sets na_bdg if this is a physical interface - * that we can attach to a switch. - */ - if (!nma_is_bwrap(na)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge prefix or wrong NIC name - */ + if (na == NULL) { /* VALE prefix missing */ error = EINVAL; - goto unref_exit; + goto unlock_exit; } if (na->active_fds > 0) { /* already registered */ @@ -727,6 +693,7 @@ unlock_exit: return error; } + static int nm_bdg_detach(struct nmreq *nmr) { @@ -736,17 +703,15 @@ nm_bdg_detach(struct nmreq *nmr) int last_instance; NMG_LOCK(); - error = netmap_get_na(nmr, &na, 0 /* don't create */); + error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); if (error) { /* no device, or another bridge or user owns the device */ goto unlock_exit; } - if (!nma_is_bwrap(na)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge's prefix or wrong NIC's name - */ + if (na == NULL) { /* VALE prefix missing */ error = EINVAL; - goto unref_exit; + goto unlock_exit; } + bna = (struct netmap_bwrap_adapter *)na; if (na->active_fds == 0) { /* not registered */ @@ -890,12 +855,13 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) case NETMAP_BDG_OFFSET: NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0); - if (!error) { + if (na && !error) { vpna = (struct netmap_vp_adapter *)na; if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET) nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET; vpna->offset = nmr->nr_arg1; D("Using offset %d for %p", vpna->offset, vpna); + netmap_adapter_put(na); } NMG_UNLOCK(); break; @@ -947,6 +913,7 @@ netmap_vp_krings_create(struct netmap_adapter *na) return 0; } + static void netmap_vp_krings_delete(struct netmap_adapter *na) { @@ -1027,10 +994,6 @@ nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, } -/* - *---- support for virtual bridge ----- - */ - /* ----- FreeBSD if_bridge hash function ------- */ /* @@ -1052,6 +1015,7 @@ do { \ c -= a; c -= b; c ^= (b >> 15); \ } while (/*CONSTCOND*/0) + static __inline uint32_t nm_bridge_rthash(const uint8_t *addr) { @@ -1144,6 +1108,77 @@ netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, /* + * Available space in the ring. Only used in VALE code + * and only with is_rx = 1 + */ +static inline uint32_t +nm_kr_space(struct netmap_kring *k, int is_rx) +{ + int space; + + if (is_rx) { + int busy = k->nkr_hwlease - k->nr_hwcur; + if (busy < 0) + busy += k->nkr_num_slots; + space = k->nkr_num_slots - 1 - busy; + } else { + /* XXX never used in this branch */ + space = k->nr_hwtail - k->nkr_hwlease; + if (space < 0) + space += k->nkr_num_slots; + } +#if 0 + // sanity check + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_tail >= k->nkr_num_slots || + busy < 0 || + busy >= k->nkr_num_slots) { + D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return space; +} + + + + +/* make a lease on the kring for N positions. return the + * lease index + * XXX only used in VALE code and with is_rx = 1 + */ +static inline uint32_t +nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +{ + uint32_t lim = k->nkr_num_slots - 1; + uint32_t lease_idx = k->nkr_lease_idx; + + k->nkr_leases[lease_idx] = NR_NOSLOT; + k->nkr_lease_idx = nm_next(lease_idx, lim); + + if (n > nm_kr_space(k, is_rx)) { + D("invalid request for %d slots", n); + panic("x"); + } + /* XXX verify that there are n slots */ + k->nkr_hwlease += n; + if (k->nkr_hwlease > lim) + k->nkr_hwlease -= lim + 1; + + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwtail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", + k->na->ifp->if_xname, + k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } + return lease_idx; +} + +/* * This flush routine supports only unicast and broadcast but a large * number of ports, and lets us replace the learn and dispatch functions. */ @@ -1357,28 +1392,30 @@ retry: dst = BDG_NMB(&dst_na->up, slot); if (unlikely(fix_mismatch)) { - if (na->offset > dst_na->offset) { - src += na->offset - dst_na->offset; - copy_len -= na->offset - dst_na->offset; - dst_len = copy_len; - } else { - bzero(dst, dst_na->offset - na->offset); - dst_len += dst_na->offset - na->offset; - dst += dst_na->offset - na->offset; - } - /* fix the first fragment only */ - fix_mismatch = 0; - /* completely skip an header only fragment */ - if (copy_len == 0) { - ft_p++; - continue; - } + /* We are processing the first fragment + * and there is a mismatch between source + * and destination offsets. Create a zeroed + * header for the destination, independently + * of the source header length and content. + */ + src += na->offset; + copy_len -= na->offset; + bzero(dst, dst_na->offset); + dst += dst_na->offset; + dst_len = dst_na->offset + copy_len; + /* fix the first fragment only */ + fix_mismatch = 0; + /* Here it could be copy_len == dst_len == 0, + * and so a zero length fragment is passed. + */ } + + ND("send [%d] %d(%d) bytes at %s:%d", + i, (int)copy_len, (int)dst_len, + NM_IFPNAME(dst_ifp), j); /* round to a multiple of 64 */ copy_len = (copy_len + 63) & ~63; - ND("send %d %d bytes at %s:%d", - i, ft_p->ft_len, NM_IFPNAME(dst_ifp), j); if (ft_p->ft_flags & NS_INDIRECT) { if (copyin(src, dst, copy_len)) { // invalid user pointer, pretend len is 0 @@ -1426,7 +1463,7 @@ retry: } p[lease_idx] = j; /* report I am done */ - update_pos = nm_kr_rxpos(kring); + update_pos = kring->nr_hwtail; if (my_start == update_pos) { /* all slots before my_start have been reported, @@ -1443,15 +1480,7 @@ retry: * means there are new buffers to report */ if (likely(j != my_start)) { - uint32_t old_avail = kring->nr_hwavail; - - kring->nr_hwavail = (j >= kring->nr_hwcur) ? - j - kring->nr_hwcur : - j + lim + 1 - kring->nr_hwcur; - if (kring->nr_hwavail < old_avail) { - D("avail shrink %d -> %d", - old_avail, kring->nr_hwavail); - } + kring->nr_hwtail = j; dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); still_locked = 0; mtx_unlock(&kring->q_lock); @@ -1471,35 +1500,32 @@ cleanup: return 0; } + static int netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) { struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k, lim = kring->nkr_num_slots - 1; - - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + u_int done; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = kring->rcur; if (bridge_batch <= 0) { /* testing only */ - j = k; // used all + done = cur; // used all goto done; } if (bridge_batch > NM_BDG_BATCH) bridge_batch = NM_BDG_BATCH; - j = nm_bdg_preflush(na, ring_nr, kring, k); - if (j != k) - D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); - /* k-j modulo ring size is the number of slots processed */ - if (k < j) - k += kring->nkr_num_slots; - kring->nr_hwavail = lim - (k - j); - + done = nm_bdg_preflush(na, ring_nr, kring, cur); done: - kring->nr_hwcur = j; - ring->avail = kring->nr_hwavail; + if (done != cur) + D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); + /* + * packets between 'done' and 'cur' are left unsent. + */ + kring->nr_hwcur = done; + kring->nr_hwtail = nm_prev(done, lim); + nm_txsync_finalize(kring); if (netmap_verbose) D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); return 0; @@ -1518,46 +1544,30 @@ bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) return netmap_vp_txsync(vpna, ring_nr, flags); } - -/* - * user process reading from a VALE switch. - * Already protected against concurrent calls from userspace, - * but we must acquire the queue's lock to protect against - * writers on the same queue. - */ static int -bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, lim = kring->nkr_num_slots - 1; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i, lim = kring->nkr_num_slots - 1; + u_int head = nm_rxsync_prologue(kring); int n; - mtx_lock(&kring->q_lock); - if (k > lim) { + if (head > lim) { D("ouch dangerous reset!!!"); n = netmap_ring_reinit(kring); goto done; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } + /* First part, import newly received packets. */ + /* actually nothing to do here, they are already in the kring */ - if (j != k) { /* userspace has released some packets. */ - n = k - j; - if (n < 0) - n += kring->nkr_num_slots; - ND("userspace releases %d packets", n); - for (n = 0; likely(j != k); n++) { - struct netmap_slot *slot = &ring->slot[j]; + /* Second part, skip past packets that userspace has released. */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* consistency check, but nothing really important here */ + for (n = 0; likely(nm_i != head); n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; void *addr = BDG_NMB(na, slot); if (addr == netmap_buffer_base) { /* bad buf */ @@ -1565,19 +1575,37 @@ bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) slot->buf_idx); } slot->flags &= ~NS_BUF_CHANGED; - j = nm_next(j, lim); + nm_i = nm_next(nm_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; } + /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); n = 0; done: + return n; +} + +/* + * user process reading from a VALE switch. + * Already protected against concurrent calls from userspace, + * but we must acquire the queue's lock to protect against + * writers on the same queue. + */ +static int +bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + int n; + + mtx_lock(&kring->q_lock); + n = netmap_vp_rxsync(na, ring_nr, flags); mtx_unlock(&kring->q_lock); return n; } + static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) { @@ -1627,6 +1655,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) return 0; } + static void netmap_bwrap_dtor(struct netmap_adapter *na) { @@ -1652,16 +1681,22 @@ netmap_bwrap_dtor(struct netmap_adapter *na) } + /* - * Pass packets from nic to the bridge. + * Intr callback for NICs connected to a bridge. + * Simply ignore tx interrupts (maybe we could try to recover space ?) + * and pass received packets from nic to the bridge. + * * XXX TODO check locking: this is called from the interrupt * handler so we should make sure that the interface is not * disconnected while passing down an interrupt. * - * Note, no user process can access this NIC so we can ignore - * the info in the 'ring'. - */ -/* callback that overwrites the hwna notify callback. + * Note, no user process can access this NIC or the host stack. + * The only part of the ring that is significant are the slots, + * and head/cur/tail are set from the kring as needed + * (part as a receive ring, part as a transmit ring). + * + * callback that overwrites the hwna notify callback. * Packets come from the outside or from the host stack and are put on an hwna rx ring. * The bridge wrapper then sends the packets through the bridge. */ @@ -1677,21 +1712,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, struct netmap_vp_adapter *vpna = &bna->up; int error = 0; - ND("%s[%d] %s %x", NM_IFPNAME(ifp), ring_nr, (tx == NR_TX ? "TX" : "RX"), flags); + if (netmap_verbose) + D("%s %s%d 0x%x", NM_IFPNAME(ifp), + (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); if (flags & NAF_DISABLE_NOTIFY) { kring = tx == NR_TX ? na->tx_rings : na->rx_rings; bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; - if (kring->nkr_stopped) - netmap_disable_ring(bkring); + if (kring[ring_nr].nkr_stopped) + netmap_disable_ring(&bkring[ring_nr]); else - bkring->nkr_stopped = 0; + bkring[ring_nr].nkr_stopped = 0; return 0; } if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) return 0; + /* we only care about receive interrupts */ if (tx == NR_TX) return 0; @@ -1707,7 +1745,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, goto put_out; } + /* Here we expect ring->head = ring->cur = ring->tail + * because everything has been released from the previous round. + * However the ring is shared and we might have info from + * the wrong side (the tx ring). Hence we overwrite with + * the info from the rx kring. + */ + if (netmap_verbose) + D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp), + ring->head, ring->cur, ring->tail, + kring->rhead, kring->rcur, kring->rtail); + + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; + + /* simulate a user wakeup on the rx ring */ if (is_host_ring) { + netmap_rxsync_from_host(na, NULL, NULL); vpna = hostna; ring_nr = 0; } else { @@ -1718,23 +1773,46 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, if (error) goto put_out; } - if (kring->nr_hwavail == 0 && netmap_verbose) { + if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { D("how strange, interrupt with no packets on %s", NM_IFPNAME(ifp)); goto put_out; } - /* XXX avail ? */ - ring->cur = nm_kr_rxpos(kring); + + /* new packets are ring->cur to ring->tail, and the bkring + * had hwcur == ring->cur. So advance ring->cur to ring->tail + * to push all packets out. + */ + ring->head = ring->cur = ring->tail; + + /* also set tail to what the bwrap expects */ + bkring = &vpna->up.tx_rings[ring_nr]; + ring->tail = bkring->nr_hwtail; // rtail too ? + + /* pass packets to the switch */ + nm_txsync_prologue(bkring); // XXX error checking ? netmap_vp_txsync(vpna, ring_nr, flags); - if (!is_host_ring) + /* mark all buffers as released on this ring */ + ring->head = ring->cur = kring->nr_hwtail; + ring->tail = kring->rtail; + /* another call to actually release the buffers */ + if (!is_host_ring) { error = na->nm_rxsync(na, ring_nr, 0); + } else { + /* mark all packets as released, as in the + * second part of netmap_rxsync_from_host() + */ + kring->nr_hwcur = kring->nr_hwtail; + nm_rxsync_finalize(kring); + } put_out: nm_kr_put(kring); return error; } + static int netmap_bwrap_register(struct netmap_adapter *na, int onoff) { @@ -1744,7 +1822,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) struct netmap_vp_adapter *hostna = &bna->host; int error; - ND("%s %d", NM_IFPNAME(ifp), onoff); + ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off"); if (onoff) { int i; @@ -1788,6 +1866,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) return 0; } + static int netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, u_int *rxr, u_int *rxd) @@ -1807,6 +1886,7 @@ netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, return 0; } + static int netmap_bwrap_krings_create(struct netmap_adapter *na) { @@ -1834,6 +1914,7 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) return 0; } + static void netmap_bwrap_krings_delete(struct netmap_adapter *na) { @@ -1847,6 +1928,7 @@ netmap_bwrap_krings_delete(struct netmap_adapter *na) netmap_vp_krings_delete(na); } + /* notify method for the bridge-->hwna direction */ static int netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) @@ -1856,7 +1938,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f struct netmap_adapter *hwna = bna->hwna; struct netmap_kring *kring, *hw_kring; struct netmap_ring *ring; - u_int lim, k; + u_int lim; int error = 0; if (tx == NR_TX) @@ -1865,35 +1947,49 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f kring = &na->rx_rings[ring_n]; hw_kring = &hwna->tx_rings[ring_n]; ring = kring->ring; - lim = kring->nkr_num_slots - 1; - k = nm_kr_rxpos(kring); if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) return 0; - ring->cur = k; - ND("%s[%d] PRE rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", + /* first step: simulate a user wakeup on the rx ring */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", NM_IFPNAME(na->ifp), ring_n, - kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, - ring->cur, ring->avail, ring->reserved, - hw_kring->nr_hwcur, hw_kring->nr_hwavail); + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); + /* second step: the simulated user consumes all new packets */ + ring->head = ring->cur = ring->tail; + + /* third step: the new packets are sent on the tx ring + * (which is actually the same ring) + */ + /* set tail to what the hw expects */ + ring->tail = hw_kring->rtail; if (ring_n == na->num_rx_rings) { netmap_txsync_to_host(hwna); } else { + nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? error = hwna->nm_txsync(hwna, ring_n, flags); } - kring->nr_hwcur = ring->cur; - kring->nr_hwavail = 0; - kring->nr_hwreserved = lim - ring->avail; - ND("%s[%d] PST rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", + + /* fourth step: now we are back the rx ring */ + /* claim ownership on all hw owned bufs */ + ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */ + ring->tail = kring->rtail; /* restore saved value of tail, for safety */ + + /* fifth step: the user goes to sleep again, causing another rxsync */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", NM_IFPNAME(na->ifp), ring_n, - kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, - ring->cur, ring->avail, ring->reserved, - hw_kring->nr_hwcur, hw_kring->nr_hwavail); + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); return error; } + static int netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) { @@ -1904,6 +2000,7 @@ netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); } + /* attach a bridge wrapper to the 'real' device */ static int netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) @@ -1957,7 +2054,8 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) hostna->nm_mem = na->nm_mem; hostna->na_private = bna; - D("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname, + ND("%s<->%s txr %d txd %d rxr %d rxd %d", + fake->if_xname, real->if_xname, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); @@ -1970,6 +2068,7 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) return 0; } + void netmap_init_bridges(void) { diff --git a/sys/net/netmap.h b/sys/net/netmap.h index 50e230934dd0..a5ee9b55edc9 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,6 +39,16 @@ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ +#define NETMAP_API 10 /* current API version */ + +/* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + /* * --- Netmap data structures --- * @@ -52,23 +62,23 @@ ==================================================================== | USERSPACE | struct netmap_ring - +---->+--------------+ - / | cur | - struct netmap_if (nifp, 1 per fd) / | avail | - +---------------+ / | buf_ofs | - | ni_tx_rings | / +==============+ - | ni_rx_rings | / | buf_idx, len | slot[0] - | | / | flags, ptr | - | | / +--------------+ - +===============+ / | buf_idx, len | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | - | txring_ofs[1] | +--------------+ - (ni_tx_rings+1 entries) (num_slots entries) - | txring_ofs[t] | | buf_idx, len | slot[n-1] - +---------------+ | flags, ptr | - | rxring_ofs[0] | +--------------+ + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1+extra_tx entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | - (ni_rx_rings+1 entries) + (rx+1+extra_rx entries) | rxring_ofs[r] | +---------------+ @@ -93,122 +103,115 @@ /* * struct netmap_slot is a buffer descriptor - * - * buf_idx the index of the buffer associated to the slot. - * len the length of the payload - * flags control operation on the slot, as defined below - * - * NS_BUF_CHANGED must be set whenever userspace wants - * to change buf_idx (it might be necessary to - * reprogram the NIC) - * - * NS_REPORT must be set if we want the NIC to generate an interrupt - * when this slot is used. Leaving it to 0 improves - * performance. - * - * NS_FORWARD if set on a receive ring, and the device is in - * transparent mode, buffers released with the flag set - * will be forwarded to the 'other' side (host stack - * or NIC, respectively) on the next select() or ioctl() - * - * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for - * this packet. - * - * NS_INDIRECT (tx rings only) data is in a userspace buffer pointed - * by the ptr field in the slot. - * - * NS_MOREFRAG Part of a multi-segment frame. The last (or only) - * segment must not have this flag. - * Only supported on VALE ports. - * - * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the - * destination port for the VALE switch, overriding - * the lookup table. */ - struct netmap_slot { uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length */ + uint16_t len; /* length for this slot */ uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + #define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ -#define NS_REPORT 0x0002 /* ask the hardware to report results - * e.g. by generating an interrupt - */ -#define NS_FORWARD 0x0004 /* pass packet to the other endpoint - * (host stack or device) - */ -#define NS_NO_LEARN 0x0008 -#define NS_INDIRECT 0x0010 -#define NS_MOREFRAG 0x0020 + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) - /* - * in rx rings, the high 8 bits - * are the number of fragments. - */ + /* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + #define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) - uint64_t ptr; /* pointer for indirect buffers */ -}; + /* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + /* * struct netmap_ring * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. - * At the software level, two fields are important: avail and cur. + * At the software level the important fields are: head, cur, tail. * * In TX rings: * - * avail tells how many slots are available for transmission. - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * adds a new packet to send. + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * - * cur indicates the slot to use for the next packet - * to send (i.e. the "tail" of the queue). - * It MUST BE incremented by the user before - * netmap system calls to reflect the number of newly - * sent packets. - * It is checked by the kernel on netmap system calls - * (normally unmodified by the kernel unless invalid). + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. * * In RX rings: * - * avail is the number of packets available (possibly 0). - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * consumes a packet. - * - * cur indicates the first slot that contains a packet not - * yet processed (the "head" of the queue). - * It MUST BE incremented by the user when it consumes - * a packet. - * - * reserved indicates the number of buffers before 'cur' - * that the user has not released yet. Normally 0, - * it MUST BE incremented by the user when it - * does not return the buffer immediately, and decremented - * when the buffer is finally freed. + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. * * DATA OWNERSHIP/LOCKING: - * The netmap_ring, all slots, and buffers in the range - * [reserved-cur , cur+avail[ are owned by the user program, - * and the kernel only touches them in the same thread context - * during a system call. - * Other buffers are reserved for use by the NIC's DMA engines. - * - * FLAGS - * NR_TIMESTAMP updates the 'ts' field on each syscall. This is - * a global timestamp for all packets. - * NR_RX_TSTMP if set, the last 64 byte in each buffer will - * contain a timestamp for the frame supplied by - * the hardware (if supported) - * NR_FORWARD if set, the NS_FORWARD flag in each slot of the - * RX ring is checked, and if set the packet is - * passed to the other side (host stack or device, - * respectively). This permits bpf-like behaviour - * or transparency for selected packets. + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. + * + * Other slots and buffers are reserved for use by the kernel */ struct netmap_ring { /* @@ -216,19 +219,22 @@ struct netmap_ring { * It contains the offset of the buffer region from this * descriptor. */ - const ssize_t buf_ofs; + const int64_t buf_ofs; const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' r/w position */ - uint32_t reserved; /* not refilled before current */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ - const uint16_t nr_buf_size; - uint16_t flags; -#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ -#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ -#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */ + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ - struct timeval ts; /* time of last *sync() */ + uint32_t flags; + + struct timeval ts; /* (k) time of last *sync() */ + + /* opaque room for a mutex or similar object */ + uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN))); /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ @@ -236,6 +242,22 @@ struct netmap_ring { /* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + + +/* * Netmap representation of an interface and its queue(s). * This is initialized by the kernel when binding a file * descriptor to a port, and should be considered as readonly @@ -252,81 +274,109 @@ struct netmap_if { const uint32_t ni_flags; /* properties */ #define NI_PRIV_MEM 0x1 /* private memory region */ - const uint32_t ni_rx_rings; /* number of rx rings */ - const uint32_t ni_tx_rings; /* number of tx rings */ + /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + const uint32_t ni_extra_tx_rings; + const uint32_t ni_extra_rx_rings; /* * The following array contains the offset of each netmap ring - * from this structure. The first ni_tx_rings+1 entries refer - * to the tx rings, the next ni_rx_rings+1 refer to the rx rings - * (the last entry in each block refers to the host stack rings). + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; }; + #ifndef NIOCREGIF /* * ioctl names and related fields * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). * The info returned is only advisory and may change before * the interface is bound to a file descriptor. * - * NIOCREGIF takes an interface name within a struct ifreq, + * NIOCREGIF takes an interface name within a struct nmre, * and activates netmap mode on the interface (if possible). * - * nr_name is the name of the interface + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. * - * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings - * indicate the configuration of the port on return. + * The actual argument (struct nmreq) has a number of options to request + * different functions. * - * On input, non-zero values for nr_tx_rings, nr_tx_slots and the - * rx counterparts may be used to reconfigure the port according - * to the requested values, but this is not guaranteed. - * The actual values are returned on completion of the ioctl(). + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. * - * nr_ringid - * indicates how rings should be bound to the file descriptors. - * The default (0) means all physical rings of a NIC are bound. - * NETMAP_HW_RING plus a ring number lets you bind just - * a single ring pair. - * NETMAP_SW_RING binds only the host tx/rx rings - * NETMAP_NO_TX_POLL prevents select()/poll() from pushing - * out packets on the tx ring unless POLLOUT is specified. + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. * - * NETMAP_PRIV_MEM is a return value used to indicate that - * this ring is in a private memory region hence buffer - * swapping cannot be used + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. * - * nr_cmd is used to configure NICs attached to a VALE switch, - * or to dump the configuration of a VALE switch. + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings * - * nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname - * attaches the NIC to the switch, with nr_ringid specifying - * which rings to use + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. * - * nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname - * disconnects a previously attached NIC + * NETMAP_PRIV_MEM is set on return for ports that use private + * memory regions and cannot use buffer swapping. * - * nr_cmd = NETMAP_BDG_LIST is used to list the configuration - * of VALE switches, with additional arguments. + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... * - * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, - * whose identity is set in NIOCREGIF through nr_ringid + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_OFFSET XXX ? + * Set the offset of data in packets. Used with VALE + * switches where the clients use the vhost header. + * + * nr_arg1, nr_arg2 (in/out) command specific * - * NETMAP_API is the API version. */ + /* * struct nmreq overlays a struct ifreq */ struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 5 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ @@ -339,19 +389,23 @@ struct nmreq { #define NETMAP_SW_RING 0x2000 /* process the sw ring */ #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ #define NETMAP_RING_MASK 0xfff /* the ring number */ + uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ #define NETMAP_BDG_OFFSET 5 /* set the port offset */ + uint16_t nr_arg1; #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ #define NETMAP_BDG_MAX_OFFSET 12 + uint16_t nr_arg2; uint32_t spare2[3]; }; + /* * FreeBSD uses the size value embedded in the _IOWR to determine * how much to copy in/out. So we need it to match the actual @@ -360,9 +414,22 @@ struct nmreq { */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ -#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #endif /* !NIOCREGIF */ + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int +nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} + #endif /* _NET_NETMAP_H_ */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index 3f2858304caf..bd6fe0db22ae 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -1,6 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013 Universita` di Pisa + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -28,8 +27,8 @@ /* * $FreeBSD$ * - * This header contains the macros used to manipulate netmap structures - * and packets in userspace. See netmap(4) for more information. + * Functions and macros to manipulate netmap structures and packets + * in userspace. See netmap(4) for more information. * * The address of the struct netmap_if, say nifp, is computed from the * value returned from ioctl(.., NIOCREG, ...) and the mmap region: @@ -44,17 +43,20 @@ * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags * * ring->slot[i] gives us the i-th slot (we can access - * directly plen, flags, bufindex) + * directly len, flags, buf_idx) * * char *buf = NETMAP_BUF(ring, x) returns a pointer to * the buffer numbered x * - * Since rings are circular, we have macros to compute the next index - * i = NETMAP_RING_NEXT(ring, i); + * All ring indexes (head, cur, tail) should always move forward. + * To compute the next index in a circular ring you can use + * i = nm_ring_next(ring, i); * * To ease porting apps from pcap to netmap we supply a few fuctions - * that can be called to open, close and read from netmap in a way - * similar to libpcap. + * that can be called to open, close, read and write on netmap in a way + * similar to libpcap. Note that the read/write function depend on + * an ioctl()/select()/poll() being issued to refill rings or push + * packets out. * * In order to use these, include #define NETMAP_WITH_LIBS * in the source file that invokes these functions. @@ -65,12 +67,19 @@ #include <stdint.h> #include <net/if.h> /* IFNAMSIZ */ + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* likely and unlikely */ + #include <net/netmap.h> +/* helper macro */ #define _NETMAP_OFFSET(type, ptr, offset) \ ((type)(void *)((char *)(ptr) + (offset))) -#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o) +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) #define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ nifp, (nifp)->ring_ofs[index] ) @@ -85,18 +94,34 @@ ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \ (ring)->nr_buf_size ) -#define NETMAP_RING_NEXT(r, i) \ - ((i)+1 == (r)->num_slots ? 0 : (i) + 1 ) -#define NETMAP_RING_FIRST_RESERVED(r) \ - ( (r)->cur < (r)->reserved ? \ - (r)->cur + (r)->num_slots - (r)->reserved : \ - (r)->cur - (r)->reserved ) +static inline uint32_t +nm_ring_next(struct netmap_ring *r, uint32_t i) +{ + return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1); +} + /* - * Return 1 if the given tx ring is empty. + * Return 1 if we have pending transmissions in the tx ring. + * When everything is complete ring->cur = ring->tail + 1 (modulo ring size) */ -#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1) +static inline int +nm_tx_pending(struct netmap_ring *r) +{ + return nm_ring_next(r, r->tail) != r->cur; +} + + +static inline uint32_t +nm_ring_space(struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} + #ifdef NETMAP_WITH_LIBS /* @@ -113,7 +138,12 @@ #include <sys/ioctl.h> #include <sys/errno.h> /* EINVAL */ #include <fcntl.h> /* O_RDWR */ -#include <malloc.h> +#include <unistd.h> /* close() */ +#ifdef __FreeBSD__ +#include <stdlib.h> +#else +#include <malloc.h> /* on FreeBSD it is stdlib.h */ +#endif struct nm_hdr_t { /* same as pcap_pkthdr */ struct timeval ts; @@ -139,30 +169,73 @@ struct nm_desc_t { #define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d)) #define NETMAP_FD(d) (P2NMD(d)->fd) + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + /* * The callback, invoked on each received packet. Same as libpcap */ typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d); /* - * The open routine accepts an ifname (netmap:foo or vale:foo) and - * optionally a second (string) argument indicating the ring number + *--- the pcap-like API --- + * + * nm_open() opens a file descriptor, binds to a port and maps memory. + * + * ifname (netmap:foo or vale:foo) is the port name + * flags can be NETMAP_SW_RING or NETMAP_HW_RING etc. + * ring_no only used if NETMAP_HW_RING is specified, is interpreted + * as a string or integer indicating the ring number + * ring_flags is stored in all ring flags (e.g. for transparent mode) * to open. If successful, t opens the fd and maps the memory. */ + static struct nm_desc_t *nm_open(const char *ifname, const char *ring_no, int flags, int ring_flags); /* - * nm_dispatch() is the same as pcap_dispatch() - * nm_next() is the same as pcap_next() + * nm_close() closes and restores the port to its previous state */ -static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *); -static u_char *nm_next(struct nm_desc_t *, struct nm_hdr_t *); + +static int nm_close(struct nm_desc_t *); /* - * unmap memory, close file descriptor and free the descriptor. + * nm_inject() is the same as pcap_inject() + * nm_dispatch() is the same as pcap_dispatch() + * nm_nextpkt() is the same as pcap_next() */ -static int nm_close(struct nm_desc_t *); + +static int nm_inject(struct nm_desc_t *, const void *, size_t); +static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *); /* @@ -240,6 +313,12 @@ fail: static int nm_close(struct nm_desc_t *d) { + /* + * ugly trick to avoid unused warnings + */ + static void *__xxzt[] __attribute__ ((unused)) = + { nm_open, nm_inject, nm_dispatch, nm_nextpkt } ; + if (d == NULL || d->self != d) return EINVAL; if (d->mem) @@ -253,9 +332,45 @@ nm_close(struct nm_desc_t *d) /* + * Same prototype as pcap_inject(), only need to cast. + */ +static int +nm_inject(struct nm_desc_t *d, const void *buf, size_t size) +{ + u_int c, n = d->last_ring - d->first_ring + 1; + + if (0) fprintf(stderr, "%s rings %d %d %d\n", __FUNCTION__, + d->first_ring, d->cur_ring, d->last_ring); + for (c = 0; c < n ; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + uint32_t i, idx; + uint32_t ri = d->cur_ring + c; + + if (ri > d->last_ring) + ri = d->first_ring; + ring = NETMAP_TXRING(d->nifp, ri); + if (nm_ring_empty(ring)) { + if (0) fprintf(stderr, "%s ring %d cur %d tail %d\n", + __FUNCTION__, + ri, ring->cur, ring->tail); + continue; + } + i = ring->cur; + idx = ring->slot[i].buf_idx; + ring->slot[i].len = size; + pkt_copy(buf, NETMAP_BUF(ring, idx), size); + d->cur_ring = ri; + ring->head = ring->cur = nm_ring_next(ring, i); + return size; + } + return 0; /* fail */ +} + + +/* * Same prototype as pcap_dispatch(), only need to cast. */ -inline /* not really, but disable unused warnings */ static int nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) { @@ -276,7 +391,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) if (ri > d->last_ring) ri = d->first_ring; ring = NETMAP_RXRING(d->nifp, ri); - for ( ; ring->avail > 0 && cnt != got; got++) { + for ( ; !nm_ring_empty(ring) && cnt != got; got++) { u_int i = ring->cur; u_int idx = ring->slot[i].buf_idx; u_char *buf = (u_char *)NETMAP_BUF(ring, idx); @@ -285,24 +400,22 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) d->hdr.len = d->hdr.caplen = ring->slot[i].len; d->hdr.ts = ring->ts; cb(arg, &d->hdr, buf); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; + ring->head = ring->cur = nm_ring_next(ring, i); } } d->cur_ring = ri; return got; } -inline /* not really, but disable unused warnings */ static u_char * -nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr) +nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr) { int ri = d->cur_ring; do { /* compute current ring to use */ struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri); - if (ring->avail > 0) { + if (!nm_ring_empty(ring)) { u_int i = ring->cur; u_int idx = ring->slot[i].buf_idx; u_char *buf = (u_char *)NETMAP_BUF(ring, idx); @@ -310,8 +423,12 @@ nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr) // prefetch(buf); hdr->ts = ring->ts; hdr->len = hdr->caplen = ring->slot[i].len; - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; + ring->cur = nm_ring_next(ring, i); + /* we could postpone advancing head if we want + * to hold the buffer. This can be supported in + * the future. + */ + ring->head = ring->cur; d->cur_ring = ri; return buf; } diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c index 0aca44d448d6..6dc77e438273 100644 --- a/tools/tools/netmap/bridge.c +++ b/tools/tools/netmap/bridge.c @@ -1,5 +1,5 @@ /* - * (C) 2011 Luigi Rizzo, Matteo Landi + * (C) 2011-2014 Luigi Rizzo, Matteo Landi * * BSD license * @@ -42,10 +42,12 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, msg, rxring->flags, txring->flags); j = rxring->cur; /* RX */ k = txring->cur; /* TX */ - if (rxring->avail < limit) - limit = rxring->avail; - if (txring->avail < limit) - limit = txring->avail; + m = nm_ring_space(rxring); + if (m < limit) + limit = m; + m = nm_ring_space(txring); + if (m < limit) + limit = m; m = limit; while (limit-- > 0) { struct netmap_slot *rs = &rxring->slot[j]; @@ -81,13 +83,11 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, ts->flags |= NS_BUF_CHANGED; rs->flags |= NS_BUF_CHANGED; #endif /* NO_SWAP */ - j = NETMAP_RING_NEXT(rxring, j); - k = NETMAP_RING_NEXT(txring, k); + j = nm_ring_next(rxring, j); + k = nm_ring_next(txring, k); } - rxring->avail -= m; - txring->avail -= m; - rxring->cur = j; - txring->cur = k; + rxring->head = rxring->cur = j; + txring->head = txring->cur = k; if (verbose && m > 0) D("%s sent %d packets to %p", msg, m, txring); @@ -107,11 +107,11 @@ move(struct my_ring *src, struct my_ring *dst, u_int limit) rxring = NETMAP_RXRING(src->nifp, si); txring = NETMAP_TXRING(dst->nifp, di); ND("txring %p rxring %p", txring, rxring); - if (rxring->avail == 0) { + if (nm_ring_empty(rxring)) { si++; continue; } - if (txring->avail == 0) { + if (nm_ring_empty(txring)) { di++; continue; } @@ -133,7 +133,7 @@ pkt_queued(struct my_ring *me, int tx) for (i = me->begin; i < me->end; i++) { struct netmap_ring *ring = tx ? NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); - tot += ring->avail; + tot += nm_ring_space(ring); } if (0 && verbose && tot && !tx) D("ring %s %s %s has %d avail at %d", @@ -288,12 +288,12 @@ main(int argc, char **argv) if (ret < 0) continue; if (pollfd[0].revents & POLLERR) { - D("error on fd0, rxcur %d@%d", - me[0].rx->avail, me[0].rx->cur); + D("error on fd0, rx [%d,%d)", + me[0].rx->cur, me[0].rx->tail); } if (pollfd[1].revents & POLLERR) { - D("error on fd1, rxcur %d@%d", - me[1].rx->avail, me[1].rx->cur); + D("error on fd1, rx [%d,%d)", + me[1].rx->cur, me[1].rx->tail); } if (pollfd[0].revents & POLLOUT) { move(me + 1, me, burst); diff --git a/tools/tools/netmap/nm_util.c b/tools/tools/netmap/nm_util.c index 195b68776c3b..1268840cd868 100644 --- a/tools/tools/netmap/nm_util.c +++ b/tools/tools/netmap/nm_util.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Luigi Rizzo. All rights reserved. + * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -232,7 +232,7 @@ pkt_queued(struct my_ring *me, int tx) for (i = me->begin; i < me->end; i++) { struct netmap_ring *ring = tx ? NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); - tot += ring->avail; + tot += nm_ring_space(ring); } if (0 && verbose && tot && !tx) D("ring %s %s %s has %d avail at %d", @@ -242,3 +242,90 @@ pkt_queued(struct my_ring *me, int tx) tot, NETMAP_TXRING(me->nifp, me->begin)->cur); return tot; } + +#if 0 + +/* + * + +Helper routines for multiple readers from the same queue + +- all readers open the device in 'passive' mode (NETMAP_PRIV_RING set). + In this mode a thread that loses the race on a poll() just continues + without calling *xsync() + +- all readers share an extra 'ring' which contains the sync information. + In particular we have a shared head+tail pointers that work + together with cur and available + ON RETURN FROM THE SYSCALL: + shadow->head = ring->cur + shadow->tail = ring->tail + shadow->link[i] = i for all slots // mark invalid + + */ + +struct nm_q_arg { + u_int want; /* Input */ + u_int have; /* Output, 0 on error */ + u_int head; + u_int tail; + struct netmap_ring *ring; +}; + +/* + * grab a number of slots from the queue. + */ +struct nm_q_arg +my_grab(struct nm_q_arg q) +{ + const u_int ns = q.ring->num_slots; + + for (;;) { + + q.head = (volatile u_int)q.ring->head; + q.have = ns + q.head - (volatile u_int)q.ring->tail; + if (q.have >= ns) + q.have -= ns; + if (q.have == 0) /* no space */ + break; + if (q.want < q.have) + q.have = q.want; + q.tail = q.head + q.have; + if (q.tail >= ns) + q.tail -= ns; + if (atomic_cmpset_int(&q.ring->head, q.head, q.tail) + break; /* success */ + } + D("returns %d out of %d at %d,%d", + q.have, q.want, q.head, q.tail); + /* the last one can clear avail ? */ + return q; +} + + +int +my_release(struct nm_q_arg q) +{ + u_int head = q.head, tail = q.tail, i; + struct netmap_ring *r = q.ring; + + /* link the block to the next one. + * there is no race here because the location is mine. + */ + r->slot[head].ptr = tail; /* this is mine */ + // memory barrier + if (r->head != head) + return; /* not my turn to release */ + for (;;) { + // advance head + r->head = head = r->slot[head].ptr; + // barrier ? + if (head == r->slot[head].ptr) + break; // stop here + } + /* we have advanced from q.head to head (r.head might be + * further down. + */ + // do an ioctl/poll to flush. +} +#endif /* unused */ diff --git a/tools/tools/netmap/nm_util.h b/tools/tools/netmap/nm_util.h index 0d64f131f289..d8f8f94fd162 100644 --- a/tools/tools/netmap/nm_util.h +++ b/tools/tools/netmap/nm_util.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012 Luigi Rizzo. All rights reserved. + * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -32,6 +32,9 @@ #ifndef _NM_UTIL_H #define _NM_UTIL_H + +#define _GNU_SOURCE /* for CPU_SET() */ + #include <errno.h> #include <signal.h> /* signal */ #include <stdlib.h> @@ -79,6 +82,9 @@ struct pcap_pkthdr; #include <pthread.h> /* pthread_* */ #ifdef linux + +#define cpuset_t cpu_set_t + #define ifr_flagshigh ifr_flags #define ifr_curcap ifr_flags #define ifr_reqcap ifr_flags diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c index f30f57bf804a..dd87c4a1b00e 100644 --- a/tools/tools/netmap/pcap.c +++ b/tools/tools/netmap/pcap.c @@ -1,5 +1,5 @@ /* - * (C) 2011-2012 Luigi Rizzo + * (C) 2011-2014 Luigi Rizzo * * BSD license * @@ -499,15 +499,14 @@ pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user) /* scan all rings */ for (si = me->begin; si < me->end; si++) { struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si); - ND("ring has %d pkts", ring->avail); - if (ring->avail == 0) + if (nm_ring_empty(ring)) continue; pme->hdr.ts = ring->ts; /* * XXX a proper prefetch should be done as * prefetch(i); callback(i-1); ... */ - while ((cnt == -1 || cnt != got) && ring->avail > 0) { + while ((cnt == -1 || cnt != got) && !nm_ring_empty(ring)) { u_int i = ring->cur; u_int idx = ring->slot[i].buf_idx; if (idx < 2) { @@ -520,8 +519,7 @@ pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user) pme->hdr.len = pme->hdr.caplen = ring->slot[i].len; // D("call %p len %d", p, me->hdr.len); callback(user, &pme->hdr, buf); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; + ring->head = ring->cur = nm_ring_next(ring, i); got++; } } @@ -540,8 +538,7 @@ pcap_inject(pcap_t *p, const void *buf, size_t size) for (si = me->begin; si < me->end; si++) { struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si); - ND("ring has %d pkts", ring->avail); - if (ring->avail == 0) + if (nm_ring_empty(ring)) continue; u_int i = ring->cur; u_int idx = ring->slot[i].buf_idx; @@ -553,9 +550,8 @@ pcap_inject(pcap_t *p, const void *buf, size_t size) u_char *dst = (u_char *)NETMAP_BUF(ring, idx); ring->slot[i].len = size; pkt_copy(buf, dst, size); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; - // if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL); + ring->head = ring->cur = nm_ring_next(ring, i); + // if (ring->cur == ring->tail) ioctl(me->fd, NIOCTXSYNC, NULL); return size; } errno = ENOBUFS; diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c index a6e5eeb227f6..c1d084028d93 100644 --- a/tools/tools/netmap/pkt-gen.c +++ b/tools/tools/netmap/pkt-gen.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -52,7 +53,16 @@ int verbose = 0; #define SKIP_PAYLOAD 1 /* do not check payload. */ + +#define VIRT_HDR_1 10 /* length of a base vnet-hdr */ +#define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */ +#define VIRT_HDR_MAX VIRT_HDR_2 +struct virt_header { + uint8_t fields[VIRT_HDR_MAX]; +}; + struct pkt { + struct virt_header vh; struct ether_header eh; struct ip ip; struct udphdr udp; @@ -109,6 +119,8 @@ struct glob_arg { char *ifname; char *nmr_config; int dummy_send; + int virt_header; /* send also the virt_header */ + int host_ring; }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; @@ -146,7 +158,8 @@ extract_ip_range(struct ip_range *r) char *ap, *pp; struct in_addr a; - D("extract IP range from %s", r->name); + if (verbose) + D("extract IP range from %s", r->name); r->port0 = r->port1 = 0; r->start = r->end = 0; @@ -192,7 +205,8 @@ extract_ip_range(struct ip_range *r) a.s_addr = htonl(r->end); strncpy(buf1, inet_ntoa(a), sizeof(buf1)); a.s_addr = htonl(r->start); - D("range is %s:%d to %s:%d", + if (1) + D("range is %s:%d to %s:%d", inet_ntoa(a), r->port0, buf1, r->port1); } } @@ -200,7 +214,8 @@ extract_ip_range(struct ip_range *r) static void extract_mac_range(struct mac_range *r) { - D("extract MAC range from %s", r->name); + if (verbose) + D("extract MAC range from %s", r->name); bcopy(ether_aton(r->name), &r->start, 6); bcopy(ether_aton(r->name), &r->end, 6); #if 0 @@ -215,7 +230,8 @@ extract_mac_range(struct mac_range *r) if (p) targ->dst_mac_range = atoi(p+1); #endif - D("%s starts at %s", r->name, ether_ntoa(&r->start)); + if (verbose) + D("%s starts at %s", r->name, ether_ntoa(&r->start)); } static struct targ *targs; @@ -281,7 +297,7 @@ system_ncpus(void) * Missing numbers or zeroes stand for default values. * As an additional convenience, if exactly one number * is specified, then this is assigned to both #tx-slots and #rx-slots. - * If there is no 4th number, then the 3rd is assigned to both #tx-rings + * If there is no 4th number, then the 3rd is assigned to both #tx-rings * and #rx-rings. */ void parse_nmr_config(const char* conf, struct nmreq *nmr) @@ -362,7 +378,7 @@ source_hwaddr(const char *ifname, char *buf) static int setaffinity(pthread_t me, int i) { -#ifdef __FreeBSD__ +#if 1 // def __FreeBSD__ cpuset_t cpumask; if (i == -1) @@ -373,7 +389,7 @@ setaffinity(pthread_t me, int i) CPU_SET(i, &cpumask); if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { - D("Unable to set affinity"); + D("Unable to set affinity: %s", strerror(errno)); return 1; } #else @@ -559,6 +575,8 @@ initialize_packet(struct targ *targ) bcopy(&targ->g->src_mac.start, eh->ether_shost, 6); bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6); eh->ether_type = htons(ETHERTYPE_IP); + + bzero(&pkt->vh, sizeof(pkt->vh)); // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } @@ -570,18 +588,19 @@ initialize_packet(struct targ *targ) * an interrupt when done. */ static int -send_packets(struct netmap_ring *ring, struct pkt *pkt, - struct glob_arg *g, u_int count, int options, u_int nfrags) +send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, + int size, struct glob_arg *g, u_int count, int options, + u_int nfrags) { - u_int sent, cur = ring->cur; + u_int n, sent, cur = ring->cur; int fcnt; - int size = g->pkt_size; - if (ring->avail < count) - count = ring->avail; + n = nm_ring_space(ring); + if (n < count) + count = n; if (count < nfrags) { D("truncating packet, no room for frags %d %d", - count, nfrags); + count, nfrags); } #if 0 if (options & (OPT_COPY | OPT_PREFETCH) ) { @@ -590,7 +609,7 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, char *p = NETMAP_BUF(ring, slot->buf_idx); prefetch(p); - cur = NETMAP_RING_NEXT(ring, cur); + cur = nm_ring_next(ring, cur); } cur = ring->cur; } @@ -602,13 +621,13 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, slot->flags = 0; if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; - slot->ptr = (uint64_t)pkt; + slot->ptr = (uint64_t)frame; } else if (options & OPT_COPY) { - pkt_copy(pkt, p, size); + pkt_copy(frame, p, size); if (fcnt == 1) update_addresses(pkt, g); } else if (options & OPT_MEMCPY) { - memcpy(p, pkt, size); + memcpy(p, frame, size); if (fcnt == 1) update_addresses(pkt, g); } else if (options & OPT_PREFETCH) { @@ -625,10 +644,9 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, slot->flags &= ~NS_MOREFRAG; slot->flags |= NS_REPORT; } - cur = NETMAP_RING_NEXT(ring, cur); + cur = nm_ring_next(ring, cur); } - ring->avail -= sent; - ring->cur = cur; + ring->head = ring->cur = cur; return (sent); } @@ -647,6 +665,12 @@ pinger_body(void *data) struct pollfd fds[1]; struct netmap_if *nifp = targ->nifp; int i, rx = 0, n = targ->g->npackets; + void *frame; + int size; + + frame = &targ->pkt; + frame += sizeof(targ->pkt.vh) - targ->g->virt_header; + size = targ->g->pkt_size + targ->g->virt_header; fds[0].fd = targ->fd; fds[0].events = (POLLIN); @@ -660,36 +684,37 @@ pinger_body(void *data) } clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); + now = last_print; while (n == 0 || (int)sent < n) { struct netmap_ring *ring = NETMAP_TXRING(nifp, 0); struct netmap_slot *slot; char *p; - for (i = 0; i < 1; i++) { + for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */ slot = &ring->slot[ring->cur]; - slot->len = targ->g->pkt_size; + slot->len = size; p = NETMAP_BUF(ring, slot->buf_idx); - if (ring->avail == 0) { + if (nm_ring_empty(ring)) { D("-- ouch, cannot send"); } else { - pkt_copy(&targ->pkt, p, targ->g->pkt_size); + pkt_copy(frame, p, size); clock_gettime(CLOCK_REALTIME_PRECISE, &ts); bcopy(&sent, p+42, sizeof(sent)); bcopy(&ts, p+46, sizeof(ts)); sent++; - ring->cur = NETMAP_RING_NEXT(ring, ring->cur); - ring->avail--; + ring->head = ring->cur = nm_ring_next(ring, ring->cur); } } /* should use a parameter to decide how often to send */ if (poll(fds, 1, 3000) <= 0) { - D("poll error/timeout on queue %d", targ->me); + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); continue; } /* see what we got back */ for (i = targ->qfirst; i < targ->qlast; i++) { ring = NETMAP_RXRING(nifp, i); - while (ring->avail > 0) { + while (!nm_ring_empty(ring)) { uint32_t seq; slot = &ring->slot[ring->cur]; p = NETMAP_BUF(ring, slot->buf_idx); @@ -709,8 +734,7 @@ pinger_body(void *data) min = ts.tv_nsec; count ++; av += ts.tv_nsec; - ring->avail--; - ring->cur = NETMAP_RING_NEXT(ring, ring->cur); + ring->head = ring->cur = nm_ring_next(ring, ring->cur); rx++; } } @@ -761,25 +785,25 @@ ponger_body(void *data) ioctl(fds[0].fd, NIOCRXSYNC, NULL); #else if (poll(fds, 1, 1000) <= 0) { - D("poll error/timeout on queue %d", targ->me); + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); continue; } #endif txring = NETMAP_TXRING(nifp, 0); txcur = txring->cur; - txavail = txring->avail; + txavail = nm_ring_space(txring); /* see what we got back */ for (i = targ->qfirst; i < targ->qlast; i++) { rxring = NETMAP_RXRING(nifp, i); - while (rxring->avail > 0) { + while (!nm_ring_empty(rxring)) { uint16_t *spkt, *dpkt; uint32_t cur = rxring->cur; struct netmap_slot *slot = &rxring->slot[cur]; char *src, *dst; src = NETMAP_BUF(rxring, slot->buf_idx); //D("got pkt %p of size %d", src, slot->len); - rxring->avail--; - rxring->cur = NETMAP_RING_NEXT(rxring, cur); + rxring->head = rxring->cur = nm_ring_next(rxring, cur); rx++; if (txavail == 0) continue; @@ -797,13 +821,12 @@ ponger_body(void *data) dpkt[5] = spkt[2]; txring->slot[txcur].len = slot->len; /* XXX swap src dst mac */ - txcur = NETMAP_RING_NEXT(txring, txcur); + txcur = nm_ring_next(txring, txcur); txavail--; sent++; } } - txring->cur = txcur; - txring->avail = txavail; + txring->head = txring->cur = txcur; targ->count = sent; #ifdef BUSYWAIT ioctl(fds[0].fd, NIOCTXSYNC, NULL); @@ -847,43 +870,47 @@ timespec2val(const struct timespec *a) } -static int -wait_time(struct timespec ts, struct timespec *wakeup_ts, long long *waited) +static __inline struct timespec +timespec_add(struct timespec a, struct timespec b) { - struct timespec curtime; - - curtime.tv_sec = 0; - curtime.tv_nsec = 0; - - if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) { - D("clock_gettime: %s", strerror(errno)); - return (-1); - } - while (timespec_ge(&ts, &curtime)) { - if (waited != NULL) - (*waited)++; - if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) { - D("clock_gettime"); - return (-1); - } + struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec }; + if (ret.tv_nsec >= 1000000000) { + ret.tv_sec++; + ret.tv_nsec -= 1000000000; } - if (wakeup_ts != NULL) - *wakeup_ts = curtime; - return (0); + return ret; } -static __inline void -timespec_add(struct timespec *tsa, struct timespec *tsb) +static __inline struct timespec +timespec_sub(struct timespec a, struct timespec b) { - tsa->tv_sec += tsb->tv_sec; - tsa->tv_nsec += tsb->tv_nsec; - if (tsa->tv_nsec >= 1000000000) { - tsa->tv_sec++; - tsa->tv_nsec -= 1000000000; + struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec }; + if (ret.tv_nsec < 0) { + ret.tv_sec--; + ret.tv_nsec += 1000000000; } + return ret; } +/* + * wait until ts, either busy or sleeping if more than 1ms. + * Return wakeup time. + */ +static struct timespec +wait_time(struct timespec ts) +{ + for (;;) { + struct timespec w, cur; + clock_gettime(CLOCK_REALTIME_PRECISE, &cur); + w = timespec_sub(ts, cur); + if (w.tv_sec < 0) + return cur; + else if (w.tv_sec > 0 || w.tv_nsec > 1000000) + poll(NULL, 0, 1); + } +} + static void * sender_body(void *data) { @@ -894,9 +921,15 @@ sender_body(void *data) struct netmap_ring *txring; int i, n = targ->g->npackets / targ->g->nthreads, sent = 0; int options = targ->g->options | OPT_COPY; - struct timespec tmptime, nexttime = { 0, 0}; // XXX silence compiler + struct timespec nexttime = { 0, 0}; // XXX silence compiler int rate_limit = targ->g->tx_rate; - long long waited = 0; + struct pkt *pkt = &targ->pkt; + void *frame; + int size; + + frame = pkt; + frame += sizeof(pkt->vh) - targ->g->virt_header; + size = targ->g->pkt_size + targ->g->virt_header; D("start"); if (setaffinity(targ->thread, targ->affinity)) @@ -909,23 +942,16 @@ sender_body(void *data) /* main loop.*/ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (rate_limit) { - tmptime.tv_sec = 2; - tmptime.tv_nsec = 0; - timespec_add(&targ->tic, &tmptime); + targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); targ->tic.tv_nsec = 0; - if (wait_time(targ->tic, NULL, NULL) == -1) { - D("wait_time: %s", strerror(errno)); - goto quit; - } + wait_time(targ->tic); nexttime = targ->tic; } if (targ->g->dev_type == DEV_PCAP) { - int size = targ->g->pkt_size; - void *pkt = &targ->pkt; pcap_t *p = targ->g->p; for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { - if (pcap_inject(p, pkt, size) != -1) + if (pcap_inject(p, frame, size) != -1) sent++; update_addresses(pkt, targ->g); if (i > 10000) { @@ -934,12 +960,10 @@ sender_body(void *data) } } } else if (targ->g->dev_type == DEV_TAP) { /* tap */ - int size = targ->g->pkt_size; - void *pkt = &targ->pkt; D("writing to file desc %d", targ->g->main_fd); for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { - if (write(targ->g->main_fd, pkt, size) != -1) + if (write(targ->g->main_fd, frame, size) != -1) sent++; update_addresses(pkt, targ->g); if (i > 10000) { @@ -955,11 +979,8 @@ sender_body(void *data) if (rate_limit && tosend <= 0) { tosend = targ->g->burst; - timespec_add(&nexttime, &targ->g->tx_period); - if (wait_time(nexttime, &tmptime, &waited) == -1) { - D("wait_time"); - goto quit; - } + nexttime = timespec_add(nexttime, targ->g->tx_period); + wait_time(nexttime); } /* @@ -968,7 +989,12 @@ sender_body(void *data) if (poll(fds, 1, 2000) <= 0) { if (targ->cancel) break; - D("poll error/timeout on queue %d", targ->me); + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); + goto quit; + } + if (fds[0].revents & POLLERR) { + D("poll error"); goto quit; } /* @@ -983,12 +1009,12 @@ sender_body(void *data) if (n > 0 && n - sent < limit) limit = n - sent; txring = NETMAP_TXRING(nifp, i); - if (txring->avail == 0) + if (nm_ring_empty(txring)) continue; if (frags > 1) limit = ((limit + frags - 1) / frags) * frags; - m = send_packets(txring, &targ->pkt, targ->g, + m = send_packets(txring, pkt, frame, size, targ->g, limit, options, frags); ND("limit %d avail %d frags %d m %d", limit, txring->avail, frags, m); @@ -1007,7 +1033,7 @@ sender_body(void *data) /* final part: wait all the TX queues to be empty. */ for (i = targ->qfirst; i < targ->qlast; i++) { txring = NETMAP_TXRING(nifp, i); - while (!NETMAP_TX_RING_EMPTY(txring)) { + while (nm_tx_pending(txring)) { ioctl(fds[0].fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } @@ -1039,11 +1065,12 @@ receive_pcap(u_char *user, const struct pcap_pkthdr * h, static int receive_packets(struct netmap_ring *ring, u_int limit, int dump) { - u_int cur, rx; + u_int cur, rx, n; cur = ring->cur; - if (ring->avail < limit) - limit = ring->avail; + n = nm_ring_space(ring); + if (n < limit) + limit = n; for (rx = 0; rx < limit; rx++) { struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); @@ -1051,10 +1078,9 @@ receive_packets(struct netmap_ring *ring, u_int limit, int dump) if (dump) dump_payload(p, slot->len, ring, cur); - cur = NETMAP_RING_NEXT(ring, cur); + cur = nm_ring_next(ring, cur); } - ring->avail -= rx; - ring->cur = cur; + ring->head = ring->cur = cur; return (rx); } @@ -1082,7 +1108,7 @@ receiver_body(void *data) i = poll(fds, 1, 1000); if (i > 0 && !(fds[0].revents & POLLERR)) break; - D("waiting for initial packets, poll returns %d %d", i, fds[0].revents); + RD(1, "waiting for initial packets, poll returns %d %d", i, fds[0].revents); } /* main loop, exit after 1s silence */ @@ -1111,11 +1137,16 @@ receiver_body(void *data) break; } + if (fds[0].revents & POLLERR) { + D("poll err"); + goto quit; + } + for (i = targ->qfirst; i < targ->qlast; i++) { int m; rxring = NETMAP_RXRING(nifp, i); - if (rxring->avail == 0) + if (nm_ring_empty(rxring)) continue; m = receive_packets(rxring, targ->g->burst, dump); @@ -1215,6 +1246,8 @@ usage(void) "\t-w wait_for_link_time in seconds\n" "\t-R rate in packets per second\n" "\t-X dump payload\n" + "\t-H len add empty virtio-net-header with size 'len'\n" + "\t-h use host ring\n" "", cmd); @@ -1243,7 +1276,7 @@ start_threads(struct glob_arg *g) /* register interface. */ tfd = open("/dev/netmap", O_RDWR); if (tfd == -1) { - D("Unable to open /dev/netmap"); + D("Unable to open /dev/netmap: %s", strerror(errno)); continue; } targs[i].fd = tfd; @@ -1251,7 +1284,11 @@ start_threads(struct glob_arg *g) bzero(&tifreq, sizeof(tifreq)); strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name)); tifreq.nr_version = NETMAP_API; - tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0; + if (g->host_ring) { + tifreq.nr_ringid = NETMAP_SW_RING; + } else { + tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0; + } parse_nmr_config(g->nmr_config, &tifreq); /* @@ -1264,7 +1301,7 @@ start_threads(struct glob_arg *g) } if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) { - D("Unable to register %s", g->ifname); + D("Unable to register %s: %s", g->ifname, strerror(errno)); continue; } D("memsize is %d MB", tifreq.nr_memsize >> 20); @@ -1272,9 +1309,14 @@ start_threads(struct glob_arg *g) targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset); D("nifp flags 0x%x", targs[i].nifp->ni_flags); /* start threads. */ - targs[i].qfirst = (g->nthreads > 1) ? i : 0; - targs[i].qlast = (g->nthreads > 1) ? i+1 : - (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings); + if (g->host_ring) { + targs[i].qfirst = (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings); + targs[i].qlast = targs[i].qfirst + 1; + } else { + targs[i].qfirst = (g->nthreads > 1) ? i : 0; + targs[i].qlast = (g->nthreads > 1) ? i+1 : + (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings); + } } else { targs[i].fd = g->main_fd; } @@ -1292,7 +1334,7 @@ start_threads(struct glob_arg *g) if (pthread_create(&targs[i].thread, NULL, g->td_body, &targs[i]) == -1) { - D("Unable to create thread %d", i); + D("Unable to create thread %d: %s", i, strerror(errno)); targs[i].used = 0; } } @@ -1439,7 +1481,7 @@ tap_alloc(char *dev) /* try to create the device */ if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) { - D("failed to to a TUNSETIFF"); + D("failed to to a TUNSETIFF: %s", strerror(errno)); close(fd); return err; } @@ -1488,9 +1530,10 @@ main(int arc, char **argv) g.tx_rate = 0; g.frags = 1; g.nmr_config = ""; + g.virt_header = 0; while ( (ch = getopt(arc, argv, - "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:")) != -1) { + "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:H:h")) != -1) { struct sf *fn; switch(ch) { @@ -1613,6 +1656,11 @@ main(int arc, char **argv) break; case 'C': g.nmr_config = strdup(optarg); + break; + case 'H': + g.virt_header = atoi(optarg); + case 'h': + g.host_ring = 1; } } @@ -1649,6 +1697,12 @@ main(int arc, char **argv) extract_mac_range(&g.src_mac); extract_mac_range(&g.dst_mac); + if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1 + && g.virt_header != VIRT_HDR_2) { + D("bad virtio-net-header length"); + usage(); + } + if (g.dev_type == DEV_TAP) { D("want to use tap %s", g.ifname); g.main_fd = tap_alloc(g.ifname); @@ -1682,7 +1736,7 @@ main(int arc, char **argv) */ g.main_fd = open("/dev/netmap", O_RDWR); if (g.main_fd == -1) { - D("Unable to open /dev/netmap"); + D("Unable to open /dev/netmap: %s", strerror(errno)); // fail later } /* @@ -1696,22 +1750,16 @@ main(int arc, char **argv) bzero(&nmr, sizeof(nmr)); nmr.nr_version = NETMAP_API; strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name)); - nmr.nr_version = NETMAP_API; parse_nmr_config(g.nmr_config, &nmr); if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) { - D("Unable to register interface %s", g.ifname); + D("Unable to register interface %s: %s", g.ifname, strerror(errno)); //continue, fail later } ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname, nmr.nr_tx_rings, nmr.nr_tx_slots, nmr.nr_rx_rings, nmr.nr_rx_slots); - //if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - // D("Unable to get if info without name"); - //} else { - // D("map size is %d Kb", nmr.nr_memsize >> 10); - //} if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - D("Unable to get if info for %s", g.ifname); + D("Unable to get if info for %s: %s", g.ifname, strerror(errno)); } devqueues = nmr.nr_rx_rings; @@ -1732,7 +1780,7 @@ main(int arc, char **argv) PROT_WRITE | PROT_READ, MAP_SHARED, g.main_fd, 0); if (g.mmap_addr == MAP_FAILED) { - D("Unable to mmap %d KB", nmr.nr_memsize >> 10); + D("Unable to mmap %d KB: %s", nmr.nr_memsize >> 10, strerror(errno)); // continue, fail later } @@ -1772,14 +1820,17 @@ main(int arc, char **argv) g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; if (g.tx_rate > 0) { /* try to have at least something every second, - * reducing the burst size to 0.5s worth of data + * reducing the burst size to some 0.01s worth of data * (but no less than one full set of fragments) */ - if (g.burst > g.tx_rate/2) - g.burst = g.tx_rate/2; + uint64_t x; + int lim = (g.tx_rate)/300; + if (g.burst > lim) + g.burst = lim; if (g.burst < g.frags) g.burst = g.frags; - g.tx_period.tv_nsec = (1e9 / g.tx_rate) * g.burst; + x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate; + g.tx_period.tv_nsec = x; g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; } diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c index 0a478ba08b8f..c0cf574986b6 100644 --- a/tools/tools/netmap/vale-ctl.c +++ b/tools/tools/netmap/vale-ctl.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Michio Honda. All rights reserved. + * Copyright (C) 2013-2014 Michio Honda. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -118,7 +118,7 @@ main(int argc, char *argv[]) const char *command = basename(argv[0]); char *name = NULL; - if (argc != 3 && argc != 1 /* list all */ ) { + if (argc > 3) { usage: fprintf(stderr, "Usage:\n" @@ -127,12 +127,13 @@ usage: "\t-d interface interface name to be detached\n" "\t-a interface interface name to be attached\n" "\t-h interface interface name to be attached with the host stack\n" - "\t-l list all or specified bridge's interfaces\n" + "\t-l list all or specified bridge's interfaces (default)\n" "", command); return 0; } - while ((ch = getopt(argc, argv, "d:a:h:g:l:")) != -1) { + while ((ch = getopt(argc, argv, "d:a:h:g:l")) != -1) { + name = optarg; /* default */ switch (ch) { default: fprintf(stderr, "bad option %c %s", ch, optarg); @@ -152,9 +153,14 @@ usage: break; case 'l': nr_cmd = NETMAP_BDG_LIST; + if (optind < argc && argv[optind][0] == '-') + name = NULL; break; } - name = optarg; + if (optind != argc) { + // fprintf(stderr, "optind %d argc %d\n", optind, argc); + goto usage; + } } if (argc == 1) nr_cmd = NETMAP_BDG_LIST; |