diff options
Diffstat (limited to 'sys/netinet/tcp_usrreq.c')
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 1319 |
1 files changed, 651 insertions, 668 deletions
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 198852cc8fac..3e705181d5e8 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -33,19 +33,14 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" -#include "opt_tcpdebug.h" #include <sys/param.h> #include <sys/systm.h> @@ -82,6 +77,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_pcb.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> +#include <netinet/ip.h> #include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> @@ -102,9 +98,6 @@ __FBSDID("$FreeBSD$"); #ifdef TCPPCAP #include <netinet/tcp_pcap.h> #endif -#ifdef TCPDEBUG -#include <netinet/tcp_debug.h> -#endif #ifdef TCP_OFFLOAD #include <netinet/tcp_offload.h> #endif @@ -121,39 +114,39 @@ __FBSDID("$FreeBSD$"); * TCP protocol interface to socket abstraction. */ #ifdef INET -static int tcp_connect(struct tcpcb *, struct sockaddr *, +static int tcp_connect(struct tcpcb *, struct sockaddr_in *, struct thread *td); #endif /* INET */ #ifdef INET6 -static int tcp6_connect(struct tcpcb *, struct sockaddr *, +static int tcp6_connect(struct tcpcb *, struct sockaddr_in6 *, struct thread *td); #endif /* INET6 */ static void tcp_disconnect(struct tcpcb *); static void tcp_usrclosed(struct tcpcb *); -static void tcp_fill_info(struct tcpcb *, struct tcp_info *); +static void tcp_fill_info(const struct tcpcb *, struct tcp_info *); static int tcp_pru_options_support(struct tcpcb *tp, int flags); -#ifdef TCPDEBUG -#define TCPDEBUG0 int ostate = 0 -#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 -#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ - tcp_trace(TA_USER, ostate, tp, 0, 0, req) -#else -#define TCPDEBUG0 -#define TCPDEBUG1() -#define TCPDEBUG2(req) -#endif +static void +tcp_bblog_pru(struct tcpcb *tp, uint32_t pru, int error) +{ + struct tcp_log_buffer *lgb; -/* - * tcp_require_unique port requires a globally-unique source port for each - * outgoing connection. The default is to require the 4-tuple to be unique. - */ -VNET_DEFINE(int, tcp_require_unique_port) = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, require_unique_port, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_require_unique_port), 0, - "Require globally-unique ephemeral port for outgoing connections"); -#define V_tcp_require_unique_port VNET(tcp_require_unique_port) + KASSERT(tp != NULL, ("tcp_bblog_pru: tp == NULL")); + INP_WLOCK_ASSERT(tptoinpcb(tp)); + if (tcp_bblogging_on(tp)) { + lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_PRU, error, + 0, NULL, false, NULL, NULL, 0, NULL); + } else { + lgb = NULL; + } + if (lgb != NULL) { + if (error >= 0) { + lgb->tlb_errno = (uint32_t)error; + } + lgb->tlb_flex1 = pru; + } +} /* * TCP attaches to socket via pru_attach(), reserving space, @@ -165,17 +158,13 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td) struct inpcb *inp; struct tcpcb *tp = NULL; int error; - TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); - TCPDEBUG1(); - if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); - if (error) - goto out; - } + error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); + if (error) + goto out; so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; @@ -183,28 +172,23 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td) if (error) goto out; inp = sotoinpcb(so); -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6PROTO) { - inp->inp_vflag |= INP_IPV6; - if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) - inp->inp_vflag |= INP_IPV4; - inp->in6p_hops = -1; /* use kernel default */ - } - else -#endif - inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); if (tp == NULL) { error = ENOBUFS; - in_pcbdetach(inp); in_pcbfree(inp); goto out; } tp->t_state = TCPS_CLOSED; + /* Can we inherit anything from the listener? */ + if ((so->so_listen != NULL) && + (so->so_listen->so_pcb != NULL) && + (tp->t_fb->tfb_inherit != NULL)) { + (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(so->so_listen)); + } + tcp_bblog_pru(tp, PRU_ATTACH, error); INP_WUNLOCK(inp); TCPSTATES_INC(TCPS_CLOSED); out: - TCPDEBUG2(PRU_ATTACH); TCP_PROBE2(debug__user, tp, PRU_ATTACH); return (error); } @@ -229,66 +213,12 @@ tcp_usr_detach(struct socket *so) tp = intotcpcb(inp); - if (inp->inp_flags & INP_TIMEWAIT) { - /* - * There are two cases to handle: one in which the time wait - * state is being discarded (INP_DROPPED), and one in which - * this connection will remain in timewait. In the former, - * it is time to discard all state (except tcptw, which has - * already been discarded by the timewait close code, which - * should be further up the call stack somewhere). In the - * latter case, we detach from the socket, but leave the pcb - * present until timewait ends. - * - * XXXRW: Would it be cleaner to free the tcptw here? - * - * Astute question indeed, from twtcp perspective there are - * four cases to consider: - * - * #1 tcp_usr_detach is called at tcptw creation time by - * tcp_twstart, then do not discard the newly created tcptw - * and leave inpcb present until timewait ends - * #2 tcp_usr_detach is called at tcptw creation time by - * tcp_twstart, but connection is local and tw will be - * discarded immediately - * #3 tcp_usr_detach is called at timewait end (or reuse) by - * tcp_twclose, then the tcptw has already been discarded - * (or reused) and inpcb is freed here - * #4 tcp_usr_detach is called() after timewait ends (or reuse) - * (e.g. by soclose), then tcptw has already been discarded - * (or reused) and inpcb is freed here - * - * In all three cases the tcptw should not be freed here. - */ - if (inp->inp_flags & INP_DROPPED) { - KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " - "INP_DROPPED && tp != NULL")); - in_pcbdetach(inp); - in_pcbfree(inp); - } else { - in_pcbdetach(inp); - INP_WUNLOCK(inp); - } - } else { - /* - * If the connection is not in timewait, we consider two - * two conditions: one in which no further processing is - * necessary (dropped || embryonic), and one in which TCP is - * not yet done, but no longer requires the socket, so the - * pcb will persist for the time being. - * - * XXXRW: Does the second case still occur? - */ - if (inp->inp_flags & INP_DROPPED || - tp->t_state < TCPS_SYN_SENT) { - tcp_discardcb(tp); - in_pcbdetach(inp); - in_pcbfree(inp); - } else { - in_pcbdetach(inp); - INP_WUNLOCK(inp); - } - } + KASSERT(inp->inp_flags & INP_DROPPED || + tp->t_state < TCPS_SYN_SENT, + ("%s: inp %p not dropped or embryonic", __func__, inp)); + + tcp_discardcb(tp); + in_pcbfree(inp); } #ifdef INET @@ -300,9 +230,18 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; struct sockaddr_in *sinp; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (EINVAL); + } + tp = intotcpcb(inp); + sinp = (struct sockaddr_in *)nam; if (nam->sa_family != AF_INET) { /* @@ -310,35 +249,29 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) */ if (nam->sa_family != AF_UNSPEC || nam->sa_len < offsetof(struct sockaddr_in, sin_zero) || - sinp->sin_addr.s_addr != INADDR_ANY) - return (EAFNOSUPPORT); + sinp->sin_addr.s_addr != INADDR_ANY) { + error = EAFNOSUPPORT; + goto out; + } nam->sa_family = AF_INET; } - if (nam->sa_len != sizeof(*sinp)) - return (EINVAL); - + if (nam->sa_len != sizeof(*sinp)) { + error = EINVAL; + goto out; + } /* * Must check for multicast addresses and disallow binding * to them. */ - if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) - return (EAFNOSUPPORT); - - TCPDEBUG0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; + if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; goto out; } - tp = intotcpcb(inp); - TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); - error = in_pcbbind(inp, nam, td->td_ucred); + error = in_pcbbind(inp, sinp, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: - TCPDEBUG2(PRU_BIND); + tcp_bblog_pru(tp, PRU_BIND, error); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); @@ -352,34 +285,39 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; struct sockaddr_in6 *sin6; u_char vflagsav; - sin6 = (struct sockaddr_in6 *)nam; - if (nam->sa_family != AF_INET6) - return (EAFNOSUPPORT); - if (nam->sa_len != sizeof(*sin6)) + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); return (EINVAL); + } + tp = intotcpcb(inp); + + vflagsav = inp->inp_vflag; + sin6 = (struct sockaddr_in6 *)nam; + if (nam->sa_family != AF_INET6) { + error = EAFNOSUPPORT; + goto out; + } + if (nam->sa_len != sizeof(*sin6)) { + error = EINVAL; + goto out; + } /* * Must check for multicast addresses and disallow binding * to them. */ - if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) - return (EAFNOSUPPORT); - - TCPDEBUG0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); - INP_WLOCK(inp); - vflagsav = inp->inp_vflag; - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; + if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { + error = EAFNOSUPPORT; goto out; } - tp = intotcpcb(inp); - TCPDEBUG1(); + INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; @@ -398,19 +336,18 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; - error = in_pcbbind(inp, (struct sockaddr *)&sin, - td->td_ucred); + error = in_pcbbind(inp, &sin, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif - error = in6_pcbbind(inp, nam, td->td_ucred); + error = in6_pcbbind(inp, sin6, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: if (error != 0) inp->inp_vflag = vflagsav; - TCPDEBUG2(PRU_BIND); + tcp_bblog_pru(tp, PRU_BIND, error); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); @@ -426,18 +363,17 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; - TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (EINVAL); } tp = intotcpcb(inp); - TCPDEBUG1(); + SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) { @@ -461,11 +397,11 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) } SOCK_UNLOCK(so); - if (IS_FASTOPEN(tp->t_flags)) + if (tp->t_flags & TF_FASTOPEN) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); out: - TCPDEBUG2(PRU_LISTEN); + tcp_bblog_pru(tp, PRU_LISTEN, error); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); @@ -478,20 +414,20 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; u_char vflagsav; - TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (EINVAL); } - vflagsav = inp->inp_vflag; tp = intotcpcb(inp); - TCPDEBUG1(); + + vflagsav = inp->inp_vflag; + SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) { @@ -518,14 +454,14 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) } SOCK_UNLOCK(so); - if (IS_FASTOPEN(tp->t_flags)) + if (tp->t_flags & TF_FASTOPEN) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); if (error != 0) inp->inp_vflag = vflagsav; out: - TCPDEBUG2(PRU_LISTEN); + tcp_bblog_pru(tp, PRU_LISTEN, error); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); @@ -546,45 +482,46 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) struct epoch_tracker et; int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; struct sockaddr_in *sinp; - sinp = (struct sockaddr_in *)nam; - if (nam->sa_family != AF_INET) - return (EAFNOSUPPORT); - if (nam->sa_len != sizeof (*sinp)) - return (EINVAL); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNREFUSED); + } + tp = intotcpcb(inp); + sinp = (struct sockaddr_in *)nam; + if (nam->sa_family != AF_INET) { + error = EAFNOSUPPORT; + goto out; + } + if (nam->sa_len != sizeof (*sinp)) { + error = EINVAL; + goto out; + } /* * Must disallow TCP ``connections'' to multicast addresses. */ - if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) - return (EAFNOSUPPORT); - if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) - return (EACCES); - if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) - return (error); - - TCPDEBUG0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); - INP_WLOCK(inp); - if (inp->inp_flags & INP_TIMEWAIT) { - error = EADDRINUSE; + if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; goto out; } - if (inp->inp_flags & INP_DROPPED) { - error = ECONNREFUSED; + if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) { + error = EACCES; goto out; } + if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) + goto out; if (SOLISTENING(so)) { error = EOPNOTSUPP; goto out; } - tp = intotcpcb(inp); - TCPDEBUG1(); NET_EPOCH_ENTER(et); - if ((error = tcp_connect(tp, nam, td)) != 0) + if ((error = tcp_connect(tp, sinp, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && @@ -593,11 +530,13 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out_in_epoch; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tp->t_fb->tfb_tcp_output(tp); + error = tcp_output(tp); + KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" + ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); out_in_epoch: NET_EPOCH_EXIT(et); out: - TCPDEBUG2(PRU_CONNECT); + tcp_bblog_pru(tp, PRU_CONNECT, error); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); @@ -611,44 +550,43 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) struct epoch_tracker et; int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; struct sockaddr_in6 *sin6; u_int8_t incflagsav; u_char vflagsav; - TCPDEBUG0; - - sin6 = (struct sockaddr_in6 *)nam; - if (nam->sa_family != AF_INET6) - return (EAFNOSUPPORT); - if (nam->sa_len != sizeof (*sin6)) - return (EINVAL); - - /* - * Must disallow TCP ``connections'' to multicast addresses. - */ - if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) - return (EAFNOSUPPORT); - inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNREFUSED); + } + tp = intotcpcb(inp); + vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; - if (inp->inp_flags & INP_TIMEWAIT) { - error = EADDRINUSE; + + sin6 = (struct sockaddr_in6 *)nam; + if (nam->sa_family != AF_INET6) { + error = EAFNOSUPPORT; goto out; } - if (inp->inp_flags & INP_DROPPED) { - error = ECONNREFUSED; + if (nam->sa_len != sizeof (*sin6)) { + error = EINVAL; + goto out; + } + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { + error = EAFNOSUPPORT; goto out; } if (SOLISTENING(so)) { error = EINVAL; goto out; } - tp = intotcpcb(inp); - TCPDEBUG1(); #ifdef INET /* * XXXRW: Some confusion: V4/V6 flags relate to binding, and @@ -682,7 +620,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; NET_EPOCH_ENTER(et); - if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) + if ((error = tcp_connect(tp, &sin, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && @@ -690,7 +628,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif - error = tp->t_fb->tfb_tcp_output(tp); + error = tcp_output(tp); goto out_in_epoch; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { @@ -704,22 +642,22 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; - if ((error = tcp6_connect(tp, nam, td)) != 0) - goto out; + NET_EPOCH_ENTER(et); + if ((error = tcp6_connect(tp, sin6, td)) != 0) + goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) - goto out; + goto out_in_epoch; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - NET_EPOCH_ENTER(et); - error = tp->t_fb->tfb_tcp_output(tp); -#ifdef INET + error = tcp_output(tp); out_in_epoch: -#endif NET_EPOCH_EXIT(et); out: + KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" + ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); /* * If the implicit bind in the connect call fails, restore * the flags we modified. @@ -729,7 +667,7 @@ out: inp->inp_inc.inc_flags = incflagsav; } - TCPDEBUG2(PRU_CONNECT); + tcp_bblog_pru(tp, PRU_CONNECT, error); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); @@ -755,22 +693,22 @@ tcp_usr_disconnect(struct socket *so) struct epoch_tracker et; int error = 0; - TCPDEBUG0; NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & INP_TIMEWAIT) - goto out; if (inp->inp_flags & INP_DROPPED) { - error = ECONNRESET; - goto out; + INP_WUNLOCK(inp); + NET_EPOCH_EXIT(et); + return (ECONNRESET); } tp = intotcpcb(inp); - TCPDEBUG1(); + + if (tp->t_state == TCPS_TIME_WAIT) + goto out; tcp_disconnect(tp); out: - TCPDEBUG2(PRU_DISCONNECT); + tcp_bblog_pru(tp, PRU_DISCONNECT, error); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); @@ -783,100 +721,83 @@ out: * just return the address of the peer, storing through addr. */ static int -tcp_usr_accept(struct socket *so, struct sockaddr **nam) +tcp_usr_accept(struct socket *so, struct sockaddr *sa) { + struct inpcb *inp; + struct tcpcb *tp; int error = 0; - struct inpcb *inp = NULL; - struct tcpcb *tp = NULL; - struct in_addr addr; - in_port_t port = 0; - TCPDEBUG0; - - if (so->so_state & SS_ISDISCONNECTED) - return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNABORTED; - goto out; + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNABORTED); } tp = intotcpcb(inp); - TCPDEBUG1(); - - /* - * We inline in_getpeeraddr and COMMON_END here, so that we can - * copy the data of interest and defer the malloc until after we - * release the lock. - */ - port = inp->inp_fport; - addr = inp->inp_faddr; -out: - TCPDEBUG2(PRU_ACCEPT); + if (so->so_state & SS_ISDISCONNECTED) + error = ECONNABORTED; + else + *(struct sockaddr_in *)sa = (struct sockaddr_in ){ + .sin_family = AF_INET, + .sin_len = sizeof(struct sockaddr_in), + .sin_port = inp->inp_fport, + .sin_addr = inp->inp_faddr, + }; + tcp_bblog_pru(tp, PRU_ACCEPT, error); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); - if (error == 0) - *nam = in_sockaddr(port, &addr); - return error; + + return (error); } #endif /* INET */ #ifdef INET6 static int -tcp6_usr_accept(struct socket *so, struct sockaddr **nam) +tcp6_usr_accept(struct socket *so, struct sockaddr *sa) { - struct inpcb *inp = NULL; + struct inpcb *inp; + struct tcpcb *tp; int error = 0; - struct tcpcb *tp = NULL; - struct in_addr addr; - struct in6_addr addr6; - struct epoch_tracker et; - in_port_t port = 0; - int v4 = 0; - TCPDEBUG0; - - if (so->so_state & SS_ISDISCONNECTED) - return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); - NET_EPOCH_ENTER(et); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNABORTED; - goto out; + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNABORTED); } tp = intotcpcb(inp); - TCPDEBUG1(); - /* - * We inline in6_mapped_peeraddr and COMMON_END here, so that we can - * copy the data of interest and defer the malloc until after we - * release the lock. - */ - if (inp->inp_vflag & INP_IPV4) { - v4 = 1; - port = inp->inp_fport; - addr = inp->inp_faddr; + if (so->so_state & SS_ISDISCONNECTED) { + error = ECONNABORTED; } else { - port = inp->inp_fport; - addr6 = inp->in6p_faddr; + if (inp->inp_vflag & INP_IPV4) { + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_len = sizeof(struct sockaddr_in), + .sin_port = inp->inp_fport, + .sin_addr = inp->inp_faddr, + }; + in6_sin_2_v4mapsin6(&sin, (struct sockaddr_in6 *)sa); + } else { + *(struct sockaddr_in6 *)sa = (struct sockaddr_in6 ){ + .sin6_family = AF_INET6, + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_port = inp->inp_fport, + .sin6_addr = inp->in6p_faddr, + }; + /* XXX: should catch errors */ + (void)sa6_recoverscope((struct sockaddr_in6 *)sa); + } } -out: - TCPDEBUG2(PRU_ACCEPT); + tcp_bblog_pru(tp, PRU_ACCEPT, error); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); - NET_EPOCH_EXIT(et); - if (error == 0) { - if (v4) - *nam = in6_v4mapsin6_sockaddr(port, &addr); - else - *nam = in6_sockaddr(port, &addr6); - } - return error; + + return (error); } #endif /* INET6 */ @@ -884,34 +805,56 @@ out: * Mark the connection as being incapable of further output. */ static int -tcp_usr_shutdown(struct socket *so) +tcp_usr_shutdown(struct socket *so, enum shutdown_how how) { - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; struct epoch_tracker et; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + int error = 0; - TCPDEBUG0; - NET_EPOCH_ENTER(et); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("inp == NULL")); - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNRESET; - goto out; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + if (how != SHUT_WR) { + so->so_error = ECONNABORTED; + solisten_wakeup(so); /* unlocks so */ + } else + SOCK_UNLOCK(so); + return (ENOTCONN); + } else if ((so->so_state & + (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { + SOCK_UNLOCK(so); + return (ENOTCONN); } - tp = intotcpcb(inp); - TCPDEBUG1(); - socantsendmore(so); - tcp_usrclosed(tp); - if (!(inp->inp_flags & INP_DROPPED)) - error = tp->t_fb->tfb_tcp_output(tp); + SOCK_UNLOCK(so); -out: - TCPDEBUG2(PRU_SHUTDOWN); - TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); - INP_WUNLOCK(inp); - NET_EPOCH_EXIT(et); + switch (how) { + case SHUT_RD: + sorflush(so); + break; + case SHUT_RDWR: + sorflush(so); + /* FALLTHROUGH */ + case SHUT_WR: + /* + * XXXGL: mimicing old soshutdown() here. But shouldn't we + * return ECONNRESEST for SHUT_RD as well? + */ + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + + socantsendmore(so); + NET_EPOCH_ENTER(et); + tcp_usrclosed(tp); + error = tcp_output_nodrop(tp); + tcp_bblog_pru(tp, PRU_SHUTDOWN, error); + TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); + error = tcp_unlock_or_drop(tp, error); + NET_EPOCH_EXIT(et); + } + wakeup(&so->so_timeo); return (error); } @@ -924,19 +867,19 @@ tcp_usr_rcvd(struct socket *so, int flags) { struct epoch_tracker et; struct inpcb *inp; - struct tcpcb *tp = NULL; - int error = 0; + struct tcpcb *tp; + int outrv = 0, error = 0; - TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNRESET; - goto out; + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNRESET); } tp = intotcpcb(inp); - TCPDEBUG1(); + + NET_EPOCH_ENTER(et); /* * For passively-created TFO connections, don't attempt a window * update while still in SYN_RECEIVED as this may trigger an early @@ -944,21 +887,19 @@ tcp_usr_rcvd(struct socket *so, int flags) * application response data, or failing that, when the DELACK timer * expires. */ - if (IS_FASTOPEN(tp->t_flags) && - (tp->t_state == TCPS_SYN_RECEIVED)) + if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) goto out; - NET_EPOCH_ENTER(et); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif - tp->t_fb->tfb_tcp_output(tp); - NET_EPOCH_EXIT(et); + outrv = tcp_output_nodrop(tp); out: - TCPDEBUG2(PRU_RCVD); + tcp_bblog_pru(tp, PRU_RCVD, error); TCP_PROBE2(debug__user, tp, PRU_RCVD); - INP_WUNLOCK(inp); + (void) tcp_unlock_or_drop(tp, outrv); + NET_EPOCH_EXIT(et); return (error); } @@ -976,7 +917,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct epoch_tracker et; int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; #ifdef INET #ifdef INET6 struct sockaddr_in sin; @@ -984,46 +925,43 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr_in *sinp; #endif #ifdef INET6 + struct sockaddr_in6 *sin6; int isipv6; #endif u_int8_t incflagsav; u_char vflagsav; bool restoreflags; - TCPDEBUG0; - /* - * We require the pcbinfo "read lock" if we will close the socket - * as part of this call. - */ - NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + if (m != NULL && (flags & PRUS_NOTREADY) == 0) + m_freem(m); + INP_WUNLOCK(inp); + return (ECONNRESET); + } + tp = intotcpcb(inp); + vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; restoreflags = false; - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - if (control) - m_freem(control); - error = ECONNRESET; - goto out; - } + + NET_EPOCH_ENTER(et); if (control != NULL) { /* TCP doesn't do control messages (rights, creds, etc) */ - if (control->m_len) { + if (control->m_len > 0) { m_freem(control); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ - control = NULL; } - tp = intotcpcb(inp); + if ((flags & PRUS_OOB) != 0 && (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0) goto out; - TCPDEBUG1(); if (nam != NULL && tp->t_state < TCPS_SYN_SENT) { if (tp->t_state == TCPS_LISTEN) { error = EINVAL; @@ -1059,9 +997,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #endif /* INET */ #ifdef INET6 case AF_INET6: - { - struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)nam; if (sin6->sin6_len != sizeof(*sin6)) { error = EINVAL; @@ -1116,7 +1051,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, isipv6 = 1; } break; - } #endif /* INET6 */ default: error = EAFNOSUPPORT; @@ -1124,6 +1058,8 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, } } if (!(flags & PRUS_OOB)) { + if (tp->t_acktime == 0) + tp->t_acktime = ticks; sbappendstream(&so->so_snd, m, flags); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { @@ -1137,14 +1073,13 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, */ #ifdef INET6 if (isipv6) - error = tcp6_connect(tp, nam, td); + error = tcp6_connect(tp, sin6, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET - error = tcp_connect(tp, - (struct sockaddr *)sinp, td); + error = tcp_connect(tp, sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We @@ -1159,7 +1094,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, sbflush(&so->so_snd); goto out; } - if (IS_FASTOPEN(tp->t_flags)) + if (tp->t_flags & TF_FASTOPEN) tcp_fastopen_connect(tp); else { tp->snd_wnd = TTCP_CLIENT_SND_WND; @@ -1188,7 +1123,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tp->t_fb->tfb_tcp_output(tp); + error = tcp_output_nodrop(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -1210,6 +1145,8 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * of data past the urgent section. * Otherwise, snd_up should be one lower. */ + if (tp->t_acktime == 0) + tp->t_acktime = ticks; sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); m = NULL; @@ -1223,18 +1160,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, /* * Not going to contemplate SYN|URG */ - if (IS_FASTOPEN(tp->t_flags)) + if (tp->t_flags & TF_FASTOPEN) tp->t_flags &= ~TF_FASTOPEN; #ifdef INET6 if (isipv6) - error = tcp6_connect(tp, nam, td); + error = tcp6_connect(tp, sin6, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET - error = tcp_connect(tp, - (struct sockaddr *)sinp, td); + error = tcp_connect(tp, sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We @@ -1255,7 +1191,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if ((flags & PRUS_NOTREADY) == 0) { tp->t_flags |= TF_FORCEDATA; - error = tp->t_fb->tfb_tcp_output(tp); + error = tcp_output_nodrop(tp); tp->t_flags &= ~TF_FORCEDATA; } } @@ -1281,11 +1217,11 @@ out: inp->inp_vflag = vflagsav; inp->inp_inc.inc_flags = incflagsav; } - TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : - ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); + tcp_bblog_pru(tp, (flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), error); TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); - INP_WUNLOCK(inp); + error = tcp_unlock_or_drop(tp, error); NET_EPOCH_EXIT(et); return (error); } @@ -1300,7 +1236,7 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count) inp = sotoinpcb(so); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); mb_free_notready(m, count); return (ECONNRESET); @@ -1310,12 +1246,13 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count) SOCKBUF_LOCK(&so->so_snd); error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); - if (error == 0) { - NET_EPOCH_ENTER(et); - error = tp->t_fb->tfb_tcp_output(tp); - NET_EPOCH_EXIT(et); + if (error) { + INP_WUNLOCK(inp); + return (error); } - INP_WUNLOCK(inp); + NET_EPOCH_ENTER(et); + error = tcp_output_unlock(tp); + NET_EPOCH_EXIT(et); return (error); } @@ -1327,9 +1264,8 @@ static void tcp_usr_abort(struct socket *so) { struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; struct epoch_tracker et; - TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); @@ -1342,20 +1278,16 @@ tcp_usr_abort(struct socket *so) /* * If we still have full TCP state, and we're not dropped, drop. */ - if (!(inp->inp_flags & INP_TIMEWAIT) && - !(inp->inp_flags & INP_DROPPED)) { + if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); - TCPDEBUG1(); tp = tcp_drop(tp, ECONNABORTED); if (tp == NULL) goto dropped; - TCPDEBUG2(PRU_ABORT); + tcp_bblog_pru(tp, PRU_ABORT, 0); TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { - SOCK_LOCK(so); - so->so_state |= SS_PROTOREF; - SOCK_UNLOCK(so); + soref(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); @@ -1370,9 +1302,8 @@ static void tcp_usr_close(struct socket *so) { struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; struct epoch_tracker et; - TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); @@ -1383,21 +1314,20 @@ tcp_usr_close(struct socket *so) ("tcp_usr_close: inp_socket == NULL")); /* - * If we still have full TCP state, and we're not dropped, initiate + * If we are still connected and we're not dropped, initiate * a disconnect. */ - if (!(inp->inp_flags & INP_TIMEWAIT) && - !(inp->inp_flags & INP_DROPPED)) { + if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); - TCPDEBUG1(); - tcp_disconnect(tp); - TCPDEBUG2(PRU_CLOSE); - TCP_PROBE2(debug__user, tp, PRU_CLOSE); + if (tp->t_state != TCPS_TIME_WAIT) { + tp->t_flags |= TF_CLOSED; + tcp_disconnect(tp); + tcp_bblog_pru(tp, PRU_CLOSE, 0); + TCP_PROBE2(debug__user, tp, PRU_CLOSE); + } } if (!(inp->inp_flags & INP_DROPPED)) { - SOCK_LOCK(so); - so->so_state |= SS_PROTOREF; - SOCK_UNLOCK(so); + soref(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); @@ -1430,22 +1360,21 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; - TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNRESET; - goto out; + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNRESET); } tp = intotcpcb(inp); + error = tcp_pru_options_support(tp, PRUS_OOB); if (error) { goto out; } - TCPDEBUG1(); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || @@ -1463,115 +1392,95 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: - TCPDEBUG2(PRU_RCVOOB); + tcp_bblog_pru(tp, PRU_RCVOOB, error); TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } #ifdef INET -struct pr_usrreqs tcp_usrreqs = { - .pru_abort = tcp_usr_abort, - .pru_accept = tcp_usr_accept, - .pru_attach = tcp_usr_attach, - .pru_bind = tcp_usr_bind, - .pru_connect = tcp_usr_connect, - .pru_control = in_control, - .pru_detach = tcp_usr_detach, - .pru_disconnect = tcp_usr_disconnect, - .pru_listen = tcp_usr_listen, - .pru_peeraddr = in_getpeeraddr, - .pru_rcvd = tcp_usr_rcvd, - .pru_rcvoob = tcp_usr_rcvoob, - .pru_send = tcp_usr_send, - .pru_ready = tcp_usr_ready, - .pru_shutdown = tcp_usr_shutdown, - .pru_sockaddr = in_getsockaddr, - .pru_sosetlabel = in_pcbsosetlabel, - .pru_close = tcp_usr_close, +struct protosw tcp_protosw = { + .pr_type = SOCK_STREAM, + .pr_protocol = IPPROTO_TCP, + .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD | + PR_CAPATTACH, + .pr_ctloutput = tcp_ctloutput, + .pr_abort = tcp_usr_abort, + .pr_accept = tcp_usr_accept, + .pr_attach = tcp_usr_attach, + .pr_bind = tcp_usr_bind, + .pr_connect = tcp_usr_connect, + .pr_control = in_control, + .pr_detach = tcp_usr_detach, + .pr_disconnect = tcp_usr_disconnect, + .pr_listen = tcp_usr_listen, + .pr_peeraddr = in_getpeeraddr, + .pr_rcvd = tcp_usr_rcvd, + .pr_rcvoob = tcp_usr_rcvoob, + .pr_send = tcp_usr_send, + .pr_ready = tcp_usr_ready, + .pr_shutdown = tcp_usr_shutdown, + .pr_sockaddr = in_getsockaddr, + .pr_sosetlabel = in_pcbsosetlabel, + .pr_close = tcp_usr_close, }; #endif /* INET */ #ifdef INET6 -struct pr_usrreqs tcp6_usrreqs = { - .pru_abort = tcp_usr_abort, - .pru_accept = tcp6_usr_accept, - .pru_attach = tcp_usr_attach, - .pru_bind = tcp6_usr_bind, - .pru_connect = tcp6_usr_connect, - .pru_control = in6_control, - .pru_detach = tcp_usr_detach, - .pru_disconnect = tcp_usr_disconnect, - .pru_listen = tcp6_usr_listen, - .pru_peeraddr = in6_mapped_peeraddr, - .pru_rcvd = tcp_usr_rcvd, - .pru_rcvoob = tcp_usr_rcvoob, - .pru_send = tcp_usr_send, - .pru_ready = tcp_usr_ready, - .pru_shutdown = tcp_usr_shutdown, - .pru_sockaddr = in6_mapped_sockaddr, - .pru_sosetlabel = in_pcbsosetlabel, - .pru_close = tcp_usr_close, +struct protosw tcp6_protosw = { + .pr_type = SOCK_STREAM, + .pr_protocol = IPPROTO_TCP, + .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD | + PR_CAPATTACH, + .pr_ctloutput = tcp_ctloutput, + .pr_abort = tcp_usr_abort, + .pr_accept = tcp6_usr_accept, + .pr_attach = tcp_usr_attach, + .pr_bind = tcp6_usr_bind, + .pr_connect = tcp6_usr_connect, + .pr_control = in6_control, + .pr_detach = tcp_usr_detach, + .pr_disconnect = tcp_usr_disconnect, + .pr_listen = tcp6_usr_listen, + .pr_peeraddr = in6_mapped_peeraddr, + .pr_rcvd = tcp_usr_rcvd, + .pr_rcvoob = tcp_usr_rcvoob, + .pr_send = tcp_usr_send, + .pr_ready = tcp_usr_ready, + .pr_shutdown = tcp_usr_shutdown, + .pr_sockaddr = in6_mapped_sockaddr, + .pr_sosetlabel = in_pcbsosetlabel, + .pr_close = tcp_usr_close, }; #endif /* INET6 */ #ifdef INET /* * Common subroutine to open a TCP connection to remote host specified - * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local - * port number if needed. Call in_pcbconnect_setup to do the routing and - * to choose a local host address (interface). If there is an existing - * incarnation of the same connection in TIME-WAIT state and if the remote - * host was sending CC options and if the connection duration was < MSL, then - * truncate the previous TIME-WAIT state and proceed. + * by struct sockaddr_in. Call in_pcbconnect() to choose local host address + * and assign a local port number and install the inpcb into the hash. * Initialize connection parameters and enter SYN-SENT state. */ static int -tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) +tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td) { - struct inpcb *inp = tp->t_inpcb, *oinp; - struct socket *so = inp->inp_socket; - struct in_addr laddr; - u_short lport; + struct inpcb *inp = tptoinpcb(tp); + struct socket *so = tptosocket(tp); int error; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK(&V_tcbinfo); - if (V_tcp_require_unique_port && inp->inp_lport == 0) { - error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - if (error) - goto out; - } + if (__predict_false((so->so_state & + (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | + SS_ISDISCONNECTED)) != 0)) + return (EISCONN); - /* - * Cannot simply call in_pcbconnect, because there might be an - * earlier incarnation of this same connection still in - * TIME_WAIT state, creating an ADDRINUSE error. - */ - laddr = inp->inp_laddr; - lport = inp->inp_lport; - error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, - &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); - if (error && oinp == NULL) - goto out; - if (oinp) { - error = EADDRINUSE; - goto out; - } - /* Handle initial bind if it hadn't been done in advance. */ - if (inp->inp_lport == 0) { - inp->inp_lport = lport; - if (in_pcbinshash(inp) != 0) { - inp->inp_lport = 0; - error = EAGAIN; - goto out; - } - } - inp->inp_laddr = laddr; - in_pcbrehash(inp); + INP_HASH_WLOCK(&V_tcbinfo); + error = in_pcbconnect(inp, sin, td->td_ucred, true); INP_HASH_WUNLOCK(&V_tcbinfo); + if (error != 0) + return (error); /* * Compute window scaling to request: @@ -1590,40 +1499,37 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); - return 0; - -out: - INP_HASH_WUNLOCK(&V_tcbinfo); - return (error); + return (0); } #endif /* INET */ #ifdef INET6 static int -tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) +tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td) { - struct inpcb *inp = tp->t_inpcb; + struct inpcb *inp = tptoinpcb(tp); + struct socket *so = tptosocket(tp); int error; + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK(&V_tcbinfo); - if (V_tcp_require_unique_port && inp->inp_lport == 0) { - error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - if (error) - goto out; - } - error = in6_pcbconnect(inp, nam, td->td_ucred); - if (error != 0) - goto out; + if (__predict_false((so->so_state & + (SS_ISCONNECTING | SS_ISCONNECTED)) != 0)) + return (EISCONN); + + INP_HASH_WLOCK(&V_tcbinfo); + error = in6_pcbconnect(inp, sin6, td->td_ucred, true); INP_HASH_WUNLOCK(&V_tcbinfo); + if (error != 0) + return (error); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; - soisconnecting(inp->inp_socket); + soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(&inp->inp_inc); @@ -1631,11 +1537,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); - return 0; - -out: - INP_HASH_WUNLOCK(&V_tcbinfo); - return error; + return (0); } #endif /* INET6 */ @@ -1646,11 +1548,11 @@ out: * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ -static void -tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) +void +tcp_fill_info(const struct tcpcb *tp, struct tcp_info *ti) { - INP_WLOCK_ASSERT(tp->t_inpcb); + INP_LOCK_ASSERT(tptoinpcb(tp)); bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; @@ -1663,8 +1565,20 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } - if (tp->t_flags2 & TF2_ECN_PERMIT) - ti->tcpi_options |= TCPI_OPT_ECN; + switch (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { + case TF2_ECN_PERMIT: + ti->tcpi_options |= TCPI_OPT_ECN; + break; + case TF2_ACE_PERMIT: + /* FALLTHROUGH */ + case TF2_ECN_PERMIT | TF2_ACE_PERMIT: + ti->tcpi_options |= TCPI_OPT_ACE; + break; + default: + break; + } + if (tp->t_flags & TF_FASTOPEN) + ti->tcpi_options |= TCPI_OPT_TFO; ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; @@ -1687,12 +1601,31 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; + ti->tcpi_snd_una = tp->snd_una; + ti->tcpi_snd_max = tp->snd_max; + ti->tcpi_rcv_numsacks = tp->rcv_numsacks; + ti->tcpi_rcv_adv = tp->rcv_adv; + ti->tcpi_dupacks = tp->t_dupacks; + ti->tcpi_rttmin = tp->t_rttlow; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { ti->tcpi_options |= TCPI_OPT_TOE; tcp_offload_tcp_info(tp, ti); } #endif + /* + * AccECN related counters. + */ + if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) == + (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) + /* + * Internal counter starts at 5 for AccECN + * but 0 for RFC3168 ECN. + */ + ti->tcpi_delivered_ce = tp->t_scep - 5; + else + ti->tcpi_delivered_ce = tp->t_scep; + ti->tcpi_received_ce = tp->t_rcep; } /* @@ -1703,7 +1636,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) */ #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ INP_WLOCK(inp); \ - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ + if (inp->inp_flags & INP_DROPPED) { \ INP_WUNLOCK(inp); \ cleanup; \ return (ECONNRESET); \ @@ -1712,24 +1645,30 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) } while(0) #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) -static int +int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) { - struct tcpcb *tp = intotcpcb(inp); + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); int error = 0; MPASS(sopt->sopt_dir == SOPT_SET); + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("inp_flags == %x", inp->inp_flags)); + KASSERT(so != NULL, ("inp_socket == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { + INP_WUNLOCK(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) - error = ip6_ctloutput(inp->inp_socket, sopt); + error = ip6_ctloutput(so, sopt); #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET - error = ip_ctloutput(inp->inp_socket, sopt); + error = ip_ctloutput(so, sopt); #endif /* * When an IP-level socket option affects TCP, pass control @@ -1757,6 +1696,8 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) case IPPROTO_IP: switch (sopt->sopt_name) { case IP_TOS: + inp->inp_ip_tos &= ~IPTOS_ECN_MASK; + break; case IP_TTL: /* Notify tcp stacks that care (e.g. RACK). */ break; @@ -1768,6 +1709,11 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) default: return (error); } + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } } else if (sopt->sopt_name == TCP_FUNCTION_BLK) { /* * Protect the TCP option TCP_FUNCTION_BLK so @@ -1775,16 +1721,14 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) */ struct tcp_function_set fsn; struct tcp_function_block *blk; + void *ptr = NULL; + INP_WUNLOCK(inp); error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); if (error) return (error); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } tp = intotcpcb(inp); blk = find_and_ref_tcp_functions(&fsn); @@ -1825,10 +1769,33 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) return (ENOENT); } /* - * Release the old refcnt, the - * lookup acquired a ref on the - * new one already. + * Ensure the new stack takes ownership with a + * clean slate on peak rate threshold. */ + if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) + tp->t_fb->tfb_tcp_timer_stop_all(tp); + if (blk->tfb_tcp_fb_init) { + error = (*blk->tfb_tcp_fb_init)(tp, &ptr); + if (error) { + /* + * Release the ref count the lookup + * acquired. + */ + refcount_release(&blk->tfb_refcnt); + /* + * Now there is a chance that the + * init() function mucked with some + * things before it failed, such as + * hpts or inp_flags2 or timer granularity. + * It should not of, but lets give the old + * stack a chance to reset to a known good state. + */ + if (tp->t_fb->tfb_switch_failed) { + (*tp->t_fb->tfb_switch_failed)(tp); + } + goto err_out; + } + } if (tp->t_fb->tfb_tcp_fb_fini) { struct epoch_tracker et; /* @@ -1839,27 +1806,17 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); NET_EPOCH_EXIT(et); } -#ifdef TCPHPTS - /* Assure that we are not on any hpts */ - tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL); -#endif - if (blk->tfb_tcp_fb_init) { - error = (*blk->tfb_tcp_fb_init)(tp); - if (error) { - refcount_release(&blk->tfb_refcnt); - if (tp->t_fb->tfb_tcp_fb_init) { - if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { - /* Fall back failed, drop the connection */ - INP_WUNLOCK(inp); - soabort(inp->inp_socket); - return(error); - } - } - goto err_out; - } - } + /* + * Release the old refcnt, the + * lookup acquired a ref on the + * new one already. + */ refcount_release(&tp->t_fb->tfb_refcnt); + /* + * Set in the new stack. + */ tp->t_fb = blk; + tp->t_fb_ptr = ptr; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, @@ -1869,46 +1826,40 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) err_out: INP_WUNLOCK(inp); return (error); - } - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(inp); - return (ECONNRESET); } - tp = intotcpcb(inp); - /* Pass in the INP locked, caller must unlock it. */ - return (tp->t_fb->tfb_tcp_ctloutput(inp->inp_socket, sopt, inp, tp)); + /* Pass in the INP locked, callee must unlock it. */ + return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt)); } static int tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt) { - int error = 0; - struct tcpcb *tp; + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); + int error = 0; MPASS(sopt->sopt_dir == SOPT_GET); + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("inp_flags == %x", inp->inp_flags)); + KASSERT(so != NULL, ("inp_socket == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { + INP_WUNLOCK(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) - error = ip6_ctloutput(inp->inp_socket, sopt); + error = ip6_ctloutput(so, sopt); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET - error = ip_ctloutput(inp->inp_socket, sopt); + error = ip_ctloutput(so, sopt); #endif return (error); } - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } - tp = intotcpcb(inp); if (((sopt->sopt_name == TCP_FUNCTION_BLK) || (sopt->sopt_name == TCP_FUNCTION_ALIAS))) { struct tcp_function_set fsn; @@ -1928,20 +1879,23 @@ tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt) return (error); } - /* Pass in the INP locked, caller must unlock it. */ - return (tp->t_fb->tfb_tcp_ctloutput(inp->inp_socket, sopt, inp, tp)); + /* Pass in the INP locked, callee must unlock it. */ + return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt)); } int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { - int error; struct inpcb *inp; - error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } if (sopt->sopt_dir == SOPT_SET) return (tcp_ctloutput_set(inp, sopt)); else if (sopt->sopt_dir == SOPT_GET) @@ -1959,44 +1913,14 @@ CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN); CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); #endif -#ifdef KERN_TLS -static int -copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls) -{ - struct tls_enable_v0 tls_v0; - int error; - - if (sopt->sopt_valsize == sizeof(tls_v0)) { - error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0), - sizeof(tls_v0)); - if (error) - return (error); - memset(tls, 0, sizeof(*tls)); - tls->cipher_key = tls_v0.cipher_key; - tls->iv = tls_v0.iv; - tls->auth_key = tls_v0.auth_key; - tls->cipher_algorithm = tls_v0.cipher_algorithm; - tls->cipher_key_len = tls_v0.cipher_key_len; - tls->iv_len = tls_v0.iv_len; - tls->auth_algorithm = tls_v0.auth_algorithm; - tls->auth_key_len = tls_v0.auth_key_len; - tls->flags = tls_v0.flags; - tls->tls_vmajor = tls_v0.tls_vmajor; - tls->tls_vminor = tls_v0.tls_vminor; - return (0); - } - - return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls))); -} -#endif - extern struct cc_algo newreno_cc_algo; static int -tcp_congestion(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt) { struct cc_algo *algo; void *ptr = NULL; + struct tcpcb *tp; struct cc_var cc_mem; char buf[TCP_CA_NAME_MAX]; size_t mem_sz; @@ -2008,7 +1932,7 @@ tcp_congestion(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struc return(error); buf[sopt->sopt_valsize] = '\0'; CC_LIST_RLOCK(); - STAILQ_FOREACH(algo, &cc_list, entries) + STAILQ_FOREACH(algo, &cc_list, entries) { if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) { if (algo->flags & CC_MODULE_BEING_REMOVED) { @@ -2017,30 +1941,24 @@ tcp_congestion(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struc } break; } + } if (algo == NULL) { CC_LIST_RUNLOCK(); return(ESRCH); } -do_over: + /* + * With a reference the algorithm cannot be removed + * so we hold a reference through the change process. + */ + cc_refer(algo); + CC_LIST_RUNLOCK(); if (algo->cb_init != NULL) { /* We can now pre-get the memory for the CC */ mem_sz = (*algo->cc_data_sz)(); if (mem_sz == 0) { goto no_mem_needed; } - CC_LIST_RUNLOCK(); ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK); - CC_LIST_RLOCK(); - STAILQ_FOREACH(algo, &cc_list, entries) - if (strncmp(buf, algo->name, - TCP_CA_NAME_MAX) == 0) - break; - if (algo == NULL) { - if (ptr) - free(ptr, M_CC_MEM); - CC_LIST_RUNLOCK(); - return(ESRCH); - } } else { no_mem_needed: mem_sz = 0; @@ -2051,22 +1969,20 @@ no_mem_needed: * back the inplock. */ memset(&cc_mem, 0, sizeof(cc_mem)); - if (mem_sz != (*algo->cc_data_sz)()) { - if (ptr) - free(ptr, M_CC_MEM); - goto do_over; - } INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); + if (ptr) + free(ptr, M_CC_MEM); + /* Release our temp reference */ + CC_LIST_RLOCK(); + cc_release(algo); CC_LIST_RUNLOCK(); - free(ptr, M_CC_MEM); return (ECONNRESET); } tp = intotcpcb(inp); if (ptr != NULL) memset(ptr, 0, mem_sz); - CC_LIST_RUNLOCK(); cc_mem.ccvc.tcp = tp; /* * We once again hold a write lock over the tcb so it's @@ -2090,28 +2006,38 @@ no_mem_needed: * the old ones cleanup (if any). */ if (CC_ALGO(tp)->cb_destroy != NULL) - CC_ALGO(tp)->cb_destroy(tp->ccv); - memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var)); - tp->cc_algo = algo; + CC_ALGO(tp)->cb_destroy(&tp->t_ccv); + /* Detach the old CC from the tcpcb */ + cc_detach(tp); + /* Copy in our temp memory that was inited */ + memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var)); + /* Now attach the new, which takes a reference */ + cc_attach(tp, algo); /* Ok now are we where we have gotten past any conn_init? */ if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) { /* Yep run the connection init for the new CC */ - CC_ALGO(tp)->conn_init(tp->ccv); + CC_ALGO(tp)->conn_init(&tp->t_ccv); } } else if (ptr) free(ptr, M_CC_MEM); INP_WUNLOCK(inp); + /* Now lets release our temp reference */ + CC_LIST_RLOCK(); + cc_release(algo); + CC_LIST_RUNLOCK(); return (error); } int -tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt) { + struct inpcb *inp = tptoinpcb(tp); int error, opt, optval; u_int ui; struct tcp_info ti; #ifdef KERN_TLS struct tls_enable tls; + struct socket *so = inp->inp_socket; #endif char *pbuf, buf[TCP_LOG_ID_LEN]; #ifdef STATS @@ -2120,6 +2046,9 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp size_t len; INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("inp_flags == %x", inp->inp_flags)); + KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL")); switch (sopt->sopt_level) { #ifdef INET6 @@ -2158,7 +2087,7 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp } INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); if (CC_ALGO(tp)->ctl_output != NULL) - error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf); + error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, sopt, pbuf); else error = ENOENT; INP_WUNLOCK(inp); @@ -2173,19 +2102,18 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: - if (!TCPMD5_ENABLED()) { - INP_WUNLOCK(inp); + INP_WUNLOCK(inp); + if (!TCPMD5_ENABLED()) return (ENOPROTOOPT); - } error = TCPMD5_PCBCTL(inp, sopt); if (error) return (error); + INP_WLOCK_RECHECK(inp); goto unlock_and_done; #endif /* IPSEC */ case TCP_NODELAY: case TCP_NOOPT: - case TCP_LRD: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -2200,9 +2128,6 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp case TCP_NOOPT: opt = TF_NOOPT; break; - case TCP_LRD: - opt = TF_LRD; - break; default: opt = 0; /* dead code to fool gcc */ break; @@ -2238,7 +2163,7 @@ unlock_and_done: struct epoch_tracker et; NET_EPOCH_ENTER(et); - error = tp->t_fb->tfb_tcp_output(tp); + error = tcp_output_nodrop(tp); NET_EPOCH_EXIT(et); } } @@ -2278,9 +2203,19 @@ unlock_and_done: INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && - optval + 40 >= V_tcp_minmss) + optval + 40 >= V_tcp_minmss) { tp->t_maxseg = optval; - else + if (tp->t_maxseg < V_tcp_mssdflt) { + /* + * The MSS is so small we should not process incoming + * SACK's since we are subject to attack in such a + * case. + */ + tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; + } else { + tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; + } + } else error = EINVAL; goto unlock_and_done; @@ -2319,7 +2254,7 @@ unlock_and_done: break; case TCP_CONGESTION: - error = tcp_congestion(so, sopt, inp, tp); + error = tcp_set_cc_mod(inp, sopt); break; case TCP_REUSPORT_LB_NUMA: @@ -2335,15 +2270,16 @@ unlock_and_done: #ifdef KERN_TLS case TCP_TXTLS_ENABLE: INP_WUNLOCK(inp); - error = copyin_tls_enable(sopt, &tls); - if (error) + error = ktls_copyin_tls_enable(sopt, &tls); + if (error != 0) break; error = ktls_enable_tx(so, &tls); + ktls_cleanup_tls_enable(&tls); break; case TCP_TXTLS_MODE: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); - if (error) + if (error != 0) return (error); INP_WLOCK_RECHECK(inp); @@ -2352,14 +2288,14 @@ unlock_and_done: break; case TCP_RXTLS_ENABLE: INP_WUNLOCK(inp); - error = sooptcopyin(sopt, &tls, sizeof(tls), - sizeof(tls)); - if (error) + error = ktls_copyin_tls_enable(sopt, &tls); + if (error != 0) break; error = ktls_enable_rx(so, &tls); + ktls_cleanup_tls_enable(&tls); break; #endif - + case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: @@ -2376,6 +2312,10 @@ unlock_and_done: INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { + case TCP_MAXUNACKTIME: + tp->t_maxunacktime = ui; + break; + case TCP_KEEPIDLE: tp->t_keepidle = ui; /* @@ -2429,7 +2369,8 @@ unlock_and_done: INP_WLOCK_RECHECK(inp); if (optval >= 0) - tcp_pcap_set_sock_max(TCP_PCAP_OUT ? + tcp_pcap_set_sock_max( + (sopt->sopt_name == TCP_PCAP_OUT) ? &(tp->t_outpkts) : &(tp->t_inpkts), optval); else @@ -2551,10 +2492,9 @@ unlock_and_done: switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: - if (!TCPMD5_ENABLED()) { - INP_WUNLOCK(inp); + INP_WUNLOCK(inp); + if (!TCPMD5_ENABLED()) return (ENOPROTOOPT); - } error = TCPMD5_PCBCTL(inp, sopt); break; #endif @@ -2643,11 +2583,15 @@ unhold: INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; + case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { + case TCP_MAXUNACKTIME: + ui = TP_MAXUNACKTIME(tp) / hz; + break; case TCP_KEEPIDLE: ui = TP_KEEPIDLE(tp) / hz; break; @@ -2667,7 +2611,8 @@ unhold: #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: - optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? + optval = tcp_pcap_get_sock_max( + (sopt->sopt_name == TCP_PCAP_OUT) ? &(tp->t_outpkts) : &(tp->t_inpkts)); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); @@ -2680,7 +2625,7 @@ unhold: break; #ifdef TCP_BLACKBOX case TCP_LOG: - optval = tp->t_logstate; + optval = tcp_get_bblog_state(tp); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; @@ -2715,11 +2660,6 @@ unhold: sizeof(optval)); break; #endif - case TCP_LRD: - optval = tp->t_flags & TF_LRD; - INP_WUNLOCK(inp); - error = sooptcopyout(sopt, &optval, sizeof optval); - break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -2743,8 +2683,8 @@ unhold: static void tcp_disconnect(struct tcpcb *tp) { - struct inpcb *inp = tp->t_inpcb; - struct socket *so = inp->inp_socket; + struct inpcb *inp = tptoinpcb(tp); + struct socket *so = tptosocket(tp); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); @@ -2754,7 +2694,7 @@ tcp_disconnect(struct tcpcb *tp) * socket is still open. */ if (tp->t_state < TCPS_ESTABLISHED && - !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) { + !(tp->t_state > TCPS_LISTEN && (tp->t_flags & TF_FASTOPEN))) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); @@ -2767,7 +2707,8 @@ tcp_disconnect(struct tcpcb *tp) sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tp->t_fb->tfb_tcp_output(tp); + /* Ignore stack's drop request, we already at it. */ + (void)tcp_output_nodrop(tp); } } @@ -2786,7 +2727,7 @@ tcp_usrclosed(struct tcpcb *tp) { NET_EPOCH_ASSERT(); - INP_WLOCK_ASSERT(tp->t_inpcb); + INP_WLOCK_ASSERT(tptoinpcb(tp)); switch (tp->t_state) { case TCPS_LISTEN: @@ -2818,8 +2759,11 @@ tcp_usrclosed(struct tcpcb *tp) tcp_state_change(tp, TCPS_LAST_ACK); break; } + if (tp->t_acktime == 0) + tp->t_acktime = ticks; if (tp->t_state >= TCPS_FIN_WAIT_2) { - soisdisconnected(tp->t_inpcb->inp_socket); + tcp_free_sackholes(tp); + soisdisconnected(tptosocket(tp)); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) { int timeout; @@ -2954,12 +2898,16 @@ db_print_tflags(u_int t_flags) db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_PREVVALID) { + db_printf("%sTF_PREVVALID", comma ? ", " : ""); + comma = 1; + } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; } - if (t_flags & TF_LQ_OVERFLOW) { - db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); + if (t_flags & TF_SONOTCONN) { + db_printf("%sTF_SONOTCONN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LASTIDLE) { @@ -2982,6 +2930,10 @@ db_print_tflags(u_int t_flags) db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_WASCRECOVERY) { + db_printf("%sTF_WASCRECOVERY", comma ? ", " : ""); + comma = 1; + } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; @@ -3006,10 +2958,46 @@ db_print_tflags2(u_int t_flags2) int comma; comma = 0; + if (t_flags2 & TF2_PLPMTU_BLACKHOLE) { + db_printf("%sTF2_PLPMTU_BLACKHOLE", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_PLPMTU_PMTUD) { + db_printf("%sTF2_PLPMTU_PMTUD", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_PLPMTU_MAXSEGSNT) { + db_printf("%sTF2_PLPMTU_MAXSEGSNT", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_LOG_AUTO) { + db_printf("%sTF2_LOG_AUTO", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_DROP_AF_DATA) { + db_printf("%sTF2_DROP_AF_DATA", comma ? ", " : ""); + comma = 1; + } if (t_flags2 & TF2_ECN_PERMIT) { db_printf("%sTF2_ECN_PERMIT", comma ? ", " : ""); comma = 1; } + if (t_flags2 & TF2_ECN_SND_CWR) { + db_printf("%sTF2_ECN_SND_CWR", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_ECN_SND_ECE) { + db_printf("%sTF2_ECN_SND_ECE", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_ACE_PERMIT) { + db_printf("%sTF2_ACE_PERMIT", comma ? ", " : ""); + comma = 1; + } + if (t_flags2 & TF2_FBYTES_COMPLETE) { + db_printf("%sTF2_FBYTES_COMPLETE", comma ? ", " : ""); + comma = 1; + } } static void @@ -3042,12 +3030,8 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); - db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", - &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); - - db_print_indent(indent); - db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, - &tp->t_timers->tt_delack, tp->t_inpcb); + db_printf("t_callout: %p t_timers: %p\n", + &tp->t_callout, &tp->t_timers); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); @@ -3065,7 +3049,7 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) db_printf(")\n"); db_print_indent(indent); - db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", + db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: 0x%08x\n", tp->snd_una, tp->snd_max, tp->snd_nxt); db_print_indent(indent); @@ -3101,12 +3085,11 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); - db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " - "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, - tp->t_rttbest); + db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u\n", + tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin); db_print_indent(indent); - db_printf("t_rttupdated: %lu max_sndwnd: %u t_softerror: %d\n", + db_printf("t_rttupdated: %u max_sndwnd: %u t_softerror: %d\n", tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); db_print_indent(indent); |