diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/dev/sdio/sdiob.c | 7 | ||||
-rw-r--r-- | sys/fs/fuse/fuse_vnops.c | 34 | ||||
-rw-r--r-- | sys/kern/kern_racct.c | 307 | ||||
-rw-r--r-- | sys/net/if_bridge.c | 84 | ||||
-rw-r--r-- | sys/net/if_bridgevar.h | 11 | ||||
-rw-r--r-- | sys/net/if_epair.c | 35 | ||||
-rw-r--r-- | sys/net/pfvar.h | 4 | ||||
-rw-r--r-- | sys/netinet/tcp_sack.c | 32 | ||||
-rw-r--r-- | sys/netinet/tcp_stacks/rack_bbr_common.c | 6 | ||||
-rw-r--r-- | sys/netpfil/pf/pf.c | 19 | ||||
-rw-r--r-- | sys/netpfil/pf/pf.h | 4 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_ioctl.c | 3 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_nl.c | 132 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_nl.h | 19 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_norm.c | 2 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_ruleset.c | 16 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_table.c | 2 | ||||
-rw-r--r-- | sys/sys/proc.h | 1 | ||||
-rw-r--r-- | sys/sys/racct.h | 6 |
19 files changed, 484 insertions, 240 deletions
diff --git a/sys/dev/sdio/sdiob.c b/sys/dev/sdio/sdiob.c index 4ec2058fa2e4..cb2cc0da6b77 100644 --- a/sys/dev/sdio/sdiob.c +++ b/sys/dev/sdio/sdiob.c @@ -150,7 +150,7 @@ sdiob_rw_direct_sc(struct sdiob_softc *sc, uint8_t fn, uint32_t addr, bool wr, sc->ccb = xpt_alloc_ccb(); else memset(sc->ccb, 0, sizeof(*sc->ccb)); - xpt_setup_ccb(&sc->ccb->ccb_h, sc->periph->path, CAM_PRIORITY_NONE); + xpt_setup_ccb(&sc->ccb->ccb_h, sc->periph->path, CAM_PRIORITY_NORMAL); CAM_DEBUG(sc->ccb->ccb_h.path, CAM_DEBUG_TRACE, ("%s(fn=%d, addr=%#02x, wr=%d, *val=%#02x)\n", __func__, fn, addr, wr, *val)); @@ -250,7 +250,7 @@ sdiob_rw_extended_cam(struct sdiob_softc *sc, uint8_t fn, uint32_t addr, sc->ccb = xpt_alloc_ccb(); else memset(sc->ccb, 0, sizeof(*sc->ccb)); - xpt_setup_ccb(&sc->ccb->ccb_h, sc->periph->path, CAM_PRIORITY_NONE); + xpt_setup_ccb(&sc->ccb->ccb_h, sc->periph->path, CAM_PRIORITY_NORMAL); CAM_DEBUG(sc->ccb->ccb_h.path, CAM_DEBUG_TRACE, ("%s(fn=%d addr=%#0x wr=%d b_count=%u blksz=%u buf=%p incr=%d)\n", __func__, fn, addr, wr, b_count, blksz, buffer, incaddr)); @@ -977,9 +977,6 @@ sdiobdiscover(void *context, int pending) if (sc->ccb == NULL) sc->ccb = xpt_alloc_ccb(); - else - memset(sc->ccb, 0, sizeof(*sc->ccb)); - xpt_setup_ccb(&sc->ccb->ccb_h, periph->path, CAM_PRIORITY_NONE); /* * Read CCCR and FBR of each function, get manufacturer and device IDs, diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 32872e8f3f3a..b90ce60ec664 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -1219,36 +1219,20 @@ fuse_vnop_getattr(struct vop_getattr_args *ap) struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; - int err = 0; - int dataflags; - - dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; - - /* Note that we are not bailing out on a dead file system just yet. */ - if (!(dataflags & FSESS_INITED)) { - if (!vnode_isvroot(vp)) { - fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); - return (EXTERROR(ENOTCONN, "FUSE daemon is not " - "initialized")); - } else { - goto fake; - } - } err = fuse_internal_getattr(vp, vap, cred, td); if (err == ENOTCONN && vnode_isvroot(vp)) { - /* see comment in fuse_vfsop_statfs() */ - goto fake; - } else { - return err; + /* + * We want to seem a legitimate fs even if the daemon is dead, + * so that, eg., we can still do path based unmounting after + * the daemon dies. + */ + err = 0; + bzero(vap, sizeof(*vap)); + vap->va_type = vnode_vtype(vp); } - -fake: - bzero(vap, sizeof(*vap)); - vap->va_type = vnode_vtype(vp); - - return 0; + return err; } /* diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 7ee3b9e2048a..7351e9cb6313 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -96,6 +96,13 @@ static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); +static int racct_set_locked(struct proc *p, int resource, uint64_t amount, + int force); +static void racct_updatepcpu_locked(struct proc *p); +static void racct_updatepcpu_racct_locked(struct racct *racct); +static void racct_updatepcpu_containers(void); +static void racct_settime_locked(struct proc *p, bool exit); +static void racct_zeropcpu_locked(struct proc *p); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, , rusage, add, @@ -308,68 +315,6 @@ fixpt_t ccpu_exp[] = { #define CCPU_EXP_MAX 110 -/* - * This function is analogical to the getpcpu() function in the ps(1) command. - * They should both calculate in the same way so that the racct %cpu - * calculations are consistent with the values shown by the ps(1) tool. - * The calculations are more complex in the 4BSD scheduler because of the value - * of the ccpu variable. In ULE it is defined to be zero which saves us some - * work. - */ -static uint64_t -racct_getpcpu(struct proc *p, u_int pcpu) -{ - u_int swtime; -#ifdef SCHED_4BSD - fixpt_t pctcpu, pctcpu_next; -#endif - fixpt_t p_pctcpu; - struct thread *td; - - ASSERT_RACCT_ENABLED(); - KASSERT((p->p_flag & P_IDLEPROC) == 0, - ("racct_getpcpu: idle process %p", p)); - - swtime = (ticks - p->p_swtick) / hz; - - /* - * For short-lived processes, the sched_pctcpu() returns small - * values even for cpu intensive processes. Therefore we use - * our own estimate in this case. - */ - if (swtime < RACCT_PCPU_SECS) - return (pcpu); - - p_pctcpu = 0; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); -#ifdef SCHED_4BSD - pctcpu = sched_pctcpu(td); - /* Count also the yet unfinished second. */ - pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; - pctcpu_next += sched_pctcpu_delta(td); - p_pctcpu += max(pctcpu, pctcpu_next); -#else - /* - * In ULE the %cpu statistics are updated on every - * sched_pctcpu() call. So special calculations to - * account for the latest (unfinished) second are - * not needed. - */ - p_pctcpu += sched_pctcpu(td); -#endif - thread_unlock(td); - } - -#ifdef SCHED_4BSD - if (swtime <= CCPU_EXP_MAX) - return ((100 * (uint64_t)p_pctcpu * 1000000) / - (FSCALE - ccpu_exp[swtime])); -#endif - - return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); -} - static void racct_add_racct(struct racct *dest, const struct racct *src) { @@ -499,19 +444,6 @@ racct_adjust_resource(struct racct *racct, int resource, ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } - - /* - * There are some cases where the racct %cpu resource would grow - * beyond 100% per core. For example in racct_proc_exit() we add - * the process %cpu usage to the ucred racct containers. If too - * many processes terminated in a short time span, the ucred %cpu - * resource could grow too much. Also, the 4BSD scheduler sometimes - * returns for a thread more than 100% cpu usage. So we set a sane - * boundary here to 100% * the maximum number of CPUs. - */ - if ((resource == RACCT_PCTCPU) && - (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) - racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int @@ -635,10 +567,44 @@ racct_add_buf(struct proc *p, const struct buf *bp, int is_write) RACCT_UNLOCK(); } +static void +racct_settime_locked(struct proc *p, bool exit) +{ + struct thread *td; + struct timeval wallclock; + uint64_t runtime; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + if (exit) { + /* + * proc_reap() has already calculated rux + * and added crux to rux. + */ + runtime = cputick2usec(p->p_rux.rux_runtime - + p->p_crux.rux_runtime); + } else { + PROC_STATLOCK(p); + FOREACH_THREAD_IN_PROC(p, td) + ruxagg(p, td); + PROC_STATUNLOCK(p); + runtime = cputick2usec(p->p_rux.rux_runtime); + } + microuptime(&wallclock); + timevalsub(&wallclock, &p->p_stats->p_start); + + racct_set_locked(p, RACCT_CPU, runtime, 0); + racct_set_locked(p, RACCT_WALLCLOCK, + (uint64_t)wallclock.tv_sec * 1000000 + + wallclock.tv_usec, 0); +} + static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { - int64_t old_amount, decayed_amount, diff_proc, diff_cred; + int64_t old_amount, diff_proc, diff_cred; #ifdef RCTL int error; #endif @@ -655,17 +621,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) * The diffs may be negative. */ diff_proc = amount - old_amount; - if (resource == RACCT_PCTCPU) { - /* - * Resources in per-credential racct containers may decay. - * If this is the case, we need to calculate the difference - * between the new amount and the proportional value of the - * old amount that has decayed in the ucred racct containers. - */ - decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; - diff_cred = amount - decayed_amount; - } else - diff_cred = diff_proc; + diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, @@ -908,8 +864,6 @@ racct_proc_fork(struct proc *parent, struct proc *child) goto out; #endif - /* Init process cpu time. */ - child->p_prev_runtime = 0; child->p_throttled = 0; /* @@ -964,37 +918,16 @@ racct_proc_fork_done(struct proc *child) void racct_proc_exit(struct proc *p) { - struct timeval wallclock; - uint64_t pct_estimate, pct, runtime; int i; if (!racct_enable) return; PROC_LOCK(p); - /* - * We don't need to calculate rux, proc_reap() has already done this. - */ - runtime = cputick2usec(p->p_rux.rux_runtime); -#ifdef notyet - KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); -#else - if (runtime < p->p_prev_runtime) - runtime = p->p_prev_runtime; -#endif - microuptime(&wallclock); - timevalsub(&wallclock, &p->p_stats->p_start); - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); - RACCT_LOCK(); - racct_set_locked(p, RACCT_CPU, runtime, 0); - racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); + + racct_settime_locked(p, true); + racct_zeropcpu_locked(p); KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, ("process reaped with %ju allocated for RSS\n", @@ -1068,6 +1001,10 @@ racct_move(struct racct *dest, struct racct *src) RACCT_LOCK(); racct_add_racct(dest, src); racct_sub_racct(src, src); + dest->r_runtime = src->r_runtime; + dest->r_time = src->r_time; + src->r_runtime = 0; + timevalsub(&src->r_time, &src->r_time); RACCT_UNLOCK(); } @@ -1170,8 +1107,6 @@ racct_proc_wakeup(struct proc *p) static void racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { - int64_t r_old, r_new; - ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); @@ -1181,15 +1116,6 @@ racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) rctl_throttle_decay(racct, RACCT_READIOPS); rctl_throttle_decay(racct, RACCT_WRITEIOPS); #endif - - r_old = racct->r_resources[RACCT_PCTCPU]; - - /* If there is nothing to decay, just exit. */ - if (r_old <= 0) - return; - - r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; - racct->r_resources[RACCT_PCTCPU] = r_new; } static void @@ -1221,15 +1147,105 @@ racct_decay(void) } static void +racct_updatepcpu_racct_locked(struct racct *racct) +{ + struct timeval diff; + uint64_t elapsed; + uint64_t runtime; + uint64_t newpcpu; + uint64_t oldpcpu; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + + /* Difference between now and previously-recorded time. */ + microuptime(&diff); + timevalsub(&diff, &racct->r_time); + elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec; + + /* Difference between current and previously-recorded runtime. */ + runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime; + + newpcpu = runtime * 100 * 1000000 / elapsed; + oldpcpu = racct->r_resources[RACCT_PCTCPU]; + /* + * This calculation is equivalent to + * (1 - 0.3) * newpcpu + 0.3 * oldpcpu + * where RACCT_DECAY_FACTOR = 0.3 * FSCALE. + */ + racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) * + newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE; + if (racct->r_resources[RACCT_PCTCPU] > + 100 * 1000000 * (uint64_t)mp_ncpus) + racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * + (uint64_t)mp_ncpus; + + /* Record current times. */ + racct->r_runtime = racct->r_resources[RACCT_CPU]; + timevaladd(&racct->r_time, &diff); +} + +static void +racct_zeropcpu_locked(struct proc *p) +{ + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + p->p_racct->r_resources[RACCT_PCTCPU] = 0; +} + +static void +racct_updatepcpu_locked(struct proc *p) +{ + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + racct_updatepcpu_racct_locked(p->p_racct); +} + +static void +racct_updatepcpu_pre(void) +{ + + RACCT_LOCK(); +} + +static void +racct_updatepcpu_post(void) +{ + + RACCT_UNLOCK(); +} + +static void +racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void *dummy2) +{ + racct_updatepcpu_racct_locked(racct); +} + +static void +racct_updatepcpu_containers(void) +{ + ASSERT_RACCT_ENABLED(); + + ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); + loginclass_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); + prison_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); +} + +static void racctd(void) { - struct thread *td; struct proc *p; - struct timeval wallclock; - uint64_t pct, pct_estimate, runtime; + struct proc *idle; ASSERT_RACCT_ENABLED(); + idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc; + for (;;) { racct_decay(); @@ -1237,36 +1253,16 @@ racctd(void) FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); + if (p == idle) { + PROC_UNLOCK(p); + continue; + } if (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0) { - if (p->p_state == PRS_ZOMBIE) - racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); continue; } - microuptime(&wallclock); - timevalsub(&wallclock, &p->p_stats->p_start); - PROC_STATLOCK(p); - FOREACH_THREAD_IN_PROC(p, td) - ruxagg(p, td); - runtime = cputick2usec(p->p_rux.rux_runtime); - PROC_STATUNLOCK(p); -#ifdef notyet - KASSERT(runtime >= p->p_prev_runtime, - ("runtime < p_prev_runtime")); -#else - if (runtime < p->p_prev_runtime) - runtime = p->p_prev_runtime; -#endif - p->p_prev_runtime = runtime; - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); #ifdef RCTL rctl_throttle_decay(p->p_racct, RACCT_READBPS); @@ -1274,11 +1270,8 @@ racctd(void) rctl_throttle_decay(p->p_racct, RACCT_READIOPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif - racct_set_locked(p, RACCT_PCTCPU, pct, 1); - racct_set_locked(p, RACCT_CPU, runtime, 0); - racct_set_locked(p, RACCT_WALLCLOCK, - (uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec, 0); + racct_settime_locked(p, false); + racct_updatepcpu_locked(p); RACCT_UNLOCK(); PROC_UNLOCK(p); } @@ -1306,6 +1299,8 @@ racctd(void) PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); + + racct_updatepcpu_containers(); pause("-", hz); } } diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index 46e54339a171..3aed54c58e04 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -259,6 +259,7 @@ struct bridge_iflist { struct epoch_context bif_epoch_ctx; ether_vlanid_t bif_pvid; /* port vlan id */ ifbvlan_set_t bif_vlan_set; /* if allowed tagged vlans */ + uint16_t bif_vlanproto; /* vlan protocol */ }; /* @@ -423,6 +424,7 @@ static int bridge_ioctl_gflags(struct bridge_softc *, void *); static int bridge_ioctl_sflags(struct bridge_softc *, void *); static int bridge_ioctl_gdefpvid(struct bridge_softc *, void *); static int bridge_ioctl_sdefpvid(struct bridge_softc *, void *); +static int bridge_ioctl_svlanproto(struct bridge_softc *, void *); static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, int); #ifdef INET @@ -654,6 +656,9 @@ static const struct bridge_control bridge_control_table[] = { { bridge_ioctl_sdefpvid, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_svlanproto, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, }; static const int bridge_control_table_size = nitems(bridge_control_table); @@ -1494,8 +1499,11 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) bif->bif_ifp = ifs; bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; bif->bif_savedcaps = ifs->if_capenable; + bif->bif_vlanproto = ETHERTYPE_VLAN; if (sc->sc_flags & IFBRF_VLANFILTER) bif->bif_pvid = sc->sc_defpvid; + if (sc->sc_flags & IFBRF_DEFQINQ) + bif->bif_flags |= IFBIF_QINQ; /* * Assign the interface's MAC address to the bridge if it's the first @@ -1577,6 +1585,7 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) req->ifbr_addrmax = bif->bif_addrmax; req->ifbr_addrexceeded = bif->bif_addrexceeded; req->ifbr_pvid = bif->bif_pvid; + req->ifbr_vlanproto = bif->bif_vlanproto; /* Copy STP state options as flags */ if (bp->bp_operedge) @@ -2252,6 +2261,24 @@ bridge_ioctl_sdefpvid(struct bridge_softc *sc, void *arg) return (0); } +static int +bridge_ioctl_svlanproto(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (EXTERROR(ENOENT, "Interface is not a bridge member")); + + if (req->ifbr_vlanproto != ETHERTYPE_VLAN && + req->ifbr_vlanproto != ETHERTYPE_QINQ) + return (EXTERROR(EINVAL, "Invalid VLAN protocol")); + + bif->bif_vlanproto = req->ifbr_vlanproto; + + return (0); +} /* * bridge_ifdetach: * @@ -2395,12 +2422,15 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m, } /* - * If underlying interface can not do VLAN tag insertion itself - * then attach a packet tag that holds it. + * There are two cases where we have to insert our own tag: + * if the member interface doesn't support hardware tagging, + * or if the tag proto is not 802.1q. */ if ((m->m_flags & M_VLANTAG) && - (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { - m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); + ((dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0 || + bif->bif_vlanproto != ETHERTYPE_VLAN)) { + m = ether_vlanencap_proto(m, m->m_pkthdr.ether_vtag, + bif->bif_vlanproto); if (m == NULL) { if_printf(dst_ifp, "unable to prepend VLAN header\n"); @@ -2828,9 +2858,29 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) NET_EPOCH_ASSERT(); + /* We need the Ethernet header later, so make sure we have it now. */ + if (m->m_len < ETHER_HDR_LEN) { + m = m_pullup(m, ETHER_HDR_LEN); + if (m == NULL) { + if_inc_counter(sc->sc_ifp, IFCOUNTER_IERRORS, 1); + m_freem(m); + return (NULL); + } + } + eh = mtod(m, struct ether_header *); vlan = VLANTAGOF(m); + /* + * If this frame has a VLAN tag and the receiving interface has a + * vlan(4) trunk, then it is is destined for vlan(4), not for us. + * This means if vlan(4) and bridge(4) are configured on the same + * interface, vlan(4) is preferred, which is what users typically + * expect. + */ + if (vlan != DOT1Q_VID_NULL && ifp->if_vlantrunk != NULL) + return (m); + bif = ifp->if_bridge; if (bif) sc = bif->bif_sc; @@ -3031,19 +3081,13 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) do { GRAB_OUR_PACKETS(bifp) } while (0); /* - * Check the interface the packet arrived on. For tagged frames, - * we need to do this even if member_ifaddrs is disabled because - * vlan(4) might need to handle the traffic. + * If member_ifaddrs is enabled, see if the packet is destined for + * one of the members' addresses. */ - if (V_member_ifaddrs || (vlan && ifp->if_vlantrunk)) + if (V_member_ifaddrs) { + /* Check the interface the packet arrived on. */ do { GRAB_OUR_PACKETS(ifp) } while (0); - /* - * We only need to check other members interface if member_ifaddrs - * is enabled; otherwise we should have never traffic destined for - * a member's lladdr. - */ - if (V_member_ifaddrs) { CK_LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) { GRAB_OUR_PACKETS(bif2->bif_ifp) } @@ -3250,6 +3294,18 @@ bridge_vfilter_in(const struct bridge_iflist *sbif, struct mbuf *m) if ((sbif->bif_sc->sc_flags & IFBRF_VLANFILTER) == 0) return (true); + /* If Q-in-Q is disabled, check for stacked tags. */ + if ((sbif->bif_flags & IFBIF_QINQ) == 0) { + struct ether_header *eh; + uint16_t proto; + + eh = mtod(m, struct ether_header *); + proto = ntohs(eh->ether_type); + + if (proto == ETHERTYPE_VLAN || proto == ETHERTYPE_QINQ) + return (false); + } + if (vlan == DOT1Q_VID_NULL) { /* * The frame doesn't have a tag. If the interface does not diff --git a/sys/net/if_bridgevar.h b/sys/net/if_bridgevar.h index 6718c5ebcc34..b0f579f688ac 100644 --- a/sys/net/if_bridgevar.h +++ b/sys/net/if_bridgevar.h @@ -131,13 +131,15 @@ #define BRDGSFLAGS 35 /* set bridge flags (ifbrparam) */ #define BRDGGDEFPVID 36 /* get default pvid (ifbrparam) */ #define BRDGSDEFPVID 37 /* set default pvid (ifbrparam) */ +#define BRDGSIFVLANPROTO 38 /* set if vlan protocol (ifbreq) */ /* BRDGSFLAGS, Bridge flags (non-interface-specific) */ typedef uint32_t ifbr_flags_t; #define IFBRF_VLANFILTER (1U<<0) /* VLAN filtering enabled */ +#define IFBRF_DEFQINQ (1U<<1) /* 802.1ad Q-in-Q allowed by default */ -#define IFBRFBITS "\020\01VLANFILTER" +#define IFBRFBITS "\020\01VLANFILTER\02DEFQINQ" /* * Generic bridge control request. @@ -156,6 +158,7 @@ struct ifbreq { uint32_t ifbr_addrmax; /* member if addr max */ uint32_t ifbr_addrexceeded; /* member if addr violations */ ether_vlanid_t ifbr_pvid; /* member if PVID */ + uint16_t ifbr_vlanproto; /* member if VLAN protocol */ uint8_t pad[32]; }; @@ -172,12 +175,11 @@ struct ifbreq { #define IFBIF_BSTP_ADMEDGE 0x0200 /* member stp admin edge enabled */ #define IFBIF_BSTP_ADMCOST 0x0400 /* member stp admin path cost */ #define IFBIF_PRIVATE 0x0800 /* if is a private segment */ -/* was IFBIF_VLANFILTER 0x1000 */ -#define IFBIF_QINQ 0x2000 /* if allows 802.1ad Q-in-Q */ +#define IFBIF_QINQ 0x1000 /* if allows 802.1ad Q-in-Q */ #define IFBIFBITS "\020\001LEARNING\002DISCOVER\003STP\004SPAN" \ "\005STICKY\014PRIVATE\006EDGE\007AUTOEDGE\010PTP" \ - "\011AUTOPTP" + "\011AUTOPTP\015QINQ" #define IFBIFMASK ~(IFBIF_BSTP_EDGE|IFBIF_BSTP_AUTOEDGE|IFBIF_BSTP_PTP| \ IFBIF_BSTP_AUTOPTP|IFBIF_BSTP_ADMEDGE| \ IFBIF_BSTP_ADMCOST) /* not saved */ @@ -252,6 +254,7 @@ struct ifbrparam { * addresses */ #define ifbrp_flags ifbrp_ifbrpu.ifbrpu_int32 /* bridge flags */ #define ifbrp_defpvid ifbrp_ifbrpu.ifbrpu_int16 /* default pvid */ +#define ifbrp_vlanproto ifbrp_ifbrpu.ifbrpu_int8 /* vlan protocol */ /* * Bridge current operational parameters structure. diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c index 7051e31565d4..4d35584925a1 100644 --- a/sys/net/if_epair.c +++ b/sys/net/if_epair.c @@ -58,6 +58,7 @@ #include <sys/smp.h> #include <sys/socket.h> #include <sys/sockio.h> +#include <sys/sysctl.h> #include <sys/taskqueue.h> #include <net/bpf.h> @@ -97,6 +98,15 @@ static unsigned int next_index = 0; #define EPAIR_LOCK() mtx_lock(&epair_n_index_mtx) #define EPAIR_UNLOCK() mtx_unlock(&epair_n_index_mtx) +SYSCTL_DECL(_net_link); +static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Pair of virtual cross-over connected Ethernet-like interfaces"); + +static bool use_ether_gen_addr = false; +SYSCTL_BOOL(_net_link_epair, OID_AUTO, ether_gen_addr, CTLFLAG_RWTUN, + &use_ether_gen_addr, false, + "Generate MAC with FreeBSD OUI using ether_gen_addr(9)"); + struct epair_softc; struct epair_queue { struct mtx mtx; @@ -496,15 +506,29 @@ epair_clone_match(struct if_clone *ifc, const char *name) } static void +epair_generate_mac_byname(struct epair_softc *sc, uint8_t eaddr[]) +{ + struct ether_addr gen_eaddr; + int i; + + ether_gen_addr_byname(if_name(sc->ifp), &gen_eaddr); + for (i = 0; i < ETHER_ADDR_LEN; i++) + eaddr[i] = gen_eaddr.octet[i]; +} + +static void epair_clone_add(struct if_clone *ifc, struct epair_softc *scb) { struct ifnet *ifp; uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ ifp = scb->ifp; - /* Copy epairNa etheraddr and change the last byte. */ - memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN); - eaddr[5] = 0x0b; + if (!use_ether_gen_addr) { + /* Copy epairNa etheraddr and change the last byte. */ + memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN); + eaddr[5] = 0x0b; + } else + epair_generate_mac_byname(scb, eaddr); ether_ifattach(ifp, eaddr); if_clone_addif(ifc, ifp); @@ -719,7 +743,10 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, /* Finish initialization of interface <n>a. */ ifp = sca->ifp; epair_setup_ifp(sca, name, unit); - epair_generate_mac(sca, eaddr); + if (!use_ether_gen_addr) + epair_generate_mac(sca, eaddr); + else + epair_generate_mac_byname(sca, eaddr); ether_ifattach(ifp, eaddr); diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 8dae95c2cc2e..c397f0b67896 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -2338,6 +2338,10 @@ VNET_DECLARE(uma_zone_t, pf_udp_mapping_z); #define V_pf_udp_mapping_z VNET(pf_udp_mapping_z) VNET_DECLARE(uma_zone_t, pf_state_scrub_z); #define V_pf_state_scrub_z VNET(pf_state_scrub_z) +VNET_DECLARE(uma_zone_t, pf_anchor_z); +#define V_pf_anchor_z VNET(pf_anchor_z) +VNET_DECLARE(uma_zone_t, pf_eth_anchor_z); +#define V_pf_eth_anchor_z VNET(pf_eth_anchor_z) extern void pf_purge_thread(void *); extern void pf_unload_vnet_purge(void); diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index 4405098a8620..66275cb04bdd 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -283,7 +283,7 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) INP_WLOCK_ASSERT(tptoinpcb(tp)); /* Check arguments. */ - KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end")); + KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("SEG_GT(rcv_start, rcv_end)")); if ((rcv_start == rcv_end) && (tp->rcv_numsacks >= 1) && @@ -498,8 +498,8 @@ tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) tp->snd_numholes--; atomic_subtract_int(&V_tcp_sack_globalholes, 1); - KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); - KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); + KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes < 0")); + KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes < 0")); } /* @@ -583,6 +583,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { left_edge_delta = th_ack - tp->snd_una; + delivered_data += left_edge_delta; sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; /* @@ -590,7 +591,6 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) * due to DSACK blocks */ if (SEQ_LT(tp->snd_fack, th_ack)) { - delivered_data += th_ack - tp->snd_una; tp->snd_fack = th_ack; sack_changed = SACK_CHANGE; } @@ -684,7 +684,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) delivered_data += sblkp->end - sblkp->start; tp->sackhint.hole_bytes += temp->end - temp->start; KASSERT(tp->sackhint.hole_bytes >= 0, - ("sackhint hole bytes >= 0")); + ("sackhint hole bytes < 0")); tp->snd_fack = sblkp->end; sblkp--; sack_changed = SACK_NEWLOSS; @@ -783,7 +783,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) tp->sackhint.sack_bytes_rexmit -= (SEQ_MIN(cur->rxmit, cur->end) - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, - ("sackhint bytes rtx >= 0")); + ("sackhint bytes rtx < 0")); sack_changed = SACK_CHANGE; if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ @@ -874,13 +874,13 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) if (TAILQ_EMPTY(&tp->snd_holes)) { KASSERT(tp->sackhint.hole_bytes == 0, - ("SACK scoreboard empty, but accounting non-zero\n")); + ("SACK scoreboard empty, but sackhint hole bytes != 0")); tp->sackhint.sack_bytes_rexmit = 0; tp->sackhint.sacked_bytes = 0; tp->sackhint.lost_bytes = 0; } else { KASSERT(tp->sackhint.hole_bytes > 0, - ("SACK scoreboard not empty, but has no bytes\n")); + ("SACK scoreboard not empty, but sackhint hole bytes <= 0")); tp->sackhint.delivered_data = delivered_data; tp->sackhint.sacked_bytes += delivered_data - left_edge_delta; KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0")); @@ -918,9 +918,9 @@ tcp_free_sackholes(struct tcpcb *tp) tp->sackhint.hole_bytes = 0; tp->sackhint.lost_bytes = 0; - KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); + KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes != 0")); KASSERT(tp->sackhint.nexthole == NULL, - ("tp->sackhint.nexthole == NULL")); + ("tp->sackhint.nexthole != NULL")); } /* @@ -1061,11 +1061,15 @@ tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) } } } - KASSERT(SEQ_LT(hole->start, hole->end), ("%s: hole.start >= hole.end", __func__)); + KASSERT(SEQ_LT(hole->start, hole->end), + ("%s: SEQ_GEQ(hole.start, hole.end)", __func__)); if (!(V_tcp_do_newsack)) { - KASSERT(SEQ_LT(hole->start, tp->snd_fack), ("%s: hole.start >= snd.fack", __func__)); - KASSERT(SEQ_LT(hole->end, tp->snd_fack), ("%s: hole.end >= snd.fack", __func__)); - KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack), ("%s: hole.rxmit >= snd.fack", __func__)); + KASSERT(SEQ_LT(hole->start, tp->snd_fack), + ("%s: SEG_GEQ(hole.start, snd.fack)", __func__)); + KASSERT(SEQ_LT(hole->end, tp->snd_fack), + ("%s: SEG_GEQ(hole.end, snd.fack)", __func__)); + KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack), + ("%s: SEQ_GEQ(hole.rxmit, snd.fack)", __func__)); if (SEQ_GEQ(hole->start, hole->end) || SEQ_GEQ(hole->start, tp->snd_fack) || SEQ_GEQ(hole->end, tp->snd_fack) || diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index d1c4ba58bf55..fb013d3d17f0 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -509,11 +509,9 @@ void ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { - if (tp != NULL) { - tcp_dropwithreset(m, th, tp, tlen, rstreason); + tcp_dropwithreset(m, th, tp, tlen, rstreason); + if (tp != NULL) INP_WUNLOCK(tptoinpcb(tp)); - } else - tcp_dropwithreset(m, th, NULL, tlen, rstreason); } void diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 4801b3e2c766..3fa7789efcfe 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -621,7 +621,7 @@ static void pf_packet_rework_nat(struct pf_pdesc *pd, int off, struct pf_state_key *nk) { - switch (pd->proto) { + switch (pd->virtual_proto) { case IPPROTO_TCP: { struct tcphdr *th = &pd->hdr.tcp; @@ -1254,6 +1254,21 @@ pf_initialize(void) MTX_DEF | MTX_DUPOK); } + /* Anchors */ + V_pf_anchor_z = uma_zcreate("pf anchors", + sizeof(struct pf_kanchor), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + V_pf_limits[PF_LIMIT_ANCHORS].zone = V_pf_anchor_z; + uma_zone_set_max(V_pf_anchor_z, PF_ANCHOR_HIWAT); + uma_zone_set_warning(V_pf_anchor_z, "PF anchor limit reached"); + + V_pf_eth_anchor_z = uma_zcreate("pf Ethernet anchors", + sizeof(struct pf_keth_anchor), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + V_pf_limits[PF_LIMIT_ETH_ANCHORS].zone = V_pf_eth_anchor_z; + uma_zone_set_max(V_pf_eth_anchor_z, PF_ANCHOR_HIWAT); + uma_zone_set_warning(V_pf_eth_anchor_z, "PF Ethernet anchor limit reached"); + /* ALTQ */ TAILQ_INIT(&V_pf_altqs[0]); TAILQ_INIT(&V_pf_altqs[1]); @@ -6376,7 +6391,7 @@ pf_translate_compat(struct pf_test_ctx *ctx) KASSERT(ctx->sk != NULL, ("%s: null sk", __func__)); KASSERT(ctx->nk != NULL, ("%s: null nk", __func__)); - switch (pd->proto) { + switch (pd->virtual_proto) { case IPPROTO_TCP: if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != pd->nsport) { diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h index cfff58064922..51b3fd6390e1 100644 --- a/sys/netpfil/pf/pf.h +++ b/sys/netpfil/pf/pf.h @@ -120,7 +120,8 @@ enum { enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO }; enum { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS, - PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX }; + PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_ANCHORS, PF_LIMIT_ETH_ANCHORS, + PF_LIMIT_MAX }; #define PF_POOL_IDMASK 0x0f enum { PF_POOL_NONE, PF_POOL_BITMASK, PF_POOL_RANDOM, PF_POOL_SRCHASH, PF_POOL_ROUNDROBIN }; @@ -490,6 +491,7 @@ struct pf_osfp_ioctl { #define PF_ANCHOR_NAME_SIZE 64 #define PF_ANCHOR_MAXPATH (MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1) +#define PF_ANCHOR_HIWAT 512 #define PF_OPTIMIZER_TABLE_PFX "__automatic_" struct pf_rule { diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c index 178ee01649c6..b6f5d74b5b42 100644 --- a/sys/netpfil/pf/pf_ioctl.c +++ b/sys/netpfil/pf/pf_ioctl.c @@ -331,6 +331,8 @@ pfattach_vnet(void) V_pf_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT; V_pf_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT; + V_pf_limits[PF_LIMIT_ANCHORS].limit = PF_ANCHOR_HIWAT; + V_pf_limits[PF_LIMIT_ETH_ANCHORS].limit = PF_ANCHOR_HIWAT; RB_INIT(&V_pf_anchors); pf_init_kruleset(&pf_main_ruleset); @@ -4973,6 +4975,7 @@ DIOCCHANGEADDR_error: goto fail; } PF_RULES_WLOCK(); + io->pfrio_nadd = 0; error = pfr_add_addrs(&io->pfrio_table, pfras, io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL); diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c index c5de1e84a287..09754359ec2d 100644 --- a/sys/netpfil/pf/pf_nl.c +++ b/sys/netpfil/pf/pf_nl.c @@ -2082,6 +2082,123 @@ pf_handle_clear_addrs(struct nlmsghdr *hdr, struct nl_pstate *npt) return (error); } +TAILQ_HEAD(pfr_addrq, pfr_addr_item); +struct nl_parsed_table_addrs { + struct pfr_table table; + uint32_t flags; + struct pfr_addr addrs[256]; + size_t addr_count; + int nadd; + int ndel; +}; +#define _OUT(_field) offsetof(struct pfr_addr, _field) +static const struct nlattr_parser nla_p_pfr_addr[] = { + { .type = PFR_A_AF, .off = _OUT(pfra_af), .cb = nlattr_get_uint8 }, + { .type = PFR_A_NET, .off = _OUT(pfra_net), .cb = nlattr_get_uint8 }, + { .type = PFR_A_NOT, .off = _OUT(pfra_not), .cb = nlattr_get_bool }, + { .type = PFR_A_ADDR, .off = _OUT(pfra_u), .cb = nlattr_get_in6_addr }, +}; +#undef _OUT +NL_DECLARE_ATTR_PARSER(pfra_addr_parser, nla_p_pfr_addr); + +static int +nlattr_get_pfr_addr(struct nlattr *nla, struct nl_pstate *npt, const void *arg, + void *target) +{ + struct nl_parsed_table_addrs *attrs = target; + struct pfr_addr addr = { 0 }; + int error; + + if (attrs->addr_count >= nitems(attrs->addrs)) + return (E2BIG); + + error = nlattr_get_nested(nla, npt, &pfra_addr_parser, &addr); + if (error != 0) + return (error); + + memcpy(&attrs->addrs[attrs->addr_count], &addr, sizeof(addr)); + attrs->addr_count++; + + return (0); +} + +NL_DECLARE_ATTR_PARSER(nested_table_parser, nla_p_table); + +#define _OUT(_field) offsetof(struct nl_parsed_table_addrs, _field) +static const struct nlattr_parser nla_p_table_addr[] = { + { .type = PF_TA_TABLE, .off = _OUT(table), .arg = &nested_table_parser, .cb = nlattr_get_nested }, + { .type = PF_TA_ADDR, .cb = nlattr_get_pfr_addr }, + { .type = PF_TA_FLAGS, .off = _OUT(flags), .cb = nlattr_get_uint32 }, +}; +NL_DECLARE_PARSER(table_addr_parser, struct genlmsghdr, nlf_p_empty, nla_p_table_addr); +#undef _OUT + +static int +pf_handle_table_add_addrs(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + struct nl_parsed_table_addrs attrs = { 0 }; + struct nl_writer *nw = npt->nw; + struct genlmsghdr *ghdr_new; + int error; + + error = nl_parse_nlmsg(hdr, &table_addr_parser, npt, &attrs); + if (error != 0) + return (error); + + PF_RULES_WLOCK(); + error = pfr_add_addrs(&attrs.table, &attrs.addrs[0], + attrs.addr_count, &attrs.nadd, attrs.flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + + if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) + return (ENOMEM); + + ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); + ghdr_new->cmd = PFNL_CMD_TABLE_ADD_ADDR; + ghdr_new->version = 0; + ghdr_new->reserved = 0; + + nlattr_add_u32(nw, PF_TA_NBR_ADDED, attrs.nadd); + + if (!nlmsg_end(nw)) + return (ENOMEM); + + return (error); +} + +static int +pf_handle_table_del_addrs(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + struct nl_parsed_table_addrs attrs = { 0 }; + struct nl_writer *nw = npt->nw; + struct genlmsghdr *ghdr_new; + int error; + + error = nl_parse_nlmsg(hdr, &table_addr_parser, npt, &attrs); + if (error != 0) + return (error); + + PF_RULES_WLOCK(); + error = pfr_del_addrs(&attrs.table, &attrs.addrs[0], + attrs.addr_count, &attrs.ndel, attrs.flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + + if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) + return (ENOMEM); + + ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); + ghdr_new->cmd = PFNL_CMD_TABLE_DEL_ADDR; + ghdr_new->version = 0; + ghdr_new->reserved = 0; + + nlattr_add_u32(nw, PF_TA_NBR_DELETED, attrs.ndel); + + if (!nlmsg_end(nw)) + return (ENOMEM); + + return (error); +} + static const struct nlhdr_parser *all_parsers[] = { &state_parser, &addrule_parser, @@ -2096,6 +2213,7 @@ static const struct nlhdr_parser *all_parsers[] = { &add_addr_parser, &ruleset_parser, &table_parser, + &table_addr_parser, }; static uint16_t family_id; @@ -2318,6 +2436,20 @@ static const struct genl_cmd pf_cmds[] = { .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, + { + .cmd_num = PFNL_CMD_TABLE_ADD_ADDR, + .cmd_name = "TABLE_ADD_ADDRS", + .cmd_cb = pf_handle_table_add_addrs, + .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, + .cmd_priv = PRIV_NETINET_PF, + }, + { + .cmd_num = PFNL_CMD_TABLE_DEL_ADDR, + .cmd_name = "TABLE_DEL_ADDRS", + .cmd_cb = pf_handle_table_del_addrs, + .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, + .cmd_priv = PRIV_NETINET_PF, + }, }; void diff --git a/sys/netpfil/pf/pf_nl.h b/sys/netpfil/pf/pf_nl.h index d263a0b22deb..87daac393821 100644 --- a/sys/netpfil/pf/pf_nl.h +++ b/sys/netpfil/pf/pf_nl.h @@ -67,6 +67,8 @@ enum { PFNL_CMD_GET_TSTATS = 29, PFNL_CMD_CLR_TSTATS = 30, PFNL_CMD_CLR_ADDRS = 31, + PFNL_CMD_TABLE_ADD_ADDR = 32, + PFNL_CMD_TABLE_DEL_ADDR = 33, __PFNL_CMD_MAX, }; #define PFNL_CMD_MAX (__PFNL_CMD_MAX -1) @@ -461,6 +463,23 @@ enum pf_tstats_t { PF_TS_NZERO = 9, /* u64 */ }; +enum pfr_addr_t { + PFR_A_UNSPEC, + PFR_A_AF = 1, /* uint8_t */ + PFR_A_NET = 2, /* uint8_t */ + PFR_A_NOT = 3, /* bool */ + PFR_A_ADDR = 4, /* in6_addr */ +}; + +enum pf_table_addrs_t { + PF_TA_UNSPEC, + PF_TA_TABLE = 1, /* nested, pf_table_t */ + PF_TA_ADDR = 2, /* nested, pfr_addr_t */ + PF_TA_FLAGS = 3, /* u32 */ + PF_TA_NBR_ADDED = 4, /* u32 */ + PF_TA_NBR_DELETED = 5, /* u32 */ +}; + #ifdef _KERNEL void pf_nl_register(void); diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c index 8cea9557633c..a684d778ab42 100644 --- a/sys/netpfil/pf/pf_norm.c +++ b/sys/netpfil/pf/pf_norm.c @@ -118,6 +118,8 @@ VNET_DEFINE_STATIC(uma_zone_t, pf_frnode_z); #define V_pf_frnode_z VNET(pf_frnode_z) VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z); #define V_pf_frag_z VNET(pf_frag_z) +VNET_DEFINE(uma_zone_t, pf_anchor_z); +VNET_DEFINE(uma_zone_t, pf_eth_anchor_z); TAILQ_HEAD(pf_fragqueue, pf_fragment); TAILQ_HEAD(pf_cachequeue, pf_fragment); diff --git a/sys/netpfil/pf/pf_ruleset.c b/sys/netpfil/pf/pf_ruleset.c index 43b51f2933f4..039908a53126 100644 --- a/sys/netpfil/pf/pf_ruleset.c +++ b/sys/netpfil/pf/pf_ruleset.c @@ -238,7 +238,7 @@ pf_create_kanchor(struct pf_kanchor *parent, const char *aname) ((parent != NULL) && (strlen(parent->path) >= PF_ANCHOR_MAXPATH))) return (NULL); - anchor = rs_malloc(sizeof(*anchor)); + anchor = uma_zalloc(V_pf_anchor_z, M_NOWAIT | M_ZERO); if (anchor == NULL) return (NULL); @@ -259,7 +259,7 @@ pf_create_kanchor(struct pf_kanchor *parent, const char *aname) printf("%s: RB_INSERT1 " "'%s' '%s' collides with '%s' '%s'\n", __func__, anchor->path, anchor->name, dup->path, dup->name); - rs_free(anchor); + uma_zfree(V_pf_anchor_z, anchor); return (NULL); } @@ -273,7 +273,7 @@ pf_create_kanchor(struct pf_kanchor *parent, const char *aname) anchor->name, dup->path, dup->name); RB_REMOVE(pf_kanchor_global, &V_pf_anchors, anchor); - rs_free(anchor); + uma_zfree(V_pf_anchor_z, anchor); return (NULL); } } @@ -350,7 +350,7 @@ pf_remove_if_empty_kruleset(struct pf_kruleset *ruleset) if ((parent = ruleset->anchor->parent) != NULL) RB_REMOVE(pf_kanchor_node, &parent->children, ruleset->anchor); - rs_free(ruleset->anchor); + uma_zfree(V_pf_anchor_z, ruleset->anchor); if (parent == NULL) return; ruleset = &parent->ruleset; @@ -613,7 +613,7 @@ pf_find_or_create_keth_ruleset(const char *path) rs_free(p); return (NULL); } - anchor = (struct pf_keth_anchor *)rs_malloc(sizeof(*anchor)); + anchor = uma_zalloc(V_pf_eth_anchor_z, M_NOWAIT | M_ZERO); if (anchor == NULL) { rs_free(p); return (NULL); @@ -631,7 +631,7 @@ pf_find_or_create_keth_ruleset(const char *path) printf("%s: RB_INSERT1 " "'%s' '%s' collides with '%s' '%s'\n", __func__, anchor->path, anchor->name, dup->path, dup->name); - rs_free(anchor); + uma_zfree(V_pf_eth_anchor_z, anchor); rs_free(p); return (NULL); } @@ -645,7 +645,7 @@ pf_find_or_create_keth_ruleset(const char *path) anchor->name, dup->path, dup->name); RB_REMOVE(pf_keth_anchor_global, &V_pf_keth_anchors, anchor); - rs_free(anchor); + uma_zfree(V_pf_eth_anchor_z, anchor); rs_free(p); return (NULL); } @@ -754,7 +754,7 @@ pf_remove_if_empty_keth_ruleset(struct pf_keth_ruleset *ruleset) if ((parent = ruleset->anchor->parent) != NULL) RB_REMOVE(pf_keth_anchor_node, &parent->children, ruleset->anchor); - rs_free(ruleset->anchor); + uma_zfree(V_pf_eth_anchor_z, ruleset->anchor); if (parent == NULL) return; ruleset = &parent->ruleset; diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c index ecc185f89ad7..73ec18fa7646 100644 --- a/sys/netpfil/pf/pf_table.c +++ b/sys/netpfil/pf/pf_table.c @@ -294,7 +294,7 @@ pfr_add_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, else pfr_destroy_kentries(&workq); if (nadd != NULL) - *nadd = xadd; + *nadd += xadd; pfr_destroy_ktable(tmpkt, 0); return (0); _bad: diff --git a/sys/sys/proc.h b/sys/sys/proc.h index af9cafa99dd0..9140cee56885 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -763,7 +763,6 @@ struct proc { LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ - uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ /* diff --git a/sys/sys/racct.h b/sys/sys/racct.h index c6020b82c865..92b50353774e 100644 --- a/sys/sys/racct.h +++ b/sys/sys/racct.h @@ -141,13 +141,17 @@ extern bool racct_enable; /* * The 'racct' structure defines resource consumption for a particular - * subject, such as process or jail. + * subject, such as process or jail. It also contains the total + * cpu time and real time of the subject, recorded at the most recent + * time that RACCT_PCPU was updated. * * This structure must be filled with zeroes initially. */ struct racct { int64_t r_resources[RACCT_MAX + 1]; LIST_HEAD(, rctl_rule_link) r_rule_links; + uint64_t r_runtime; + struct timeval r_time; }; SYSCTL_DECL(_kern_racct); |