aboutsummaryrefslogblamecommitdiff
path: root/sys/netlink/route/nexthop.c
blob: d1652cfb1508c779fbcd686efdecfd4453c60539 (plain) (tree)
1
2
   
                                        
























                                                                             

                        



                      
                      
                      
                   
                      
                       


                       










                                 





                                    
                         





















































































































































































































                                                                                           
                  

                                                                 
                           

                                      























                                                                                     
                  




























                                                                                              
      







































































































                                                                                
                                                                            

                         






                                                                   
                                               

                          

                                                 






















                                                                     












                                                                                   












                                                               
                                                                      

































































                                                                                        
                                                                               


































                                                                                       
                      








































                                                                                          
                         































                                                                                        












                                                                                   



                                      

                                    




                                    


                                   



                                                               








                                                                                  











                                                                                       
                                                                                 


           









                                                                                       















































                                                                                         

























                                                                                 
          
                                                                                     






                                            
                                                                         


                                             
                                                                     




                                                                              
                                                                                           











                                                                                         
                                                



                                                      




                                                                                  




























































                                                                                     
                                                    




















































































                                                                                         



                                













                                                                





                                                                   
                                                                          







                                                                        

































                                                                                                    
 











                                                                                 



























                                                              
                                                                                     

    
                        



                                                                         
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "opt_netlink.h"

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_route.h"
#include <sys/types.h>
#include <sys/ck.h>
#include <sys/epoch.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/rmlock.h>
#include <sys/socket.h>

#include <net/if.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <net/route/nhop_utils.h>

#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
#include <netinet6/scope6_var.h>
#include <netlink/netlink.h>
#include <netlink/netlink_ctl.h>
#include <netlink/netlink_route.h>
#include <netlink/route/route_var.h>

#define	DEBUG_MOD_NAME	nl_nhop
#define	DEBUG_MAX_LEVEL	LOG_DEBUG3
#include <netlink/netlink_debug.h>
_DECLARE_DEBUG(LOG_INFO);

/*
 * This file contains the logic to maintain kernel nexthops and
 *  nexhop groups based om the data provided by the user.
 *
 * Kernel stores (nearly) all of the routing data in the nexthops,
 *  including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
 *
 * Netlink API provides higher-level abstraction for the user. Each
 *  user-created nexthop may map to multiple kernel nexthops.
 *
 * The following variations require separate kernel nexthop to be
 *  created:
 *  * prefix flags (NHF_HOST, NHF_DEFAULT)
 *  * using IPv6 gateway for IPv4 routes
 *  * different fibnum
 *
 * These kernel nexthops have the lifetime bound to the lifetime of
 *  the user_nhop object. They are not collected until user requests
 *  to delete the created user_nhop.
 *
 */
struct user_nhop {
        uint32_t                        un_idx; /* Userland-provided index */
	uint32_t			un_fibfam; /* fibnum+af(as highest byte) */
	uint8_t				un_protocol; /* protocol that install the record */
	struct nhop_object		*un_nhop; /* "production" nexthop */
	struct nhop_object		*un_nhop_src; /* nexthop to copy from */
	struct weightened_nhop		*un_nhgrp_src; /* nexthops for nhg */
	uint32_t			un_nhgrp_count; /* number of nexthops */
        struct user_nhop		*un_next; /* next item in hash chain */
        struct user_nhop		*un_nextchild; /* master -> children */
	struct epoch_context		un_epoch_ctx;	/* epoch ctl helper */
};

/* produce hash value for an object */
#define	unhop_hash_obj(_obj)	(hash_unhop(_obj))
/* compare two objects */
#define	unhop_cmp(_one, _two)	(cmp_unhop(_one, _two))
/* next object accessor */
#define	unhop_next(_obj)	(_obj)->un_next

CHT_SLIST_DEFINE(unhop, struct user_nhop);

struct unhop_ctl {
	struct unhop_head	un_head;
	struct rmlock		un_lock;
};
#define	UN_LOCK_INIT(_ctl)	rm_init(&(_ctl)->un_lock, "unhop_ctl")
#define	UN_TRACKER		struct rm_priotracker un_tracker
#define	UN_RLOCK(_ctl)		rm_rlock(&((_ctl)->un_lock), &un_tracker)
#define	UN_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->un_lock), &un_tracker)

#define	UN_WLOCK(_ctl)		rm_wlock(&(_ctl)->un_lock);
#define	UN_WUNLOCK(_ctl)	rm_wunlock(&(_ctl)->un_lock);

VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
#define V_un_ctl	VNET(un_ctl)

static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
static unsigned int hash_unhop(const struct user_nhop *obj);

static void destroy_unhop(struct user_nhop *unhop);
static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
    uint32_t fibnum, int family, int nh_flags);

static int
cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
{
        return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
}

/*
 * Hash callback: calculate hash of an object
 */
static unsigned int
hash_unhop(const struct user_nhop *obj)
{
        return (obj->un_idx ^ obj->un_fibfam);
}

#define	UNHOP_IS_MASTER(_unhop)	((_unhop)->un_fibfam == 0)

/*
 * Factory interface for creating matching kernel nexthops/nexthop groups
 *
 * @uidx: userland nexhop index used to create the nexthop
 * @fibnum: fibnum nexthop will be used in
 * @family: upper family nexthop will be used in
 * @nh_flags: desired nexthop prefix flags
 * @perror: pointer to store error to
 *
 * Returns referenced nexthop linked to @fibnum/@family rib on success.
 */
struct nhop_object *
nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
    int nh_flags, int *perror)
{
	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
        UN_TRACKER;

	if (__predict_false(ctl == NULL))
		return (NULL);

	struct user_nhop key= {
		.un_idx = uidx,
		.un_fibfam = fibnum  | ((uint32_t)family) << 24,
	};
	struct user_nhop *unhop;

	nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);

	if (__predict_false(family == 0))
		return (NULL);

	UN_RLOCK(ctl);
	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
	if (unhop != NULL) {
		struct nhop_object *nh = unhop->un_nhop;
		UN_RLOCK(ctl);
		*perror = 0;
		nhop_ref_any(nh);
		return (nh);
	}

	/*
	 * Exact nexthop not found. Search for template nexthop to clone from.
	 */
	key.un_fibfam = 0;
	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
	if (unhop == NULL) {
		UN_RUNLOCK(ctl);
		*perror = ESRCH;
		return (NULL);
	}

	UN_RUNLOCK(ctl);

	/* Create entry to insert first */
	struct user_nhop *un_new, *un_tmp;
	un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
	if (un_new == NULL) {
		*perror = ENOMEM;
		return (NULL);
	}
	un_new->un_idx = uidx;
	un_new->un_fibfam = fibnum  | ((uint32_t)family) << 24;

	/* Relying on epoch to protect unhop here */
	un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
	if (un_new->un_nhop == NULL) {
		free(un_new, M_NETLINK);
		*perror = ENOMEM;
		return (NULL);
	}

	/* Insert back and report */
	UN_WLOCK(ctl);

	/* First, find template record once again */
	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
	if (unhop == NULL) {
		/* Someone deleted the nexthop during the call */
		UN_WUNLOCK(ctl);
		*perror = ESRCH;
		destroy_unhop(un_new);
		return (NULL);
	}

	/* Second, check the direct match */
	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
	struct nhop_object *nh;
	if (un_tmp != NULL) {
		/* Another thread already created the desired nextop, use it */
		nh = un_tmp->un_nhop;
	} else {
		/* Finally, insert the new nexthop and link it to the primary */
		nh = un_new->un_nhop;
		CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
		un_new->un_nextchild = unhop->un_nextchild;
		unhop->un_nextchild = un_new;
		un_new = NULL;
		NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
	}

	UN_WUNLOCK(ctl);

	if (un_new != NULL)
		destroy_unhop(un_new);

	*perror = 0;
	nhop_ref_any(nh);
	return (nh);
}

static struct user_nhop *
nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
{
	struct user_nhop key= { .un_idx = uidx };
	struct user_nhop *unhop = NULL;
	UN_TRACKER;

	UN_RLOCK(ctl);
	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
	UN_RUNLOCK(ctl);

	return (unhop);
}

#define MAX_STACK_NHOPS	4
static struct nhop_object *
clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
{
#ifdef ROUTE_MPATH
	const struct weightened_nhop *wn;
	struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
	uint32_t num_nhops;
#endif
	struct nhop_object *nh = NULL;
	int error;

	if (unhop->un_nhop_src != NULL) {
		IF_DEBUG_LEVEL(LOG_DEBUG2) {
			char nhbuf[NHOP_PRINT_BUFSIZE];
			nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
			FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
			    "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
			    family, nh_flags);
		}
		struct nhop_object *nh;
		nh = nhop_alloc(fibnum, AF_UNSPEC);
		if (nh == NULL)
			return (NULL);
		nhop_copy(nh, unhop->un_nhop_src);
		/* Check that nexthop gateway is compatible with the new family */
		if (!nhop_set_upper_family(nh, family)) {
			nhop_free(nh);
			return (NULL);
		}
		nhop_set_uidx(nh, unhop->un_idx);
		nhop_set_pxtype_flag(nh, nh_flags);
		return (nhop_get_nhop(nh, &error));
	}
#ifdef ROUTE_MPATH
	wn = unhop->un_nhgrp_src;
	num_nhops = unhop->un_nhgrp_count;

	if (num_nhops > MAX_STACK_NHOPS) {
		wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
		if (wn_new == NULL)
			return (NULL);
	} else
		wn_new = wn_base;

	for (int i = 0; i < num_nhops; i++) {
		uint32_t uidx = nhop_get_uidx(wn[i].nh);
		MPASS(uidx != 0);
		wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
		if (error != 0)
			break;
		wn_new[i].weight = wn[i].weight;
	}

	if (error == 0) {
		struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
		struct nhgrp_object *nhg;

		error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
		nh = (struct nhop_object *)nhg;
	}

	if (wn_new != wn_base)
		free(wn_new, M_TEMP);
#endif
	return (nh);
}

static void
destroy_unhop(struct user_nhop *unhop)
{
	if (unhop->un_nhop != NULL)
		nhop_free_any(unhop->un_nhop);
	if (unhop->un_nhop_src != NULL)
		nhop_free_any(unhop->un_nhop_src);
	free(unhop, M_NETLINK);
}

static void
destroy_unhop_epoch(epoch_context_t ctx)
{
	struct user_nhop *unhop;

	unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);

	destroy_unhop(unhop);
}

static uint32_t
find_spare_uidx(struct unhop_ctl *ctl)
{
	struct user_nhop *unhop, key = {};
	uint32_t uidx = 0;
	UN_TRACKER;

	UN_RLOCK(ctl);
	/* This should return spare uid with 75% of 65k used in ~99/100 cases */
	for (int i = 0; i < 16; i++) {
		key.un_idx = (arc4random() % 65536) + 65536 * 4;
		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
		if (unhop == NULL) {
			uidx = key.un_idx;
			break;
		}
	}
	UN_RUNLOCK(ctl);

	return (uidx);
}


/*
 * Actual netlink code
 */
struct netlink_walkargs {
	struct nl_writer *nw;
	struct nlmsghdr hdr;
	struct nlpcb *so;
	int family;
	int error;
	int count;
	int dumped;
};
#define	ENOMEM_IF_NULL(_v)	if ((_v) == NULL) goto enomem

static bool
dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
    struct nl_writer *nw)
{

	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
		goto enomem;

	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
	nhm->nh_family = AF_UNSPEC;
	nhm->nh_scope = 0;
	nhm->nh_protocol = unhop->un_protocol;
	nhm->nh_flags = 0;

	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
	nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);

	struct weightened_nhop *wn = unhop->un_nhgrp_src;
	uint32_t num_nhops = unhop->un_nhgrp_count;
	/* TODO: a better API? */
	int nla_len = sizeof(struct nlattr);
	nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
	if (nla == NULL)
		goto enomem;
	nla->nla_type = NHA_GROUP;
	nla->nla_len = nla_len;
	for (int i = 0; i < num_nhops; i++) {
		struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
		grp->id = nhop_get_uidx(wn[i].nh);
		grp->weight = wn[i].weight;
		grp->resvd1 = 0;
		grp->resvd2 = 0;
	}

        if (nlmsg_end(nw))
		return (true);
enomem:
	NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
        nlmsg_abort(nw);
	return (false);
}

static bool
dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
    struct nl_writer *nw)
{
	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
		goto enomem;

	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
	ENOMEM_IF_NULL(nhm);
	nhm->nh_family = nhop_get_neigh_family(nh);
	nhm->nh_scope = 0; // XXX: what's that?
	nhm->nh_protocol = nhop_get_origin(nh);
	nhm->nh_flags = 0;

	if (uidx != 0)
		nlattr_add_u32(nw, NHA_ID, uidx);
	if (nh->nh_flags & NHF_BLACKHOLE) {
		nlattr_add_flag(nw, NHA_BLACKHOLE);
		goto done;
	}
	nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index);

	switch (nh->gw_sa.sa_family) {
#ifdef INET
	case AF_INET:
		nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
		break;
#endif
#ifdef INET6
	case AF_INET6:
		{
			struct in6_addr addr = nh->gw6_sa.sin6_addr;
			in6_clearscope(&addr);
			nlattr_add(nw, NHA_GATEWAY, 16, &addr);
			break;
		}
#endif
	}

	int off = nlattr_add_nested(nw, NHA_FREEBSD);
	if (off != 0) {
		nlattr_add_u32(nw, NHAF_AIF, nh->nh_aifp->if_index);

		if (uidx == 0) {
			nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
			nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
			nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
		}

		nlattr_set_len(nw, off);
	}

done:
        if (nlmsg_end(nw))
		return (true);
enomem:
	nlmsg_abort(nw);
	return (false);
}

static void
dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
    struct nl_writer *nw)
{
	if (unhop->un_nhop_src != NULL)
		dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
	else
		dump_nhgrp(unhop, hdr, nw);
}

static int
delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
{
	struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;

	struct user_nhop key = { .un_idx = uidx };

	UN_WLOCK(ctl);

	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);

	if (unhop_base != NULL) {
		CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
		IF_DEBUG_LEVEL(LOG_DEBUG2) {
			char nhbuf[NHOP_PRINT_BUFSIZE];
			nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
			FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
			    "removed base nhop %u: %s", uidx, nhbuf);
		}
		/* Unlink all child nexhops as well, keeping the chain intact */
		unhop_chain = unhop_base->un_nextchild;
		while (unhop_chain != NULL) {
			CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
			    unhop_ret);
			MPASS(unhop_chain == unhop_ret);
			IF_DEBUG_LEVEL(LOG_DEBUG3) {
				char nhbuf[NHOP_PRINT_BUFSIZE];
				nhop_print_buf_any(unhop_chain->un_nhop,
				    nhbuf, sizeof(nhbuf));
				FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
				    "removed child nhop %u: %s", uidx, nhbuf);
			}
			unhop_chain = unhop_chain->un_nextchild;
		}
	}

	UN_WUNLOCK(ctl);

	if (unhop_base == NULL) {
		NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
		return (ENOENT);
	}

	/* Report nexthop deletion */
	struct netlink_walkargs wa = {
		.hdr.nlmsg_pid = hdr->nlmsg_pid,
		.hdr.nlmsg_seq = hdr->nlmsg_seq,
		.hdr.nlmsg_flags = hdr->nlmsg_flags,
		.hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
	};

	struct nl_writer nw = {};
	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
		NL_LOG(LOG_DEBUG, "error allocating message writer");
		return (ENOMEM);
	}

	dump_unhop(unhop_base, &wa.hdr, &nw);
	nlmsg_flush(&nw);

	while (unhop_base != NULL) {
		unhop_chain = unhop_base->un_nextchild;
		NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
		unhop_base = unhop_chain;
	}

	return (0);
}

static void
consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
{
	void *new_ptr = NULL;
	size_t alloc_size;

        if (new_size == 0)
                return;

	if (new_size != 0) {
		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
		new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
                if (new_ptr == NULL)
                        return;
	}

	NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
	UN_WLOCK(ctl);
	if (new_ptr != NULL) {
		CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
	}
	UN_WUNLOCK(ctl);


	if (new_ptr != NULL)
		free(new_ptr, M_NETLINK);
}

static bool __noinline
vnet_init_unhops(void)
{
        uint32_t num_buckets = 16;
        size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);

        struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
            M_NOWAIT | M_ZERO);
        if (ctl == NULL)
                return (false);

        void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
        if (ptr == NULL) {
		free(ctl, M_NETLINK);
                return (false);
	}
        CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
	UN_LOCK_INIT(ctl);

	if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
                free(ptr, M_NETLINK);
                free(ctl, M_NETLINK);
	}

	if (atomic_load_ptr(&V_un_ctl) == NULL)
		return (false);

	NL_LOG(LOG_NOTICE, "UNHOPS init done");

        return (true);
}

static void
vnet_destroy_unhops(const void *unused __unused)
{
	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
	struct user_nhop *unhop, *tmp;

	if (ctl == NULL)
		return;
	V_un_ctl = NULL;

	/* Wait till all unhop users finish their reads */
	NET_EPOCH_WAIT();

	UN_WLOCK(ctl);
	CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
		destroy_unhop(unhop);
	} CHT_SLIST_FOREACH_SAFE_END;
	UN_WUNLOCK(ctl);

	free(ctl->un_head.ptr, M_NETLINK);
	free(ctl, M_NETLINK);
}
VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
    vnet_destroy_unhops, NULL);

static int
nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
{
	int error = 0;

	/* Verify attribute correctness */
	struct nexthop_grp *grp = NLA_DATA(nla);
	int data_len = NLA_DATA_LEN(nla);

	int count = data_len / sizeof(*grp);
	if (count == 0 || (count * sizeof(*grp) != data_len)) {
		NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
		return (EINVAL);
	}

	*((struct nlattr **)target) = nla;
	return (error);
}

static void
set_scope6(struct sockaddr *sa, struct ifnet *ifp)
{
#ifdef INET6
	if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
		struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;

		if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
			in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
	}
#endif
}

struct nl_parsed_nhop {
	uint32_t	nha_id;
	uint8_t		nha_blackhole;
	uint8_t		nha_groups;
	uint8_t		nhaf_knhops;
	uint8_t		nhaf_family;
	struct ifnet	*nha_oif;
	struct sockaddr	*nha_gw;
	struct nlattr	*nha_group;
	uint8_t		nh_family;
	uint8_t		nh_protocol;
	uint32_t	nhaf_table;
	uint32_t	nhaf_kid;
	uint32_t	nhaf_aif;
};

#define	_IN(_field)	offsetof(struct nhmsg, _field)
#define	_OUT(_field)	offsetof(struct nl_parsed_nhop, _field)
static struct nlattr_parser nla_p_nh_fbsd[] = {
	{ .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
	{ .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
	{ .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
	{ .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
	{ .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
};
NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);

static const struct nlfield_parser nlf_p_nh[] = {
	{ .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
	{ .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
};

static const struct nlattr_parser nla_p_nh[] = {
	{ .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
	{ .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
	{ .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
	{ .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
	{ .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
	{ .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
	{ .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
};
#undef _IN
#undef _OUT

static bool
post_p_nh(void *_attrs, struct nl_pstate *npt)
{
	struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;

	set_scope6(attrs->nha_gw, attrs->nha_oif);
	return (true);
}
NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);

static bool
eligible_nhg(const struct nhop_object *nh)
{
	return (nh->nh_flags & NHF_GATEWAY);
}

static int
newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
{
	struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
	int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
	struct weightened_nhop *wn;

	wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
	if (wn == NULL)
		return (ENOMEM);

	for (int i = 0; i < count; i++) {
		struct user_nhop *unhop;
		unhop = nl_find_base_unhop(ctl, grp[i].id);
		if (unhop == NULL) {
			NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
			free(wn, M_NETLINK);
			return (ESRCH);
		} else if (unhop->un_nhop_src == NULL) {
			NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
			    grp[i].id);
			free(wn, M_NETLINK);
			return (ENOTSUP);
		} else if (!eligible_nhg(unhop->un_nhop_src)) {
			NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
			    grp[i].id);
			free(wn, M_NETLINK);
			return (ENOTSUP);
		}
		/*
		 * TODO: consider more rigid eligibility checks:
		 * restrict nexthops with the same gateway
		 */
		wn[i].nh = unhop->un_nhop_src;
		wn[i].weight = grp[i].weight;
	}
	unhop->un_nhgrp_src = wn;
	unhop->un_nhgrp_count = count;
	return (0);
}

/*
 * Sets nexthop @nh gateway specified by @gw.
 * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
 * @ifp ifindex.
 * Returns 0 on success or errno.
 */
int
nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp,
    struct nl_pstate *npt)
{
#ifdef INET6
	if (gw->sa_family == AF_INET6) {
		struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
		if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
			if (ifp == NULL) {
				NLMSG_REPORT_ERR_MSG(npt, "interface not set");
				return (EINVAL);
			}
			in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index);
		}
	}
#endif
	nhop_set_gw(nh, gw, true);
	return (0);
}

static int
newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
{
	struct ifaddr *ifa = NULL;
	struct nhop_object *nh;
	int error;

	if (!attrs->nha_blackhole) {
		if (attrs->nha_gw == NULL) {
			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
			return (EINVAL);
		}
		if (attrs->nha_oif == NULL) {
			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
			return (EINVAL);
		}
		if (ifa == NULL)
			ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
		if (ifa == NULL) {
			NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
			return (EINVAL);
		}
	}

	int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;

	nh = nhop_alloc(RT_DEFAULT_FIB, family);
	if (nh == NULL) {
		NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
		return (ENOMEM);
	}
	nhop_set_uidx(nh, attrs->nha_id);
	nhop_set_origin(nh, attrs->nh_protocol);

	if (attrs->nha_blackhole)
		nhop_set_blackhole(nh, NHF_BLACKHOLE);
	else {
		error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
		if (error != 0) {
			nhop_free(nh);
			return (error);
		}
		nhop_set_transmit_ifp(nh, attrs->nha_oif);
		nhop_set_src(nh, ifa);
	}

	error = nhop_get_unlinked(nh);
	if (error != 0) {
		NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
		return (error);
	}

	IF_DEBUG_LEVEL(LOG_DEBUG2) {
		char nhbuf[NHOP_PRINT_BUFSIZE];
		nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
		NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
	}

	unhop->un_nhop_src = nh;
	return (0);
}

static int
rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
    struct nl_pstate *npt)
{
	struct user_nhop *unhop;
	int error;

        if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
		return (ENOMEM);
	struct unhop_ctl *ctl = V_un_ctl;

	struct nl_parsed_nhop attrs = {};
	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
	if (error != 0)
		return (error);

	/*
	 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
	 *  citizen.
	 */
	if (attrs.nha_id == 0) {
		attrs.nha_id = find_spare_uidx(ctl);
		if (attrs.nha_id == 0) {
			NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
			return (ENOSPC);
		}
	}

	NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0);

	unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
	if (unhop == NULL) {
		NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
		return (ENOMEM);
	}
	unhop->un_idx = attrs.nha_id;
	unhop->un_protocol = attrs.nh_protocol;

	if (attrs.nha_group)
		error = newnhg(ctl, &attrs, unhop);
	else
		error = newnhop(&attrs, unhop, npt);

	if (error != 0) {
		free(unhop, M_NETLINK);
		return (error);
	}

	UN_WLOCK(ctl);
	/* Check if uidx already exists */
	struct user_nhop *tmp = NULL;
	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
	if (tmp != NULL) {
		UN_WUNLOCK(ctl);
		NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
		destroy_unhop(unhop);
		return (EEXIST);
	}
	CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
	uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
	UN_WUNLOCK(ctl);

	/* Report addition of the next nexhop */
	struct netlink_walkargs wa = {
		.hdr.nlmsg_pid = hdr->nlmsg_pid,
		.hdr.nlmsg_seq = hdr->nlmsg_seq,
		.hdr.nlmsg_flags = hdr->nlmsg_flags,
		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
	};

	struct nl_writer nw = {};
	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
		NL_LOG(LOG_DEBUG, "error allocating message writer");
		return (ENOMEM);
	}

	dump_unhop(unhop, &wa.hdr, &nw);
	nlmsg_flush(&nw);

	consider_resize(ctl, num_buckets_new);

        return (0);
}

static int
rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
    struct nl_pstate *npt)
{
	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
	int error;

	if (__predict_false(ctl == NULL))
		return (ESRCH);

	struct nl_parsed_nhop attrs = {};
	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
	if (error != 0)
		return (error);

	if (attrs.nha_id == 0) {
		NL_LOG(LOG_DEBUG, "NHA_ID not set");
		return (EINVAL);
	}

	error = delete_unhop(ctl, hdr, attrs.nha_id);

        return (error);
}

static bool
match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
{
	if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
		return (false);
	if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
		return (false);
	if (attrs->nha_oif != NULL &&
	    (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
		return (false);

	return (true);
}

static int
rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
    struct nl_pstate *npt)
{
	struct user_nhop *unhop;
	UN_TRACKER;
	int error;

	struct nl_parsed_nhop attrs = {};
	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
	if (error != 0)
		return (error);

	struct netlink_walkargs wa = {
		.nw = npt->nw,
		.hdr.nlmsg_pid = hdr->nlmsg_pid,
		.hdr.nlmsg_seq = hdr->nlmsg_seq,
		.hdr.nlmsg_flags = hdr->nlmsg_flags,
		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
	};

	if (attrs.nha_id != 0) {
		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
		struct user_nhop key = { .un_idx = attrs.nha_id };

		if (__predict_false(ctl == NULL))
			return (ESRCH);

		NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
		UN_RLOCK(ctl);
		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
		UN_RUNLOCK(ctl);

		if (unhop == NULL)
			return (ESRCH);
		dump_unhop(unhop, &wa.hdr, wa.nw);
		return (0);
	} else if (attrs.nhaf_kid != 0) {
		struct nhop_iter iter = {
			.fibnum = attrs.nhaf_table,
			.family = attrs.nhaf_family,
		};
		int error = ESRCH;

		NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
		    nh = nhops_iter_next(&iter)) {
			NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
			if (nhop_get_idx(nh) == attrs.nhaf_kid) {
				dump_nhop(nh, 0, &wa.hdr, wa.nw);
				error = 0;
				break;
			}
		}
		nhops_iter_stop(&iter);
		return (error);
	} else if (attrs.nhaf_knhops) {
		struct nhop_iter iter = {
			.fibnum = attrs.nhaf_table,
			.family = attrs.nhaf_family,
		};

		NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
		    nh = nhops_iter_next(&iter)) {
			dump_nhop(nh, 0, &wa.hdr, wa.nw);
		}
		nhops_iter_stop(&iter);
	} else {
		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);

		if (__predict_false(ctl == NULL))
			return (ESRCH);

		NL_LOG(LOG_DEBUG2, "DUMP unhops");
		UN_RLOCK(ctl);
		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
		CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
			if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
				dump_unhop(unhop, &wa.hdr, wa.nw);
		} CHT_SLIST_FOREACH_END;
		UN_RUNLOCK(ctl);
	}

	if (wa.error == 0) {
		if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
			return (ENOMEM);
	}
        return (0);
}

static const struct rtnl_cmd_handler cmd_handlers[] = {
	{
		.cmd = NL_RTM_NEWNEXTHOP,
		.name = "RTM_NEWNEXTHOP",
		.cb = &rtnl_handle_newnhop,
		.priv = PRIV_NET_ROUTE,
	},
	{
		.cmd = NL_RTM_DELNEXTHOP,
		.name = "RTM_DELNEXTHOP",
		.cb = &rtnl_handle_delnhop,
		.priv = PRIV_NET_ROUTE,
	},
	{
		.cmd = NL_RTM_GETNEXTHOP,
		.name = "RTM_GETNEXTHOP",
		.cb = &rtnl_handle_getnhop,
	}
};

static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };

void
rtnl_nexthops_init(void)
{
	NL_VERIFY_PARSERS(all_parsers);
	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
}