26 files changed, 9281 insertions, 2042 deletions
diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c
new file mode 100644
index 000000000000..6773851327c3
--- /dev/null
+++ b/sys/netinet/ipfw/dn_heap.c
@@ -0,0 +1,550 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, used in dummynet
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#ifdef _KERNEL
+__FBSDID("$FreeBSD$");
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <netinet/ipfw/dn_heap.h>
+#ifndef log
+#define log(x, arg...)
+#endif
+
+#else /* !_KERNEL */
+
+#include <stdio.h>
+#include <dn_test.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include  "dn_heap.h"
+#define log(x, arg...)	fprintf(stderr, ## arg)
+#define panic(x...)	fprintf(stderr, ## x), exit(1)
+#define MALLOC_DEFINE(a, b, c)
+static void *my_malloc(int s) {	return malloc(s); }
+static void my_free(void *p) {	free(p); }
+#define malloc(s, t, w)	my_malloc(s)
+#define free(p, t)	my_free(p)
+#endif /* !_KERNEL */
+
+MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( (x)+(x) + 1 )
+#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT	15
+
+static int
+heap_resize(struct dn_heap *h, unsigned int new_size)
+{
+	struct dn_heap_entry *p;
+
+	if (h->size >= new_size )	/* have enough room */
+		return 0;
+#if 1  /* round to the next power of 2 */
+	new_size |= new_size >> 1;
+	new_size |= new_size >> 2;
+	new_size |= new_size >> 4;
+	new_size |= new_size >> 8;
+	new_size |= new_size >> 16;
+#else
+	new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
+#endif
+	p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
+	if (p == NULL) {
+		printf("--- %s, resize %d failed\n", __func__, new_size );
+		return 1; /* error */
+	}
+	if (h->size > 0) {
+		bcopy(h->p, p, h->size * sizeof(*p) );
+		free(h->p, M_DN_HEAP);
+	}
+	h->p = p;
+	h->size = new_size;
+	return 0;
+}
+
+int
+heap_init(struct dn_heap *h, int size, int ofs)
+{
+	if (heap_resize(h, size))
+		return 1;
+	h->elements = 0;
+	h->ofs = ofs;
+	return 0;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If ofs > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(h, i) do {					\
+	if (h->ofs > 0)						\
+	    *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i;	\
+	} while (0)
+/*
+ * RESET_OFFSET is used for sanity checks. It sets ofs
+ * to an invalid value.
+ */
+#define RESET_OFFSET(h, i) do {					\
+	if (h->ofs > 0)						\
+	    *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16;	\
+	} while (0)
+
+int
+heap_insert(struct dn_heap *h, uint64_t key1, void *p)
+{
+	int son = h->elements;
+
+	//log("%s key %llu p %p\n", __FUNCTION__, key1, p);
+	if (p == NULL) { /* data already there, set starting point */
+		son = key1;
+	} else { /* insert new element at the end, possibly resize */
+		son = h->elements;
+		if (son == h->size) /* need resize... */
+			// XXX expand by 16 or so
+			if (heap_resize(h, h->elements+16) )
+				return 1; /* failure... */
+		h->p[son].object = p;
+		h->p[son].key = key1;
+		h->elements++;
+	}
+	/* make sure that son >= father along the path */
+	while (son > 0) {
+		int father = HEAP_FATHER(son);
+		struct dn_heap_entry tmp;
+
+		if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+			break; /* found right position */
+		/* son smaller than father, swap and repeat */
+		HEAP_SWAP(h->p[son], h->p[father], tmp);
+		SET_OFFSET(h, son);
+		son = father;
+	}
+	SET_OFFSET(h, son);
+	return 0;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+void
+heap_extract(struct dn_heap *h, void *obj)
+{
+	int child, father, max = h->elements - 1;
+
+	if (max < 0) {
+		printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
+		return;
+	}
+	if (obj == NULL)
+		father = 0; /* default: move up smallest child */
+	else { /* extract specific element, index is at offset */
+		if (h->ofs <= 0)
+			panic("%s: extract from middle not set on %p\n",
+				__FUNCTION__, h);
+		father = *((int *)((char *)obj + h->ofs));
+		if (father < 0 || father >= h->elements) {
+			panic("%s: father %d out of bound 0..%d\n",
+				__FUNCTION__, father, h->elements);
+		}
+	}
+	/*
+	 * below, father is the index of the empty element, which
+	 * we replace at each step with the smallest child until we
+	 * reach the bottom level.
+	 */
+	// XXX why removing RESET_OFFSET increases runtime by 10% ?
+	RESET_OFFSET(h, father);
+	while ( (child = HEAP_LEFT(father)) <= max ) {
+		if (child != max &&
+		    DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+			child++; /* take right child, otherwise left */
+		h->p[father] = h->p[child];
+		SET_OFFSET(h, father);
+		father = child;
+	}
+	h->elements--;
+	if (father != max) {
+		/*
+		 * Fill hole with last entry and bubble up,
+		 * reusing the insert code
+		 */
+		h->p[father] = h->p[max];
+		heap_insert(h, father, NULL);
+	}
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, uint64_t new_key, void *object)
+{
+	int temp, i, max = h->elements-1;
+	struct dn_heap_entry *p, buf;
+
+	if (h->ofs <= 0)
+		panic("cannot move items on this heap");
+	p = h->p;	/* shortcut */
+
+	i = *((int *)((char *)object + h->ofs));
+	if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
+		p[i].key = new_key;
+		for (; i>0 &&
+		    DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
+		    i = temp ) { /* bubble up */
+			HEAP_SWAP(p[i], p[temp], buf);
+			SET_OFFSET(h, i);
+		}
+	} else {		/* must move down */
+		p[i].key = new_key;
+		while ( (temp = HEAP_LEFT(i)) <= max ) {
+			/* found left child */
+			if (temp != max &&
+			    DN_KEY_LT(p[temp+1].key, p[temp].key))
+				temp++; /* select child with min key */
+			if (DN_KEY_LT(>p[temp].key, new_key)) {
+				/* go down */
+				HEAP_SWAP(p[i], p[temp], buf);
+				SET_OFFSET(h, i);
+			} else
+				break;
+			i = temp;
+		}
+	}
+	SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+	int i;
+
+	for (i = 0; i < h->elements; i++ )
+		heap_insert(h, i , NULL);
+}
+
+int
+heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
+	uintptr_t arg)
+{
+	int i, ret, found;
+
+	for (i = found = 0 ; i < h->elements ;) {
+		ret = fn(h->p[i].object, arg);
+		if (ret & HEAP_SCAN_DEL) {
+			h->elements-- ;
+			h->p[i] = h->p[h->elements] ;
+			found++ ;
+		} else
+			i++ ;
+		if (ret & HEAP_SCAN_END)
+			break;
+	}
+	if (found)
+		heapify(h);
+	return found;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+void
+heap_free(struct dn_heap *h)
+{
+	if (h->size >0 )
+		free(h->p, M_DN_HEAP);
+	bzero(h, sizeof(*h) );
+}
+
+/*
+ * hash table support.
+ */
+
+struct dn_ht {
+        int buckets;            /* how many buckets, really buckets - 1*/
+        int entries;            /* how many entries */
+        int ofs;	        /* offset of link field */
+        uint32_t (*hash)(uintptr_t, int, void *arg);
+        int (*match)(void *_el, uintptr_t key, int, void *);
+        void *(*newh)(uintptr_t, int, void *);
+        void **ht;              /* bucket heads */
+};
+/*
+ * Initialize, allocating bucket pointers inline.
+ * Recycle previous record if possible.
+ * If the 'newh' function is not supplied, we assume that the
+ * key passed to ht_find is the same object to be stored in.
+ */
+struct dn_ht *
+dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
+        uint32_t (*h)(uintptr_t, int, void *),
+        int (*match)(void *, uintptr_t, int, void *),
+	void *(*newh)(uintptr_t, int, void *))
+{
+	int l;
+
+	/*
+	 * Notes about rounding bucket size to a power of two.
+	 * Given the original bucket size, we compute the nearest lower and
+	 * higher power of two, minus 1  (respectively b_min and b_max) because
+	 * this value will be used to do an AND with the index returned
+	 * by hash function.
+	 * To choice between these two values, the original bucket size is
+	 * compared with b_min. If the original size is greater than 4/3 b_min,
+	 * we round the bucket size to b_max, else to b_min.
+	 * This ratio try to round to the nearest power of two, advantaging
+	 * the greater size if the different between two power is relatively
+	 * big.
+	 * Rounding the bucket size to a power of two avoid the use of
+	 * module when calculating the correct bucket.
+	 * The ht->buckets variable store the bucket size - 1 to simply
+	 * do an AND between the index returned by hash function and ht->bucket
+	 * instead of a module.
+	 */
+	int b_min; /* min buckets */
+	int b_max; /* max buckets */
+	int b_ori; /* original buckets */
+
+	if (h == NULL || match == NULL) {
+		printf("--- missing hash or match function");
+		return NULL;
+	}
+	if (buckets < 1 || buckets > 65536)
+		return NULL;
+
+	b_ori = buckets;
+	/* calculate next power of 2, - 1*/
+	buckets |= buckets >> 1;
+	buckets |= buckets >> 2;
+	buckets |= buckets >> 4;
+	buckets |= buckets >> 8;
+	buckets |= buckets >> 16;
+
+	b_max = buckets; /* Next power */
+	b_min = buckets >> 1; /* Previous power */
+
+	/* Calculate the 'nearest' bucket size */
+	if (b_min * 4000 / 3000 < b_ori)
+		buckets = b_max;
+	else
+		buckets = b_min;
+
+	if (ht) {	/* see if we can reuse */
+		if (buckets <= ht->buckets) {
+			ht->buckets = buckets;
+		} else {
+			/* free pointers if not allocated inline */
+			if (ht->ht != (void *)(ht + 1))
+				free(ht->ht, M_DN_HEAP);
+			free(ht, M_DN_HEAP);
+			ht = NULL;
+		}
+	}
+	if (ht == NULL) {
+		/* Allocate buckets + 1 entries because buckets is use to
+		 * do the AND with the index returned by hash function
+		 */
+		l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
+		ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
+	}
+	if (ht) {
+		ht->ht = (void **)(ht + 1);
+		ht->buckets = buckets;
+		ht->ofs = ofs;
+		ht->hash = h;
+		ht->match = match;
+		ht->newh = newh;
+	}
+	return ht;
+}
+
+/* dummy callback for dn_ht_free to unlink all */
+static int
+do_del(void *obj, void *arg)
+{
+	return DNHT_SCAN_DEL;
+}
+
+void
+dn_ht_free(struct dn_ht *ht, int flags)
+{
+	if (ht == NULL)
+		return;
+	if (flags & DNHT_REMOVE) {
+		(void)dn_ht_scan(ht, do_del, NULL);
+	} else {
+		if (ht->ht && ht->ht != (void *)(ht + 1))
+			free(ht->ht, M_DN_HEAP);
+		free(ht, M_DN_HEAP);
+	}
+}
+
+int
+dn_ht_entries(struct dn_ht *ht)
+{
+	return ht ? ht->entries : 0;
+}
+
+/* lookup and optionally create or delete element */
+void *
+dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
+{
+	int i;
+	void **pp, *p;
+
+	if (ht == NULL)	/* easy on an empty hash */
+		return NULL;
+	i = (ht->buckets == 1) ? 0 :
+		(ht->hash(key, flags, arg) & ht->buckets);
+
+	for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) {
+		if (flags & DNHT_MATCH_PTR) {
+			if (key == (uintptr_t)p)
+				break;
+		} else if (ht->match(p, key, flags, arg)) /* found match */
+			break;
+	}
+	if (p) {
+		if (flags & DNHT_REMOVE) {
+			/* link in the next element */
+			*pp = *(void **)((char *)p + ht->ofs);
+			*(void **)((char *)p + ht->ofs) = NULL;
+			ht->entries--;
+		}
+	} else if (flags & DNHT_INSERT) {
+		// printf("%s before calling new, bucket %d ofs %d\n",
+		//	__FUNCTION__, i, ht->ofs);
+		p = ht->newh ? ht->newh(key, flags, arg) : (void *)key;
+		// printf("%s newh returns %p\n", __FUNCTION__, p);
+		if (p) {
+			ht->entries++;
+			*(void **)((char *)p + ht->ofs) = ht->ht[i];
+			ht->ht[i] = p;
+		}
+	}
+	return p;
+}
+
+/*
+ * do a scan with the option to delete the object. Extract next before
+ * running the callback because the element may be destroyed there.
+ */
+int
+dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
+{
+	int i, ret, found = 0;
+	void **curp, *cur, *next;
+
+	if (ht == NULL || fn == NULL)
+		return 0;
+	for (i = 0; i <= ht->buckets; i++) {
+		curp = &ht->ht[i];
+		while ( (cur = *curp) != NULL) {
+			next = *(void **)((char *)cur + ht->ofs);
+			ret = fn(cur, arg);
+			if (ret & DNHT_SCAN_DEL) {
+				found++;
+				ht->entries--;
+				*curp = next;
+			} else {
+				curp = (void **)((char *)cur + ht->ofs);
+			}
+			if (ret & DNHT_SCAN_END)
+				return found;
+		}
+	}
+	return found;
+}
+
+/*
+ * Similar to dn_ht_scan(), except thah the scan is performed only
+ * in the bucket 'bucket'. The function returns a correct bucket number if
+ * the original is invalid
+ */
+int
+dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
+		 void *arg)
+{
+	int i, ret, found = 0;
+	void **curp, *cur, *next;
+
+	if (ht == NULL || fn == NULL)
+		return 0;
+	if (*bucket > ht->buckets)
+		*bucket = 0;
+	i = *bucket;
+
+	curp = &ht->ht[i];
+	while ( (cur = *curp) != NULL) {
+		next = *(void **)((char *)cur + ht->ofs);
+		ret = fn(cur, arg);
+		if (ret & DNHT_SCAN_DEL) {
+			found++;
+			ht->entries--;
+			*curp = next;
+		} else {
+			curp = (void **)((char *)cur + ht->ofs);
+		}
+		if (ret & DNHT_SCAN_END)
+			return found;
+	}
+	return found;
+}
+
diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h
new file mode 100644
index 000000000000..c95473ade392
--- /dev/null
+++ b/sys/netinet/ipfw/dn_heap.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, header file
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_HEAP_H
+#define _IP_DN_HEAP_H
+
+#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
+
+/*
+ * This module implements a binary heap supporting random extraction.
+ *
+ * A heap entry contains an uint64_t key and a pointer to object.
+ * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b'
+ *
+ * The heap is a struct dn_heap plus a dynamically allocated
+ * array of dn_heap_entry entries. 'size' represents the size of
+ * the array, 'elements' count entries in use. The topmost
+ * element has the smallest key.
+ * The heap supports ordered insert, and extract from the top.
+ * To extract an object from the middle of the heap, we the object
+ * must reserve an 'int32_t' to store the position of the object
+ * in the heap itself, and the location of this field must be
+ * passed as an argument to heap_init() -- use -1 if the feature
+ * is not used.
+ */
+struct dn_heap_entry {
+	uint64_t key;	/* sorting key, smallest comes first */
+	void *object;	/* object pointer */
+};
+
+struct dn_heap {
+	int size;	/* the size of the array */
+	int elements;	/* elements in use */
+	int ofs;	/* offset in the object of heap index */
+	struct dn_heap_entry *p;	/* array of "size" entries */
+};
+
+enum {
+	HEAP_SCAN_DEL = 1,
+	HEAP_SCAN_END = 2,
+};
+
+/*
+ * heap_init() reinitializes the heap setting the size and the offset
+ *	of the index for random extraction (use -1 if not used).
+ *	The 'elements' counter is set to 0.
+ *
+ * SET_HEAP_OFS() indicates where, in the object, is stored the index
+ *	for random extractions from the heap.
+ *
+ * heap_free() frees the memory associated to a heap.
+ *
+ * heap_insert() adds a key-pointer pair to the heap
+ *
+ * HEAP_TOP() returns a pointer to the top element of the heap,
+ *	but makes no checks on its existance (XXX should we change ?)
+ *
+ * heap_extract() removes the entry at the top, returing the pointer.
+ *	(the key should have been read before).
+ *
+ * heap_scan() invokes a callback on each entry of the heap.
+ *	The callback can return a combination of HEAP_SCAN_DEL and
+ *	HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must
+ *	be removed, and HEAP_SCAN_END means to terminate the scan.
+ *	heap_scan() returns the number of elements removed.
+ *	Because the order is not guaranteed, we should use heap_scan()
+ *	only as a last resort mechanism.
+ */
+#define HEAP_TOP(h)	((h)->p)
+#define SET_HEAP_OFS(h, n)	do { (h)->ofs = n; } while (0)
+int     heap_init(struct dn_heap *h, int size, int ofs);
+int     heap_insert(struct dn_heap *h, uint64_t key1, void *p);
+void    heap_extract(struct dn_heap *h, void *obj);
+void heap_free(struct dn_heap *h);
+int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);
+
+/*------------------------------------------------------
+ * This module implements a generic hash table with support for
+ * running callbacks on the entire table. To avoid allocating
+ * memory during hash table operations, objects must reserve
+ * space for a link field. XXX if the heap is moderately full,
+ * an SLIST suffices, and we can tolerate the cost of a hash
+ * computation on each removal.
+ *
+ * dn_ht_init() initializes the table, setting the number of
+ *	buckets, the offset of the link field, the main callbacks.
+ *	Callbacks are:
+ * 
+ *	hash(key, flags, arg) called to return a bucket index.
+ *	match(obj, key, flags, arg) called to determine if key
+ *		matches the current 'obj' in the heap
+ *	newh(key, flags, arg) optional, used to allocate a new
+ *		object during insertions.
+ *
+ * dn_ht_free() frees the heap or unlink elements.
+ *	DNHT_REMOVE unlink elements, 0 frees the heap.
+ *	You need two calls to do both.
+ *
+ * dn_ht_find() is the main lookup function, which can also be
+ *	used to insert or delete elements in the hash table.
+ *	The final 'arg' is passed to all callbacks.
+ *
+ * dn_ht_scan() is used to invoke a callback on all entries of
+ *	the heap, or possibly on just one bucket. The callback
+ *	is invoked with a pointer to the object, and must return
+ *	one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the
+ *	removal of the object from the heap and the end of the
+ *	scan, respectively.
+ *
+ * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans
+ *	only the specific bucket of the table. The bucket is a in-out
+ *	parameter and return a valid bucket number if the original
+ *	is invalid.
+ *
+ * A combination of flags can be used to modify the operation
+ * of the dn_ht_find(), and of the callbacks:
+ *
+ * DNHT_KEY_IS_OBJ	means the key is the object pointer.
+ *	It is usally of interest for the hash and match functions.
+ *
+ * DNHT_MATCH_PTR	during a lookup, match pointers instead
+ *	of calling match(). Normally used when removing specific
+ *	entries. Does not imply KEY_IS_OBJ as the latter _is_ used
+ *	by the match function.
+ *
+ * DNHT_INSERT		insert the element if not found.
+ *	Calls new() to allocates a new object unless
+ *	DNHT_KEY_IS_OBJ is set.
+ *
+ * DNHT_UNIQUE		only insert if object not found.
+ *	XXX should it imply DNHT_INSERT ?
+ *
+ * DNHT_REMOVE		remove objects if we find them.
+ */
+struct dn_ht;	/* should be opaque */
+
+struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, 
+        uint32_t (*hash)(uintptr_t, int, void *),
+        int (*match)(void *, uintptr_t, int, void *),
+        void *(*newh)(uintptr_t, int, void *));
+void dn_ht_free(struct dn_ht *, int flags);
+
+void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *);
+int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *);
+int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *);
+int dn_ht_entries(struct dn_ht *);
+
+enum {  /* flags values.
+	 * first two are returned by the scan callback to indicate
+	 * to delete the matching element or to end the scan
+	 */
+        DNHT_SCAN_DEL	= 0x0001,
+        DNHT_SCAN_END	= 0x0002,
+        DNHT_KEY_IS_OBJ	= 0x0004,	/* key is the obj pointer */
+        DNHT_MATCH_PTR	= 0x0008,	/* match by pointer, not match() */
+        DNHT_INSERT	= 0x0010,	/* insert if not found */
+        DNHT_UNIQUE	= 0x0020,	/* report error if already there */
+        DNHT_REMOVE	= 0x0040,	/* remove on find or dn_ht_free */
+}; 
+
+#endif /* _IP_DN_HEAP_H */
diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h
new file mode 100644
index 000000000000..b6bf24e466af
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The API to write a packet scheduling algorithm for dummynet.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DN_SCHED_H
+#define _DN_SCHED_H
+
+#define	DN_MULTIQUEUE	0x01
+/*
+ * Descriptor for a scheduling algorithm.
+ * Contains all function pointers for a given scheduler
+ * This is typically created when a module is loaded, and stored
+ * in a global list of schedulers.
+ */
+struct dn_alg {
+	uint32_t type;           /* the scheduler type */
+	const char *name;   /* scheduler name */
+	uint32_t flags;	/* DN_MULTIQUEUE if supports multiple queues */
+
+	/*
+	 * The following define the size of 3 optional data structures
+	 * that may need to be allocated at runtime, and are appended
+	 * to each of the base data structures: scheduler, sched.inst,
+	 * and queue. We don't have a per-flowset structure.
+	 */
+	/*    + parameters attached to the template, e.g.
+	 *	default queue sizes, weights, quantum size, and so on;
+	 */
+	size_t schk_datalen;
+
+	/*    + per-instance parameters, such as timestamps,
+	 *	containers for queues, etc;
+	 */
+	size_t si_datalen;
+
+	size_t q_datalen;	/* per-queue parameters (e.g. S,F) */
+
+	/*
+	 * Methods implemented by the scheduler:
+	 * enqueue	enqueue packet 'm' on scheduler 's', queue 'q'.
+	 *	q is NULL for !MULTIQUEUE.
+	 *	Return 0 on success, 1 on drop (packet consumed anyways).
+	 *	Note that q should be interpreted only as a hint
+	 *	on the flow that the mbuf belongs to: while a
+	 *	scheduler will normally enqueue m into q, it is ok
+	 *	to leave q alone and put the mbuf elsewhere.
+	 *	This function is called in two cases:
+	 *	 - when a new packet arrives to the scheduler;
+	 *	 - when a scheduler is reconfigured. In this case the
+	 *	   call is issued by the new_queue callback, with a 
+	 *	   non empty queue (q) and m pointing to the first
+	 *	   mbuf in the queue. For this reason, the function
+	 *	   should internally check for (m != q->mq.head)
+	 *	   before calling dn_enqueue().
+	 *
+	 * dequeue	Called when scheduler instance 's' can
+	 *	dequeue a packet. Return NULL if none are available.
+	 *	XXX what about non work-conserving ?
+	 *
+	 * config	called on 'sched X config ...', normally writes
+	 *	in the area of size sch_arg
+	 *
+	 * destroy	called on 'sched delete', frees everything
+	 *	in sch_arg (other parts are handled by more specific
+	 *	functions)
+	 *
+	 * new_sched    called when a new instance is created, e.g.
+	 *	to create the local queue for !MULTIQUEUE, set V or
+	 *	copy parameters for WFQ, and so on.
+	 *
+	 * free_sched	called when deleting an instance, cleans
+	 *	extra data in the per-instance area.
+	 *
+	 * new_fsk	called when a flowset is linked to a scheduler,
+	 *	e.g. to validate parameters such as weights etc.
+	 * free_fsk	when a flowset is unlinked from a scheduler.
+	 *	(probably unnecessary)
+	 *
+	 * new_queue	called to set the per-queue parameters,
+	 *	e.g. S and F, adjust sum of weights in the parent, etc.
+	 *
+	 *	The new_queue callback is normally called from when
+	 *	creating a new queue. In some cases (such as a
+	 *	scheduler change or reconfiguration) it can be called
+	 *	with a non empty queue. In this case, the queue
+	 *	In case of non empty queue, the new_queue callback could
+	 *	need to call the enqueue function. In this case,
+	 *	the callback should eventually call enqueue() passing
+	 *	as m the first element in the queue.
+	 *
+	 * free_queue	actions related to a queue removal, e.g. undo
+	 *	all the above. If the queue has data in it, also remove
+	 *	from the scheduler. This can e.g. happen during a reconfigure.
+	 */
+	int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
+		struct mbuf *);
+	struct mbuf * (*dequeue)(struct dn_sch_inst *);
+
+	int (*config)(struct dn_schk *);
+	int (*destroy)(struct dn_schk*);
+	int (*new_sched)(struct dn_sch_inst *);
+	int (*free_sched)(struct dn_sch_inst *);
+	int (*new_fsk)(struct dn_fsk *f);
+	int (*free_fsk)(struct dn_fsk *f);
+	int (*new_queue)(struct dn_queue *q);
+	int (*free_queue)(struct dn_queue *q);
+
+	/* run-time fields */
+	int ref_count;      /* XXX number of instances in the system */
+	SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
+};
+
+/* MSVC does not support initializers so we need this ugly macro */
+#ifdef _WIN32
+#define _SI(fld)
+#else
+#define _SI(fld)	fld
+#endif
+
+/*
+ * Additionally, dummynet exports some functions and macros
+ * to be used by schedulers:
+ */
+
+void dn_free_pkts(struct mbuf *mnext);
+int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
+/* bound a variable between min and max */
+int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);
+
+/*
+ * Extract the head of a queue, update stats. Must be the very last
+ * thing done on a dequeue as the queue itself may go away.
+ */
+static __inline struct mbuf*
+dn_dequeue(struct dn_queue *q)
+{
+	struct mbuf *m = q->mq.head;
+	if (m == NULL)
+		return NULL;
+	q->mq.head = m->m_nextpkt;
+	q->ni.length--;
+	q->ni.len_bytes -= m->m_pkthdr.len;
+	if (q->_si) {
+		q->_si->ni.length--;
+		q->_si->ni.len_bytes -= m->m_pkthdr.len;
+	}
+	if (q->ni.length == 0) /* queue is now idle */
+		q->q_time = dn_cfg.curr_time;
+	return m;
+}
+
+int dn_sched_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNSCHED_MODULE(name, dnsched)			\
+	static moduledata_t name##_mod = {			\
+		#name, dn_sched_modevent, dnsched		\
+	};							\
+	DECLARE_MODULE(name, name##_mod, 			\
+		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 	\
+        MODULE_DEPEND(name, dummynet, 3, 3, 3);
+#endif /* _DN_SCHED_H */
diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c
new file mode 100644
index 000000000000..0bb3800a9c2a
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_fifo.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+/*
+ * This file implements a FIFO scheduler for a single queue.
+ * The queue is allocated as part of the scheduler instance,
+ * and there is a single flowset is in the template which stores
+ * queue size and policy.
+ * Enqueue and dequeue use the default library functions.
+ */
+static int 
+fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
+{
+	/* XXX if called with q != NULL and m=NULL, this is a
+	 * re-enqueue from an existing scheduler, which we should
+	 * handle.
+	 */
+	return dn_enqueue((struct dn_queue *)(si+1), m, 0);
+}
+
+static struct mbuf *
+fifo_dequeue(struct dn_sch_inst *si)
+{
+	return dn_dequeue((struct dn_queue *)(si + 1));
+}
+
+static int
+fifo_new_sched(struct dn_sch_inst *si)
+{
+	/* This scheduler instance contains the queue */
+	struct dn_queue *q = (struct dn_queue *)(si + 1);
+
+        set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+	q->_si = si;
+	q->fs = si->sched->fs;
+	return 0;
+}
+
+static int
+fifo_free_sched(struct dn_sch_inst *si)
+{
+	struct dn_queue *q = (struct dn_queue *)(si + 1);
+	dn_free_pkts(q->mq.head);
+	bzero(q, sizeof(*q));
+	return 0;
+}
+
+/*
+ * FIFO scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fifo_desc = {
+	_SI( .type = )  DN_SCHED_FIFO,
+	_SI( .name = )  "FIFO",
+	_SI( .flags = ) 0,
+
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = )  sizeof(struct dn_queue),
+	_SI( .q_datalen = )  0,
+
+	_SI( .enqueue = )  fifo_enqueue,
+	_SI( .dequeue = )  fifo_dequeue,
+	_SI( .config = )  NULL,
+	_SI( .destroy = )  NULL,
+	_SI( .new_sched = )  fifo_new_sched,
+	_SI( .free_sched = )  fifo_free_sched,
+	_SI( .new_fsk = )  NULL,
+	_SI( .free_fsk = )  NULL,
+	_SI( .new_queue = )  NULL,
+	_SI( .free_queue = )  NULL,
+};
+
+DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netinet/ipfw/dn_sched_prio.c b/sys/netinet/ipfw/dn_sched_prio.c
new file mode 100644
index 000000000000..28f60062cfc5
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_prio.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_PRIO	5 //XXX
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#define test_bit(ix, pData)	((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData)	(*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
+#endif
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
+#endif
+
+/* Size of the array of queues pointers. */
+#define BITMAP_T	unsigned long
+#define MAXPRIO		(sizeof(BITMAP_T) * 8)
+
+/*
+ * The scheduler instance contains an array of pointers to queues,
+ * one for each priority, and a bitmap listing backlogged queues.
+ */
+struct prio_si {
+	BITMAP_T bitmap;			/* array bitmap */
+	struct dn_queue *q_array[MAXPRIO];	/* Array of queues pointers */
+};
+
+/*
+ * If a queue with the same priority is already backlogged, use
+ * that one instead of the queue passed as argument.
+ */
+static int 
+prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+	struct prio_si *si = (struct prio_si *)(_si + 1);
+	int prio = q->fs->fs.par[0];
+
+	if (test_bit(prio, &si->bitmap) == 0) {
+		/* No queue with this priority, insert */
+		__set_bit(prio, &si->bitmap);
+		si->q_array[prio] = q;
+	} else { /* use the existing queue */
+		q = si->q_array[prio];
+	}
+	if (dn_enqueue(q, m, 0))
+		return 1;
+	return 0;
+}
+
+/*
+ * Packets are dequeued only from the highest priority queue.
+ * The function ffs() return the lowest bit in the bitmap that rapresent
+ * the array index (-1) which contains the pointer to the highest priority
+ * queue.
+ * After the dequeue, if this queue become empty, it is index is removed
+ * from the bitmap.
+ * Scheduler is idle if the bitmap is empty
+ *
+ * NOTE: highest priority is 0, lowest is sched->max_prio_q
+ */
+static struct mbuf *
+prio_dequeue(struct dn_sch_inst *_si)
+{
+	struct prio_si *si = (struct prio_si *)(_si + 1);
+	struct mbuf *m;
+	struct dn_queue *q;
+	int prio;
+
+	if (si->bitmap == 0) /* scheduler idle */
+		return NULL;
+
+	prio = ffs(si->bitmap) - 1;
+
+	/* Take the highest priority queue in the scheduler */
+	q = si->q_array[prio];
+	// assert(q)
+
+	m = dn_dequeue(q);
+	if (q->mq.head == NULL) {
+		/* Queue is now empty, remove from scheduler
+		 * and mark it
+		 */
+		si->q_array[prio] = NULL;
+		__clear_bit(prio, &si->bitmap);
+	}
+	return m;
+}
+
+static int
+prio_new_sched(struct dn_sch_inst *_si)
+{
+	struct prio_si *si = (struct prio_si *)(_si + 1);
+
+	bzero(si->q_array, sizeof(si->q_array));
+	si->bitmap = 0;
+
+	return 0;
+}
+
+static int
+prio_new_fsk(struct dn_fsk *fs)
+{
+	/* Check if the prioritiy is between 0 and MAXPRIO-1 */
+	ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority");
+	return 0;
+}
+
+static int
+prio_new_queue(struct dn_queue *q)
+{
+	struct prio_si *si = (struct prio_si *)(q->_si + 1);
+	int prio = q->fs->fs.par[0];
+	struct dn_queue *oldq;
+
+	q->ni.oid.subtype = DN_SCHED_PRIO;
+
+	if (q->mq.head == NULL)
+		return 0;
+
+	/* Queue already full, must insert in the scheduler or append
+	 * mbufs to existing queue. This partly duplicates prio_enqueue
+	 */
+	if (test_bit(prio, &si->bitmap) == 0) {
+		/* No queue with this priority, insert */
+		__set_bit(prio, &si->bitmap);
+		si->q_array[prio] = q;
+	} else if ( (oldq = si->q_array[prio]) != q) {
+		/* must append to the existing queue.
+		 * can simply append q->mq.head to q2->...
+		 * and add the counters to those of q2
+		 */
+		oldq->mq.tail->m_nextpkt = q->mq.head;
+		oldq->mq.tail = q->mq.tail;
+		oldq->ni.length += q->ni.length;
+		q->ni.length = 0;
+		oldq->ni.len_bytes += q->ni.len_bytes;
+		q->ni.len_bytes = 0;
+		q->mq.tail = q->mq.head = NULL;
+	}
+	return 0;
+}
+
+static int
+prio_free_queue(struct dn_queue *q)
+{
+	int prio = q->fs->fs.par[0];
+	struct prio_si *si = (struct prio_si *)(q->_si + 1);
+
+	if (si->q_array[prio] == q) {
+		si->q_array[prio] = NULL;
+		__clear_bit(prio, &si->bitmap);
+	}
+	return 0;
+}
+
+
+static struct dn_alg prio_desc = {
+	_SI( .type = ) DN_SCHED_PRIO,
+	_SI( .name = ) "PRIO",
+	_SI( .flags = ) DN_MULTIQUEUE,
+
+	/* we need extra space in the si and the queue */
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = ) sizeof(struct prio_si),
+	_SI( .q_datalen = ) 0,
+
+	_SI( .enqueue = ) prio_enqueue,
+	_SI( .dequeue = ) prio_dequeue,
+
+	_SI( .config = )  NULL,
+	_SI( .destroy = )  NULL,
+	_SI( .new_sched = ) prio_new_sched,
+	_SI( .free_sched = ) NULL,
+
+	_SI( .new_fsk = ) prio_new_fsk,
+	_SI( .free_fsk = )  NULL,
+
+	_SI( .new_queue = ) prio_new_queue,
+	_SI( .free_queue = ) prio_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc);
diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c
new file mode 100644
index 000000000000..44555ee09e28
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_qfq.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifdef QFQ_DEBUG
+struct qfq_sched;
+static void dump_sched(struct qfq_sched *q, const char *msg);
+#define	NO(x)	x
+#else
+#define NO(x)
+#endif
+#define DN_SCHED_QFQ	4 // XXX Where?
+typedef	unsigned long	bitmap;
+
+/*
+ * bitmaps ops are critical. Some linux versions have __fls
+ * and the bitmap ops. Some machines have ffs
+ */
+#if defined(_WIN32)
+int fls(unsigned int n)
+{
+	int i = 0;
+	for (i = 0; n > 0; n >>= 1, i++)
+		;
+	return i;
+}
+#endif
+
+#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32)
+static inline unsigned long __fls(unsigned long word)
+{
+	return fls(word) - 1;
+}
+#endif
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#ifdef QFQ_DEBUG
+int test_bit(int ix, bitmap *p)
+{
+	if (ix < 0 || ix > 31)
+		D("bad index %d", ix);
+	return *p & (1<<ix);
+}
+void __set_bit(int ix, bitmap *p)
+{
+	if (ix < 0 || ix > 31)
+		D("bad index %d", ix);
+	*p |= (1<<ix);
+}
+void __clear_bit(int ix, bitmap *p)
+{
+	if (ix < 0 || ix > 31)
+		D("bad index %d", ix);
+	*p &= ~(1<<ix);
+}
+#else /* !QFQ_DEBUG */
+/* XXX do we have fast version, or leave it to the compiler ? */
+#define test_bit(ix, pData)	((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData)	(*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
+#endif /* !QFQ_DEBUG */
+#endif /* !__linux__ */
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/*-------------------------------------------*/
+/*
+
+Virtual time computations.
+
+S, F and V are all computed in fixed point arithmetic with
+FRAC_BITS decimal bits.
+
+   QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+  	one bit per index.
+   QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+   The layout of the bits is as below:
+  
+                   [ MTU_SHIFT ][      FRAC_BITS    ]
+                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
+  				 ^.__grp->index = 0
+  				 *.__grp->slot_shift
+  
+   where MIN_SLOT_SHIFT is derived by difference from the others.
+
+The max group index corresponds to Lmax/w_min, where
+Lmax=1<<MTU_SHIFT, w_min = 1 .
+From this, and knowing how many groups (MAX_INDEX) we want,
+we can derive the shift corresponding to each group.
+
+Because we often need to compute
+	F = S + len/w_i  and V = V + len/wsum
+instead of storing w_i store the value
+	inv_w = (1<<FRAC_BITS)/w_i
+so we can do F = S + len * inv_w * wsum.
+We use W_TOT in the formulas so we can easily move between
+static and adaptive weight sum.
+
+The per-scheduler-instance data contain all the data structures
+for the scheduler: bitmaps and bucket lists.
+
+ */
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group. This is approx lmax/lmin + 5.
+ * XXX check because it poses constraints on MAX_INDEX
+ */
+#define QFQ_MAX_SLOTS	32
+/*
+ * Shifts used for class<->group mapping. Class weights are
+ * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
+ * group with the smallest index that can support the L_i / r_i
+ * configured for the class.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ *
+ * When computing the group index, we do (len<<FP_SHIFT)/weight,
+ * then compute an FLS (which is like a log2()), and if the result
+ * is below the MAX_INDEX region we use 0 (which is the same as
+ * using a larger len).
+ */
+#define QFQ_MAX_INDEX		19
+#define QFQ_MAX_WSHIFT		16	/* log2(max_weight) */
+
+#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT)
+#define QFQ_MAX_WSUM		(2*QFQ_MAX_WEIGHT)
+//#define IWSUM	(q->i_wsum)
+#define IWSUM	((1<<FRAC_BITS)/QFQ_MAX_WSUM)
+
+#define FRAC_BITS		30	/* fixed point arithmetic */
+#define ONE_FP			(1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT		11	/* log2(max_len) */
+#define QFQ_MIN_SLOT_SHIFT	(FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+
+/*
+ * Possible group states, also indexes for the bitmaps array in
+ * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+/*
+ * additional queue info. Some of this info should come from
+ * the flowset, we copy them here for faster processing.
+ * This is an overlay of the struct dn_queue
+ */
+struct qfq_class {
+	struct dn_queue _q;
+	uint64_t S, F;		/* flow timestamps (exact) */
+	struct qfq_class *next; /* Link for the slot list. */
+
+	/* group we belong to. In principle we would need the index,
+	 * which is log_2(lmax/weight), but we never reference it
+	 * directly, only the group.
+	 */
+	struct qfq_group *grp;
+
+	/* these are copied from the flowset. */
+	uint32_t	inv_w;	/* ONE_FP/weight */
+	uint32_t 	lmax;	/* Max packet size for this flow. */
+};
+
+/* Group descriptor, see the paper for details.
+ * Basically this contains the bucket lists
+ */
+struct qfq_group {
+	uint64_t S, F;			/* group timestamps (approx). */
+	unsigned int slot_shift;	/* Slot shift. */
+	unsigned int index;		/* Group index. */
+	unsigned int front;		/* Index of the front slot. */
+	bitmap full_slots;		/* non-empty slots */
+
+	/* Array of lists of active classes. */
+	struct qfq_class *slots[QFQ_MAX_SLOTS];
+};
+
+/* scheduler instance descriptor. */
+struct qfq_sched {
+	uint64_t	V;		/* Precise virtual time. */
+	uint32_t	wsum;		/* weight sum */
+	NO(uint32_t	i_wsum;		/* ONE_FP/w_sum */
+	uint32_t	_queued;	/* debugging */
+	uint32_t	loops;	/* debugging */)
+	bitmap bitmaps[QFQ_MAX_STATE];	/* Group bitmaps. */
+	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+};
+
+/*---- support functions ----------------------------*/
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(uint64_t a, uint64_t b)
+{
+	return (int64_t)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
+{
+	return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+					unsigned long bitmap)
+{
+	int index = ffs(bitmap) - 1; // zero-based
+	return &q->groups[index];
+}
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
+{
+	uint64_t slot_size = (uint64_t)maxlen *inv_w;
+	unsigned long size_map;
+	int index = 0;
+
+	size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
+	if (!size_map)
+		goto out;
+
+	index = __fls(size_map) + 1;	// basically a log_2()
+	index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+
+	if (index < 0)
+		index = 0;
+
+out:
+	ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
+	return index;
+}
+/*---- end support functions ----*/
+
+/*-------- API calls --------------------------------*/
+/*
+ * Validate and copy parameters from flowset.
+ */
+static int
+qfq_new_queue(struct dn_queue *_q)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+	struct qfq_class *cl = (struct qfq_class *)_q;
+	int i;
+	uint32_t w;	/* approximated weight */
+
+	/* import parameters from the flowset. They should be correct
+	 * already.
+	 */
+	w = _q->fs->fs.par[0];
+	cl->lmax = _q->fs->fs.par[1];
+	if (!w || w > QFQ_MAX_WEIGHT) {
+		w = 1;
+		D("rounding weight to 1");
+	}
+	cl->inv_w = ONE_FP/w;
+	w = ONE_FP/cl->inv_w;	
+	if (q->wsum + w > QFQ_MAX_WSUM)
+		return EINVAL;
+
+	i = qfq_calc_index(cl->inv_w, cl->lmax);
+	cl->grp = &q->groups[i];
+	q->wsum += w;
+	// XXX cl->S = q->V; ?
+	// XXX compute q->i_wsum
+	return 0;
+}
+
+/* remove an empty queue */
+static int
+qfq_free_queue(struct dn_queue *_q)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+	struct qfq_class *cl = (struct qfq_class *)_q;
+	if (cl->inv_w) {
+		q->wsum -= ONE_FP/cl->inv_w;
+		cl->inv_w = 0; /* reset weight to avoid run twice */
+	}
+	return 0;
+}
+
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long
+mask_from(unsigned long bitmap, int from)
+{
+	return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static inline unsigned int
+qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
+{
+	/* if S > V we are not eligible */
+	unsigned int state = qfq_gt(grp->S, q->V);
+	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (qfq_gt(grp->F, next->F))
+			state |= EB;
+	}
+
+	return state;
+}
+
+/*
+ * In principle
+ *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ *	q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void
+qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
+{
+	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+	q->bitmaps[src] &= ~mask;
+}
+
+static inline void
+qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
+{
+	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (!qfq_gt(next->F, old_finish))
+			return;
+	}
+
+	mask = (1UL << index) - 1;
+	qfq_move_groups(q, mask, EB, ER);
+	qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+	old_V ^= q->V;
+	old_V >>= QFQ_MIN_SLOT_SHIFT;
+	if (old_V) {
+		...
+	}
+ *
+ */
+static inline void
+qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+	unsigned long mask, vslot, old_vslot;
+
+	vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
+	old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+
+	if (vslot != old_vslot) {
+		mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
+		qfq_move_groups(q, mask, IR, ER);
+		qfq_move_groups(q, mask, IB, EB);
+	}
+}
+
+/*
+ * XXX we should make sure that slot becomes less than 32.
+ * This is guaranteed by the input values.
+ * roundedS is always cl->S rounded on grp->slot_shift bits.
+ */
+static inline void
+qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
+{
+	uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
+	unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+	cl->next = grp->slots[i];
+	grp->slots[i] = cl;
+	__set_bit(slot, &grp->full_slots);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static inline void
+qfq_front_slot_remove(struct qfq_group *grp)
+{
+	struct qfq_class **h = &grp->slots[grp->front];
+
+	*h = (*h)->next;
+	if (!*h)
+		__clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first full queue in a group. As a side effect,
+ * adjust the bucket list so the first non-empty bucket is at
+ * position 0 in full_slots.
+ */
+static inline struct qfq_class *
+qfq_slot_scan(struct qfq_group *grp)
+{
+	int i;
+
+	ND("grp %d full %x", grp->index, grp->full_slots);
+	if (!grp->full_slots)
+		return NULL;
+
+	i = ffs(grp->full_slots) - 1; // zero-based
+	if (i > 0) {
+		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+		grp->full_slots >>= i;
+	}
+
+	return grp->slots[grp->front];
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static inline void
+qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
+{
+	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+	grp->full_slots <<= i;
+	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+
+static inline void
+qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+	bitmap ineligible;
+
+	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+	if (ineligible) {
+		if (!q->bitmaps[ER]) {
+			struct qfq_group *grp;
+			grp = qfq_ffs(q, ineligible);
+			if (qfq_gt(grp->S, q->V))
+				q->V = grp->S;
+		}
+		qfq_make_eligible(q, old_V);
+	}
+}
+
+/*
+ * Updates the class, returns true if also the group needs to be updated.
+ */
+static inline int
+qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
+	    struct qfq_class *cl)
+{
+
+	cl->S = cl->F;
+	if (cl->_q.mq.head == NULL)  {
+		qfq_front_slot_remove(grp);
+	} else {
+		unsigned int len;
+		uint64_t roundedS;
+
+		len = cl->_q.mq.head->m_pkthdr.len;
+		cl->F = cl->S + (uint64_t)len * cl->inv_w;
+		roundedS = qfq_round_down(cl->S, grp->slot_shift);
+		if (roundedS == grp->S)
+			return 0;
+
+		qfq_front_slot_remove(grp);
+		qfq_slot_insert(grp, cl, roundedS);
+	}
+	return 1;
+}
+
+static struct mbuf *
+qfq_dequeue(struct dn_sch_inst *si)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+	struct qfq_group *grp;
+	struct qfq_class *cl;
+	struct mbuf *m;
+	uint64_t old_V;
+
+	NO(q->loops++;)
+	if (!q->bitmaps[ER]) {
+		NO(if (q->queued)
+			dump_sched(q, "start dequeue");)
+		return NULL;
+	}
+
+	grp = qfq_ffs(q, q->bitmaps[ER]);
+
+	cl = grp->slots[grp->front];
+	/* extract from the first bucket in the bucket list */
+	m = dn_dequeue(&cl->_q);
+
+	if (!m) {
+		D("BUG/* non-workconserving leaf */");
+		return NULL;
+	}
+	NO(q->queued--;)
+	old_V = q->V;
+	q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
+	ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
+
+	if (qfq_update_class(q, grp, cl)) {
+		uint64_t old_F = grp->F;
+		cl = qfq_slot_scan(grp);
+		if (!cl) { /* group gone, remove from ER */
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			// grp->S = grp->F + 1; // XXX debugging only
+		} else {
+			uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
+			unsigned int s;
+
+			if (grp->S == roundedS)
+				goto skip_unblock;
+			grp->S = roundedS;
+			grp->F = roundedS + (2ULL << grp->slot_shift);
+			/* remove from ER and put in the new set */
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			s = qfq_calc_state(q, grp);
+			__set_bit(grp->index, &q->bitmaps[s]);
+		}
+		/* we need to unblock even if the group has gone away */
+		qfq_unblock_groups(q, grp->index, old_F);
+	}
+
+skip_unblock:
+	qfq_update_eligible(q, old_V);
+	NO(if (!q->bitmaps[ER] && q->queued)
+		dump_sched(q, "end dequeue");)
+
+	return m;
+}
+
+/*
+ * Assign a reasonable start time for a new flow k in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in ER. So, if we have groups in ER, set S to
+ * the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static inline void
+qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+{
+	unsigned long mask;
+	uint32_t limit, roundedF;
+	int slot_shift = cl->grp->slot_shift;
+
+	roundedF = qfq_round_down(cl->F, slot_shift);
+	limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
+
+	if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+		/* timestamp was stale */
+		mask = mask_from(q->bitmaps[ER], cl->grp->index);
+		if (mask) {
+			struct qfq_group *next = qfq_ffs(q, mask);
+			if (qfq_gt(roundedF, next->F)) {
+				cl->S = next->F;
+				return;
+			}
+		}
+		cl->S = q->V;
+	} else { /* timestamp is not stale */
+		cl->S = cl->F;
+	}
+}
+
+static int
+qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+	struct qfq_group *grp;
+	struct qfq_class *cl = (struct qfq_class *)_q;
+	uint64_t roundedS;
+	int s;
+
+	NO(q->loops++;)
+	DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
+		_q, cl->inv_w, cl->grp->index);
+	/* XXX verify that the packet obeys the parameters */
+	if (m != _q->mq.head) {
+		if (dn_enqueue(_q, m, 0)) /* packet was dropped */
+			return 1;
+		NO(q->queued++;)
+		if (m != _q->mq.head)
+			return 0;
+	}
+	/* If reach this point, queue q was idle */
+	grp = cl->grp;
+	qfq_update_start(q, cl); /* adjust start time */
+	/* compute new finish time and rounded start. */
+	cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
+	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+
+	/*
+	 * insert cl in the correct bucket.
+	 * If cl->S >= grp->S we don't need to adjust the
+	 * bucket list and simply go to the insertion phase.
+	 * Otherwise grp->S is decreasing, we must make room
+	 * in the bucket list, and also recompute the group state.
+	 * Finally, if there were no flows in this group and nobody
+	 * was in ER make sure to adjust V.
+	 */
+	if (grp->full_slots) {
+		if (!qfq_gt(grp->S, cl->S))
+			goto skip_update;
+		/* create a slot for this cl->S */
+		qfq_slot_rotate(q, grp, roundedS);
+		/* group was surely ineligible, remove */
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+		q->V = roundedS;
+
+	grp->S = roundedS;
+	grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
+	s = qfq_calc_state(q, grp);
+	__set_bit(grp->index, &q->bitmaps[s]);
+	ND("new state %d 0x%x", s, q->bitmaps[s]);
+	ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
+skip_update:
+	qfq_slot_insert(grp, cl, roundedS);
+
+	return 0;
+}
+
+
+#if 0
+static inline void
+qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+	struct qfq_class *cl, struct qfq_class **pprev)
+{
+	unsigned int i, offset;
+	uint64_t roundedS;
+
+	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+	offset = (roundedS - grp->S) >> grp->slot_shift;
+	i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+#ifdef notyet
+	if (!pprev) {
+		pprev = &grp->slots[i];
+		while (*pprev && *pprev != cl)
+			pprev = &(*pprev)->next;
+	}
+#endif
+
+	*pprev = cl->next;
+	if (!grp->slots[i])
+		__clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * called to forcibly destroy a queue.
+ * If the queue is not in the front bucket, or if it has
+ * other queues in the front bucket, we can simply remove
+ * the queue with no other side effects.
+ * Otherwise we must propagate the event up.
+ * XXX description to be completed.
+ */
+static void
+qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
+				 struct qfq_class **pprev)
+{
+	struct qfq_group *grp = &q->groups[cl->index];
+	unsigned long mask;
+	uint64_t roundedS;
+	int s;
+
+	cl->F = cl->S;	// not needed if the class goes away.
+	qfq_slot_remove(q, grp, cl, pprev);
+
+	if (!grp->full_slots) {
+		/* nothing left in the group, remove from all sets.
+		 * Do ER last because if we were blocking other groups
+		 * we must unblock them.
+		 */
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[EB]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+
+		if (test_bit(grp->index, &q->bitmaps[ER]) &&
+		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+			if (mask)
+				mask = ~((1UL << __fls(mask)) - 1);
+			else
+				mask = ~0UL;
+			qfq_move_groups(q, mask, EB, ER);
+			qfq_move_groups(q, mask, IB, IR);
+		}
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+	} else if (!grp->slots[grp->front]) {
+		cl = qfq_slot_scan(grp);
+		roundedS = qfq_round_down(cl->S, grp->slot_shift);
+		if (grp->S != roundedS) {
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			__clear_bit(grp->index, &q->bitmaps[IR]);
+			__clear_bit(grp->index, &q->bitmaps[EB]);
+			__clear_bit(grp->index, &q->bitmaps[IB]);
+			grp->S = roundedS;
+			grp->F = roundedS + (2ULL << grp->slot_shift);
+			s = qfq_calc_state(q, grp);
+			__set_bit(grp->index, &q->bitmaps[s]);
+		}
+	}
+	qfq_update_eligible(q, q->V);
+}
+#endif
+
+static int
+qfq_new_fsk(struct dn_fsk *f)
+{
+	ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
+	ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
+	ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
+	return 0;
+}
+
+/*
+ * initialize a new scheduler instance
+ */
+static int
+qfq_new_sched(struct dn_sch_inst *si)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+	struct qfq_group *grp;
+	int i;
+
+	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+		grp = &q->groups[i];
+		grp->index = i;
+		grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
+					(QFQ_MAX_INDEX - i);
+	}
+	return 0;
+}
+
+/*
+ * QFQ scheduler descriptor
+ */
+static struct dn_alg qfq_desc = {
+	_SI( .type = ) DN_SCHED_QFQ,
+	_SI( .name = ) "QFQ",
+	_SI( .flags = ) DN_MULTIQUEUE,
+
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = ) sizeof(struct qfq_sched),
+	_SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
+
+	_SI( .enqueue = ) qfq_enqueue,
+	_SI( .dequeue = ) qfq_dequeue,
+
+	_SI( .config = )  NULL,
+	_SI( .destroy = )  NULL,
+	_SI( .new_sched = ) qfq_new_sched,
+	_SI( .free_sched = )  NULL,
+	_SI( .new_fsk = ) qfq_new_fsk,
+	_SI( .free_fsk = )  NULL,
+	_SI( .new_queue = ) qfq_new_queue,
+	_SI( .free_queue = ) qfq_free_queue,
+};
+
+DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
+
+#ifdef QFQ_DEBUG
+static void
+dump_groups(struct qfq_sched *q, uint32_t mask)
+{
+	int i, j;
+
+	for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
+		struct qfq_group *g = &q->groups[i];
+
+		if (0 == (mask & (1<<i)))
+			continue;
+		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+			if (g->slots[j])
+				D("    bucket %d %p", j, g->slots[j]);
+		}
+		D("full_slots 0x%x", g->full_slots);
+		D("        %2d S 0x%20llx F 0x%llx %c", i,
+			g->S, g->F,
+			mask & (1<<i) ? '1' : '0');
+	}
+}
+
+static void
+dump_sched(struct qfq_sched *q, const char *msg)
+{
+	D("--- in %s: ---", msg);
+	ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
+	D("    ER 0x%08x", q->bitmaps[ER]);
+	D("    EB 0x%08x", q->bitmaps[EB]);
+	D("    IR 0x%08x", q->bitmaps[IR]);
+	D("    IB 0x%08x", q->bitmaps[IB]);
+	dump_groups(q, 0xffffffff);
+};
+#endif /* QFQ_DEBUG */
diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c
new file mode 100644
index 000000000000..1bbd80057c3f
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_rr.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_RR	3 // XXX Where?
+
+struct rr_queue {
+	struct dn_queue q;		/* Standard queue */
+	int status;			/* 1: queue is in the list */
+	int credit;			/* Number of bytes to transmit */
+	int quantum;			/* quantum * C */
+	struct rr_queue *qnext;		/* */
+};
+
+/* struct rr_schk contains global config parameters
+ * and is right after dn_schk
+ */
+struct rr_schk {
+	int min_q;		/* Min quantum */
+	int max_q;		/* Max quantum */
+	int q_bytes;		/* Bytes per quantum */
+};
+
+/* per-instance round robin list, right after dn_sch_inst */
+struct rr_si {
+	struct rr_queue *head, *tail;	/* Pointer to current queue */
+};
+
+/* Append a queue to the rr list */
+static inline void
+rr_append(struct rr_queue *q, struct rr_si *si)
+{
+	q->status = 1;		/* mark as in-rr_list */
+	q->credit = q->quantum;	/* initialize credit */
+
+	/* append to the tail */
+	if (si->head == NULL)
+		si->head = q;
+	else
+		si->tail->qnext = q;
+	si->tail = q;		/* advance the tail pointer */
+	q->qnext = si->head;	/* make it circular */
+}
+
+/* Remove the head queue from circular list. */
+static inline void
+rr_remove_head(struct rr_si *si)
+{
+	if (si->head == NULL)
+		return; /* empty queue */
+	si->head->status = 0;
+
+	if (si->head == si->tail) {
+		si->head = si->tail = NULL;
+		return;
+	}
+
+	si->head = si->head->qnext;
+	si->tail->qnext = si->head;
+}
+
+/* Remove a queue from circular list.
+ * XXX see if ti can be merge with remove_queue()
+ */
+static inline void
+remove_queue_q(struct rr_queue *q, struct rr_si *si)
+{
+	struct rr_queue *prev;
+
+	if (q->status != 1)
+		return;
+	if (q == si->head) {
+		rr_remove_head(si);
+		return;
+	}
+
+	for (prev = si->head; prev; prev = prev->qnext) {
+		if (prev->qnext != q)
+			continue;
+		prev->qnext = q->qnext;
+		if (q == si->tail)
+			si->tail = prev;
+		q->status = 0;
+		break;
+	}
+}
+
+
+static inline void
+next_pointer(struct rr_si *si)
+{
+	if (si->head == NULL)
+		return; /* empty queue */
+
+	si->head = si->head->qnext;
+	si->tail = si->tail->qnext;
+}
+
+static int
+rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+	struct rr_si *si;
+	struct rr_queue *rrq;
+
+	if (m != q->mq.head) {
+		if (dn_enqueue(q, m, 0)) /* packet was dropped */
+			return 1;
+		if (m != q->mq.head)
+			return 0;
+	}
+
+	/* If reach this point, queue q was idle */
+	si = (struct rr_si *)(_si + 1);
+	rrq = (struct rr_queue *)q;
+
+	if (rrq->status == 1) /* Queue is already in the queue list */
+		return 0;
+
+	/* Insert the queue in the queue list */
+	rr_append(rrq, si);
+
+	return 0;
+}
+
+static struct mbuf *
+rr_dequeue(struct dn_sch_inst *_si)
+{
+	/* Access scheduler instance private data */
+	struct rr_si *si = (struct rr_si *)(_si + 1);
+	struct rr_queue *rrq;
+	uint64_t len;
+
+	while ( (rrq = si->head) ) {
+		struct mbuf *m = rrq->q.mq.head;
+		if ( m == NULL) {
+			/* empty queue, remove from list */
+			rr_remove_head(si);
+			continue;
+		}
+		len = m->m_pkthdr.len;
+
+		if (len > rrq->credit) {
+			/* Packet too big */
+			rrq->credit += rrq->quantum;
+			/* Try next queue */
+			next_pointer(si);
+		} else {
+			rrq->credit -= len;
+			return dn_dequeue(&rrq->q);
+		}
+	}
+
+	/* no packet to dequeue*/
+	return NULL;
+}
+
+static int
+rr_config(struct dn_schk *_schk)
+{
+	struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
+	ND("called");
+
+	/* use reasonable quantums (64..2k bytes, default 1500) */
+	schk->min_q = 64;
+	schk->max_q = 2048;
+	schk->q_bytes = 1500;	/* quantum */
+
+	return 0;
+}
+
+static int
+rr_new_sched(struct dn_sch_inst *_si)
+{
+	struct rr_si *si = (struct rr_si *)(_si + 1);
+
+	ND("called");
+	si->head = si->tail = NULL;
+
+	return 0;
+}
+
+static int
+rr_free_sched(struct dn_sch_inst *_si)
+{
+	ND("called");
+	/* Nothing to do? */
+	return 0;
+}
+
+static int
+rr_new_fsk(struct dn_fsk *fs)
+{
+	struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
+	/* par[0] is the weight, par[1] is the quantum step */
+	ipdn_bound_var(&fs->fs.par[0], 1,
+		1, 65536, "RR weight");
+	ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
+		schk->min_q, schk->max_q, "RR quantum");
+	return 0;
+}
+
+static int
+rr_new_queue(struct dn_queue *_q)
+{
+	struct rr_queue *q = (struct rr_queue *)_q;
+
+	_q->ni.oid.subtype = DN_SCHED_RR;
+
+	q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
+	ND("called, q->quantum %d", q->quantum);
+	q->credit = q->quantum;
+	q->status = 0;
+
+	if (_q->mq.head != NULL) {
+		/* Queue NOT empty, insert in the queue list */
+		rr_append(q, (struct rr_si *)(_q->_si + 1));
+	}
+	return 0;
+}
+
+static int
+rr_free_queue(struct dn_queue *_q)
+{
+	struct rr_queue *q = (struct rr_queue *)_q;
+
+	ND("called");
+	if (q->status == 1) {
+		struct rr_si *si = (struct rr_si *)(_q->_si + 1);
+		remove_queue_q(q, si);
+	}
+	return 0;
+}
+
+/*
+ * RR scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg rr_desc = {
+	_SI( .type = ) DN_SCHED_RR,
+	_SI( .name = ) "RR",
+	_SI( .flags = ) DN_MULTIQUEUE,
+
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = ) sizeof(struct rr_si),
+	_SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),
+
+	_SI( .enqueue = ) rr_enqueue,
+	_SI( .dequeue = ) rr_dequeue,
+
+	_SI( .config = ) rr_config,
+	_SI( .destroy = ) NULL,
+	_SI( .new_sched = ) rr_new_sched,
+	_SI( .free_sched = ) rr_free_sched,
+	_SI( .new_fsk = ) rr_new_fsk,
+	_SI( .free_fsk = ) NULL,
+	_SI( .new_queue = ) rr_new_queue,
+	_SI( .free_queue = ) rr_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);
diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c
new file mode 100644
index 000000000000..55a49550b7f9
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_wf2q.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifndef MAX64
+#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#endif
+
+/*
+ * timestamps are computed on 64 bit using fixed point arithmetic.
+ * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
+ * and sum of weights, respectively. FRAC_BITS is the number of
+ * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
+ * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
+ * using an unsigned 32-bit division, and to avoid wraparounds we need
+ * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
+ * As an example
+ * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
+ */
+#ifndef FRAC_BITS
+#define FRAC_BITS    28 /* shift for fixed point arithmetic */
+#define	ONE_FP	(1UL << FRAC_BITS)
+#endif
+
+/*
+ * Private information for the scheduler instance:
+ * sch_heap (key is Finish time) returns the next queue to serve
+ * ne_heap (key is Start time) stores not-eligible queues
+ * idle_heap (key=start/finish time) stores idle flows. It must
+ *	support extract-from-middle.
+ * A flow is only in 1 of the three heaps.
+ * XXX todo: use a more efficient data structure, e.g. a tree sorted
+ * by F with min_subtree(S) in each node
+ */
+struct wf2qp_si {
+    struct dn_heap sch_heap;	/* top extract - key Finish  time */
+    struct dn_heap ne_heap;	/* top extract - key Start   time */
+    struct dn_heap idle_heap;	/* random extract - key Start=Finish time */
+    uint64_t V;			/* virtual time */
+    uint32_t inv_wsum;		/* inverse of sum of weights */
+    uint32_t wsum;		/* sum of weights */
+};
+
+struct wf2qp_queue {
+    struct dn_queue _q;
+    uint64_t S, F;		/* start time, finish time */
+    uint32_t inv_w;		/* ONE_FP / weight */
+    int32_t heap_pos;		/* position (index) of struct in heap */
+};
+
+/*
+ * This file implements a WF2Q+ scheduler as it has been in dummynet
+ * since 2000.
+ * The scheduler supports per-flow queues and has O(log N) complexity.
+ *
+ * WF2Q+ needs to drain entries from the idle heap so that we
+ * can keep the sum of weights up to date. We can do it whenever
+ * we get a chance, or periodically, or following some other
+ * strategy. The function idle_check() drains at most N elements
+ * from the idle heap.
+ */
+static void
+idle_check(struct wf2qp_si *si, int n, int force)
+{
+    struct dn_heap *h = &si->idle_heap;
+    while (n-- > 0 && h->elements > 0 &&
+		(force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
+	struct dn_queue *q = HEAP_TOP(h)->object;
+        struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+
+        heap_extract(h, NULL);
+        /* XXX to let the flowset delete the queue we should
+	 * mark it as 'unused' by the scheduler.
+	 */
+        alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
+        si->wsum -= q->fs->fs.par[0];	/* adjust sum of weights */
+	if (si->wsum > 0)
+		si->inv_wsum = ONE_FP/si->wsum;
+    }
+}
+
+static int
+wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+    struct dn_fsk *fs = q->fs;
+    struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+    struct wf2qp_queue *alg_fq;
+    uint64_t len = m->m_pkthdr.len;
+
+    if (m != q->mq.head) {
+	if (dn_enqueue(q, m, 0)) /* packet was dropped */
+	    return 1;
+	if (m != q->mq.head)	/* queue was already busy */
+	    return 0;
+    }
+
+    /* If reach this point, queue q was idle */
+    alg_fq = (struct wf2qp_queue *)q;
+
+    if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
+        /* F<S means timestamps are invalid ->brand new queue. */
+        alg_fq->S = si->V;		/* init start time */
+        si->wsum += fs->fs.par[0];	/* add weight of new queue. */
+	si->inv_wsum = ONE_FP/si->wsum;
+    } else { /* if it was idle then it was in the idle heap */
+        heap_extract(&si->idle_heap, q);
+        alg_fq->S = MAX64(alg_fq->F, si->V);	/* compute new S */
+    }
+    alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
+
+    /* if nothing is backlogged, make sure this flow is eligible */
+    if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
+        si->V = MAX64(alg_fq->S, si->V);
+
+    /*
+     * Look at eligibility. A flow is not eligibile if S>V (when
+     * this happens, it means that there is some other flow already
+     * scheduled for the same pipe, so the sch_heap cannot be
+     * empty). If the flow is not eligible we just store it in the
+     * ne_heap. Otherwise, we store in the sch_heap.
+     * Note that for all flows in sch_heap (SCH), S_i <= V,
+     * and for all flows in ne_heap (NEH), S_i > V.
+     * So when we need to compute max(V, min(S_i)) forall i in
+     * SCH+NEH, we only need to look into NEH.
+     */
+    if (DN_KEY_LT(si->V, alg_fq->S)) {
+        /* S>V means flow Not eligible. */
+        if (si->sch_heap.elements == 0)
+            D("++ ouch! not eligible but empty scheduler!");
+        heap_insert(&si->ne_heap, alg_fq->S, q);
+    } else {
+        heap_insert(&si->sch_heap, alg_fq->F, q);
+    }
+    return 0;
+}
+
+/* XXX invariant: sch > 0 || V >= min(S in neh) */
+static struct mbuf *
+wf2qp_dequeue(struct dn_sch_inst *_si)
+{
+	/* Access scheduler instance private data */
+	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+	struct mbuf *m;
+	struct dn_queue *q;
+	struct dn_heap *sch = &si->sch_heap;
+	struct dn_heap *neh = &si->ne_heap;
+	struct wf2qp_queue *alg_fq;
+
+	if (sch->elements == 0 && neh->elements == 0) {
+		/* we have nothing to do. We could kill the idle heap
+		 * altogether and reset V
+		 */
+		idle_check(si, 0x7fffffff, 1);
+		si->V = 0;
+		si->wsum = 0;	/* should be set already */
+		return NULL;	/* quick return if nothing to do */
+	}
+	idle_check(si, 1, 0);	/* drain something from the idle heap */
+
+	/* make sure at least one element is eligible, bumping V
+	 * and moving entries that have become eligible.
+	 * We need to repeat the first part twice, before and
+	 * after extracting the candidate, or enqueue() will
+	 * find the data structure in a wrong state.
+	 */
+  m = NULL;
+  for(;;) {
+	/*
+	 * Compute V = max(V, min(S_i)). Remember that all elements
+	 * in sch have by definition S_i <= V so if sch is not empty,
+	 * V is surely the max and we must not update it. Conversely,
+	 * if sch is empty we only need to look at neh.
+	 * We don't need to move the queues, as it will be done at the
+	 * next enqueue
+	 */
+	if (sch->elements == 0 && neh->elements > 0) {
+		si->V = MAX64(si->V, HEAP_TOP(neh)->key);
+	}
+	while (neh->elements > 0 &&
+		    DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
+		q = HEAP_TOP(neh)->object;
+		alg_fq = (struct wf2qp_queue *)q;
+		heap_extract(neh, NULL);
+		heap_insert(sch, alg_fq->F, q);
+	}
+	if (m) /* pkt found in previous iteration */
+		break;
+	/* ok we have at least one eligible pkt */
+	q = HEAP_TOP(sch)->object;
+	alg_fq = (struct wf2qp_queue *)q;
+	m = dn_dequeue(q);
+	heap_extract(sch, NULL); /* Remove queue from heap. */
+	si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
+	alg_fq->S = alg_fq->F;  /* Update start time. */
+	if (q->mq.head == 0) {	/* not backlogged any more. */
+		heap_insert(&si->idle_heap, alg_fq->F, q);
+	} else {			/* Still backlogged. */
+		/* Update F, store in neh or sch */
+		uint64_t len = q->mq.head->m_pkthdr.len;
+		alg_fq->F += len * alg_fq->inv_w;
+		if (DN_KEY_LEQ(alg_fq->S, si->V)) {
+			heap_insert(sch, alg_fq->F, q);
+		} else {
+			heap_insert(neh, alg_fq->S, q);
+		}
+	}
+    }
+	return m;
+}
+
+static int
+wf2qp_new_sched(struct dn_sch_inst *_si)
+{
+	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+	int ofs = offsetof(struct wf2qp_queue, heap_pos);
+
+	/* all heaps support extract from middle */
+	if (heap_init(&si->idle_heap, 16, ofs) ||
+	    heap_init(&si->sch_heap, 16, ofs) ||
+	    heap_init(&si->ne_heap, 16, ofs)) {
+		heap_free(&si->ne_heap);
+		heap_free(&si->sch_heap);
+		heap_free(&si->idle_heap);
+		return ENOMEM;
+	}
+	return 0;
+}
+
+static int
+wf2qp_free_sched(struct dn_sch_inst *_si)
+{
+	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+
+	heap_free(&si->sch_heap);
+	heap_free(&si->ne_heap);
+	heap_free(&si->idle_heap);
+
+	return 0;
+}
+
+static int
+wf2qp_new_fsk(struct dn_fsk *fs)
+{
+	ipdn_bound_var(&fs->fs.par[0], 1,
+		1, 100, "WF2Q+ weight");
+	return 0;
+}
+
+static int
+wf2qp_new_queue(struct dn_queue *_q)
+{
+	struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
+
+	_q->ni.oid.subtype = DN_SCHED_WF2QP;
+	q->F = 0;	/* not strictly necessary */
+	q->S = q->F + 1;    /* mark timestamp as invalid. */
+        q->inv_w = ONE_FP / _q->fs->fs.par[0];
+	if (_q->mq.head != NULL) {
+		wf2qp_enqueue(_q->_si, _q, _q->mq.head);
+	}
+	return 0;
+}
+
+/*
+ * Called when the infrastructure removes a queue (e.g. flowset
+ * is reconfigured). Nothing to do if we did not 'own' the queue,
+ * otherwise remove it from the right heap and adjust the sum
+ * of weights.
+ */
+static int
+wf2qp_free_queue(struct dn_queue *q)
+{
+	struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+	struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
+   
+	if (alg_fq->S >= alg_fq->F + 1)
+		return 0;	/* nothing to do, not in any heap */
+	si->wsum -= q->fs->fs.par[0];
+	if (si->wsum > 0)
+		si->inv_wsum = ONE_FP/si->wsum;
+
+	/* extract from the heap. XXX TODO we may need to adjust V
+	 * to make sure the invariants hold.
+	 */
+	if (q->mq.head == NULL) {
+		heap_extract(&si->idle_heap, q);
+	} else if (DN_KEY_LT(si->V, alg_fq->S)) {
+		heap_extract(&si->ne_heap, q);
+	} else {
+		heap_extract(&si->sch_heap, q);
+	}
+	return 0;
+}
+
+/*
+ * WF2Q+ scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg wf2qp_desc = {
+	_SI( .type = ) DN_SCHED_WF2QP,
+	_SI( .name = ) "WF2Q+",
+	_SI( .flags = ) DN_MULTIQUEUE,
+
+	/* we need extra space in the si and the queue */
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = ) sizeof(struct wf2qp_si),
+	_SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
+				sizeof(struct dn_queue),
+
+	_SI( .enqueue = ) wf2qp_enqueue,
+	_SI( .dequeue = ) wf2qp_dequeue,
+
+	_SI( .config = )  NULL,
+	_SI( .destroy = )  NULL,
+	_SI( .new_sched = ) wf2qp_new_sched,
+	_SI( .free_sched = ) wf2qp_free_sched,
+
+	_SI( .new_fsk = ) wf2qp_new_fsk,
+	_SI( .free_fsk = )  NULL,
+
+	_SI( .new_queue = ) wf2qp_new_queue,
+	_SI( .free_queue = ) wf2qp_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
diff --git a/sys/netinet/ipfw/dummynet.txt b/sys/netinet/ipfw/dummynet.txt
new file mode 100644
index 000000000000..0ed6ad15d327
--- /dev/null
+++ b/sys/netinet/ipfw/dummynet.txt
@@ -0,0 +1,860 @@
+#
+# $FreeBSD$
+#
+
+Notes on the internal structure of dummynet (2010 version)
+by Riccardo Panicucci and Luigi Rizzo
+Work supported by the EC project ONELAB2
+
+
+*********
+* INDEX *
+*********
+Implementation of new dummynet
+    Internal structure
+    Files
+Packet arrival
+    The reconfiguration routine
+dummynet_task()
+Configuration
+    Add a pipe
+    Add a scheduler
+    Add a flowset
+Listing object
+Delete of object
+    Delete a pipe
+    Delete a flowset
+    Delete a scheduler
+Compatibility with FreeBSD7.2 and FreeBSD 8 ipfw binary
+    ip_dummynet_glue.c
+    ip_fw_glue.c
+How to configure dummynet
+How to implement a new scheduler
+
+
+
+OPEN ISSUES
+------------------------------
+20100131 deleting RR causes infinite loop
+	presumably in the rr_free_queue() call -- seems to hang
+	forever when deleting a live flow
+------------------------------
+
+Dummynet is a traffic shaper and network emulator. Packets are
+selected by an external filter such as ipfw, and passed to the emulator
+with a tag such as "pipe 10" or "queue 5" which tells what to
+do with the packet. As an example
+
+	ipfw add queue 5 icmp from 10.0.0.2 to all
+
+All packets with the same tag belong to a "flowset", or a set
+of flows which can be further partitioned according to a mask.
+Flowsets are then passed to a scheduler for processing. The
+association of flowsets and schedulers is configurable e.g.
+
+	ipfw queue 5 config sched 10 weight 3 flow_mask xxxx
+	ipfw queue 8 config sched 10 weight 1 ...
+	ipfw queue 3 config sched 20 weight 1 ...
+
+"sched 10" represents one or more scheduler instances,
+selected through a mask on the 5-tuple itself.
+
+	ipfw sched 20 config type FIFO sched_mask yyy ...
+
+There are in fact two masks applied to each packet:
++ the "sched_mask" sends packets arriving to a scheduler_id to
+  one of many instances.
++ the "flow_mask" together with the flowset_id is used to
+  collect packets into independent flows on each scheduler.
+
+As an example, we can have
+	ipfw queue 5 config sched 10 flow_mask src-ip 0x000000ff
+	ipfw sched 10 config type WF2Q+ sched_mask src-ip 0xffffff00
+
+means that sched 10 will have one instance per /24 source subnet,
+and within that, each individual source will be a flow.
+	
+Internal structure
+-----------------
+Dummynet-related data is split into several data structures,
+part of them constituting the userland-kernel API, and others
+specific to the kernel.
+NOTE: for up-to-date details please look at the relevant source
+	headers (ip_dummynet.h, ip_dn_private.h, dn_sched.h)
+
+USERLAND-KERNEL API	(ip_dummynet.h)
+
+    struct dn_link:
+	contains data about the physical link such as
+	bandwith, delay, burst size;
+
+    struct dn_fs:
+	describes a flowset, i.e. a template for queues.
+	Main parameters are the scheduler we attach to, a flow_mask,
+	buckets, queue size, plr, weight, and other scheduler-specific
+	parameters.
+
+    struct dn_flow
+	contains information on a flow, including masks and
+	statistics
+
+    struct dn_sch:
+	defines a scheduler (and a link attached to it).
+	Parameters include scheduler type, sched_mask, number of
+	buckets, and possibly other scheduler-specific parameters,
+
+    struct dn_profile:
+	fields to simulate a delay profile
+
+
+KERNEL REPRESENTATION	(ip_dn_private.h)
+
+    struct mq
+	a queue of mbufs with head and tail.
+
+    struct dn_queue
+	individual queue of packets, created by a flowset using
+	flow_mask and attached to a scheduler instance selected
+	through sched_mask.
+	A dn_queue has a pointer to the dn_fsk (which in turn counts
+	how many queues point to it), a pointer to the
+	dn_sch_inst it attaches to, and is in a hash table in the
+	flowset. scheduler instances also should store queues in
+	their own containers used for scheduling (lists, trees, etc.)
+	CREATE: done on packet arrivals when a flow matches a flowset.
+	DELETE: done only when deleting the parent dn_sch_inst
+		or draining memory.
+
+    struct dn_fsk
+	includes a dn_fs; a pointer to the dn_schk; a link field
+	for the list of dn_fsk attached to the same scheduler,
+	or for the unlinked list;
+	a refcount for the number of queues pointing to it;
+	The dn_fsk is in a hash table, fshash.
+	CREATE: done on configuration commands.
+	DELETE: on configuration commands.
+
+    struct dn_sch_inst
+	a scheduler instance, created from a dn_schk applying sched_mask.
+	Contains a delay line, a reference to the parent, and scheduler-
+	specific info.  Both dn_sch_inst and its delay line can be in the
+	evheap if they have events to be processed.
+	CREATE: created from a dn_schk applying sched_mask
+	DELETE: configuration command delete a scheduler which in turn
+		sweeps the hash table of instances deleting them
+
+    struct dn_schk
+	includes dn_sch, dn_link, a pointer to dn_profile,
+	a hash table of dn_sch_inst, a list of dn_fsk
+	attached to it.
+	CREATE: configuration command. If there are flowsets that
+		refer to this number, they are attached and moved
+		to the hash table
+	DELETE: manual, see dn_sch_inst
+
+
+	fshash                            schedhash
+      +---------------+   sched        +--------------+
+      |      sched-------------------->|      NEW_SCHK|
+  -<----*sch_chain    |<-----------------*fsk_list    |
+      |NEW_FSK        |<----.          | [dn_link]    |
+      +---------------+     |          +--------------+
+      |qht (hash)     |     |          |  siht(hash)  |
+      |   [dn_queue]  |     |          |  [dn_si]     |
+      |   [dn_queue]  |     |          |  [dn_si]     |
+      |     ...       |     |          |   ...        |
+      |   +--------+  |     |          | +---------+  |
+      |   |dn_queue|  |     |          | |dn_si    |  |
+      |  |    fs *----------'          | |         |  |
+      |  |    si *---------------------->|         |  |
+      |  +---------+  |                | +---------+  |
+      +---------------+                +--------------+
+
+The following global data structures contain all
+schedulers and flowsets.
+
+- schedhash[x]: contains all scheduler templates in the system.
+	Looked up only on manual configurations, where flowsets
+	are attached to matching schedulers.
+	We have one entry per 'sched X config' command
+	(plus one for each 'pipe X config').
+
+- fshash[x]: contains all flowsets.
+	We do a lookup on this for each packet.
+	We have one entry for each 'queue X config'
+	(plus one for each 'pipe X config').
+
+Additionally, a list that contains all unlinked flowset:
+- fsu:  contains flowset that are not linked with any scheduler.
+	Flowset are put in this list when they refer to a non
+	existing scheduler.
+	We don't need an efficient data structure as we never search
+	here on a packet arrivals.
+
+Scheduler instances and the delay lines associated with each scheduler
+instance need to be woken up at certain times. Because we have many
+such objects, we keep them in a priority heap (system_heap).
+
+Almost all objects in this implementation are preceded by a structure
+(struct dn_id) which makes it easier to identify them.
+
+
+Files
+-----
+The dummynet code is split in several files.
+All kernel code is in sys/netinet/ipfw except ip_dummynet.h
+All userland code is in sbin/ipfw.
+Files are
+- sys/netinet/ip_dummynet.h defines the kernel-userland API
+- ip_dn_private.h contains the kernel-specific APIs
+  and data structures
+- dn_sched.h defines the scheduler API
+- ip_dummynet.c cointains module glue and sockopt handlers, with all
+  functions to configure and list objects.
+- ip_dn_io.c contains the functions directly related to packet processing,
+  and run in the critical path. It also contains some functions
+  exported to the schedulers.
+- dn_heap.[ch] implement a binary heap and a generic hash table
+- dn_sched_* implement the various scheduler modules
+  
+- dummynet.c is the file used to implement the user side of dummynet.
+  It contains the function to parsing command line, and functions to
+  show the output of dummynet objects.
+Moreover, there are two new file (ip_dummynet_glue.c and ip_fw_glue.c) that
+are used to allow compatibility with the "ipfw" binary from FreeBSD 7.2 and
+FreeBSD 8.
+
+LOCKING
+=======
+At the moment the entire processing occurs under a single lock
+which is expected to be acquired in exclusive mode
+DN_BH_WLOCK() / DN_BH_WUNLOCK().
+
+In perspective we aim at the following:
+- the 'busy' flag, 'pending' list and all structures modified by packet
+  arrivals and departures are protected by the BH_WLOCK.
+  This is normally acquired in exclusive mode by the packet processing
+  functions for short sections of code (exception -- the timer).
+  If 'busy' is not set, we can do regular packet processing.
+  If 'busy' is set, no pieces can be accessed.
+  We must enqueue the packet on 'pending' and return immediately.
+
+- the 'busy' flag is set/cleared by long sections of code as follows:
+	UH_WLOCK(); KASSERT(busy == 0);
+	BH_WLOCK(); busy=1; BH_WUNLOCK();
+	... do processing ...
+	BH_WLOCK(); busy=0; drain_queue(pending); BH_WUNLOCK();
+	UH_WUNLOCK();
+  this normally happens when the upper half has something heavy
+  to do. The prologue and epilogue are not in the critical path.
+
+- the main containers (fshash, schedhash, ...) are protected by
+  UH_WLOCK.
+  
+Packet processing
+=================
+A packet enters dummynet through dummynet_io(). We first lookup
+the flowset number in fshash using dn_ht_find(), then find the scheduler
+instance using ipdn_si_find(), then possibly identify the correct
+queue with ipdn_q_find().
+If successful, we call the scheduler's enqueue function(), and
+if needed start I/O on the link calling serve_sched().
+If the packet can be returned immediately, this is done by
+leaving *m0 set. Otherwise, the packet is absorbed by dummynet
+and we simply return, possibly with some appropriate error code.
+
+Reconfiguration
+---------------
+Reconfiguration is the complex part of the system because we need to
+keep track of the various objects and containers.
+At the moment we do not use reference counts for objects so all
+processing must be done under a lock.
+
+The main entry points for configuration is the ip_dn_ctl() handler
+for the IP_DUMMYNET3 sockopt (others are provided only for backward
+compatibility). Modifications to the configuration call do_config().
+The argument is a sequence of blocks each starting with a  struct dn_id
+which specifies its content.
+The first dn_id must contain as obj.id the DN_API_VERSION
+The obj.type is DN_CMD_CONFIG (followed by actual objects),
+DN_CMD_DELETE (with the correct subtype and list of objects), or
+DN_CMD_FLUSH.
+
+DN_CMD_CONFIG is followed by objects to add/reconfigure. In general,
+if an object already exists it is reconfigured, otherwise it is
+created in a way that keeps the structure consistent.
+We have the following objects in the system, normally numbered with
+an identifier N between 1 and 65535. For certain objects we have
+"shadow" copies numbered I+NMAX and I+ 2*NMAX which are used to
+implement certain backward compatibility features.
+
+In general we have the following linking
+
+  TRADITIONAL DUMMYNET QUEUES "queue N config ... pipe M ..."
+	corresponds to a dn_fs object numbered N
+
+  TRADITIONAL DUMMYNET PIPES "pipe N config ..."
+	dn_fs N+2*NMAX --> dn_sch N+NMAX type FIFO --> dn_link N+NMAX
+
+  GENERIC SCHEDULER "sched N config ... "
+	[dn_fs N+NMAX] --> dn_sch N --> dn_link N
+	The flowset N+NMAX is created only if the scheduler is not
+	of type MULTIQUEUE.
+
+  DELAY PROFILE	"pipe N config profile ..."
+	it is always attached to an existing dn_link N
+
+Because traditional dummynet pipes actually configure both a
+'standalone' instance and one that can be used by queues,
+we do the following:
+
+    "pipe N config ..." configures:
+	dn_sched N type WF2Q+
+	dn_sched N+NMAX type FIFO
+	dn_fs N+2NMAX attached to dn_sched N+NMAX
+	dn_pipe N
+	dn_pipe N+NMAX
+
+    "queue N config" configures
+	dn_fs N
+
+    "sched N config" configures
+	dn_sched N type as desired
+	dn_fs N+NMAX attached to dn_sched N
+
+
+dummynet_task()
+===============
+The dummynet_task() is the the main dummynet processing function and is
+called every tick. This function first calculate the new current time, then
+it checks if it is the time to wake up object from the system_heap comparing
+the current time and the key of the heap. Two types of object (really the
+heap contains pointer to objects) are in the
+system_heap:
+
+- scheduler instance: if a scheduler instance is waked up, the dequeue()
+  function is called until it has credit. If the dequeue() returns packets,
+  the scheduler instance is inserted in the heap with a new key depending of
+  the data that will be send out. If the scheduler instance remains with
+  some credit, it means that is hasn't other packet to send and so the
+  instance is no longer inserted in the heap.
+
+  If the scheduler instance extracted from the heap has the DELETE flag set,
+  the dequeue() is not called and the instance is destroyed now.
+
+- delay line: when extracting a delay line, the function transmit_event() is
+  called to send out packet from delay line.
+
+  If the scheduler instance associated with this delay line doesn't exists,
+  the delay line will be delete now.
+
+Configuration
+=============
+To create a pipe, queue or scheduler, the user should type commands like:
+"ipfw pipe x config"
+"ipfw queue y config pipe x"
+"ipfw pipe x config sched <type>"
+
+The userland side of dummynet will prepare a buffer contains data to pass to
+kernel side.
+The buffer contains all struct needed to configure an object. In more detail,
+to configure a pipe all three structs (dn_link, dn_sch, dn_fs) are needed,
+plus the delay profile struct if the pipe has a delay profile.
+
+If configuring a scheduler only the struct dn_sch is wrote in the buffer,
+while if configuring a flowset only the dn_fs struct is wrote.
+
+The first struct in the buffer contains the type of command request, that is
+if it is configuring a pipe, a queue, or a scheduler. Then there are structs
+need to configure the object, and finally there is the struct that mark
+the end of the buffer.
+
+To support the insertion of pipe and queue using the old syntax, when adding
+a pipe it's necessary to create a FIFO flowset and a FIFO scheduler, which
+have a number x + DN_PIPEOFFSET.
+
+Add a pipe
+----------
+A pipe is only a template for a link.
+If the pipe already exists, parameters are updated. If a delay profile exists
+it is deleted and a new one is created.
+If the pipe doesn't exist a new one is created. After the creation, the
+flowset unlinked list is scanned to see if there are some flowset that would
+be linked with this pipe. If so, these flowset will be of wf2q+ type (for
+compatibility) and a new wf2q+ scheduler is created now.
+
+Add a scheduler
+---------------
+If the scheduler already exists, and the type and the mask are the same, the
+scheduler is simply reconfigured calling the config_scheduler() scheduler
+function with the RECONFIGURE flag active.
+If the type or the mask differ, it is necessary to delete the old scheduler
+and create a new one.
+If the scheduler doesn't exists, a new one is created. If the scheduler has
+a mask, the hash table is created to store pointers to scheduler instances.
+When a new scheduler is created, it is necessary to scan the unlinked
+flowset list to search eventually flowset that would be linked with this
+scheduler number. If some are found, flowsets became of the type of this
+scheduler and they are configured properly.
+
+Add a flowset
+-------------
+Flowset pointers are store in the system in two list. The unlinked flowset list
+contains all flowset that aren't linked with a scheduler, the flowset list
+contains flowset linked to a scheduler, and so they have a type.
+When adding a new flowset, first it is checked if the flowset exists (that is,
+it is in the flowset list) and if it doesn't exists a new flowset is created
+and added to unlinked flowset list if the scheduler which the flowset would be
+linked doesn't exists, or added in the flowset list and configured properly if
+the scheduler exists. If the flowset (before to be created) was in the
+unlinked flowset list, it is removed and deleted, and then recreated.
+If the flowset exists, to allow reconfiguration of this flowset, the
+scheduler number and types must match with the one in memory. If this isn't
+so, the flowset is deleted and a new one will be created. Really, the flowset
+it isn't deleted now, but it is removed from flowset list and it will be
+deleted later because there could be some queues that are using it.
+
+Listing of object
+=================
+The user can request a list of object present in dummynet through the command
+"ipfw [-v] pipe|queue [x] list|show"
+The kernel side of dummynet send a buffer to user side that contains all
+pipe, all scheduler, all flowset, plus all scheduler instances and all queues.
+The dummynet user land will format the output and show only the relevant
+information.
+The buffer sent start with all pipe from the system. The entire struct dn_link
+is passed, except the delay_profile struct that is useless in user space.
+After pipes, all flowset are wrote in the buffer. The struct contains
+scheduler flowset specific data is linked with the flowset writing the
+'obj' id of the extension into the 'alg_fs' pointer.
+Then schedulers are wrote. If a scheduler has one or more scheduler instance,
+these are linked to the parent scheduler writing the id of the parent in the
+'ptr_sched' pointer. If a scheduler instance has queues, there are wrote in
+the buffer and linked thorugh the 'obj' and 'sched_inst' pointer.
+Finally, flowsets in the unlinked flowset list  are write in the buffer, and
+then a struct gen in saved in the buffer to mark the last struct in the buffer.
+
+
+Delete of object
+================
+An object is usually removed by user through a command like
+"ipfw pipe|queue x delete". XXX sched?
+ipfw pass to the kernel a struct gen that contains the type and the number
+of the object to remove
+
+Delete of pipe x
+----------------
+A pipe can be deleted by the user throught the command 'ipfw pipe x delete'.
+To delete a pipe, the pipe is removed from the pipe list, and then deleted.
+Also the scheduler associated with this pipe should be deleted.
+For compatibility with old dummynet syntax, the associated FIFO scheduler and
+FIFO flowset must be deleted.
+
+Delete of flowset x
+-------------------
+To remove a flowset, we must be sure that is no loger referenced by any object.
+If the flowset to remove is in the unlinked flowset list, there is not any
+issue, the flowset can be safely removed calling a free() (the flowset
+extension is not yet created if the flowset is in this list).
+If the flowset is in the flowset list, first we remove from it so new packet
+are discarded when arrive. Next, the flowset is marked as delete.
+Now we must check if some queue is using this flowset.
+To do this, a counter (active_f) is provided. This counter indicate how many
+queues exist using this flowset.
+The active_f counter is automatically incremented when a queue is created
+and decremented when a queue is deleted.
+If the counter is 0, the flowset can be safely deleted, and the delete_alg_fs()
+scheduler function is called before deallocate memory.
+If the counter is not 0, the flowset remain in memory until the counter become
+zero. When a queue is delete (by dn_delete_queue() function) it is checked if
+the linked flowset is deleting and if so the counter is decrementing. If the
+counter reaches 0, the flowset is deleted.
+The deletion of a queue can be done only by the scheduler, or when the scheduler
+is destroyed.
+
+Delete of scheduler x
+---------------------
+To delete a scheduler we must be sure that any scheduler instance of this type
+are in the system_heap. To do so, a counter (inst_counter) is provided.
+This counter is managed by the system: it is incremented every time it is
+inserted in the system_heap, and decremented every time it is extracted from it.
+To delete the scheduler, first we remove it from the scheduler list, so new
+packet are discarded when they arrive, and mark the scheduler as deleting.
+
+If the counter is 0, we can remove the scheduler safely calling the
+really_deletescheduler() function. This function will scan all scheduler
+instances and call the delete_scheduler_instance() function that will delete
+the instance. When all instance are deleted, the scheduler template is
+deleted calling the delete_scheduler_template(). If the delay line associate
+with the scheduler is empty, it is deleted now, else it will be deleted when
+it will became empy.
+If the counter was not 0, we wait for it. Every time the dummynet_task()
+function extract a scheduler from the system_heap, the counter is decremented.
+If the scheduler has the delete flag enabled the dequeue() is not called and
+delete_scheduler_instance() is called to delete the instance.
+Obviously this scheduler instance is no loger inserted in the system_heap.
+If the counter reaches 0, the delete_scheduler_template() function is called
+all memory is released.
+NOTE: Flowsets that belong to this scheduler are not deleted, so if a new
+      scheduler with the same number is inserted will use these flowsets.
+      To do so, the best approach would be insert these flowset in the
+      unlinked flowset list, but doing this now will be very expensive.
+      So flowsets will remain in memory and linked with a scheduler that no
+      longer exists until a packet belonging to this flowset arrives. When
+      this packet arrives, the reconfigure() function is called because the
+      generation number mismatch with one contains in the flowset and so
+      the flowset will be moved into the flowset unlinked list, or will be
+      linked with the new scheduler if a new one was created.
+
+
+COMPATIBILITY WITH FREEBSD 7.2 AND FREEBSD 8 'IPFW' BINARY
+==========================================================
+Dummynet is not compatible with old ipfw binary because internal structs are
+changed. Moreover, the old ipfw binary is not compatible with new kernels
+because the struct that represents a firewall rule has changed. So, if a user
+install a new kernel on a FreeBSD 7.2, the ipfw (and possibly many other
+commands) will not work.
+New dummynet uses a new socket option: IP_DUMMYNET3, used for both set and get.
+The old option can be used to allow compatibility with the 'ipfw' binary of
+older version (tested with 7.2 and 8.0) of FreeBSD.
+Two file are provided for this purpose:
+- ip_dummynet_glue.c translates old dummynet requests to the new ones,
+- ip_fw_glue.c converts the rule format between 7.2 and 8 versions.
+Let see in detail these two files.
+
+IP_DUMMYNET_GLUE.C
+------------------
+The internal structs of new dummynet are very different from the original.
+Because of there are some difference from between dummynet in FreeBSD 7.2 and
+dummynet in FreeBSD 8 (the FreeBSD 8 version includes support to pipe delay
+profile and burst option), I have to include both header files. I copied
+the revision 191715 (for version 7.2) and the revision 196045 (for version 8)
+and I appended a number to each struct to mark them.
+
+The main function of this file is ip_dummynet_compat() that is called by
+ip_dn_ctl() when it receive a request of old socket option.
+
+A global variabile ('is7') store the version of 'ipfw' that FreeBSD is using.
+This variable is set every time a request of configuration is done, because
+with this request we receive a buffer of which size depending of ipfw version.
+Because of in general the first action is a configuration, this variable is
+usually set accordly. If the first action is a request of listing of pipes
+or queues, the system cannot know the version of ipfw, and we suppose that
+version 7.2 is used. If version is wrong, the output can be senseless, but
+the application should not crash.
+
+There are four request for old dummynet:
+- IP_DUMMYNET_FLUSH: the flush options have no parameter, so simply the
+  dummynet_flush() function is called;
+- IP_DUMMYNET_DEL: the delete option need to be translate.
+  It is only necessary to extract the number and the type of the object
+  (pipe or queue) to delete from the buffer received and build a new struct
+  gen contains the right parameters, then call the delete_object() function;
+- IP_DUMMYNET_CONFIGURE: the configure command receive a buffer depending of
+  the ipfw version. After the properly extraction of all data, that depends
+  by the ipfw version used, new structures are filled and then the dummynet
+  config_link() function is properly called. Note that the 7.2 version does
+  not support some parameter as burst or delay profile.
+- IP_DUMMYNET_GET: The get command should send to the ipfw the correct buffer
+  depending of its version. There are two function that build the
+  corrected buffer, ip_dummynet_get7() and ip_dummynet_get8(). These
+  functions reproduce the buffer exactly as 'ipfw' expect. The only difference
+  is that the weight parameter for a queue is no loger sent by dummynet and so
+  it is set to 0.
+  Moreover, because of the internal structure has changed, the bucket size
+  of a queue could not be correct, because now all flowset share the hash
+  table.
+  If the version of ipfw is wrong, the output could be senseless or truncated,
+  but the application should not crash.
+
+IP_FW_GLUE.C
+------------
+The ipfw binary also is used to add rules to FreeBSD firewall. Because of the
+struct ip_fw is changed from FreeBsd 7.2 to FreeBSD 8, it is necessary
+to write some glue code to allow use ipfw from FreeBSD 7.2 with the kernel
+provided with FreeBSD 8.
+This file contains two functions to convert a rule from FreeBSD 7.2 format to
+FreeBSD 8 format, and viceversa.
+The conversion should be done when a rule passes from userspace to kernel space
+and viceversa.
+I have to modify the ip_fw2.c file to manage these two case, and added a
+variable (is7) to store the ipfw version used, using an approach like the
+previous file:
+- when a new rule is added (option IP_FW_ADD) the is7 variable is set if the
+  size of the rule received corrispond to FreeBSD 7.2 ipfw version. If so, the
+  rule is converted to version 8 calling the function convert_rule_to_8().
+  Moreover, after the insertion of the rule, the rule is now reconverted to
+  version 7 because the ipfw binary will print it.
+- when the user request a list of rules (option IP_FW_GET) the is7 variable
+  should be set correctly because we suppose that a configure command was done,
+  else we suppose that the FreeBSD version is 8. The function ipfw_getrules()
+  in ip_fw2.c file return all rules, eventually converted to version 7 (if
+  the is7 is set) to the ipfw binary.
+The conversion of a rule is quite simple. The only difference between the
+two structures (struct ip_fw) is that in the new there is a new field
+(uint32_t id). So, I copy the entire rule in a buffer and the copy the rule in
+the right position in the new (or old) struct. The size of commands are not
+changed, and the copy is done into a cicle.
+
+How to configure dummynet
+=========================
+It is possible to configure dummynet through two main commands:
+'ipfw pipe' and 'ipfw queue'.
+To allow compatibility with old version, it is possible configure dummynet
+using the old command syntax. Doing so, obviously, it is only possible to
+configure a FIFO scheduler or a wf2q+ scheduler.
+A new command, 'ipfw pipe x config sched <type>' is supported to add a new
+scheduler to the system.
+
+- ipfw pipe x config ...
+  create a new pipe with the link parameters
+  create a new scheduler fifo (x + offset)
+  create a new flowset fifo (x + offset)
+  the mask is eventually stored in the FIFO scheduler
+
+- ipfw queue y config pipe x ...
+  create a new flowset y linked to sched x.
+    The type of flowset depends by the specified scheduler.
+    If the scheduler does not exist, this flowset is inserted in a special
+    list and will be not active.
+    If pipe x exists and sched does not exist, a new wf2q+ scheduler is
+    created and the flowset will be linked to this new scheduler (this is
+    done for compatibility with old syntax).
+
+- ipfw pipe x config sched <type> ...
+  create a new scheduler x of type <type>.
+  Search into the flowset unlinked list if there are some flowset that
+  should be linked with this new scheduler.
+
+- ipfw pipe x delete
+  delete the pipe x
+  delete the scheduler fifo (x + offset)
+  delete the scheduler x
+  delete the flowset fifo (x + offset)
+
+- ipfw queue x delete
+  delete the flowset x
+
+- ipfw sched x delete ///XXX
+  delete the scheduler x
+
+Follow now some examples to how configure dummynet:
+- Ex1:
+  ipfw pipe 10 config bw 1M delay 15 // create a pipe with band and delay
+                                        A FIFO flowset and scheduler is
+                                        also created
+  ipfw queue 5 config pipe 10 weight 56 // create a flowset. This flowset
+                                           will be of wf2q+ because a pipe 10
+                                           exists. Moreover, the wf2q+
+                                           scheduler is created now.
+- Ex2:
+  ipfw queue 5 config pipe 10 weight 56 // Create a flowset. Scheduler 10
+                                           does not exist, so this flowset
+                                           is inserted in the unlinked
+                                           flowset list.
+  ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+                               Because of a flowset with 'pipe 10' exists,
+                               a wf2q+ scheduler is created now and that
+                               flowset is linked with this sceduler.
+
+- Ex3:
+  ipfw pipe 10 config bw...    // Create a pipe, a FIFO flowset and scheduler.
+  ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+                                  pipe 10
+  ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5. This flowset
+                                           will belong to scheduler 10 and
+                                           it is of type RR
+
+- Ex4:
+  ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+                                  pipe 10 (not exist yet)
+  ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+  ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5.This flowset
+                                           will belong to scheduler 10 and
+                                           it is of type RR
+  ipfw pipe 10 config sched wf2q+ // Modify the type of scheduler 10. It
+                                     becomes a wf2q+ scheduler.
+                                     When a new packet of flowset 5 arrives,
+                                     the flowset 5 becomes to wf2q+ type.
+
+How to implement a new scheduler
+================================
+In dummynet, a scheduler algorithm is represented by two main structs, some
+functions and other minor structs.
+- A struct dn_sch_xyz (where xyz is the 'type' of scheduler algorithm
+  implemented) contains data relative to scheduler, as global parameter that
+  are common to all instances of the scheduler
+- A struct dn_sch_inst_xyz contains data relative to a single scheduler
+  instance, as local status variable depending for example by flows that
+  are linked with the scheduler, and so on.
+To add a scheduler to dummynet, the user should type a command like:
+'ipfw pipe x config sched <type> [mask ... ...]'
+This command creates a new struct dn_sch_xyz of type <type>, and
+store the optional parameter in that struct.
+
+The parameter mask determines how many scheduler instance of this
+scheduler may exist. For example, it is possible to divide traffic
+depending on the source port (or destination, or ip address...),
+so that every scheduler instance act as an independent scheduler.
+If the mask is not set, all traffic goes to the same instance.
+
+When a packet arrives to a scheduler, the system search the corrected
+scheduler instance, and if it does not exist it is created now (the
+struct dn_sch_inst_xyz is allocated by the system, and the scheduler
+fills the field correctly). It is a task of the scheduler to create
+the struct that contains all queues for a scheduler instance.
+Dummynet provides some function to create an hash table to store
+queues, but the schedule algorithm can choice the own struct.
+
+To link a flow to a scheduler, the user should type a command like:
+'ipfw queue z config pipe x [mask... ...]'
+
+This command creates a new 'dn_fs' struct that will be inserted
+in the system.  If the scheduler x exists, this flowset will be
+linked to that scheduler and the flowset type become the same as
+the scheduler type. At this point, the function create_alg_fs_xyz()
+is called to allow store eventually parameter for the flowset that
+depend by scheduler (for example the 'weight' parameter for a wf2q+
+scheduler, or some priority...). A parameter mask can be used for
+a flowset. If the mask parameter is set, the scheduler instance can
+separate packet according to its flow id (src and dst ip, ports...)
+and assign it to a separate queue. This is done by the scheduler,
+so it can ignore the mask if it wants.
+
+See now the two main structs:
+struct dn_sch_xyz {
+    struct gen g; /* important the name g */
+    /* global params */
+};
+struct dn_sch_inst_xyz {
+    struct gen g; /* important the name g */
+    /* params of the instance */
+};
+It is important to embed the struct gen as first parameter. The struct gen
+contains some values that the scheduler instance must fill (the 'type' of
+scheduler, the 'len' of the struct...)
+The function create_scheduler_xyz() should be implemented to initialize global
+parameters in the first struct, and if memory allocation is done it is
+mandatory to implement the delete_scheduler_template() function to free that
+memory.
+The function create_scheduler_instance_xyz() must be implemented even if the
+scheduler instance does not use extra parameters. In this function the struct
+gen fields must be filled with corrected infos. The
+delete_scheduler_instance_xyz() function must bu implemented if the instance
+has allocated some memory in the previous function.
+
+To store data belonging to a flowset the follow struct is used:
+struct alg_fs_xyz {
+    struct gen g;
+    /* fill correctly the gen struct
+     g.subtype = DN_XYZ;
+     g.len = sizeof(struct alg_fs_xyz)
+     ...
+     */
+    /* params for the flow */
+};
+The create_alg_fs_xyz() function is mandatory, because it must fill the struct
+gen, but the delete_alg_fs_xyz() is mandatory only if the previous function
+has allocated some memory.
+
+A struct dn_queue contains packets belonging to a queue and some statistical
+data. The scheduler could have to store data in this struct, so it must define
+a dn_queue_xyz struct:
+struct dn_queue_xyz {
+    struct dn_queue q;
+    /* parameter for a queue */
+}
+
+All structures are allocated by the system. To do so, the scheduler must
+set the size of its structs in the scheduler descriptor:
+scheduler_size:     sizeof(dn_sch_xyz)
+scheduler_i_size:   sizeof(dn_sch_inst_xyz)
+flowset_size:       sizeof(alg_fs_xyz)
+queue_size:         sizeof(dn_queue_xyz);
+The scheduler_size could be 0, but other struct must have at least a struct gen.
+
+
+After the definition of structs, it is necessary to implement the
+scheduler functions.
+
+- int (*config_scheduler)(char *command, void *sch, int reconfigure);
+    Configure a scheduler, or reconfigure if 'reconfigure' == 1.
+    This function performs additional allocation and initialization of global
+    parameter for this scheduler.
+    If memory is allocated here, the delete_scheduler_template() function
+    should be implemented to remove this memory.
+- int (*delete_scheduler_template)(void* sch);
+    Delete a scheduler template. This function is mandatory if the scheduler
+    uses extra data respect the struct dn_sch.
+- int (*create_scheduler_instance)(void *s);
+    Create a new scheduler instance. The system allocate the necessary memory
+    and the schedulet can access it using the 's' pointer.
+    The scheduler instance stores all queues, and to do this can use the
+    hash table provided by the system.
+- int (*delete_scheduler_instance)(void *s);
+    Delete a scheduler instance. It is important to free memory allocated
+    by create_scheduler_instance() function. The memory allocated by system
+    is freed by the system itself. The struct contains all queue also has
+    to be deleted.
+- int (*enqueue)(void *s, struct gen *f, struct mbuf *m,
+                 struct ipfw_flow_id *id);
+    Called when a packet arrives. The packet 'm' belongs to the scheduler
+    instance 's', has a flowset 'f' and the flowid 'id' has already been
+    masked. The enqueue() must call dn_queue_packet(q, m) function to really
+    enqueue packet in the queue q. The queue 'q' is chosen by the scheduler
+    and if it does not exist should be created calling the dn_create_queue()
+    function. If the schedule want to drop the packet, it must call the
+    dn_drop_packet() function and then return 1.
+- struct mbuf * (*dequeue)(void *s);
+    Called when the timer expires (or when a packet arrives and the scheduler
+    instance is idle).
+    This function is called when at least a packet can be send out. The
+    scheduler choices the packet and returns it; if no packet are in the
+    schedulerinstance, the function must return NULL.
+    Before return a packet, it is important to call the function
+    dn_return_packet() to update some statistic of the queue and update the
+    queue counters.
+- int (*drain_queue)(void *s, int flag);
+    The system request to scheduler to delete all queues that is not using
+    to free memory. The flag parameter indicate if a queue must be deleted
+    even if it is active.
+
+- int (*create_alg_fs)(char *command, struct gen *g, int reconfigure);
+    It is called when a flowset is linked with a scheduler. This is done
+    when the scheduler is defined, so we can know the type of flowset.
+    The function initialize the flowset paramenter parsing the command
+    line. The parameter will be stored in the g struct that have the right
+    size allocated by the system. If the reconfigure flag is set, it means
+    that the flowset is reconfiguring
+- int (*delete_alg_fs)(struct gen *f);
+    It is called when a flowset is deleting. Must remove the memory allocate
+    by the create_alg_fs() function.
+
+- int (*create_queue_alg)(struct dn_queue *q, struct gen *f);
+    Called when a queue is created. The function should link the queue
+    to the struct used by the scheduler instance to store all queues.
+- int (*delete_queue_alg)(struct dn_queue *q);
+    Called when a queue is deleting. The function should remove extra data
+    and update the struct contains all queues in the scheduler instance.
+
+The struct scheduler represent the scheduler descriptor that is passed to
+dummynet when a scheduler module is loaded.
+This struct contains the type of scheduler, the lenght of all structs and
+all function pointers.
+If a function is not implemented should be initialize to NULL. Some functions
+are mandatory, other are mandatory if some memory should be freed.
+Mandatory functions:
+- create_scheduler_instance()
+- enqueue()
+- dequeue()
+- create_alg_fs()
+- drain_queue()
+Optional functions:
+- config_scheduler()
+- create_queue_alg()
+Mandatory functions if the corresponding create...() has allocated memory:
+- delete_scheduler_template()
+- delete_scheduler_instance()
+- delete_alg_fs()
+- delete_queue_alg()
+
diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c
new file mode 100644
index 000000000000..a31ec1f71b81
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_glue.c
@@ -0,0 +1,845 @@
+/*-    
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ *
+ * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* FREEBSD7.2 ip_dummynet.h r191715*/
+
+struct dn_heap_entry7 {
+	int64_t key;        /* sorting key. Topmost element is smallest one */
+	void *object;      /* object pointer */
+};
+
+struct dn_heap7 {
+	int size;
+	int elements;
+	int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
+	struct dn_heap_entry7 *p;   /* really an array of "size" entries */
+};
+
+/* Common to 7.2 and 8 */
+struct dn_flow_set {
+	SLIST_ENTRY(dn_flow_set)    next;   /* linked list in a hash slot */
+
+	u_short fs_nr ;             /* flow_set number       */
+	u_short flags_fs;
+#define DNOLD_HAVE_FLOW_MASK   0x0001
+#define DNOLD_IS_RED       0x0002
+#define DNOLD_IS_GENTLE_RED    0x0004
+#define DNOLD_QSIZE_IS_BYTES   0x0008  /* queue size is measured in bytes */
+#define DNOLD_NOERROR      0x0010  /* do not report ENOBUFS on drops  */
+#define DNOLD_HAS_PROFILE      0x0020  /* the pipe has a delay profile. */
+#define DNOLD_IS_PIPE      0x4000
+#define DNOLD_IS_QUEUE     0x8000
+
+	struct dn_pipe7 *pipe ;  /* pointer to parent pipe */
+	u_short parent_nr ;     /* parent pipe#, 0 if local to a pipe */
+
+	int weight ;        /* WFQ queue weight */
+	int qsize ;         /* queue size in slots or bytes */
+	int plr ;           /* pkt loss rate (2^31-1 means 100%) */
+
+	struct ipfw_flow_id flow_mask ;
+
+	/* hash table of queues onto this flow_set */
+	int rq_size ;       /* number of slots */
+	int rq_elements ;       /* active elements */
+	struct dn_flow_queue7 **rq;  /* array of rq_size entries */
+
+	u_int32_t last_expired ;    /* do not expire too frequently */
+	int backlogged ;        /* #active queues for this flowset */
+
+        /* RED parameters */
+#define SCALE_RED               16
+#define SCALE(x)                ( (x) << SCALE_RED )
+#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
+	int w_q ;           /* queue weight (scaled) */
+	int max_th ;        /* maximum threshold for queue (scaled) */
+	int min_th ;        /* minimum threshold for queue (scaled) */
+	int max_p ;         /* maximum value for p_b (scaled) */
+	u_int c_1 ;         /* max_p/(max_th-min_th) (scaled) */
+	u_int c_2 ;         /* max_p*min_th/(max_th-min_th) (scaled) */
+	u_int c_3 ;         /* for GRED, (1-max_p)/max_th (scaled) */
+	u_int c_4 ;         /* for GRED, 1 - 2*max_p (scaled) */
+	u_int * w_q_lookup ;    /* lookup table for computing (1-w_q)^t */
+	u_int lookup_depth ;    /* depth of lookup table */
+	int lookup_step ;       /* granularity inside the lookup table */
+	int lookup_weight ;     /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+	int avg_pkt_size ;      /* medium packet size */
+	int max_pkt_size ;      /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+#define DN_IS_PIPE		0x4000
+#define DN_IS_QUEUE		0x8000
+struct dn_flow_queue7 {
+	struct dn_flow_queue7 *next ;
+	struct ipfw_flow_id id ;
+
+	struct mbuf *head, *tail ;  /* queue of packets */
+	u_int len ;
+	u_int len_bytes ;
+
+	u_long numbytes;
+
+	u_int64_t tot_pkts ;    /* statistics counters  */
+	u_int64_t tot_bytes ;
+	u_int32_t drops ;
+
+	int hash_slot ;     /* debugging/diagnostic */
+
+	/* RED parameters */
+	int avg ;                   /* average queue length est. (scaled) */
+	int count ;                 /* arrivals since last RED drop */
+	int random ;                /* random value (scaled) */
+	u_int32_t q_time;      /* start of queue idle time */
+
+	/* WF2Q+ support */
+	struct dn_flow_set *fs ;    /* parent flow set */
+	int heap_pos ;      /* position (index) of struct in heap */
+	int64_t sched_time ;     /* current time when queue enters ready_heap */
+
+	int64_t S,F ;        /* start time, finish time */
+};
+
+struct dn_pipe7 {        /* a pipe */
+	SLIST_ENTRY(dn_pipe7)    next;   /* linked list in a hash slot */
+
+	int pipe_nr ;       /* number   */
+	int bandwidth;      /* really, bytes/tick.  */
+	int delay ;         /* really, ticks    */
+
+	struct  mbuf *head, *tail ; /* packets in delay line */
+
+	/* WF2Q+ */
+	struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+	struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+	struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+	int64_t V ;          /* virtual time */
+	int sum;            /* sum of weights of all active sessions */
+
+	int numbytes;
+
+	int64_t sched_time ;     /* time pipe was scheduled in ready_heap */
+
+	/*
+	* When the tx clock come from an interface (if_name[0] != '\0'), its name
+	* is stored below, whereas the ifp is filled when the rule is configured.
+	*/
+	char if_name[IFNAMSIZ];
+	struct ifnet *ifp ;
+	int ready ; /* set if ifp != NULL and we got a signal from it */
+
+	struct dn_flow_set fs ; /* used with fixed-rate flows */
+};
+SLIST_HEAD(dn_pipe_head7, dn_pipe7);
+
+
+/* FREEBSD8 ip_dummynet.h r196045 */
+struct dn_flow_queue8 {
+	struct dn_flow_queue8 *next ;
+	struct ipfw_flow_id id ;
+
+	struct mbuf *head, *tail ;  /* queue of packets */
+	u_int len ;
+	u_int len_bytes ;
+
+	uint64_t numbytes ;     /* credit for transmission (dynamic queues) */
+	int64_t extra_bits;     /* extra bits simulating unavailable channel */
+
+	u_int64_t tot_pkts ;    /* statistics counters  */
+	u_int64_t tot_bytes ;
+	u_int32_t drops ;
+
+	int hash_slot ;     /* debugging/diagnostic */
+
+	/* RED parameters */
+	int avg ;                   /* average queue length est. (scaled) */
+	int count ;                 /* arrivals since last RED drop */
+	int random ;                /* random value (scaled) */
+	int64_t idle_time;       /* start of queue idle time */
+
+	/* WF2Q+ support */
+	struct dn_flow_set *fs ;    /* parent flow set */
+	int heap_pos ;      /* position (index) of struct in heap */
+	int64_t sched_time ;     /* current time when queue enters ready_heap */
+
+	int64_t S,F ;        /* start time, finish time */
+};
+
+struct dn_pipe8 {        /* a pipe */
+	SLIST_ENTRY(dn_pipe8)    next;   /* linked list in a hash slot */
+
+	int pipe_nr ;       /* number   */
+	int bandwidth;      /* really, bytes/tick.  */
+	int delay ;         /* really, ticks    */
+
+	struct  mbuf *head, *tail ; /* packets in delay line */
+
+	/* WF2Q+ */
+	struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+	struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+	struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+	int64_t V ;          /* virtual time */
+	int sum;            /* sum of weights of all active sessions */
+
+	/* Same as in dn_flow_queue, numbytes can become large */
+	int64_t numbytes;       /* bits I can transmit (more or less). */
+	uint64_t burst;     /* burst size, scaled: bits * hz */
+
+	int64_t sched_time ;     /* time pipe was scheduled in ready_heap */
+	int64_t idle_time;       /* start of pipe idle time */
+
+	char if_name[IFNAMSIZ];
+	struct ifnet *ifp ;
+	int ready ; /* set if ifp != NULL and we got a signal from it */
+
+	struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+    /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN     32
+	char name[ED_MAX_NAME_LEN];
+	int loss_level;
+	int samples_no;
+	int *samples;
+};
+
+#define ED_MAX_SAMPLES_NO   1024
+struct dn_pipe_max8 {
+	struct dn_pipe8 pipe;
+	int samples[ED_MAX_SAMPLES_NO];
+};
+SLIST_HEAD(dn_pipe_head8, dn_pipe8);
+
+/*
+ * Changes from 7.2 to 8:
+ * dn_pipe:
+ *      numbytes from int to int64_t
+ *      add burst (int64_t)
+ *      add idle_time (int64_t)
+ *      add profile
+ *      add struct dn_pipe_max
+ *      add flag DN_HAS_PROFILE
+ *
+ * dn_flow_queue
+ *      numbytes from u_long to int64_t
+ *      add extra_bits (int64_t)
+ *      q_time from u_int32_t to int64_t and name idle_time
+ *
+ * dn_flow_set unchanged
+ *
+ */
+
+/* NOTE:XXX copied from dummynet.c */
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+	oid->len = len;
+	oid->type = type;
+	oid->subtype = 0;
+	oid->id = id;
+}
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+	struct dn_id *ret = *o;
+	oid_fill(ret, len, type, 0);
+	*o = O_NEXT(*o, len);
+	return ret;
+}
+
+
+static size_t pipesize7 = sizeof(struct dn_pipe7);
+static size_t pipesize8 = sizeof(struct dn_pipe8);
+static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
+
+/* Indicate 'ipfw' version
+ * 1: from FreeBSD 7.2
+ * 0: from FreeBSD 8
+ * -1: unknow (for now is unused)
+ *
+ * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
+ * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow,
+ *       it is suppose to be the FreeBSD 8 version.
+ */
+static int is7 = 0;
+
+static int
+convertflags2new(int src)
+{
+	int dst = 0;
+
+	if (src & DNOLD_HAVE_FLOW_MASK)
+		dst |= DN_HAVE_MASK;
+	if (src & DNOLD_QSIZE_IS_BYTES)
+		dst |= DN_QSIZE_BYTES;
+	if (src & DNOLD_NOERROR)
+		dst |= DN_NOERROR;
+	if (src & DNOLD_IS_RED)
+		dst |= DN_IS_RED;
+	if (src & DNOLD_IS_GENTLE_RED)
+		dst |= DN_IS_GENTLE_RED;
+	if (src & DNOLD_HAS_PROFILE)
+		dst |= DN_HAS_PROFILE;
+
+	return dst;
+}
+
+static int
+convertflags2old(int src)
+{
+	int dst = 0;
+
+	if (src & DN_HAVE_MASK)
+		dst |= DNOLD_HAVE_FLOW_MASK;
+	if (src & DN_IS_RED)
+		dst |= DNOLD_IS_RED;
+	if (src & DN_IS_GENTLE_RED)
+		dst |= DNOLD_IS_GENTLE_RED;
+	if (src & DN_NOERROR)
+		dst |= DNOLD_NOERROR;
+	if (src & DN_HAS_PROFILE)
+		dst |= DNOLD_HAS_PROFILE;
+	if (src & DN_QSIZE_BYTES)
+		dst |= DNOLD_QSIZE_IS_BYTES;
+
+	return dst;
+}
+
+static int
+dn_compat_del(void *v)
+{
+	struct dn_pipe7 *p = (struct dn_pipe7 *) v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
+	struct {
+		struct dn_id oid;
+		uintptr_t a[1];	/* add more if we want a list */
+	} cmd;
+
+	/* XXX DN_API_VERSION ??? */
+	oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+
+	if (is7) {
+		if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+			return EINVAL;
+		if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+			return EINVAL;
+	} else {
+		if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
+			return EINVAL;
+		if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
+			return EINVAL;
+	}
+
+	if (p->pipe_nr != 0) { /* pipe x delete */
+		cmd.a[0] = p->pipe_nr;
+		cmd.oid.subtype = DN_LINK;
+	} else { /* queue x delete */
+		cmd.oid.subtype = DN_FS;
+		cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
+	}
+
+	return do_config(&cmd, cmd.oid.len);
+}
+
+static int
+dn_compat_config_queue(struct dn_fs *fs, void* v)
+{
+	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+	struct dn_flow_set *f;
+
+	if (is7)
+		f = &p7->fs;
+	else
+		f = &p8->fs;
+
+	fs->fs_nr = f->fs_nr;
+	fs->sched_nr = f->parent_nr;
+	fs->flow_mask = f->flow_mask;
+	fs->buckets = f->rq_size;
+	fs->qsize = f->qsize;
+	fs->plr = f->plr;
+	fs->par[0] = f->weight;
+	fs->flags = convertflags2new(f->flags_fs);
+	if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
+		fs->w_q = f->w_q;
+		fs->max_th = f->max_th;
+		fs->min_th = f->min_th;
+		fs->max_p = f->max_p;
+	}
+
+	return 0;
+}
+
+static int
+dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, 
+		      struct dn_fs *fs, void* v)
+{
+	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+	int i = p7->pipe_nr;
+
+	sch->sched_nr = i;
+	sch->oid.subtype = 0;
+	p->link_nr = i;
+	fs->fs_nr = i + 2*DN_MAX_ID;
+	fs->sched_nr = i + DN_MAX_ID;
+
+	/* Common to 7 and 8 */
+	p->bandwidth = p7->bandwidth;
+	p->delay = p7->delay;
+	if (!is7) {
+		/* FreeBSD 8 has burst  */
+		p->burst = p8->burst;
+	}
+
+	/* fill the fifo flowset */
+	dn_compat_config_queue(fs, v);
+	fs->fs_nr = i + 2*DN_MAX_ID;
+	fs->sched_nr = i + DN_MAX_ID;
+
+	/* Move scheduler related parameter from fs to sch */
+	sch->buckets = fs->buckets; /*XXX*/
+	fs->buckets = 0;
+	if (fs->flags & DN_HAVE_MASK) {
+		sch->flags |= DN_HAVE_MASK;
+		fs->flags &= ~DN_HAVE_MASK;
+		sch->sched_mask = fs->flow_mask;
+		bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
+	}
+
+	return 0;
+}
+
+static int
+dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
+			 void *v)
+{
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+	p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
+	
+	pf->link_nr = p->link_nr;
+	pf->loss_level = p8->loss_level;
+// 	pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
+	pf->samples_no = p8->samples_no;
+	strncpy(pf->name, p8->name,sizeof(pf->name));
+	bcopy(p8->samples, pf->samples, sizeof(pf->samples));
+
+	return 0;
+}
+
+/*
+ * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
+ * the three main struct, else only a flowset is created
+ */
+static int
+dn_compat_configure(void *v)
+{
+	struct dn_id *buf = NULL, *base;
+	struct dn_sch *sch = NULL;
+	struct dn_link *p = NULL;
+	struct dn_fs *fs = NULL;
+	struct dn_profile *pf = NULL;
+	int lmax;
+	int error;
+
+	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+	int i; /* number of object to configure */
+
+	lmax = sizeof(struct dn_id);	/* command header */
+	lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+		sizeof(struct dn_fs) + sizeof(struct dn_profile);
+
+	base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO);
+	o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+	base->id = DN_API_VERSION;
+
+	/* pipe_nr is the same in p7 and p8 */
+	i = p7->pipe_nr;
+	if (i != 0) { /* pipe config */
+		sch = o_next(&buf, sizeof(*sch), DN_SCH);
+		p = o_next(&buf, sizeof(*p), DN_LINK);
+		fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+		error = dn_compat_config_pipe(sch, p, fs, v);
+		if (error) {
+			free(buf, M_DUMMYNET);
+			return error;
+		}
+		if (!is7 && p8->samples_no > 0) {
+			/* Add profiles*/
+			pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+			error = dn_compat_config_profile(pf, p, v);
+			if (error) {
+				free(buf, M_DUMMYNET);
+				return error;
+			}
+		}
+	} else { /* queue config */
+		fs = o_next(&buf, sizeof(*fs), DN_FS);
+		error = dn_compat_config_queue(fs, v);
+		if (error) {
+			free(buf, M_DUMMYNET);
+			return error;
+		}
+	}
+	error = do_config(base, (char *)buf - (char *)base);
+
+	if (buf)
+		free(buf, M_DUMMYNET);
+	return error;
+}
+
+int
+dn_compat_calc_size(void)
+{
+	int need = 0;
+	/* XXX use FreeBSD 8 struct size */
+	/* NOTE:
+	 * - half scheduler: 		schk_count/2
+	 * - all flowset:		fsk_count
+	 * - all flowset queues:	queue_count
+	 * - all pipe queue:		si_count
+	 */
+	need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
+	need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
+	need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
+	need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
+
+	return need;
+}
+
+int
+dn_c_copy_q (void *_ni, void *arg)
+{
+	struct copy_args *a = arg;
+	struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
+	struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
+	struct dn_flow *ni = (struct dn_flow *)_ni;
+	int size = 0;
+
+	/* XXX hash slot not set */
+	/* No difference between 7.2/8 */
+	fq7->len = ni->length;
+	fq7->len_bytes = ni->len_bytes;
+	fq7->id = ni->fid;
+
+	if (is7) {
+		size = sizeof(struct dn_flow_queue7);
+		fq7->tot_pkts = ni->tot_pkts;
+		fq7->tot_bytes = ni->tot_bytes;
+		fq7->drops = ni->drops;
+	} else {
+		size = sizeof(struct dn_flow_queue8);
+		fq8->tot_pkts = ni->tot_pkts;
+		fq8->tot_bytes = ni->tot_bytes;
+		fq8->drops = ni->drops;
+	}
+
+	*a->start += size;
+	return 0;
+}
+
+int
+dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
+{
+	struct dn_link *l = &s->link;
+	struct dn_fsk *f = s->fs;
+
+	struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
+	struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
+	struct dn_flow_set *fs;
+	int size = 0;
+
+	if (is7) {
+		fs = &pipe7->fs;
+		size = sizeof(struct dn_pipe7);
+	} else {
+		fs = &pipe8->fs;
+		size = sizeof(struct dn_pipe8);
+	}
+
+	/* These 4 field are the same in pipe7 and pipe8 */
+	pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
+	pipe7->bandwidth = l->bandwidth;
+	pipe7->delay = l->delay;
+	pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
+
+	if (!is7) {
+		if (s->profile) {
+			struct dn_profile *pf = s->profile;
+			strncpy(pipe8->name, pf->name, sizeof(pf->name));
+			pipe8->loss_level = pf->loss_level;
+			pipe8->samples_no = pf->samples_no;
+		}
+		pipe8->burst = div64(l->burst , 8 * hz);
+	}
+
+	fs->flow_mask = s->sch.sched_mask;
+	fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
+
+	fs->parent_nr = l->link_nr - DN_MAX_ID;
+	fs->qsize = f->fs.qsize;
+	fs->plr = f->fs.plr;
+	fs->w_q = f->fs.w_q;
+	fs->max_th = f->max_th;
+	fs->min_th = f->min_th;
+	fs->max_p = f->fs.max_p;
+	fs->rq_elements = nq;
+
+	fs->flags_fs = convertflags2old(f->fs.flags);
+
+	*a->start += size;
+	return 0;
+}
+
+
+int
+dn_compat_copy_pipe(struct copy_args *a, void *_o)
+{
+	int have = a->end - *a->start;
+	int need = 0;
+	int pipe_size = sizeof(struct dn_pipe8);
+	int queue_size = sizeof(struct dn_flow_queue8);
+	int n_queue = 0; /* number of queues */
+
+	struct dn_schk *s = (struct dn_schk *)_o;
+	/* calculate needed space:
+	 * - struct dn_pipe
+	 * - if there are instances, dn_queue * n_instances
+	 */
+	n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
+						(s->siht ? 1 : 0));
+	need = pipe_size + queue_size * n_queue;
+	if (have < need) {
+		D("have %d < need %d", have, need);
+		return 1;
+	}
+	/* copy pipe */
+	dn_c_copy_pipe(s, a, n_queue);
+
+	/* copy queues */
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, dn_c_copy_q, a);
+	else if (s->siht)
+		dn_c_copy_q(s->siht, a);
+	return 0;
+}
+
+int
+dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
+{
+	struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
+
+	fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+	fs->fs_nr = f->fs.fs_nr;
+	fs->qsize = f->fs.qsize;
+	fs->plr = f->fs.plr;
+	fs->w_q = f->fs.w_q;
+	fs->max_th = f->max_th;
+	fs->min_th = f->min_th;
+	fs->max_p = f->fs.max_p;
+	fs->flow_mask = f->fs.flow_mask;
+	fs->rq_elements = nq;
+	fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
+	fs->parent_nr = f->fs.sched_nr;
+	fs->weight = f->fs.par[0];
+
+	fs->flags_fs = convertflags2old(f->fs.flags);
+	*a->start += sizeof(struct dn_flow_set);
+	return 0;
+}
+
+int
+dn_compat_copy_queue(struct copy_args *a, void *_o)
+{
+	int have = a->end - *a->start;
+	int need = 0;
+	int fs_size = sizeof(struct dn_flow_set);
+	int queue_size = sizeof(struct dn_flow_queue8);
+
+	struct dn_fsk *fs = (struct dn_fsk *)_o;
+	int n_queue = 0; /* number of queues */
+
+	n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
+						(fs->qht ? 1 : 0));
+
+	need = fs_size + queue_size * n_queue;
+	if (have < need) {
+		D("have < need");
+		return 1;
+	}
+
+	/* copy flowset */
+	dn_c_copy_fs(fs, a, n_queue);
+
+	/* copy queues */
+	if (fs->fs.flags & DN_HAVE_MASK)
+		dn_ht_scan(fs->qht, dn_c_copy_q, a);
+	else if (fs->qht)
+		dn_c_copy_q(fs->qht, a);
+
+	return 0;
+}
+
+int
+copy_data_helper_compat(void *_o, void *_arg)
+{
+	struct copy_args *a = _arg;
+
+	if (a->type == DN_COMPAT_PIPE) {
+		struct dn_schk *s = _o;
+		if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
+			return 0;	/* not old type */
+		}
+		/* copy pipe parameters, and if instance exists, copy
+		 * other parameters and eventually queues.
+		 */
+		if(dn_compat_copy_pipe(a, _o))
+			return DNHT_SCAN_END;
+	} else if (a->type == DN_COMPAT_QUEUE) {
+		struct dn_fsk *fs = _o;
+		if (fs->fs.fs_nr >= DN_MAX_ID)
+			return 0;
+		if (dn_compat_copy_queue(a, _o))
+			return DNHT_SCAN_END;
+	}
+	return 0;
+}
+
+/* Main function to manage old requests */
+int
+ip_dummynet_compat(struct sockopt *sopt)
+{
+	int error=0;
+	void *v = NULL;
+	struct dn_id oid;
+
+	/* Lenght of data, used to found ipfw version... */
+	int len = sopt->sopt_valsize;
+
+	/* len can be 0 if command was dummynet_flush */
+	if (len == pipesize7) {
+		D("setting compatibility with FreeBSD 7.2");
+		is7 = 1;
+	}
+	else if (len == pipesize8 || len == pipesizemax8) {
+		D("setting compatibility with FreeBSD 8");
+		is7 = 0;
+	}
+
+	switch (sopt->sopt_name) {
+	default:
+		printf("dummynet: -- unknown option %d", sopt->sopt_name);
+		error = EINVAL;
+		break;
+
+	case IP_DUMMYNET_FLUSH:
+		oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+		do_config(&oid, oid.len);
+		break;
+
+	case IP_DUMMYNET_DEL:
+		v = malloc(len, M_TEMP, M_WAITOK);
+		error = sooptcopyin(sopt, v, len, len);
+		if (error)
+			break;
+		error = dn_compat_del(v);
+		free(v, M_DUMMYNET);
+		break;
+
+	case IP_DUMMYNET_CONFIGURE:
+		v = malloc(len, M_TEMP, M_WAITOK);
+		error = sooptcopyin(sopt, v, len, len);
+		if (error)
+			break;
+		error = dn_compat_configure(v);
+		free(v, M_DUMMYNET);
+		break;
+
+	case IP_DUMMYNET_GET: {
+		void *buf;
+		int ret;
+		int original_size = sopt->sopt_valsize;
+		int size;
+
+		ret = dummynet_get(sopt, &buf);
+		if (ret)
+			return 0;//XXX ?
+		size = sopt->sopt_valsize;
+		sopt->sopt_valsize = original_size;
+		D("size=%d, buf=%p", size, buf);
+		ret = sooptcopyout(sopt, buf, size);
+		if (ret)
+			printf("  %s ERROR sooptcopyout\n", __FUNCTION__);
+		if (buf)
+			free(buf, M_DUMMYNET);
+	    }
+	}
+
+	return error;
+}
+
+
diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c
new file mode 100644
index 000000000000..152010eda48a
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_io.c
@@ -0,0 +1,801 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Dummynet portions related to packet handling.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>		/* ip_len, ip_off */
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ * instead of dn_cfg.curr_time
+ */
+
+struct dn_parms dn_cfg;
+//VNET_DEFINE(struct dn_parms, _base_dn_cfg);
+
+static long tick_last;		/* Last tick duration (usec). */
+static long tick_delta;		/* Last vs standard tick diff (usec). */
+static long tick_delta_sum;	/* Accumulated tick difference (usec).*/
+static long tick_adjustment;	/* Tick adjustments done. */
+static long tick_lost;		/* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static unsigned long	io_pkt;
+static unsigned long	io_pkt_fast;
+static unsigned long	io_pkt_drop;
+
+/*
+ * We use a heap to store entities for which we have pending timer events.
+ * The heap is checked at every tick and all entities with expired events
+ * are extracted.
+ */
+  
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+
+/* wrapper to pass dn_cfg fields to SYSCTL_* */
+//#define DC(x)	(&(VNET_NAME(_base_dn_cfg).x))
+#define DC(x)	(&(dn_cfg.x))
+/* parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, DC(hash_size), 0, "Default hash table size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, DC(slot_limit), 0,
+    "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, DC(byte_limit), 0,
+    "Upper limit in bytes for pipe queue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
+    CTLFLAG_RW, DC(debug), 0, "Dummynet debug level");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
+    CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
+
+/* RED parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size");
+
+/* time adjustment */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+
+/* statistics */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
+    CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
+    CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
+    CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
+    CTLFLAG_RD, DC(queue_count), 0, "Number of queues");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+#undef DC
+SYSEND
+
+#endif
+
+static void	dummynet_send(struct mbuf *);
+
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.
+ * Outside dummynet, only the 'rule' field is relevant, and it must
+ * be at the beginning of the structure.
+ */
+struct dn_pkt_tag {
+	struct ipfw_rule_ref rule;	/* matching rule	*/
+
+	/* second part, dummynet specific */
+	int dn_dir;		/* action when packet comes out.*/
+				/* see ip_fw_private.h		*/
+	uint64_t output_time;	/* when the pkt is due for delivery*/
+	struct ifnet *ifp;	/* interface, for ip_output	*/
+	struct _ip6dn_args ip6opt;	/* XXX ipv6 options	*/
+};
+
+/*
+ * Return the mbuf tag holding the dummynet state (it should
+ * be the first one on the list).
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+	struct m_tag *mtag = m_tag_first(m);
+	KASSERT(mtag != NULL &&
+	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+	    ("packet on dummynet queue w/o dummynet tag!"));
+	return (struct dn_pkt_tag *)(mtag+1);
+}
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+	if (q->head == NULL)
+		q->head = m;
+	else
+		q->tail->m_nextpkt = m;
+	q->tail = m;
+	m->m_nextpkt = NULL;
+}
+
+/*
+ * Dispose a list of packet. Use a functions so if we need to do
+ * more work, this is a central point to do it.
+ */
+void dn_free_pkts(struct mbuf *mnext)
+{
+        struct mbuf *m;
+    
+        while ((m = mnext) != NULL) {
+                mnext = m->m_nextpkt;
+                FREE_PKT(m);
+        }
+}
+
+static int
+red_drops (struct dn_queue *q, int len)
+{
+	/*
+	 * RED algorithm
+	 *
+	 * RED calculates the average queue size (avg) using a low-pass filter
+	 * with an exponential weighted (w_q) moving average:
+	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
+	 * where q_size is the queue length (measured in bytes or * packets).
+	 *
+	 * If q_size == 0, we compute the idle time for the link, and set
+	 *	avg = (1 - w_q)^(idle/s)
+	 * where s is the time needed for transmitting a medium-sized packet.
+	 *
+	 * Now, if avg < min_th the packet is enqueued.
+	 * If avg > max_th the packet is dropped. Otherwise, the packet is
+	 * dropped with probability P function of avg.
+	 */
+
+	struct dn_fsk *fs = q->fs;
+	int64_t p_b = 0;
+
+	/* Queue in bytes or packets? */
+	uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
+	    q->ni.len_bytes : q->ni.length;
+
+	/* Average queue size estimation. */
+	if (q_size != 0) {
+		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+		int diff = SCALE(q_size) - q->avg;
+		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+		q->avg += (int)v;
+	} else {
+		/*
+		 * Queue is empty, find for how long the queue has been
+		 * empty and use a lookup table for computing
+		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
+		 * (small) packet.
+		 * XXX check wraps...
+		 */
+		if (q->avg) {
+			u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);
+
+			q->avg = (t < fs->lookup_depth) ?
+			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+		}
+	}
+
+	/* Should i drop? */
+	if (q->avg < fs->min_th) {
+		q->count = -1;
+		return (0);	/* accept packet */
+	}
+	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
+		if (fs->fs.flags & DN_IS_GENTLE_RED) {
+			/*
+			 * According to Gentle-RED, if avg is greater than
+			 * max_th the packet is dropped with a probability
+			 *	 p_b = c_3 * avg - c_4
+			 * where c_3 = (1 - max_p) / max_th
+			 *       c_4 = 1 - 2 * max_p
+			 */
+			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+			    fs->c_4;
+		} else {
+			q->count = -1;
+			return (1);
+		}
+	} else if (q->avg > fs->min_th) {
+		/*
+		 * We compute p_b using the linear dropping function
+		 *	 p_b = c_1 * avg - c_2
+		 * where c_1 = max_p / (max_th - min_th)
+		 * 	 c_2 = max_p * min_th / (max_th - min_th)
+		 */
+		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+	}
+
+	if (fs->fs.flags & DN_QSIZE_BYTES)
+		p_b = div64((p_b * len) , fs->max_pkt_size);
+	if (++q->count == 0)
+		q->random = random() & 0xffff;
+	else {
+		/*
+		 * q->count counts packets arrived since last drop, so a greater
+		 * value of q->count means a greater packet drop probability.
+		 */
+		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+			q->count = 0;
+			/* After a drop we calculate a new random value. */
+			q->random = random() & 0xffff;
+			return (1);	/* drop */
+		}
+	}
+	/* End of RED algorithm. */
+
+	return (0);	/* accept */
+
+}
+
+/*
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{   
+	struct dn_fs *f;
+	struct dn_flow *ni;	/* stats for scheduler instance */
+	uint64_t len;
+
+	if (q->fs == NULL || q->_si == NULL) {
+		printf("%s fs %p si %p, dropping\n",
+			__FUNCTION__, q->fs, q->_si);
+		FREE_PKT(m);
+		return 1;
+	}
+	f = &(q->fs->fs);
+	ni = &q->_si->ni;
+	len = m->m_pkthdr.len;
+	/* Update statistics, then check reasons to drop pkt. */
+	q->ni.tot_bytes += len;
+	q->ni.tot_pkts++;
+	ni->tot_bytes += len;
+	ni->tot_pkts++;
+	if (drop)
+		goto drop;
+	if (f->plr && random() < f->plr)
+		goto drop;
+	if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
+		goto drop;
+	if (f->flags & DN_QSIZE_BYTES) {
+		if (q->ni.len_bytes > f->qsize)
+			goto drop;
+	} else if (q->ni.length >= f->qsize) {
+		goto drop;
+	}
+	mq_append(&q->mq, m);
+	q->ni.length++;
+	q->ni.len_bytes += len;
+	ni->length++;
+	ni->len_bytes += len;
+	return 0;
+
+drop:
+	io_pkt_drop++;
+	q->ni.drops++;
+	ni->drops++;
+	FREE_PKT(m);
+	return 1;
+}
+
+/*
+ * Fetch packets from the delay line which are due now. If there are
+ * leftover packets, reinsert the delay line in the heap.
+ * Runs under scheduler lock.
+ */
+static void
+transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
+{
+	struct mbuf *m;
+	struct dn_pkt_tag *pkt = NULL;
+
+	dline->oid.subtype = 0; /* not in heap */
+	while ((m = dline->mq.head) != NULL) {
+		pkt = dn_tag_get(m);
+		if (!DN_KEY_LEQ(pkt->output_time, now))
+			break;
+		dline->mq.head = m->m_nextpkt;
+		mq_append(q, m);
+	}
+	if (m != NULL) {
+		dline->oid.subtype = 1; /* in heap */
+		heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
+	}
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are
+ * in milliseconds so we need to divide by 1000.
+ */
+static uint64_t
+extra_bits(struct mbuf *m, struct dn_schk *s)
+{
+	int index;
+	uint64_t bits;
+	struct dn_profile *pf = s->profile;
+
+	if (!pf || pf->samples_no == 0)
+		return 0;
+	index  = random() % pf->samples_no;
+	bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
+	if (index >= pf->loss_level) {
+		struct dn_pkt_tag *dt = dn_tag_get(m);
+		if (dt)
+			dt->dn_dir = DIR_DROP;
+	}
+	return bits;
+}
+
+/*
+ * Send traffic from a scheduler instance due by 'now'.
+ * Return a pointer to the head of the queue.
+ */
+static struct mbuf *
+serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
+{
+	struct mq def_q;
+	struct dn_schk *s = si->sched;
+	struct mbuf *m = NULL;
+	int delay_line_idle = (si->dline.mq.head == NULL);
+	int done, bw;
+
+	if (q == NULL) {
+		q = &def_q;
+		q->head = NULL;
+	}
+
+	bw = s->link.bandwidth;
+	si->kflags &= ~DN_ACTIVE;
+
+	if (bw > 0)
+		si->credit += (now - si->sched_time) * bw;
+	else
+		si->credit = 0;
+	si->sched_time = now;
+	done = 0;
+	while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+		uint64_t len_scaled;
+		done++;
+		len_scaled = (bw == 0) ? 0 : hz *
+		    (m->m_pkthdr.len * 8 + extra_bits(m, s));
+		si->credit -= len_scaled;
+		/* Move packet in the delay line */
+		dn_tag_get(m)->output_time += s->link.delay ;
+		mq_append(&si->dline.mq, m);
+	}
+	/*
+	 * If credit >= 0 the instance is idle, mark time.
+	 * Otherwise put back in the heap, and adjust the output
+	 * time of the last inserted packet, m, which was too early.
+	 */
+	if (si->credit >= 0) {
+		si->idle_time = now;
+	} else {
+		uint64_t t;
+		KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
+		t = div64(bw - 1 - si->credit, bw);
+		if (m)
+			dn_tag_get(m)->output_time += t;
+		si->kflags |= DN_ACTIVE;
+		heap_insert(&dn_cfg.evheap, now + t, si);
+	}
+	if (delay_line_idle && done)
+		transmit_event(q, &si->dline, now);
+	return q->head;
+}
+
+/*
+ * The timer handler for dummynet. Time is computed in ticks, but
+ * but the code is tolerant to the actual rate at which this is called.
+ * Once complete, the function reschedules itself for the next tick.
+ */
+void
+dummynet_task(void *context, int pending)
+{
+	struct timeval t;
+	struct mq q = { NULL, NULL }; /* queue to accumulate results */
+
+	CURVNET_SET((struct vnet *)context);
+
+	DN_BH_WLOCK();
+
+	/* Update number of lost(coalesced) ticks. */
+	tick_lost += pending - 1;
+
+	getmicrouptime(&t);
+	/* Last tick duration (usec). */
+	tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
+	(t.tv_usec - dn_cfg.prev_t.tv_usec);
+	/* Last tick vs standard tick difference (usec). */
+	tick_delta = (tick_last * hz - 1000000) / hz;
+	/* Accumulated tick difference (usec). */
+	tick_delta_sum += tick_delta;
+
+	dn_cfg.prev_t = t;
+
+	/*
+	* Adjust curr_time if the accumulated tick difference is
+	* greater than the 'standard' tick. Since curr_time should
+	* be monotonically increasing, we do positive adjustments
+	* as required, and throttle curr_time in case of negative
+	* adjustment.
+	*/
+	dn_cfg.curr_time++;
+	if (tick_delta_sum - tick >= 0) {
+		int diff = tick_delta_sum / tick;
+
+		dn_cfg.curr_time += diff;
+		tick_diff += diff;
+		tick_delta_sum %= tick;
+		tick_adjustment++;
+	} else if (tick_delta_sum + tick <= 0) {
+		dn_cfg.curr_time--;
+		tick_diff--;
+		tick_delta_sum += tick;
+		tick_adjustment++;
+	}
+
+	/* serve pending events, accumulate in q */
+	for (;;) {
+		struct dn_id *p;    /* generic parameter to handler */
+
+		if (dn_cfg.evheap.elements == 0 ||
+		    DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
+			break;
+		p = HEAP_TOP(&dn_cfg.evheap)->object;
+		heap_extract(&dn_cfg.evheap, NULL);
+
+		if (p->type == DN_SCH_I) {
+			serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
+		} else { /* extracted a delay line */
+			transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
+		}
+	}
+	if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) {
+		dn_cfg.expire_cycle = 0;
+		dn_drain_scheduler();
+		dn_drain_queue();
+	}
+
+	DN_BH_WUNLOCK();
+	dn_reschedule();
+	if (q.head != NULL)
+		dummynet_send(q.head);
+	CURVNET_RESTORE();
+}
+
+/*
+ * forward a chain of packets to the proper destination.
+ * This runs outside the dummynet lock.
+ */
+static void
+dummynet_send(struct mbuf *m)
+{
+	struct mbuf *n;
+
+	for (; m != NULL; m = n) {
+		struct ifnet *ifp = NULL;	/* gcc 3.4.6 complains */
+        	struct m_tag *tag;
+		int dst;
+
+		n = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+		tag = m_tag_first(m);
+		if (tag == NULL) { /* should not happen */
+			dst = DIR_DROP;
+		} else {
+			struct dn_pkt_tag *pkt = dn_tag_get(m);
+			/* extract the dummynet info, rename the tag
+			 * to carry reinject info.
+			 */
+			dst = pkt->dn_dir;
+			ifp = pkt->ifp;
+			tag->m_tag_cookie = MTAG_IPFW_RULE;
+			tag->m_tag_id = 0;
+		}
+
+		switch (dst) {
+		case DIR_OUT:
+			SET_HOST_IPLEN(mtod(m, struct ip *));
+			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+			break ;
+
+		case DIR_IN :
+			/* put header in network format for ip_input() */
+			//SET_NET_IPLEN(mtod(m, struct ip *));
+			netisr_dispatch(NETISR_IP, m);
+			break;
+
+#ifdef INET6
+		case DIR_IN | PROTO_IPV6:
+			netisr_dispatch(NETISR_IPV6, m);
+			break;
+
+		case DIR_OUT | PROTO_IPV6:
+			SET_HOST_IPLEN(mtod(m, struct ip *));
+			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+			break;
+#endif
+
+		case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+			if (bridge_dn_p != NULL)
+				((*bridge_dn_p)(m, ifp));
+			else
+				printf("dummynet: if_bridge not loaded\n");
+
+			break;
+
+		case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+			/*
+			 * The Ethernet code assumes the Ethernet header is
+			 * contiguous in the first mbuf header.
+			 * Insure this is true.
+			 */
+			if (m->m_len < ETHER_HDR_LEN &&
+			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+				printf("dummynet/ether: pullup failed, "
+				    "dropping packet\n");
+				break;
+			}
+			ether_demux(m->m_pkthdr.rcvif, m);
+			break;
+
+		case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+			ether_output_frame(ifp, m);
+			break;
+
+		case DIR_DROP:
+			/* drop the packet after some time */
+			FREE_PKT(m);
+			break;
+
+		default:
+			printf("dummynet: bad switch %d!\n", dst);
+			FREE_PKT(m);
+			break;
+		}
+	}
+}
+
+static inline int
+tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
+{
+	struct dn_pkt_tag *dt;
+	struct m_tag *mtag;
+
+	mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+		    sizeof(*dt), M_NOWAIT | M_ZERO);
+	if (mtag == NULL)
+		return 1;		/* Cannot allocate packet header. */
+	m_tag_prepend(m, mtag);		/* Attach to mbuf chain. */
+	dt = (struct dn_pkt_tag *)(mtag + 1);
+	dt->rule = fwa->rule;
+	dt->rule.info &= IPFW_ONEPASS;	/* only keep this info */
+	dt->dn_dir = dir;
+	dt->ifp = fwa->oif;
+	/* dt->output tame is updated as we move through */
+	dt->output_time = dn_cfg.curr_time;
+	return 0;
+}
+
+
+/*
+ * dummynet hook for packets.
+ * We use the argument to locate the flowset fs and the sched_set sch
+ * associated to it. The we apply flow_mask and sched_mask to
+ * determine the queue and scheduler instances.
+ *
+ * dir		where shall we send the packet after dummynet.
+ * *m0		the mbuf with the packet
+ * ifp		the 'ifp' parameter from the caller.
+ *		NULL in ip_input, destination interface in ip_output,
+ */
+int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+	struct mbuf *m = *m0;
+	struct dn_fsk *fs = NULL;
+	struct dn_sch_inst *si;
+	struct dn_queue *q = NULL;	/* default */
+
+	int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
+		((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
+	DN_BH_WLOCK();
+	io_pkt++;
+	/* we could actually tag outside the lock, but who cares... */
+	if (tag_mbuf(m, dir, fwa))
+		goto dropit;
+	if (dn_cfg.busy) {
+		/* if the upper half is busy doing something expensive,
+		 * lets queue the packet and move forward
+		 */
+		mq_append(&dn_cfg.pending, m);
+		m = *m0 = NULL; /* consumed */
+		goto done; /* already active, nothing to do */
+	}
+	/* XXX locate_flowset could be optimised with a direct ref. */
+	fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
+	if (fs == NULL)
+		goto dropit;	/* This queue/pipe does not exist! */
+	if (fs->sched == NULL)	/* should not happen */
+		goto dropit;
+	/* find scheduler instance, possibly applying sched_mask */
+	si = ipdn_si_find(fs->sched, &(fwa->f_id));
+	if (si == NULL)
+		goto dropit;
+	/*
+	 * If the scheduler supports multiple queues, find the right one
+	 * (otherwise it will be ignored by enqueue).
+	 */
+	if (fs->sched->fp->flags & DN_MULTIQUEUE) {
+		q = ipdn_q_find(fs, si, &(fwa->f_id));
+		if (q == NULL)
+			goto dropit;
+	}
+	if (fs->sched->fp->enqueue(si, q, m)) {
+		/* packet was dropped by enqueue() */
+		m = *m0 = NULL;
+		goto dropit;
+	}
+
+	if (si->kflags & DN_ACTIVE) {
+		m = *m0 = NULL; /* consumed */
+		goto done; /* already active, nothing to do */
+	}
+
+	/* compute the initial allowance */
+	{
+	    struct dn_link *p = &fs->sched->link;
+	    si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
+	    if (p->burst) {
+		uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
+		if (burst > p->burst)
+			burst = p->burst;
+		si->credit += burst;
+	    }
+	}
+	/* pass through scheduler and delay line */
+	m = serve_sched(NULL, si, dn_cfg.curr_time);
+
+	/* optimization -- pass it back to ipfw for immediate send */
+	/* XXX Don't call dummynet_send() if scheduler return the packet
+	 *     just enqueued. This avoid a lock order reversal.
+	 *     
+	 */
+	if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
+		/* fast io, rename the tag * to carry reinject info. */
+		struct m_tag *tag = m_tag_first(m);
+
+		tag->m_tag_cookie = MTAG_IPFW_RULE;
+		tag->m_tag_id = 0;
+		io_pkt_fast++;
+		if (m->m_nextpkt != NULL) {
+			printf("dummynet: fast io: pkt chain detected!\n");
+			m->m_nextpkt = NULL;
+		}
+		m = NULL;
+	} else {
+		*m0 = NULL;
+	}
+done:
+	DN_BH_WUNLOCK();
+	if (m)
+		dummynet_send(m);
+	return 0;
+
+dropit:
+	io_pkt_drop++;
+	DN_BH_WUNLOCK();
+	if (m)
+		FREE_PKT(m);
+	*m0 = NULL;
+	return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
+}
diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h
new file mode 100644
index 000000000000..03b43dba55d4
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_private.h
@@ -0,0 +1,406 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * internal dummynet APIs.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_PRIVATE_H
+#define _IP_DN_PRIVATE_H
+
+/* debugging support
+ * use ND() to remove debugging, D() to print a line,
+ * DX(level, ...) to print above a certain level
+ * If you redefine D() you are expected to redefine all.
+ */
+#ifndef D
+#define ND(fmt, ...) do {} while (0)
+#define D1(fmt, ...) do {} while (0)
+#define D(fmt, ...) printf("%-10s " fmt "\n",      \
+        __FUNCTION__, ## __VA_ARGS__)
+#define DX(lev, fmt, ...) do {              \
+        if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
+#endif
+
+MALLOC_DECLARE(M_DUMMYNET);
+
+#ifndef FREE_PKT
+#define	FREE_PKT(m)	m_freem(m)
+#endif
+
+#ifndef __linux__
+#define div64(a, b)  ((int64_t)(a) / (int64_t)(b))
+#endif
+
+#define DN_LOCK_INIT() do {				\
+	mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF);	\
+	mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF);	\
+	} while (0)
+#define DN_LOCK_DESTROY() do {				\
+	mtx_destroy(&dn_cfg.uh_mtx);			\
+	mtx_destroy(&dn_cfg.bh_mtx);			\
+	} while (0)
+#if 0 /* not used yet */
+#define DN_UH_RLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_RUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_WLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_WUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_LOCK_ASSERT()	mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+#endif
+
+#define DN_BH_RLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_RUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_WLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_WUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_LOCK_ASSERT()	mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+
+SLIST_HEAD(dn_schk_head, dn_schk);
+SLIST_HEAD(dn_sch_inst_head, dn_sch_inst);
+SLIST_HEAD(dn_fsk_head, dn_fsk);
+SLIST_HEAD(dn_queue_head, dn_queue);
+SLIST_HEAD(dn_alg_head, dn_alg);
+
+struct mq {	/* a basic queue of packets*/
+        struct mbuf *head, *tail;
+};
+
+static inline void
+set_oid(struct dn_id *o, int type, int len)
+{
+        o->type = type;
+        o->len = len;
+        o->subtype = 0;
+};
+
+/*
+ * configuration and global data for a dummynet instance
+ *
+ * When a configuration is modified from userland, 'id' is incremented
+ * so we can use the value to check for stale pointers.
+ */
+struct dn_parms {
+	uint32_t	id;		/* configuration version */
+
+	/* defaults (sysctl-accessible) */
+	int	red_lookup_depth;
+	int	red_avg_pkt_size;
+	int	red_max_pkt_size;
+	int	hash_size;
+	int	max_hash_size;
+	long	byte_limit;		/* max queue sizes */
+	long	slot_limit;
+
+	int	io_fast;
+	int	debug;
+
+	/* timekeeping */
+	struct timeval prev_t;		/* last time dummynet_tick ran */
+	struct dn_heap	evheap;		/* scheduled events */
+
+	/* counters of objects -- used for reporting space */
+	int	schk_count;
+	int	si_count;
+	int	fsk_count;
+	int	queue_count;
+
+	/* ticks and other stuff */
+	uint64_t	curr_time;
+	/* flowsets and schedulers are in hash tables, with 'hash_size'
+	 * buckets. fshash is looked up at every packet arrival
+	 * so better be generous if we expect many entries.
+	 */
+	struct dn_ht	*fshash;
+	struct dn_ht	*schedhash;
+	/* list of flowsets without a scheduler -- use sch_chain */
+	struct dn_fsk_head	fsu;	/* list of unlinked flowsets */
+	struct dn_alg_head	schedlist;	/* list of algorithms */
+
+	/* Store the fs/sch to scan when draining. The value is the
+	 * bucket number of the hash table. Expire can be disabled
+	 * with net.inet.ip.dummynet.expire=0, or it happens every
+	 * expire ticks.
+	 **/
+	int drain_fs;
+	int drain_sch;
+	uint32_t expire;
+	uint32_t expire_cycle;	/* tick count */
+
+	int init_done;
+
+	/* if the upper half is busy doing something long,
+	 * can set the busy flag and we will enqueue packets in
+	 * a queue for later processing.
+	 */
+	int	busy;
+	struct	mq	pending;
+
+#ifdef _KERNEL
+	/*
+	 * This file is normally used in the kernel, unless we do
+	 * some userland tests, in which case we do not need a mtx.
+	 * uh_mtx arbitrates between system calls and also
+	 * protects fshash, schedhash and fsunlinked.
+	 * These structures are readonly for the lower half.
+	 * bh_mtx protects all other structures which may be
+	 * modified upon packet arrivals
+	 */
+#if defined( __linux__ ) || defined( _WIN32 )
+	spinlock_t uh_mtx;
+	spinlock_t bh_mtx;
+#else
+	struct mtx uh_mtx;
+	struct mtx bh_mtx;
+#endif
+
+#endif /* _KERNEL */
+};
+
+/*
+ * Delay line, contains all packets on output from a link.
+ * Every scheduler instance has one.
+ */
+struct delay_line {
+	struct dn_id oid;
+	struct dn_sch_inst *si;
+	struct mq mq;
+};
+
+/*
+ * The kernel side of a flowset. It is linked in a hash table
+ * of flowsets, and in a list of children of their parent scheduler.
+ * qht is either the queue or (if HAVE_MASK) a hash table queues.
+ * Note that the mask to use is the (flow_mask|sched_mask), which
+ * changes as we attach/detach schedulers. So we store it here.
+ *
+ * XXX If we want to add scheduler-specific parameters, we need to
+ * put them in external storage because the scheduler may not be
+ * available when the fsk is created.
+ */
+struct dn_fsk { /* kernel side of a flowset */
+	struct dn_fs fs;
+	SLIST_ENTRY(dn_fsk) fsk_next;	/* hash chain for fshash */
+
+	struct ipfw_flow_id fsk_mask;
+
+	/* qht is a hash table of queues, or just a single queue
+	 * a bit in fs.flags tells us which one
+	 */
+	struct dn_ht	*qht;
+	struct dn_schk *sched;		/* Sched we are linked to */
+	SLIST_ENTRY(dn_fsk) sch_chain;	/* list of fsk attached to sched */
+
+	/* bucket index used by drain routine to drain queues for this
+	 * flowset
+	 */
+	int drain_bucket;
+	/* Parameter realted to RED / GRED */
+	/* original values are in dn_fs*/
+	int w_q ;		/* queue weight (scaled) */
+	int max_th ;		/* maximum threshold for queue (scaled) */
+	int min_th ;		/* minimum threshold for queue (scaled) */
+	int max_p ;		/* maximum value for p_b (scaled) */
+
+	u_int c_1 ;		/* max_p/(max_th-min_th) (scaled) */
+	u_int c_2 ;		/* max_p*min_th/(max_th-min_th) (scaled) */
+	u_int c_3 ;		/* for GRED, (1-max_p)/max_th (scaled) */
+	u_int c_4 ;		/* for GRED, 1 - 2*max_p (scaled) */
+	u_int * w_q_lookup ;	/* lookup table for computing (1-w_q)^t */
+	u_int lookup_depth ;	/* depth of lookup table */
+	int lookup_step ;	/* granularity inside the lookup table */
+	int lookup_weight ;	/* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+	int avg_pkt_size ;	/* medium packet size */
+	int max_pkt_size ;	/* max packet size */
+};
+
+/*
+ * A queue is created as a child of a flowset unless it belongs to
+ * a !MULTIQUEUE scheduler. It is normally in a hash table in the
+ * flowset. fs always points to the parent flowset.
+ * si normally points to the sch_inst, unless the flowset has been
+ * detached from the scheduler -- in this case si == NULL and we
+ * should not enqueue.
+ */
+struct dn_queue {
+	struct dn_flow ni;	/* oid, flow_id, stats */
+	struct mq mq;	/* packets queue */
+	struct dn_sch_inst *_si;	/* owner scheduler instance */
+	SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
+	struct dn_fsk *fs;		/* parent flowset. */
+
+	/* RED parameters */
+	int avg;		/* average queue length est. (scaled) */
+	int count;		/* arrivals since last RED drop */
+	int random;		/* random value (scaled) */
+	uint64_t q_time;	/* start of queue idle time */
+
+};
+
+/*
+ * The kernel side of a scheduler. Contains the userland config,
+ * a link, pointer to extra config arguments from command line,
+ * kernel flags, and a pointer to the scheduler methods.
+ * It is stored in a hash table, and holds a list of all
+ * flowsets and scheduler instances.
+ * XXX sch must be at the beginning, see schk_hash().
+ */
+struct dn_schk {
+	struct dn_sch sch;
+	struct dn_alg *fp;	/* Pointer to scheduler functions */
+	struct dn_link link;	/* The link, embedded */
+	struct dn_profile *profile; /* delay profile, if any */
+	struct dn_id *cfg;	/* extra config arguments */
+
+	SLIST_ENTRY(dn_schk) schk_next;  /* hash chain for schedhash */
+
+	struct dn_fsk_head fsk_list;  /* all fsk linked to me */
+	struct dn_fsk *fs;	/* Flowset for !MULTIQUEUE */
+
+	/* bucket index used by the drain routine to drain the scheduler
+	 * instance for this flowset.
+	 */
+	int drain_bucket;
+
+	/* Hash table of all instances (through sch.sched_mask)
+	 * or single instance if no mask. Always valid.
+	 */
+	struct dn_ht	*siht;
+};
+
+
+/*
+ * Scheduler instance.
+ * Contains variables and all queues relative to a this instance.
+ * This struct is created a runtime.
+ */
+struct dn_sch_inst {
+	struct dn_flow	ni;	/* oid, flowid and stats */
+	SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
+	struct delay_line dline;
+	struct dn_schk *sched;	/* the template */
+	int		kflags;	/* DN_ACTIVE */
+
+	int64_t	credit;		/* bits I can transmit (more or less). */
+	uint64_t sched_time;	/* time link was scheduled in ready_heap */
+	uint64_t idle_time;	/* start of scheduler instance idle time */
+
+	/* q_count is the number of queues that this instance is using.
+	 * The counter is incremented or decremented when
+	 * a reference from the queue is created or deleted.
+	 * It is used to make sure that a scheduler instance can be safely
+	 * deleted by the drain routine. See notes below.
+	 */
+	int q_count;
+
+};
+
+/*
+ * NOTE about object drain.
+ * The system will automatically (XXX check when) drain queues and
+ * scheduler instances when they are idle.
+ * A queue is idle when it has no packets; an instance is idle when
+ * it is not in the evheap heap, and the corresponding delay line is empty.
+ * A queue can be safely deleted when it is idle because of the scheduler
+ * function xxx_free_queue() will remove any references to it.
+ * An instance can be only deleted when no queues reference it. To be sure
+ * of that, a counter (q_count) stores the number of queues that are pointing
+ * to the instance.
+ *
+ * XXX
+ * Order of scan:
+ * - take all flowset in a bucket for the flowset hash table
+ * - take all queues in a bucket for the flowset
+ * - increment the queue bucket
+ * - scan next flowset bucket
+ * Nothing is done if a bucket contains no entries.
+ *
+ * The same schema is used for sceduler instances
+ */
+
+
+/* kernel-side flags. Linux has DN_DELETE in fcntl.h
+ */
+enum {
+	/* 1 and 2 are reserved for the SCAN flags */
+	DN_DESTROY	= 0x0004, /* destroy */
+	DN_DELETE_FS	= 0x0008, /* destroy flowset */
+	DN_DETACH	= 0x0010,
+	DN_ACTIVE	= 0x0020, /* object is in evheap */
+	DN_F_DLINE	= 0x0040, /* object is a delay line */
+	DN_F_SCHI	= 0x00C0, /* object is a sched.instance */
+	DN_QHT_IS_Q	= 0x0100, /* in flowset, qht is a single queue */
+};
+
+extern struct dn_parms dn_cfg;
+//VNET_DECLARE(struct dn_parms, _base_dn_cfg);
+//#define dn_cfg              VNET(_base_dn_cfg)
+
+int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+void dummynet_task(void *context, int pending);
+void dn_reschedule(void);
+
+struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *,
+        struct ipfw_flow_id *);
+struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);
+
+/*
+ * copy_range is a template for requests for ranges of pipes/queues/scheds.
+ * The number of ranges is variable and can be derived by o.len.
+ * As a default, we use a small number of entries so that the struct
+ * fits easily on the stack and is sufficient for most common requests.
+ */
+#define DEFAULT_RANGES	5
+struct copy_range {
+        struct dn_id o;
+        uint32_t	r[ 2 * DEFAULT_RANGES ];
+};
+
+struct copy_args {
+	char **start;
+	char *end;
+	int flags;
+	int type;
+	struct copy_range *extra;	/* extra filtering */
+};
+
+struct sockopt;
+int ip_dummynet_compat(struct sockopt *sopt);
+int dummynet_get(struct sockopt *sopt, void **compat);
+int dn_c_copy_q (void *_ni, void *arg);
+int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
+int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
+int dn_compat_copy_queue(struct copy_args *a, void *_o);
+int dn_compat_copy_pipe(struct copy_args *a, void *_o);
+int copy_data_helper_compat(void *_o, void *_arg);
+int dn_compat_calc_size(void);
+int do_config(void *p, int l);
+
+/* function to drain idle object */
+void dn_drain_scheduler(void);
+void dn_drain_queue(void);
+
+#endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c
index 267776f567a7..01714aa66341 100644
--- a/sys/netinet/ipfw/ip_dummynet.c
+++ b/sys/netinet/ipfw/ip_dummynet.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
  * Portions Copyright (c) 2000 Akamba Corp.
  * All rights reserved
  *
@@ -28,32 +28,12 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define	DUMMYNET_DEBUG
-
-#include "opt_inet6.h"
-
 /*
- * This module implements IP dummynet, a bandwidth limiter/delay emulator
- * used in conjunction with the ipfw package.
- * Description of the data structures used is in ip_dummynet.h
- * Here you mainly find the following blocks of code:
- *  + variable declarations;
- *  + heap management functions;
- *  + scheduler and dummynet functions;
- *  + configuration and initialization.
- *
- * NOTA BENE: critical sections are protected by the "dummynet lock".
- *
- * Most important Changes:
- *
- * 011004: KLDable
- * 010124: Fixed WF2Q behaviour
- * 010122: Fixed spl protection.
- * 000601: WF2Q support
- * 000106: large rewrite, use heaps to handle very many pipes.
- * 980513:	initial release
+ * Configuration and internal object management for dummynet.
  */
 
+#include "opt_inet6.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
@@ -67,2201 +47,2115 @@ __FBSDID("$FreeBSD$");
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/time.h>
-#include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
-#include <net/netisr.h>
 #include <netinet/in.h>
-#include <netinet/ip.h>		/* ip_len, ip_off */
 #include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
 #include <netinet/ip_fw.h>
 #include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
 #include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* which objects to copy */
+#define DN_C_LINK 	0x01
+#define DN_C_SCH	0x02
+#define DN_C_FLOW	0x04
+#define DN_C_FS		0x08
+#define DN_C_QUEUE	0x10
+
+/* we use this argument in case of a schk_new */
+struct schk_new_arg {
+	struct dn_alg *fp;
+	struct dn_sch *sch;
+};
 
-#include <netinet/if_ether.h> /* various ether_* routines */
-
-#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
-#include <netinet6/ip6_var.h>
-
-/*
- * We keep a private variable for the simulation time, but we could
- * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
- */
-static dn_key curr_time = 0 ; /* current simulation time */
+/*---- callout hooks. ----*/
+static struct callout dn_timeout;
+static struct task	dn_task;
+static struct taskqueue	*dn_tq = NULL;
 
-static int dn_hash_size = 64 ;	/* default hash size */
+static void
+dummynet(void * __unused unused)
+{
 
-/* statistics on number of queue searches and search steps */
-static long searches, search_steps ;
-static int pipe_expire = 1 ;   /* expire queue if empty */
-static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+	taskqueue_enqueue(dn_tq, &dn_task);
+}
 
-static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
-static long pipe_byte_limit = 1024 * 1024;
+void
+dn_reschedule(void)
+{
+	callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+/*----- end of callout hooks -----*/
 
-static int red_lookup_depth = 256;	/* RED - default lookup table depth */
-static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
-static int red_max_pkt_size = 1500;     /* RED - default max packet size */
+/* Return a scheduler descriptor given the type or name. */
+static struct dn_alg *
+find_sched_type(int type, char *name)
+{
+	struct dn_alg *d;
 
-static struct timeval prev_t;
-static long tick_last;			/* Last tick duration (usec). */
-static long tick_delta;			/* Last vs standard tick diff (usec). */
-static long tick_delta_sum;		/* Accumulated tick difference (usec).*/
-static long tick_adjustment;		/* Tick adjustments done. */
-static long tick_lost;			/* Lost(coalesced) ticks number. */
-/* Adjusted vs non-adjusted curr_time difference (ticks). */
-static long tick_diff;
+	SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
+		if (d->type == type || (name && !strcmp(d->name, name)))
+			return d;
+	}
+	return NULL; /* not found */
+}
 
-static int		io_fast;
-static unsigned long	io_pkt;
-static unsigned long	io_pkt_fast;
-static unsigned long	io_pkt_drop;
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+	int oldv = *v;
+	const char *op = NULL;
+	if (oldv < lo) {
+		*v = dflt;
+		op = "Bump";
+	} else if (oldv > hi) {
+		*v = hi;
+		op = "Clamp";
+	} else
+		return *v;
+	if (op && msg)
+		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
+	return *v;
+}
 
+/*---- flow_id mask, hash and compare functions ---*/
 /*
- * Three heaps contain queues and pipes that the scheduler handles:
- *
- * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
- *
- * wfq_ready_heap contains the pipes associated with WF2Q flows
- *
- * extract_heap contains pipes associated with delay lines.
- *
+ * The flow_id includes the 5-tuple, the queue/pipe number
+ * which we store in the extra area in host order,
+ * and for ipv6 also the flow_id6.
+ * XXX see if we want the tos byte (can store in 'flags')
  */
+static struct ipfw_flow_id *
+flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
+{
+	int is_v6 = IS_IP6_FLOW_ID(id);
 
-MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
-
-static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
-
-static int	heap_init(struct dn_heap *h, int size);
-static int	heap_insert (struct dn_heap *h, dn_key key1, void *p);
-static void	heap_extract(struct dn_heap *h, void *obj);
-static void	transmit_event(struct dn_pipe *pipe, struct mbuf **head,
-		    struct mbuf **tail);
-static void	ready_event(struct dn_flow_queue *q, struct mbuf **head,
-		    struct mbuf **tail);
-static void	ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
-		    struct mbuf **tail);
-
-#define	HASHSIZE	16
-#define	HASH(num)	((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
-static struct dn_pipe_head	pipehash[HASHSIZE];	/* all pipes */
-static struct dn_flow_set_head	flowsethash[HASHSIZE];	/* all flowsets */
-
-static struct callout dn_timeout;
-
-extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
-
-#ifdef SYSCTL_NODE
-SYSCTL_DECL(_net_inet);
-SYSCTL_DECL(_net_inet_ip);
-
-SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
-    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
-#if 0	/* curr_time is 64 bit */
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
-    CTLFLAG_RD, &curr_time, 0, "Current tick");
-#endif
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
-    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
-    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
-    CTLFLAG_RD, &searches, 0, "Number of queue searches");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
-    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
-    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
-    CTLFLAG_RW, &dn_max_ratio, 0,
-    "Max ratio between dynamic queues and buckets");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
-    CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
-    CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
-    CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
-    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
-    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
-    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
-    CTLFLAG_RD, &tick_diff, 0,
-    "Adjusted vs non-adjusted curr_time difference (ticks).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
-    CTLFLAG_RD, &tick_lost, 0,
-    "Number of ticks coalesced by dummynet taskqueue.");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
-    CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
-    CTLFLAG_RD, &io_pkt, 0,
-    "Number of packets passed to dummynet.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
-    CTLFLAG_RD, &io_pkt_fast, 0,
-    "Number of packets bypassed dummynet scheduler.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
-    CTLFLAG_RD, &io_pkt_drop, 0,
-    "Number of packets dropped by dummynet.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
-    CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
-    CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
-#endif
-
-#ifdef DUMMYNET_DEBUG
-int	dummynet_debug = 0;
-#ifdef SYSCTL_NODE
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
-	    0, "control debugging printfs");
-#endif
-#define	DPRINTF(X)	if (dummynet_debug) printf X
-#else
-#define	DPRINTF(X)
-#endif
-
-static struct task	dn_task;
-static struct taskqueue	*dn_tq = NULL;
-static void dummynet_task(void *, int);
-
-static struct mtx dummynet_mtx;
-#define	DUMMYNET_LOCK_INIT() \
-	mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
-#define	DUMMYNET_LOCK_DESTROY()	mtx_destroy(&dummynet_mtx)
-#define	DUMMYNET_LOCK()		mtx_lock(&dummynet_mtx)
-#define	DUMMYNET_UNLOCK()	mtx_unlock(&dummynet_mtx)
-#define	DUMMYNET_LOCK_ASSERT()	mtx_assert(&dummynet_mtx, MA_OWNED)
-
-static int	config_pipe(struct dn_pipe *p);
-static int	ip_dn_ctl(struct sockopt *sopt);
-
-static void	dummynet(void *);
-static void	dummynet_flush(void);
-static void	dummynet_send(struct mbuf *);
-static int	dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+	id->dst_port &= mask->dst_port;
+	id->src_port &= mask->src_port;
+	id->proto &= mask->proto;
+	id->extra &= mask->extra;
+	if (is_v6) {
+		APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
+		APPLY_MASK(&id->src_ip6, &mask->src_ip6);
+		id->flow_id6 &= mask->flow_id6;
+	} else {
+		id->dst_ip &= mask->dst_ip;
+		id->src_ip &= mask->src_ip;
+	}
+	return id;
+}
 
-/*
- * Flow queue is idle if:
- *   1) it's empty for at least 1 tick
- *   2) it has invalid timestamp (WF2Q case)
- *   3) parent pipe has no 'exhausted' burst.
- */
-#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \
-	curr_time > (q)->idle_time + 1 && \
-	((q)->numbytes + (curr_time - (q)->idle_time - 1) * \
-	(q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst))
+/* computes an OR of two masks, result in dst and also returned */
+static struct ipfw_flow_id *
+flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
+{
+	int is_v6 = IS_IP6_FLOW_ID(dst);
 
-/*
- * Heap management functions.
- *
- * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
- * Some macros help finding parent/children so we can optimize them.
- *
- * heap_init() is called to expand the heap when needed.
- * Increment size in blocks of 16 entries.
- * XXX failure to allocate a new element is a pretty bad failure
- * as we basically stall a whole queue forever!!
- * Returns 1 on error, 0 on success
- */
-#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
-#define HEAP_LEFT(x) ( 2*(x) + 1 )
-#define HEAP_IS_LEFT(x) ( (x) & 1 )
-#define HEAP_RIGHT(x) ( 2*(x) + 2 )
-#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
-#define HEAP_INCREMENT	15
+	dst->dst_port |= src->dst_port;
+	dst->src_port |= src->src_port;
+	dst->proto |= src->proto;
+	dst->extra |= src->extra;
+	if (is_v6) {
+#define OR_MASK(_d, _s)                          \
+    (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
+    (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
+    (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
+    (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
+		OR_MASK(&dst->dst_ip6, &src->dst_ip6);
+		OR_MASK(&dst->src_ip6, &src->src_ip6);
+#undef OR_MASK
+		dst->flow_id6 |= src->flow_id6;
+	} else {
+		dst->dst_ip |= src->dst_ip;
+		dst->src_ip |= src->src_ip;
+	}
+	return dst;
+}
 
 static int
-heap_init(struct dn_heap *h, int new_size)
+nonzero_mask(struct ipfw_flow_id *m)
 {
-    struct dn_heap_entry *p;
+	if (m->dst_port || m->src_port || m->proto || m->extra)
+		return 1;
+	if (IS_IP6_FLOW_ID(m)) {
+		return
+			m->dst_ip6.__u6_addr.__u6_addr32[0] ||
+			m->dst_ip6.__u6_addr.__u6_addr32[1] ||
+			m->dst_ip6.__u6_addr.__u6_addr32[2] ||
+			m->dst_ip6.__u6_addr.__u6_addr32[3] ||
+			m->src_ip6.__u6_addr.__u6_addr32[0] ||
+			m->src_ip6.__u6_addr.__u6_addr32[1] ||
+			m->src_ip6.__u6_addr.__u6_addr32[2] ||
+			m->src_ip6.__u6_addr.__u6_addr32[3] ||
+			m->flow_id6;
+	} else {
+		return m->dst_ip || m->src_ip;
+	}
+}
 
-    if (h->size >= new_size ) {
-	printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
-		h->size, new_size);
-	return 0 ;
-    }
-    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
-    p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
-    if (p == NULL) {
-	printf("dummynet: %s, resize %d failed\n", __func__, new_size );
-	return 1 ; /* error */
-    }
-    if (h->size > 0) {
-	bcopy(h->p, p, h->size * sizeof(*p) );
-	free(h->p, M_DUMMYNET);
+/* XXX we may want a better hash function */
+static uint32_t
+flow_id_hash(struct ipfw_flow_id *id)
+{
+    uint32_t i;
+
+    if (IS_IP6_FLOW_ID(id)) {
+	uint32_t *d = (uint32_t *)&id->dst_ip6;
+	uint32_t *s = (uint32_t *)&id->src_ip6;
+        i = (d[0]      ) ^ (d[1])       ^
+            (d[2]      ) ^ (d[3])       ^
+            (d[0] >> 15) ^ (d[1] >> 15) ^
+            (d[2] >> 15) ^ (d[3] >> 15) ^
+            (s[0] <<  1) ^ (s[1] <<  1) ^
+            (s[2] <<  1) ^ (s[3] <<  1) ^
+            (s[0] << 16) ^ (s[1] << 16) ^
+            (s[2] << 16) ^ (s[3] << 16) ^
+            (id->dst_port << 1) ^ (id->src_port) ^
+	    (id->extra) ^
+            (id->proto ) ^ (id->flow_id6);
+    } else {
+        i = (id->dst_ip)        ^ (id->dst_ip >> 15) ^
+            (id->src_ip << 1)   ^ (id->src_ip >> 16) ^
+	    (id->extra) ^
+            (id->dst_port << 1) ^ (id->src_port)     ^ (id->proto);
     }
-    h->p = p ;
-    h->size = new_size ;
-    return 0 ;
+    return i;
 }
 
-/*
- * Insert element in heap. Normally, p != NULL, we insert p in
- * a new position and bubble up. If p == NULL, then the element is
- * already in place, and key is the position where to start the
- * bubble-up.
- * Returns 1 on failure (cannot allocate new heap entry)
- *
- * If offset > 0 the position (index, int) of the element in the heap is
- * also stored in the element itself at the given offset in bytes.
- */
-#define SET_OFFSET(heap, node) \
-    if (heap->offset > 0) \
-	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
-/*
- * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
- */
-#define RESET_OFFSET(heap, node) \
-    if (heap->offset > 0) \
-	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+/* Like bcmp, returns 0 if ids match, 1 otherwise. */
 static int
-heap_insert(struct dn_heap *h, dn_key key1, void *p)
-{
-    int son = h->elements ;
-
-    if (p == NULL)	/* data already there, set starting point */
-	son = key1 ;
-    else {		/* insert new element at the end, possibly resize */
-	son = h->elements ;
-	if (son == h->size) /* need resize... */
-	    if (heap_init(h, h->elements+1) )
-		return 1 ; /* failure... */
-	h->p[son].object = p ;
-	h->p[son].key = key1 ;
-	h->elements++ ;
-    }
-    while (son > 0) {				/* bubble up */
-	int father = HEAP_FATHER(son) ;
-	struct dn_heap_entry tmp  ;
-
-	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
-	    break ; /* found right position */
-	/* son smaller than father, swap and repeat */
-	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
-	SET_OFFSET(h, son);
-	son = father ;
-    }
-    SET_OFFSET(h, son);
-    return 0 ;
+flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
+{
+	int is_v6 = IS_IP6_FLOW_ID(id1);
+
+	if (!is_v6) {
+	    if (IS_IP6_FLOW_ID(id2))
+		return 1; /* different address families */
+
+	    return (id1->dst_ip == id2->dst_ip &&
+		    id1->src_ip == id2->src_ip &&
+		    id1->dst_port == id2->dst_port &&
+		    id1->src_port == id2->src_port &&
+		    id1->proto == id2->proto &&
+		    id1->extra == id2->extra) ? 0 : 1;
+	}
+	/* the ipv6 case */
+	return (
+	    !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
+	    !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
+	    id1->dst_port == id2->dst_port &&
+	    id1->src_port == id2->src_port &&
+	    id1->proto == id2->proto &&
+	    id1->extra == id2->extra &&
+	    id1->flow_id6 == id2->flow_id6) ? 0 : 1;
 }
+/*--------- end of flow-id mask, hash and compare ---------*/
 
-/*
- * remove top element from heap, or obj if obj != NULL
+/*--- support functions for the qht hashtable ----
+ * Entries are hashed by flow-id
  */
-static void
-heap_extract(struct dn_heap *h, void *obj)
+static uint32_t
+q_hash(uintptr_t key, int flags, void *arg)
 {
-    int child, father, max = h->elements - 1 ;
+	/* compute the hash slot from the flow id */
+	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+		&((struct dn_queue *)key)->ni.fid :
+		(struct ipfw_flow_id *)key;
 
-    if (max < 0) {
-	printf("dummynet: warning, extract from empty heap 0x%p\n", h);
-	return ;
-    }
-    father = 0 ; /* default: move up smallest child */
-    if (obj != NULL) { /* extract specific element, index is at offset */
-	if (h->offset <= 0)
-	    panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
-	father = *((int *)((char *)obj + h->offset)) ;
-	if (father < 0 || father >= h->elements) {
-	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
-		father, h->elements);
-	    panic("dummynet: heap_extract");
+	return flow_id_hash(id);
+}
+
+static int
+q_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+	struct dn_queue *o = (struct dn_queue *)obj;
+	struct ipfw_flow_id *id2;
+
+	if (flags & DNHT_KEY_IS_OBJ) {
+		/* compare pointers */
+		id2 = &((struct dn_queue *)key)->ni.fid;
+	} else {
+		id2 = (struct ipfw_flow_id *)key;
 	}
-    }
-    RESET_OFFSET(h, father);
-    child = HEAP_LEFT(father) ;		/* left child */
-    while (child <= max) {		/* valid entry */
-	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
-	    child = child+1 ;		/* take right child, otherwise left */
-	h->p[father] = h->p[child] ;
-	SET_OFFSET(h, father);
-	father = child ;
-	child = HEAP_LEFT(child) ;   /* left child for next loop */
-    }
-    h->elements-- ;
-    if (father != max) {
-	/*
-	 * Fill hole with last entry and bubble up, reusing the insert code
-	 */
-	h->p[father] = h->p[max] ;
-	heap_insert(h, father, NULL); /* this one cannot fail */
-    }
+	return (0 == flow_id_cmp(&o->ni.fid,  id2));
 }
 
-#if 0
 /*
- * change object position and update references
- * XXX this one is never used!
+ * create a new queue instance for the given 'key'.
  */
-static void
-heap_move(struct dn_heap *h, dn_key new_key, void *object)
-{
-    int temp;
-    int i ;
-    int max = h->elements-1 ;
-    struct dn_heap_entry buf ;
-
-    if (h->offset <= 0)
-	panic("cannot move items on this heap");
-
-    i = *((int *)((char *)object + h->offset));
-    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
-	h->p[i].key = new_key ;
-	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
-		 i = temp ) { /* bubble up */
-	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
-	    SET_OFFSET(h, i);
-	}
-    } else {		/* must move down */
-	h->p[i].key = new_key ;
-	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
-	    if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
-		temp++ ; /* select child with min key */
-	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
-		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
-		SET_OFFSET(h, i);
-	    } else
-		break ;
-	    i = temp ;
+static void *
+q_new(uintptr_t key, int flags, void *arg)
+{   
+	struct dn_queue *q, *template = arg;
+	struct dn_fsk *fs = template->fs;
+	int size = sizeof(*q) + fs->sched->fp->q_datalen;
+
+	q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (q == NULL) {
+		D("no memory for new queue");
+		return NULL;
 	}
-    }
-    SET_OFFSET(h, i);
+
+	set_oid(&q->ni.oid, DN_QUEUE, size);
+	if (fs->fs.flags & DN_QHT_HASH)
+		q->ni.fid = *(struct ipfw_flow_id *)key;
+	q->fs = fs;
+	q->_si = template->_si;
+	q->_si->q_count++;
+
+	if (fs->sched->fp->new_queue)
+		fs->sched->fp->new_queue(q);
+	dn_cfg.queue_count++;
+	return q;
 }
-#endif /* heap_move, unused */
 
 /*
- * heapify() will reorganize data inside an array to maintain the
- * heap property. It is needed when we delete a bunch of entries.
+ * Notify schedulers that a queue is going away.
+ * If (flags & DN_DESTROY), also free the packets.
+ * The version for callbacks is called q_delete_cb().
  */
 static void
-heapify(struct dn_heap *h)
+dn_delete_queue(struct dn_queue *q, int flags)
 {
-    int i ;
+	struct dn_fsk *fs = q->fs;
+
+	// D("fs %p si %p\n", fs, q->_si);
+	/* notify the parent scheduler that the queue is going away */
+	if (fs && fs->sched->fp->free_queue)
+		fs->sched->fp->free_queue(q);
+	q->_si->q_count--;
+	q->_si = NULL;
+	if (flags & DN_DESTROY) {
+		if (q->mq.head)
+			dn_free_pkts(q->mq.head);
+		bzero(q, sizeof(*q));	// safety
+		free(q, M_DUMMYNET);
+		dn_cfg.queue_count--;
+	}
+}
 
-    for (i = 0 ; i < h->elements ; i++ )
-	heap_insert(h, i , NULL) ;
+static int
+q_delete_cb(void *q, void *arg)
+{
+	int flags = (int)(uintptr_t)arg;
+	dn_delete_queue(q, flags);
+	return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
 }
 
 /*
- * cleanup the heap and free data structure
+ * calls dn_delete_queue/q_delete_cb on all queues,
+ * which notifies the parent scheduler and possibly drains packets.
+ * flags & DN_DESTROY: drains queues and destroy qht;
  */
 static void
-heap_free(struct dn_heap *h)
+qht_delete(struct dn_fsk *fs, int flags)
 {
-    if (h->size >0 )
-	free(h->p, M_DUMMYNET);
-    bzero(h, sizeof(*h) );
+	ND("fs %d start flags %d qht %p",
+		fs->fs.fs_nr, flags, fs->qht);
+	if (!fs->qht)
+		return;
+	if (fs->fs.flags & DN_QHT_HASH) {
+		dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
+		if (flags & DN_DESTROY) {
+			dn_ht_free(fs->qht, 0);
+			fs->qht = NULL;
+		}
+	} else {
+		dn_delete_queue((struct dn_queue *)(fs->qht), flags);
+		if (flags & DN_DESTROY)
+			fs->qht = NULL;
+	}
 }
 
 /*
- * --- end of heap management functions ---
+ * Find and possibly create the queue for a MULTIQUEUE scheduler.
+ * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
  */
+struct dn_queue *
+ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si,
+	struct ipfw_flow_id *id)
+{
+	struct dn_queue template;
+
+	template._si = si;
+	template.fs = fs;
+
+	if (fs->fs.flags & DN_QHT_HASH) {
+		struct ipfw_flow_id masked_id;
+		if (fs->qht == NULL) {
+			fs->qht = dn_ht_init(NULL, fs->fs.buckets,
+				offsetof(struct dn_queue, q_next),
+				q_hash, q_match, q_new);
+			if (fs->qht == NULL)
+				return NULL;
+		}
+		masked_id = *id;
+		flow_id_mask(&fs->fsk_mask, &masked_id);
+		return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
+			DNHT_INSERT, &template);
+	} else {
+		if (fs->qht == NULL)
+			fs->qht = q_new(0, 0, &template);
+		return (struct dn_queue *)fs->qht;
+	}
+}
+/*--- end of queue hash table ---*/
 
-/*
- * Dispose a list of packet. Use an inline functions so if we
- * need to free extra state associated to a packet, this is a
- * central point to do it.
+/*--- support functions for the sch_inst hashtable ----
+ *
+ * These are hashed by flow-id
  */
-
-static __inline void dn_free_pkts(struct mbuf *mnext)
+static uint32_t
+si_hash(uintptr_t key, int flags, void *arg)
 {
-	struct mbuf *m;
+	/* compute the hash slot from the flow id */
+	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+		&((struct dn_sch_inst *)key)->ni.fid :
+		(struct ipfw_flow_id *)key;
 
-	while ((m = mnext) != NULL) {
-		mnext = m->m_nextpkt;
-		FREE_PKT(m);
-	}
+	return flow_id_hash(id);
 }
 
-/*
- * Return the mbuf tag holding the dummynet state.  As an optimization
- * this is assumed to be the first tag on the list.  If this turns out
- * wrong we'll need to search the list.
- */
-static struct dn_pkt_tag *
-dn_tag_get(struct mbuf *m)
+static int
+si_match(void *obj, uintptr_t key, int flags, void *arg)
 {
-    struct m_tag *mtag = m_tag_first(m);
-    KASSERT(mtag != NULL &&
-	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
-	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
-	    ("packet on dummynet queue w/o dummynet tag!"));
-    return (struct dn_pkt_tag *)(mtag+1);
+	struct dn_sch_inst *o = obj;
+	struct ipfw_flow_id *id2;
+
+	id2 = (flags & DNHT_KEY_IS_OBJ) ?
+		&((struct dn_sch_inst *)key)->ni.fid :
+		(struct ipfw_flow_id *)key;
+	return flow_id_cmp(&o->ni.fid,  id2) == 0;
 }
 
 /*
- * Scheduler functions:
- *
- * transmit_event() is called when the delay-line needs to enter
- * the scheduler, either because of existing pkts getting ready,
- * or new packets entering the queue. The event handled is the delivery
- * time of the packet.
- *
- * ready_event() does something similar with fixed-rate queues, and the
- * event handled is the finish time of the head pkt.
- *
- * wfq_ready_event() does something similar with WF2Q queues, and the
- * event handled is the start time of the head pkt.
- *
- * In all cases, we make sure that the data structures are consistent
- * before passing pkts out, because this might trigger recursive
- * invocations of the procedures.
+ * create a new instance for the given 'key'
+ * Allocate memory for instance, delay line and scheduler private data.
  */
-static void
-transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+static void *
+si_new(uintptr_t key, int flags, void *arg)
 {
-	struct mbuf *m;
-	struct dn_pkt_tag *pkt;
-
-	DUMMYNET_LOCK_ASSERT();
-
-	while ((m = pipe->head) != NULL) {
-		pkt = dn_tag_get(m);
-		if (!DN_KEY_LEQ(pkt->output_time, curr_time))
-			break;
-
-		pipe->head = m->m_nextpkt;
-		if (*tail != NULL)
-			(*tail)->m_nextpkt = m;
-		else
-			*head = m;
-		*tail = m;
+	struct dn_schk *s = arg;
+	struct dn_sch_inst *si;
+	int l = sizeof(*si) + s->fp->si_datalen;
+
+	si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (si == NULL)
+		goto error;
+	/* Set length only for the part passed up to userland. */
+	set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
+	set_oid(&(si->dline.oid), DN_DELAY_LINE,
+		sizeof(struct delay_line));
+	/* mark si and dline as outside the event queue */
+	si->ni.oid.id = si->dline.oid.id = -1;
+
+	si->sched = s;
+	si->dline.si = si;
+
+	if (s->fp->new_sched && s->fp->new_sched(si)) {
+		D("new_sched error");
+		goto error;
 	}
-	if (*tail != NULL)
-		(*tail)->m_nextpkt = NULL;
+	if (s->sch.flags & DN_HAVE_MASK)
+		si->ni.fid = *(struct ipfw_flow_id *)key;
 
-	/* If there are leftover packets, put into the heap for next event. */
-	if ((m = pipe->head) != NULL) {
-		pkt = dn_tag_get(m);
-		/*
-		 * XXX Should check errors on heap_insert, by draining the
-		 * whole pipe p and hoping in the future we are more successful.
-		 */
-		heap_insert(&extract_heap, pkt->output_time, pipe);
+	dn_cfg.si_count++;
+	return si;
+
+error:
+	if (si) {
+		bzero(si, sizeof(*si)); // safety
+		free(si, M_DUMMYNET);
 	}
+        return NULL;
 }
 
-#define div64(a, b)	((int64_t)(a) / (int64_t)(b))
 /*
- * Compute how many ticks we have to wait before being able to send
- * a packet. This is computed as the "wire time" for the packet
- * (length + extra bits), minus the credit available, scaled to ticks.
- * Check that the result is not be negative (it could be if we have
- * too much leftover credit in q->numbytes).
+ * Callback from siht to delete all scheduler instances. Remove
+ * si and delay line from the system heap, destroy all queues.
+ * We assume that all flowset have been notified and do not
+ * point to us anymore.
  */
-static inline dn_key
-set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+static int
+si_destroy(void *_si, void *arg)
 {
-	int64_t ret;
-
-	ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
-		- q->numbytes + p->bandwidth - 1 , p->bandwidth);
-	if (ret < 0)
-		ret = 0;
-	return ret;
+	struct dn_sch_inst *si = _si;
+	struct dn_schk *s = si->sched;
+	struct delay_line *dl = &si->dline;
+
+	if (dl->oid.subtype) /* remove delay line from event heap */
+		heap_extract(&dn_cfg.evheap, dl);
+	dn_free_pkts(dl->mq.head);	/* drain delay line */
+	if (si->kflags & DN_ACTIVE) /* remove si from event heap */
+		heap_extract(&dn_cfg.evheap, si);
+	if (s->fp->free_sched)
+		s->fp->free_sched(si);
+	bzero(si, sizeof(*si));	/* safety */
+	free(si, M_DUMMYNET);
+	dn_cfg.si_count--;
+	return DNHT_SCAN_DEL;
 }
 
 /*
- * Convert the additional MAC overheads/delays into an equivalent
- * number of bits for the given data rate. The samples are in milliseconds
- * so we need to divide by 1000.
+ * Find the scheduler instance for this packet. If we need to apply
+ * a mask, do on a local copy of the flow_id to preserve the original.
+ * Assume siht is always initialized if we have a mask.
  */
-static dn_key
-compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+struct dn_sch_inst *
+ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
 {
-	int index;
-	dn_key extra_bits;
 
-	if (!p->samples || p->samples_no == 0)
-		return 0;
-	index  = random() % p->samples_no;
-	extra_bits = div64((dn_key)p->samples[index] * p->bandwidth, 1000);
-	if (index >= p->loss_level) {
-		struct dn_pkt_tag *dt = dn_tag_get(pkt);
-		if (dt)
-			dt->dn_dir = DIR_DROP;
+	if (s->sch.flags & DN_HAVE_MASK) {
+		struct ipfw_flow_id id_t = *id;
+		flow_id_mask(&s->sch.sched_mask, &id_t);
+		return dn_ht_find(s->siht, (uintptr_t)&id_t,
+			DNHT_INSERT, s);
 	}
-	return extra_bits;
+	if (!s->siht)
+		s->siht = si_new(0, 0, s);
+	return (struct dn_sch_inst *)s->siht;
 }
 
-static void
-free_pipe(struct dn_pipe *p)
+/* callback to flush credit for the scheduler instance */
+static int
+si_reset_credit(void *_si, void *arg)
 {
-	if (p->samples)
-		free(p->samples, M_DUMMYNET);
-	free(p, M_DUMMYNET);
+	struct dn_sch_inst *si = _si;
+	struct dn_link *p = &si->sched->link;
+
+	si->credit = p->burst + (dn_cfg.io_fast ?  p->bandwidth : 0);
+	return 0;
 }
 
-/*
- * extract pkt from queue, compute output time (could be now)
- * and put into delay line (p_queue)
- */
 static void
-move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
-    int len)
+schk_reset_credit(struct dn_schk *s)
 {
-    struct dn_pkt_tag *dt = dn_tag_get(pkt);
-
-    q->head = pkt->m_nextpkt ;
-    q->len-- ;
-    q->len_bytes -= len ;
-
-    dt->output_time = curr_time + p->delay ;
-
-    if (p->head == NULL)
-	p->head = pkt;
-    else
-	p->tail->m_nextpkt = pkt;
-    p->tail = pkt;
-    p->tail->m_nextpkt = NULL;
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, si_reset_credit, NULL);
+	else if (s->siht)
+		si_reset_credit(s->siht, NULL);
 }
+/*---- end of sch_inst hashtable ---------------------*/
 
-/*
- * ready_event() is invoked every time the queue must enter the
- * scheduler, either because the first packet arrives, or because
- * a previously scheduled event fired.
- * On invokation, drain as many pkts as possible (could be 0) and then
- * if there are leftover packets reinsert the pkt in the scheduler.
+/*-------------------------------------------------------
+ * flowset hash (fshash) support. Entries are hashed by fs_nr.
+ * New allocations are put in the fsunlinked list, from which
+ * they are removed when they point to a specific scheduler.
  */
-static void
-ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+static uint32_t
+fsk_hash(uintptr_t key, int flags, void *arg)
 {
-	struct mbuf *pkt;
-	struct dn_pipe *p = q->fs->pipe;
-	int p_was_empty;
+	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_fsk *)key)->fs.fs_nr;
 
-	DUMMYNET_LOCK_ASSERT();
-
-	if (p == NULL) {
-		printf("dummynet: ready_event- pipe is gone\n");
-		return;
-	}
-	p_was_empty = (p->head == NULL);
+	return ( (i>>8)^(i>>4)^i );
+}
 
-	/*
-	 * Schedule fixed-rate queues linked to this pipe:
-	 * account for the bw accumulated since last scheduling, then
-	 * drain as many pkts as allowed by q->numbytes and move to
-	 * the delay line (in p) computing output time.
-	 * bandwidth==0 (no limit) means we can drain the whole queue,
-	 * setting len_scaled = 0 does the job.
-	 */
-	q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
-	while ((pkt = q->head) != NULL) {
-		int len = pkt->m_pkthdr.len;
-		dn_key len_scaled = p->bandwidth ? len*8*hz
-			+ q->extra_bits*hz
-			: 0;
-
-		if (DN_KEY_GT(len_scaled, q->numbytes))
-			break;
-		q->numbytes -= len_scaled;
-		move_pkt(pkt, q, p, len);
-		if (q->head)
-			q->extra_bits = compute_extra_bits(q->head, p);
-	}
-	/*
-	 * If we have more packets queued, schedule next ready event
-	 * (can only occur when bandwidth != 0, otherwise we would have
-	 * flushed the whole queue in the previous loop).
-	 * To this purpose we record the current time and compute how many
-	 * ticks to go for the finish time of the packet.
-	 */
-	if ((pkt = q->head) != NULL) {	/* this implies bandwidth != 0 */
-		dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+static int
+fsk_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+	struct dn_fsk *fs = obj;
+	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_fsk *)key)->fs.fs_nr;
 
-		q->sched_time = curr_time;
-		heap_insert(&ready_heap, curr_time + t, (void *)q);
-		/*
-		 * XXX Should check errors on heap_insert, and drain the whole
-		 * queue on error hoping next time we are luckier.
-		 */
-	} else		/* RED needs to know when the queue becomes empty. */
-		q->idle_time = curr_time;
+	return (fs->fs.fs_nr == i);
+}
 
-	/*
-	 * If the delay line was empty call transmit_event() now.
-	 * Otherwise, the scheduler will take care of it.
-	 */
-	if (p_was_empty)
-		transmit_event(p, head, tail);
+static void *
+fsk_new(uintptr_t key, int flags, void *arg)
+{
+	struct dn_fsk *fs;
+
+	fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (fs) {
+		set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
+		dn_cfg.fsk_count++;
+		fs->drain_bucket = 0;
+		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+	}
+	return fs;
 }
 
 /*
- * Called when we can transmit packets on WF2Q queues. Take pkts out of
- * the queues at their start time, and enqueue into the delay line.
- * Packets are drained until p->numbytes < 0. As long as
- * len_scaled >= p->numbytes, the packet goes into the delay line
- * with a deadline p->delay. For the last packet, if p->numbytes < 0,
- * there is an additional delay.
+ * detach flowset from its current scheduler. Flags as follows:
+ * DN_DETACH removes from the fsk_list
+ * DN_DESTROY deletes individual queues
+ * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
  */
 static void
-ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+fsk_detach(struct dn_fsk *fs, int flags)
 {
-	int p_was_empty = (p->head == NULL);
-	struct dn_heap *sch = &(p->scheduler_heap);
-	struct dn_heap *neh = &(p->not_eligible_heap);
-	int64_t p_numbytes = p->numbytes;
-
-	/*
-	 * p->numbytes is only 32bits in FBSD7, but we might need 64 bits.
-	 * Use a local variable for the computations, and write back the
-	 * results when done, saturating if needed.
-	 * The local variable has no impact on performance and helps
-	 * reducing diffs between the various branches.
-	 */
-
-	DUMMYNET_LOCK_ASSERT();
-
-	if (p->if_name[0] == 0)		/* tx clock is simulated */
-		p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
-	else {	/*
-		 * tx clock is for real,
-		 * the ifq must be empty or this is a NOP.
-		 */
-		if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
-			return;
-		else {
-			DPRINTF(("dummynet: pipe %d ready from %s --\n",
-			    p->pipe_nr, p->if_name));
-		}
-	}
-
-	/*
-	 * While we have backlogged traffic AND credit, we need to do
-	 * something on the queue.
-	 */
-	while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
-		if (sch->elements > 0) {
-			/* Have some eligible pkts to send out. */
-			struct dn_flow_queue *q = sch->p[0].object;
-			struct mbuf *pkt = q->head;
-			struct dn_flow_set *fs = q->fs;
-			uint64_t len = pkt->m_pkthdr.len;
-			int len_scaled = p->bandwidth ? len * 8 * hz : 0;
-
-			heap_extract(sch, NULL); /* Remove queue from heap. */
-			p_numbytes -= len_scaled;
-			move_pkt(pkt, q, p, len);
-
-			p->V += div64((len << MY_M), p->sum);	/* Update V. */
-			q->S = q->F;			/* Update start time. */
-			if (q->len == 0) {
-				/* Flow not backlogged any more. */
-				fs->backlogged--;
-				heap_insert(&(p->idle_heap), q->F, q);
-			} else {
-				/* Still backlogged. */
-
-				/*
-				 * Update F and position in backlogged queue,
-				 * then put flow in not_eligible_heap
-				 * (we will fix this later).
-				 */
-				len = (q->head)->m_pkthdr.len;
-				q->F += div64((len << MY_M), fs->weight);
-				if (DN_KEY_LEQ(q->S, p->V))
-					heap_insert(neh, q->S, q);
-				else
-					heap_insert(sch, q->F, q);
-			}
-		}
-		/*
-		 * Now compute V = max(V, min(S_i)). Remember that all elements
-		 * in sch have by definition S_i <= V so if sch is not empty,
-		 * V is surely the max and we must not update it. Conversely,
-		 * if sch is empty we only need to look at neh.
-		 */
-		if (sch->elements == 0 && neh->elements > 0)
-			p->V = MAX64(p->V, neh->p[0].key);
-		/* Move from neh to sch any packets that have become eligible */
-		while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
-			struct dn_flow_queue *q = neh->p[0].object;
-			heap_extract(neh, NULL);
-			heap_insert(sch, q->F, q);
-		}
-
-		if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
-			p_numbytes = -1;	/* Mark not ready for I/O. */
-			break;
-		}
+	if (flags & DN_DELETE_FS)
+		flags |= DN_DESTROY;
+	ND("fs %d from sched %d flags %s %s %s",
+		fs->fs.fs_nr, fs->fs.sched_nr,
+		(flags & DN_DELETE_FS) ? "DEL_FS":"",
+		(flags & DN_DESTROY) ? "DEL":"",
+		(flags & DN_DETACH) ? "DET":"");
+	if (flags & DN_DETACH) { /* detach from the list */
+		struct dn_fsk_head *h;
+		h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
+		SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
 	}
-	if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0) {
-		p->idle_time = curr_time;
-		/*
-		 * No traffic and no events scheduled.
-		 * We can get rid of idle-heap.
-		 */
-		if (p->idle_heap.elements > 0) {
-			int i;
-
-			for (i = 0; i < p->idle_heap.elements; i++) {
-				struct dn_flow_queue *q;
-				
-				q = p->idle_heap.p[i].object;
-				q->F = 0;
-				q->S = q->F + 1;
-			}
-			p->sum = 0;
-			p->V = 0;
-			p->idle_heap.elements = 0;
-		}
-	}
-	/*
-	 * If we are getting clocks from dummynet (not a real interface) and
-	 * If we are under credit, schedule the next ready event.
-	 * Also fix the delivery time of the last packet.
+	/* Free the RED parameters, they will be recomputed on
+	 * subsequent attach if needed.
 	 */
-	if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
-		dn_key t = 0;		/* Number of ticks i have to wait. */
-
-		if (p->bandwidth > 0)
-			t = div64(p->bandwidth - 1 - p_numbytes, p->bandwidth);
-		dn_tag_get(p->tail)->output_time += t;
-		p->sched_time = curr_time;
-		heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
-		/*
-		 * XXX Should check errors on heap_insert, and drain the whole
-		 * queue on error hoping next time we are luckier.
-		 */
+	if (fs->w_q_lookup)
+		free(fs->w_q_lookup, M_DUMMYNET);
+	fs->w_q_lookup = NULL;
+	qht_delete(fs, flags);
+	if (fs->sched && fs->sched->fp->free_fsk)
+		fs->sched->fp->free_fsk(fs);
+	fs->sched = NULL;
+	if (flags & DN_DELETE_FS) {
+		bzero(fs, sizeof(fs));	/* safety */
+		free(fs, M_DUMMYNET);
+		dn_cfg.fsk_count--;
+	} else {
+		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
 	}
-
-	/* Write back p_numbytes (adjust 64->32bit if necessary). */
-	p->numbytes = p_numbytes;
-
-	/*
-	 * If the delay line was empty call transmit_event() now.
-	 * Otherwise, the scheduler will take care of it.
-	 */
-	if (p_was_empty)
-		transmit_event(p, head, tail);
 }
 
 /*
- * This is called one tick, after previous run. It is used to
- * schedule next run.
+ * Detach or destroy all flowsets in a list.
+ * flags specifies what to do:
+ * DN_DESTROY:	flush all queues
+ * DN_DELETE_FS:	DN_DESTROY + destroy flowset
+ *	DN_DELETE_FS implies DN_DESTROY
  */
 static void
-dummynet(void * __unused unused)
+fsk_detach_list(struct dn_fsk_head *h, int flags)
 {
-
-	taskqueue_enqueue(dn_tq, &dn_task);
+	struct dn_fsk *fs;
+	int n = 0; /* only for stats */
+
+	ND("head %p flags %x", h, flags);
+	while ((fs = SLIST_FIRST(h))) {
+		SLIST_REMOVE_HEAD(h, sch_chain);
+		n++;
+		fsk_detach(fs, flags);
+	}
+	ND("done %d flowsets", n);
 }
 
 /*
- * The timer handler for dummynet. Time is computed in ticks, but
- * but the code is tolerant to the actual rate at which this is called.
- * Once complete, the function reschedules itself for the next tick.
+ * called on 'queue X delete' -- removes the flowset from fshash,
+ * deletes all queues for the flowset, and removes the flowset.
  */
-static void
-dummynet_task(void *context, int pending)
+static int
+delete_fs(int i, int locked)
 {
-	struct mbuf *head = NULL, *tail = NULL;
-	struct dn_pipe *pipe;
-	struct dn_heap *heaps[3];
-	struct dn_heap *h;
-	void *p;	/* generic parameter to handler */
-	int i;
-	struct timeval t;
-
-	DUMMYNET_LOCK();
-
-	heaps[0] = &ready_heap;			/* fixed-rate queues */
-	heaps[1] = &wfq_ready_heap;		/* wfq queues */
-	heaps[2] = &extract_heap;		/* delay line */
-
- 	/* Update number of lost(coalesced) ticks. */
- 	tick_lost += pending - 1;
- 
- 	getmicrouptime(&t);
- 	/* Last tick duration (usec). */
- 	tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
- 	    (t.tv_usec - prev_t.tv_usec);
- 	/* Last tick vs standard tick difference (usec). */
- 	tick_delta = (tick_last * hz - 1000000) / hz;
- 	/* Accumulated tick difference (usec). */
- 	tick_delta_sum += tick_delta;
- 
- 	prev_t = t;
- 
- 	/*
- 	 * Adjust curr_time if accumulated tick difference greater than
- 	 * 'standard' tick. Since curr_time should be monotonically increasing,
- 	 * we do positive adjustment as required and throttle curr_time in
- 	 * case of negative adjustment.
- 	 */
-  	curr_time++;
- 	if (tick_delta_sum - tick >= 0) {
- 		int diff = tick_delta_sum / tick;
- 
- 		curr_time += diff;
- 		tick_diff += diff;
- 		tick_delta_sum %= tick;
- 		tick_adjustment++;
- 	} else if (tick_delta_sum + tick <= 0) {
- 		curr_time--;
- 		tick_diff--;
- 		tick_delta_sum += tick;
- 		tick_adjustment++;
- 	}
-
-	for (i = 0; i < 3; i++) {
-		h = heaps[i];
-		while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
-			if (h->p[0].key > curr_time)
-				printf("dummynet: warning, "
-				    "heap %d is %d ticks late\n",
-				    i, (int)(curr_time - h->p[0].key));
-			/* store a copy before heap_extract */
-			p = h->p[0].object;
-			/* need to extract before processing */
-			heap_extract(h, NULL);
-			if (i == 0)
-				ready_event(p, &head, &tail);
-			else if (i == 1) {
-				struct dn_pipe *pipe = p;
-				if (pipe->if_name[0] != '\0')
-					printf("dummynet: bad ready_event_wfq "
-					    "for pipe %s\n", pipe->if_name);
-				else
-					ready_event_wfq(p, &head, &tail);
-			} else
-				transmit_event(p, &head, &tail);
-		}
-	}
-
-	/* Sweep pipes trying to expire idle flow_queues. */
-	for (i = 0; i < HASHSIZE; i++) {
-		SLIST_FOREACH(pipe, &pipehash[i], next) {
-			if (pipe->idle_heap.elements > 0 &&
-			    DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
-				struct dn_flow_queue *q =
-				    pipe->idle_heap.p[0].object;
-
-				heap_extract(&(pipe->idle_heap), NULL);
-				/* Mark timestamp as invalid. */
-				q->S = q->F + 1;
-				pipe->sum -= q->fs->weight;
-			}
-		}
-	}
-
-	DUMMYNET_UNLOCK();
+	struct dn_fsk *fs;
+	int err = 0;
+
+	if (!locked)
+		DN_BH_WLOCK();
+	fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
+	ND("fs %d found %p", i, fs);
+	if (fs) {
+		fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
+		err = 0;
+	} else
+		err = EINVAL;
+	if (!locked)
+		DN_BH_WUNLOCK();
+	return err;
+}
 
-	if (head != NULL)
-		dummynet_send(head);
+/*----- end of flowset hashtable support -------------*/
 
-	callout_reset(&dn_timeout, 1, dummynet, NULL);
+/*------------------------------------------------------------
+ * Scheduler hash. When searching by index we pass sched_nr,
+ * otherwise we pass struct dn_sch * which is the first field in
+ * struct dn_schk so we can cast between the two. We use this trick
+ * because in the create phase (but it should be fixed).
+ */
+static uint32_t
+schk_hash(uintptr_t key, int flags, void *_arg)
+{
+	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_schk *)key)->sch.sched_nr;
+	return ( (i>>8)^(i>>4)^i );
 }
 
-static void
-dummynet_send(struct mbuf *m)
+static int
+schk_match(void *obj, uintptr_t key, int flags, void *_arg)
 {
-	struct mbuf *n;
-
-	for (; m != NULL; m = n) {
-		struct ifnet *ifp;
-		int dst;
-        	struct m_tag *tag;
-
-		n = m->m_nextpkt;
-		m->m_nextpkt = NULL;
-		tag = m_tag_first(m);
-		if (tag == NULL) {
-			dst = DIR_DROP;
-		} else {
-			struct dn_pkt_tag *pkt = dn_tag_get(m);
-			/* extract the dummynet info, rename the tag */
-			dst = pkt->dn_dir;
-			ifp = pkt->ifp;
-			/* rename the tag so it carries reinject info */
-			tag->m_tag_cookie = MTAG_IPFW_RULE;
-			tag->m_tag_id = 0;
-		}
-
-		switch (dst) {
-		case DIR_OUT:
-			SET_HOST_IPLEN(mtod(m, struct ip *));
-			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
-			break ;
-		case DIR_IN :
-			/* put header in network format for ip_input() */
-			//SET_NET_IPLEN(mtod(m, struct ip *));
-			netisr_dispatch(NETISR_IP, m);
-			break;
-#ifdef INET6
-		case DIR_IN | PROTO_IPV6:
-			netisr_dispatch(NETISR_IPV6, m);
-			break;
-
-		case DIR_OUT | PROTO_IPV6:
-			SET_HOST_IPLEN(mtod(m, struct ip *));
-			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
-			break;
-#endif
-		case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
-			if (bridge_dn_p != NULL)
-				((*bridge_dn_p)(m, ifp));
-			else
-				printf("dummynet: if_bridge not loaded\n");
-
-			break;
-		case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
-			/*
-			 * The Ethernet code assumes the Ethernet header is
-			 * contiguous in the first mbuf header.
-			 * Insure this is true.
-			 */
-			if (m->m_len < ETHER_HDR_LEN &&
-			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
-				printf("dummynet/ether: pullup failed, "
-				    "dropping packet\n");
-				break;
-			}
-			ether_demux(m->m_pkthdr.rcvif, m);
-			break;
-		case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
-			ether_output_frame(ifp, m);
-			break;
-
-		case DIR_DROP:
-			/* drop the packet after some time */
-			FREE_PKT(m);
-			break;
-
-		default:
-			printf("dummynet: bad switch %d!\n", dst);
-			FREE_PKT(m);
-			break;
-		}
-	}
+	struct dn_schk *s = (struct dn_schk *)obj;
+	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_schk *)key)->sch.sched_nr;
+	return (s->sch.sched_nr == i);
 }
 
 /*
- * Unconditionally expire empty queues in case of shortage.
- * Returns the number of queues freed.
+ * Create the entry and intialize with the sched hash if needed.
+ * Leave s->fp unset so we can tell whether a dn_ht_find() returns
+ * a new object or a previously existing one.
  */
-static int
-expire_queues(struct dn_flow_set *fs)
-{
-    struct dn_flow_queue *q, *prev ;
-    int i, initial_elements = fs->rq_elements ;
-
-    if (fs->last_expired == time_uptime)
-	return 0 ;
-    fs->last_expired = time_uptime ;
-    for (i = 0 ; i <= fs->rq_size ; i++) { /* last one is overflow */
-	for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) {
-	    if (!QUEUE_IS_IDLE(q)) {
-  		prev = q ;
-  	        q = q->next ;
-  	    } else { /* entry is idle, expire it */
-		struct dn_flow_queue *old_q = q ;
-
-		if (prev != NULL)
-		    prev->next = q = q->next ;
-		else
-		    fs->rq[i] = q = q->next ;
-		fs->rq_elements-- ;
-		free(old_q, M_DUMMYNET);
-	    }
+static void *
+schk_new(uintptr_t key, int flags, void *arg)
+{
+	struct schk_new_arg *a = arg;
+	struct dn_schk *s;
+	int l = sizeof(*s) +a->fp->schk_datalen;
+
+	s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (s == NULL)
+		return NULL;
+	set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
+	s->sch = *a->sch; // copy initial values
+	s->link.link_nr = s->sch.sched_nr;
+	SLIST_INIT(&s->fsk_list);
+	/* initialize the hash table or create the single instance */
+	s->fp = a->fp;	/* si_new needs this */
+	s->drain_bucket = 0;
+	if (s->sch.flags & DN_HAVE_MASK) {
+		s->siht = dn_ht_init(NULL, s->sch.buckets,
+			offsetof(struct dn_sch_inst, si_next),
+			si_hash, si_match, si_new);
+		if (s->siht == NULL) {
+			free(s, M_DUMMYNET);
+			return NULL;
+		}
 	}
-    }
-    return initial_elements - fs->rq_elements ;
+	s->fp = NULL;	/* mark as a new scheduler */
+	dn_cfg.schk_count++;
+	return s;
 }
 
 /*
- * If room, create a new queue and put at head of slot i;
- * otherwise, create or use the default queue.
+ * Callback for sched delete. Notify all attached flowsets to
+ * detach from the scheduler, destroy the internal flowset, and
+ * all instances. The scheduler goes away too.
+ * arg is 0 (only detach flowsets and destroy instances)
+ * DN_DESTROY (detach & delete queues, delete schk)
+ * or DN_DELETE_FS (delete queues and flowsets, delete schk)
  */
-static struct dn_flow_queue *
-create_queue(struct dn_flow_set *fs, int i)
+static int
+schk_delete_cb(void *obj, void *arg)
 {
-	struct dn_flow_queue *q;
-
-	if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
-	    expire_queues(fs) == 0) {
-		/* No way to get room, use or create overflow queue. */
-		i = fs->rq_size;
-		if (fs->rq[i] != NULL)
-		    return fs->rq[i];
-	}
-	q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
-	if (q == NULL) {
-		printf("dummynet: sorry, cannot allocate queue for new flow\n");
-		return (NULL);
+	struct dn_schk *s = obj;
+#if 0
+	int a = (int)arg;
+	ND("sched %d arg %s%s",
+		s->sch.sched_nr,
+		a&DN_DESTROY ? "DEL ":"",
+		a&DN_DELETE_FS ? "DEL_FS":"");
+#endif
+	fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
+	/* no more flowset pointing to us now */
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, si_destroy, NULL);
+	else if (s->siht)
+		si_destroy(s->siht, NULL);
+	if (s->profile) {
+		free(s->profile, M_DUMMYNET);
+		s->profile = NULL;
 	}
-	q->fs = fs;
-	q->hash_slot = i;
-	q->next = fs->rq[i];
-	q->S = q->F + 1;	/* hack - mark timestamp as invalid. */
-	q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0);
-	fs->rq[i] = q;
-	fs->rq_elements++;
-	return (q);
+	s->siht = NULL;
+	if (s->fp->destroy)
+		s->fp->destroy(s);
+	bzero(s, sizeof(*s));	// safety
+	free(obj, M_DUMMYNET);
+	dn_cfg.schk_count--;
+	return DNHT_SCAN_DEL;
 }
 
 /*
- * Given a flow_set and a pkt in last_pkt, find a matching queue
- * after appropriate masking. The queue is moved to front
- * so that further searches take less time.
+ * called on a 'sched X delete' command. Deletes a single scheduler.
+ * This is done by removing from the schedhash, unlinking all
+ * flowsets and deleting their traffic.
  */
-static struct dn_flow_queue *
-find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
-{
-    int i = 0 ; /* we need i and q for new allocations */
-    struct dn_flow_queue *q, *prev;
-    int is_v6 = IS_IP6_FLOW_ID(id);
-
-    if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
-	q = fs->rq[0] ;
-    else {
-	/* first, do the masking, then hash */
-	id->dst_port &= fs->flow_mask.dst_port ;
-	id->src_port &= fs->flow_mask.src_port ;
-	id->proto &= fs->flow_mask.proto ;
-	id->flags = 0 ; /* we don't care about this one */
-	if (is_v6) {
-	    APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
-	    APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
-	    id->flow_id6 &= fs->flow_mask.flow_id6;
-
-	    i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
-
-		((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
-
-		((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
-
-		((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
-
-		(id->dst_port << 1) ^ (id->src_port) ^
-		(id->proto ) ^
-		(id->flow_id6);
-	} else {
-	    id->dst_ip &= fs->flow_mask.dst_ip ;
-	    id->src_ip &= fs->flow_mask.src_ip ;
-
-	    i = ( (id->dst_ip) & 0xffff ) ^
-		( (id->dst_ip >> 15) & 0xffff ) ^
-		( (id->src_ip << 1) & 0xffff ) ^
-		( (id->src_ip >> 16 ) & 0xffff ) ^
-		(id->dst_port << 1) ^ (id->src_port) ^
-		(id->proto );
-	}
-	i = i % fs->rq_size ;
-	/* finally, scan the current list for a match */
-	searches++ ;
-	for (prev=NULL, q = fs->rq[i] ; q ; ) {
-	    search_steps++;
-	    if (is_v6 &&
-		    IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
-		    IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
-		    id->dst_port == q->id.dst_port &&
-		    id->src_port == q->id.src_port &&
-		    id->proto == q->id.proto &&
-		    id->flags == q->id.flags &&
-		    id->flow_id6 == q->id.flow_id6)
-		break ; /* found */
-
-	    if (!is_v6 && id->dst_ip == q->id.dst_ip &&
-		    id->src_ip == q->id.src_ip &&
-		    id->dst_port == q->id.dst_port &&
-		    id->src_port == q->id.src_port &&
-		    id->proto == q->id.proto &&
-		    id->flags == q->id.flags)
-		break ; /* found */
-
-	    /* No match. Check if we can expire the entry */
-	    if (pipe_expire && QUEUE_IS_IDLE(q)) {
-		/* entry is idle and not in any heap, expire it */
-		struct dn_flow_queue *old_q = q ;
-
-		if (prev != NULL)
-		    prev->next = q = q->next ;
-		else
-		    fs->rq[i] = q = q->next ;
-		fs->rq_elements-- ;
-		free(old_q, M_DUMMYNET);
-		continue ;
-	    }
-	    prev = q ;
-	    q = q->next ;
-	}
-	if (q && prev != NULL) { /* found and not in front */
-	    prev->next = q->next ;
-	    q->next = fs->rq[i] ;
-	    fs->rq[i] = q ;
-	}
-    }
-    if (q == NULL) { /* no match, need to allocate a new entry */
-	q = create_queue(fs, i);
-	if (q != NULL)
-	q->id = *id ;
-    }
-    return q ;
+static int
+delete_schk(int i)
+{
+	struct dn_schk *s;
+
+	s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+	ND("%d %p", i, s);
+	if (!s)
+		return EINVAL;
+	delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
+	/* then detach flowsets, delete traffic */
+	schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
+	return 0;
 }
+/*--- end of schk hashtable support ---*/
 
 static int
-red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+copy_obj(char **start, char *end, void *_o, const char *msg, int i)
 {
-	/*
-	 * RED algorithm
-	 *
-	 * RED calculates the average queue size (avg) using a low-pass filter
-	 * with an exponential weighted (w_q) moving average:
-	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
-	 * where q_size is the queue length (measured in bytes or * packets).
-	 *
-	 * If q_size == 0, we compute the idle time for the link, and set
-	 *	avg = (1 - w_q)^(idle/s)
-	 * where s is the time needed for transmitting a medium-sized packet.
-	 *
-	 * Now, if avg < min_th the packet is enqueued.
-	 * If avg > max_th the packet is dropped. Otherwise, the packet is
-	 * dropped with probability P function of avg.
-	 */
-
-	int64_t p_b = 0;
+	struct dn_id *o = _o;
+	int have = end - *start;
 
-	/* Queue in bytes or packets? */
-	u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
-	    q->len_bytes : q->len;
-
-	DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
-
-	/* Average queue size estimation. */
-	if (q_size != 0) {
-		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
-		int diff = SCALE(q_size) - q->avg;
-		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
-
-		q->avg += (int)v;
-	} else {
-		/*
-		 * Queue is empty, find for how long the queue has been
-		 * empty and use a lookup table for computing
-		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
-		 * (small) packet.
-		 * XXX check wraps...
-		 */
-		if (q->avg) {
-			u_int t = div64(curr_time - q->idle_time,
-			    fs->lookup_step);
-
-			q->avg = (t < fs->lookup_depth) ?
-			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
-		}
-	}
-	DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
-
-	/* Should i drop? */
-	if (q->avg < fs->min_th) {
-		q->count = -1;
-		return (0);	/* accept packet */
-	}
-	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
-		if (fs->flags_fs & DN_IS_GENTLE_RED) {
-			/*
-			 * According to Gentle-RED, if avg is greater than
-			 * max_th the packet is dropped with a probability
-			 *	 p_b = c_3 * avg - c_4
-			 * where c_3 = (1 - max_p) / max_th
-			 *       c_4 = 1 - 2 * max_p
-			 */
-			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
-			    fs->c_4;
-		} else {
-			q->count = -1;
-			DPRINTF(("dummynet: - drop"));
-			return (1);
-		}
-	} else if (q->avg > fs->min_th) {
-		/*
-		 * We compute p_b using the linear dropping function
-		 *	 p_b = c_1 * avg - c_2
-		 * where c_1 = max_p / (max_th - min_th)
-		 * 	 c_2 = max_p * min_th / (max_th - min_th)
-		 */
-		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+	if (have < o->len || o->len == 0 || o->type == 0) {
+		D("(WARN) type %d %s %d have %d need %d",
+			o->type, msg, i, have, o->len);
+		return 1;
 	}
-
-	if (fs->flags_fs & DN_QSIZE_IS_BYTES)
-		p_b = div64(p_b * len, fs->max_pkt_size);
-	if (++q->count == 0)
-		q->random = random() & 0xffff;
-	else {
-		/*
-		 * q->count counts packets arrived since last drop, so a greater
-		 * value of q->count means a greater packet drop probability.
-		 */
-		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
-			q->count = 0;
-			DPRINTF(("dummynet: - red drop"));
-			/* After a drop we calculate a new random value. */
-			q->random = random() & 0xffff;
-			return (1);	/* drop */
-		}
+	ND("type %d %s %d len %d", o->type, msg, i, o->len);
+	bcopy(_o, *start, o->len);
+	if (o->type == DN_LINK) {
+		/* Adjust burst parameter for link */
+		struct dn_link *l = (struct dn_link *)*start;
+		l->burst =  div64(l->burst, 8 * hz);
+	} else if (o->type == DN_SCH) {
+		/* Set id->id to the number of instances */
+		struct dn_schk *s = _o;
+		struct dn_id *id = (struct dn_id *)(*start);
+		id->id = (s->sch.flags & DN_HAVE_MASK) ?
+			dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
 	}
-	/* End of RED algorithm. */
-
-	return (0);	/* accept */
+	*start += o->len;
+	return 0;
 }
 
-static __inline struct dn_flow_set *
-locate_flowset(int fs_nr)
+/* Specific function to copy a queue.
+ * Copies only the user-visible part of a queue (which is in
+ * a struct dn_flow), and sets len accordingly.
+ */
+static int
+copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
 {
-	struct dn_flow_set *fs;
-
-	SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
-		if (fs->fs_nr == fs_nr)
-			return (fs);
-
-	return (NULL);
+	struct dn_id *o = _o;
+	int have = end - *start;
+	int len = sizeof(struct dn_flow); /* see above comment */
+
+	if (have < len || o->len == 0 || o->type != DN_QUEUE) {
+		D("ERROR type %d %s %d have %d need %d",
+			o->type, msg, i, have, len);
+		return 1;
+	}
+	ND("type %d %s %d len %d", o->type, msg, i, len);
+	bcopy(_o, *start, len);
+	((struct dn_id*)(*start))->len = len;
+	*start += len;
+	return 0;
 }
 
-static __inline struct dn_pipe *
-locate_pipe(int pipe_nr)
+static int
+copy_q_cb(void *obj, void *arg)
 {
-	struct dn_pipe *pipe;
-
-	SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
-		if (pipe->pipe_nr == pipe_nr)
-			return (pipe);
-
-	return (NULL);
+	struct dn_queue *q = obj;
+	struct copy_args *a = arg;
+	struct dn_flow *ni = (struct dn_flow *)(*a->start);
+        if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
+                return DNHT_SCAN_END;
+        ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
+        ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
+        return 0;
 }
 
-/*
- * dummynet hook for packets. Below 'pipe' is a pipe or a queue
- * depending on whether WF2Q or fixed bw is used.
- *
- * pipe_nr	pipe or queue the packet is destined for.
- * dir		where shall we send the packet after dummynet.
- * m		the mbuf with the packet
- * ifp		the 'ifp' parameter from the caller.
- *		NULL in ip_input, destination interface in ip_output,
- * rule		matching rule, in case of multiple passes
- */
 static int
-dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
-{
-	struct mbuf *m = *m0, *head = NULL, *tail = NULL;
-	struct dn_pkt_tag *pkt;
-	struct m_tag *mtag;
-	struct dn_flow_set *fs = NULL;
-	struct dn_pipe *pipe;
-	uint64_t len = m->m_pkthdr.len;
-	struct dn_flow_queue *q = NULL;
-	int is_pipe = fwa->rule.info & IPFW_IS_PIPE;
-
-	KASSERT(m->m_nextpkt == NULL,
-	    ("dummynet_io: mbuf queue passed to dummynet"));
-
-	DUMMYNET_LOCK();
-	io_pkt++;
-	/*
-	 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
-	 */
-	if (is_pipe) {
-		pipe = locate_pipe(fwa->rule.info & IPFW_INFO_MASK);
-		if (pipe != NULL)
-			fs = &(pipe->fs);
-	} else
-		fs = locate_flowset(fwa->rule.info & IPFW_INFO_MASK);
-
-	if (fs == NULL)
-		goto dropit;	/* This queue/pipe does not exist! */
-	pipe = fs->pipe;
-	if (pipe == NULL) {	/* Must be a queue, try find a matching pipe. */
-		pipe = locate_pipe(fs->parent_nr);
-		if (pipe != NULL)
-			fs->pipe = pipe;
-		else {
-			printf("dummynet: no pipe %d for queue %d, drop pkt\n",
-			    fs->parent_nr, fs->fs_nr);
-			goto dropit;
-		}
-	}
-	q = find_queue(fs, &(fwa->f_id));
-	if (q == NULL)
-		goto dropit;		/* Cannot allocate queue. */
-
-	/* Update statistics, then check reasons to drop pkt. */
-	q->tot_bytes += len;
-	q->tot_pkts++;
-	if (fs->plr && random() < fs->plr)
-		goto dropit;		/* Random pkt drop. */
-	if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
-		if (q->len_bytes > fs->qsize)
-			goto dropit;	/* Queue size overflow. */
-	} else {
-		if (q->len >= fs->qsize)
-			goto dropit;	/* Queue count overflow. */
-	}
-	if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len))
-		goto dropit;
-
-	/* XXX expensive to zero, see if we can remove it. */
-	mtag = m_tag_get(PACKET_TAG_DUMMYNET,
-	    sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO);
-	if (mtag == NULL)
-		goto dropit;		/* Cannot allocate packet header. */
-	m_tag_prepend(m, mtag);		/* Attach to mbuf chain. */
-
-	pkt = (struct dn_pkt_tag *)(mtag + 1);
-	/*
-	 * Ok, i can handle the pkt now...
-	 * Build and enqueue packet + parameters.
-	 */
-	pkt->rule = fwa->rule;
-	pkt->rule.info &= IPFW_ONEPASS;	/* only keep this info */
-	pkt->dn_dir = dir;
-	pkt->ifp = fwa->oif;
-
-	if (q->head == NULL)
-		q->head = m;
+copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+	if (!fs->qht)
+		return 0;
+	if (fs->fs.flags & DN_QHT_HASH)
+		dn_ht_scan(fs->qht, copy_q_cb, a);
 	else
-		q->tail->m_nextpkt = m;
-	q->tail = m;
-	q->len++;
-	q->len_bytes += len;
-
-	if (q->head != m)		/* Flow was not idle, we are done. */
-		goto done;
-
-	if (is_pipe) {			/* Fixed rate queues. */
-		if (q->idle_time < curr_time) {
-			/* Calculate available burst size. */
-			q->numbytes +=
-			    (curr_time - q->idle_time - 1) * pipe->bandwidth;
-			if (q->numbytes > pipe->burst)
-				q->numbytes = pipe->burst;
-			if (io_fast)
-				q->numbytes += pipe->bandwidth;
-		}
-	} else {			/* WF2Q. */
-		if (pipe->idle_time < curr_time &&
-		    pipe->scheduler_heap.elements == 0 &&
-		    pipe->not_eligible_heap.elements == 0) {
-			/* Calculate available burst size. */
-			pipe->numbytes +=
-			    (curr_time - pipe->idle_time - 1) * pipe->bandwidth;
-			if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst)
-				pipe->numbytes = pipe->burst;
-			if (io_fast)
-				pipe->numbytes += pipe->bandwidth;
-		}
-		pipe->idle_time = curr_time;
-	}
-	/* Necessary for both: fixed rate & WF2Q queues. */
-	q->idle_time = curr_time;
-
-	/*
-	 * If we reach this point the flow was previously idle, so we need
-	 * to schedule it. This involves different actions for fixed-rate or
-	 * WF2Q queues.
-	 */
-	if (is_pipe) {
-		/* Fixed-rate queue: just insert into the ready_heap. */
-		dn_key t = 0;
-
-		if (pipe->bandwidth) {
-			q->extra_bits = compute_extra_bits(m, pipe);
-			t = set_ticks(m, q, pipe);
-		}
-		q->sched_time = curr_time;
-		if (t == 0)		/* Must process it now. */
-			ready_event(q, &head, &tail);
-		else
-			heap_insert(&ready_heap, curr_time + t , q);
-	} else {
-		/*
-		 * WF2Q. First, compute start time S: if the flow was
-		 * idle (S = F + 1) set S to the virtual time V for the
-		 * controlling pipe, and update the sum of weights for the pipe;
-		 * otherwise, remove flow from idle_heap and set S to max(F,V).
-		 * Second, compute finish time F = S + len / weight.
-		 * Third, if pipe was idle, update V = max(S, V).
-		 * Fourth, count one more backlogged flow.
-		 */
-		if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */
-			q->S = pipe->V;
-			pipe->sum += fs->weight; /* Add weight of new queue. */
-		} else {
-			heap_extract(&(pipe->idle_heap), q);
-			q->S = MAX64(q->F, pipe->V);
-		}
-		q->F = q->S + div64(len << MY_M, fs->weight);
-
-		if (pipe->not_eligible_heap.elements == 0 &&
-		    pipe->scheduler_heap.elements == 0)
-			pipe->V = MAX64(q->S, pipe->V);
-		fs->backlogged++;
-		/*
-		 * Look at eligibility. A flow is not eligibile if S>V (when
-		 * this happens, it means that there is some other flow already
-		 * scheduled for the same pipe, so the scheduler_heap cannot be
-		 * empty). If the flow is not eligible we just store it in the
-		 * not_eligible_heap. Otherwise, we store in the scheduler_heap
-		 * and possibly invoke ready_event_wfq() right now if there is
-		 * leftover credit.
-		 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
-		 * and for all flows in not_eligible_heap (NEH), S_i > V.
-		 * So when we need to compute max(V, min(S_i)) forall i in
-		 * SCH+NEH, we only need to look into NEH.
-		 */
-		if (DN_KEY_GT(q->S, pipe->V)) {		/* Not eligible. */
-			if (pipe->scheduler_heap.elements == 0)
-				printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
-			heap_insert(&(pipe->not_eligible_heap), q->S, q);
-		} else {
-			heap_insert(&(pipe->scheduler_heap), q->F, q);
-			if (pipe->numbytes >= 0) {	 /* Pipe is idle. */
-				if (pipe->scheduler_heap.elements != 1)
-					printf("dummynet: OUCH! pipe should have been idle!\n");
-				DPRINTF(("dummynet: waking up pipe %d at %d\n",
-				    pipe->pipe_nr, (int)(q->F >> MY_M)));
-				pipe->sched_time = curr_time;
-				ready_event_wfq(pipe, &head, &tail);
-			}
-		}
-	}
-done:
-	if (head == m && (dir & PROTO_LAYER2) == 0 ) {
-		/* Fast io. */
-		io_pkt_fast++;
-		if (m->m_nextpkt != NULL)
-			printf("dummynet: fast io: pkt chain detected!\n");
-		head = m->m_nextpkt = NULL;
-	} else
-		*m0 = NULL;		/* Normal io. */
-
-	DUMMYNET_UNLOCK();
-	if (head != NULL)
-		dummynet_send(head);
-	return (0);
-
-dropit:
-	io_pkt_drop++;
-	if (q)
-		q->drops++;
-	DUMMYNET_UNLOCK();
-	FREE_PKT(m);
-	*m0 = NULL;
-	return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
+		copy_q_cb(fs->qht, a);
+	return 0;
 }
 
 /*
- * Dispose all packets and flow_queues on a flow_set.
- * If all=1, also remove red lookup table and other storage,
- * including the descriptor itself.
- * For the one in dn_pipe MUST also cleanup ready_heap...
+ * This routine only copies the initial part of a profile ? XXX
  */
-static void
-purge_flow_set(struct dn_flow_set *fs, int all)
+static int
+copy_profile(struct copy_args *a, struct dn_profile *p)
 {
-	struct dn_flow_queue *q, *qn;
-	int i;
-
-	DUMMYNET_LOCK_ASSERT();
+	int have = a->end - *a->start;
+	/* XXX here we check for max length */
+	int profile_len = sizeof(struct dn_profile) - 
+		ED_MAX_SAMPLES_NO*sizeof(int);
 
-	for (i = 0; i <= fs->rq_size; i++) {
-		for (q = fs->rq[i]; q != NULL; q = qn) {
-			dn_free_pkts(q->head);
-			qn = q->next;
-			free(q, M_DUMMYNET);
-		}
-		fs->rq[i] = NULL;
+	if (p == NULL)
+		return 0;
+	if (have < profile_len) {
+		D("error have %d need %d", have, profile_len);
+		return 1;
 	}
+	bcopy(p, *a->start, profile_len);
+	((struct dn_id *)(*a->start))->len = profile_len;
+	*a->start += profile_len;
+	return 0;
+}
 
-	fs->rq_elements = 0;
-	if (all) {
-		/* RED - free lookup table. */
-		if (fs->w_q_lookup != NULL)
-			free(fs->w_q_lookup, M_DUMMYNET);
-		if (fs->rq != NULL)
-			free(fs->rq, M_DUMMYNET);
-		/* If this fs is not part of a pipe, free it. */
-		if (fs->pipe == NULL || fs != &(fs->pipe->fs))
-			free(fs, M_DUMMYNET);
+static int
+copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+	struct dn_fs *ufs = (struct dn_fs *)(*a->start);
+	if (!fs)
+		return 0;
+	ND("flowset %d", fs->fs.fs_nr);
+	if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
+		return DNHT_SCAN_END;
+	ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
+		dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
+	if (flags) {	/* copy queues */
+		copy_q(a, fs, 0);
 	}
+	return 0;
 }
 
-/*
- * Dispose all packets queued on a pipe (not a flow_set).
- * Also free all resources associated to a pipe, which is about
- * to be deleted.
- */
-static void
-purge_pipe(struct dn_pipe *pipe)
+static int
+copy_si_cb(void *obj, void *arg)
 {
+	struct dn_sch_inst *si = obj;
+	struct copy_args *a = arg;
+	struct dn_flow *ni = (struct dn_flow *)(*a->start);
+	if (copy_obj(a->start, a->end, &si->ni, "inst",
+			si->sched->sch.sched_nr))
+		return DNHT_SCAN_END;
+	ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
+	ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
+	return 0;
+}
 
-    purge_flow_set( &(pipe->fs), 1 );
-
-    dn_free_pkts(pipe->head);
-
-    heap_free( &(pipe->scheduler_heap) );
-    heap_free( &(pipe->not_eligible_heap) );
-    heap_free( &(pipe->idle_heap) );
+static int
+copy_si(struct copy_args *a, struct dn_schk *s, int flags)
+{
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, copy_si_cb, a);
+	else if (s->siht)
+		copy_si_cb(s->siht, a);
+	return 0;
 }
 
 /*
- * Delete all pipes and heaps returning memory. Must also
- * remove references from all ipfw rules to all pipes.
+ * compute a list of children of a scheduler and copy up
  */
-static void
-dummynet_flush(void)
+static int
+copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
 {
-	struct dn_pipe *pipe, *pipe1;
-	struct dn_flow_set *fs, *fs1;
-	int i;
-
-	DUMMYNET_LOCK();
-	/* Free heaps so we don't have unwanted events. */
-	heap_free(&ready_heap);
-	heap_free(&wfq_ready_heap);
-	heap_free(&extract_heap);
+	struct dn_fsk *fs;
+	struct dn_id *o;
+	uint32_t *p;
+
+	int n = 0, space = sizeof(*o);
+	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+		if (fs->fs.fs_nr < DN_MAX_ID)
+			n++;
+	}
+	space += n * sizeof(uint32_t);
+	DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
+	if (a->end - *(a->start) < space)
+		return DNHT_SCAN_END;
+	o = (struct dn_id *)(*(a->start));
+	o->len = space;
+	*a->start += o->len;
+	o->type = DN_TEXT;
+	p = (uint32_t *)(o+1);
+	SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
+		if (fs->fs.fs_nr < DN_MAX_ID)
+			*p++ = fs->fs.fs_nr;
+	return 0;
+}
 
-	/*
-	 * Now purge all queued pkts and delete all pipes.
-	 *
-	 * XXXGL: can we merge the for(;;) cycles into one or not?
-	 */
-	for (i = 0; i < HASHSIZE; i++)
-		SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
-			SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
-			purge_flow_set(fs, 1);
+static int
+copy_data_helper(void *_o, void *_arg)
+{
+	struct copy_args *a = _arg;
+	uint32_t *r = a->extra->r; /* start of first range */
+	uint32_t *lim;	/* first invalid pointer */
+	int n;
+
+	lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);
+
+	if (a->type == DN_LINK || a->type == DN_SCH) {
+		/* pipe|sched show, we receive a dn_schk */
+		struct dn_schk *s = _o;
+
+		n = s->sch.sched_nr;
+		if (a->type == DN_SCH && n >= DN_MAX_ID)
+			return 0;	/* not a scheduler */
+		if (a->type == DN_LINK && n <= DN_MAX_ID)
+		    return 0;	/* not a pipe */
+
+		/* see if the object is within one of our ranges */
+		for (;r < lim; r += 2) {
+			if (n < r[0] || n > r[1])
+				continue;
+			/* Found a valid entry, copy and we are done */
+			if (a->flags & DN_C_LINK) {
+				if (copy_obj(a->start, a->end,
+				    &s->link, "link", n))
+					return DNHT_SCAN_END;
+				if (copy_profile(a, s->profile))
+					return DNHT_SCAN_END;
+				if (copy_flowset(a, s->fs, 0))
+					return DNHT_SCAN_END;
+			}
+			if (a->flags & DN_C_SCH) {
+				if (copy_obj(a->start, a->end,
+				    &s->sch, "sched", n))
+					return DNHT_SCAN_END;
+				/* list all attached flowsets */
+				if (copy_fsk_list(a, s, 0))
+					return DNHT_SCAN_END;
+			}
+			if (a->flags & DN_C_FLOW)
+				copy_si(a, s, 0);
+			break;
 		}
-	for (i = 0; i < HASHSIZE; i++)
-		SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
-			SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
-			purge_pipe(pipe);
-			free_pipe(pipe);
+	} else if (a->type == DN_FS) {
+		/* queue show, skip internal flowsets */
+		struct dn_fsk *fs = _o;
+
+		n = fs->fs.fs_nr;
+		if (n >= DN_MAX_ID)
+			return 0;
+		/* see if the object is within one of our ranges */
+		for (;r < lim; r += 2) {
+			if (n < r[0] || n > r[1])
+				continue;
+			if (copy_flowset(a, fs, 0))
+				return DNHT_SCAN_END;
+			copy_q(a, fs, 0);
+			break; /* we are done */
 		}
-	DUMMYNET_UNLOCK();
+	}
+	return 0;
+}
+
+static inline struct dn_schk *
+locate_scheduler(int i)
+{
+	return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
 }
 
 /*
- * setup RED parameters
+ * red parameters are in fixed point arithmetic.
  */
 static int
-config_red(struct dn_flow_set *p, struct dn_flow_set *x)
+config_red(struct dn_fsk *fs)
 {
-	int i;
-
-	x->w_q = p->w_q;
-	x->min_th = SCALE(p->min_th);
-	x->max_th = SCALE(p->max_th);
-	x->max_p = p->max_p;
-
-	x->c_1 = p->max_p / (p->max_th - p->min_th);
-	x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
-
-	if (x->flags_fs & DN_IS_GENTLE_RED) {
-		x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
-		x->c_4 = SCALE(1) - 2 * p->max_p;
+	int64_t s, idle, weight, w0;
+	int t, i;
+
+	fs->w_q = fs->fs.w_q;
+	fs->max_p = fs->fs.max_p;
+	D("called");
+	/* Doing stuff that was in userland */
+	i = fs->sched->link.bandwidth;
+	s = (i <= 0) ? 0 :
+		hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
+
+	idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
+	fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
+	/* fs->lookup_step not scaled, */
+	if (!fs->lookup_step)
+		fs->lookup_step = 1;
+	w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
+
+	for (t = fs->lookup_step; t > 1; --t)
+		weight = SCALE_MUL(weight, w0);
+	fs->lookup_weight = (int)(weight); // scaled
+
+	/* Now doing stuff that was in kerneland */
+	fs->min_th = SCALE(fs->fs.min_th);
+	fs->max_th = SCALE(fs->fs.max_th);
+
+	fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
+	fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
+
+	if (fs->fs.flags & DN_IS_GENTLE_RED) {
+		fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
+		fs->c_4 = SCALE(1) - 2 * fs->max_p;
 	}
 
 	/* If the lookup table already exist, free and create it again. */
-	if (x->w_q_lookup) {
-		free(x->w_q_lookup, M_DUMMYNET);
-		x->w_q_lookup = NULL;
+	if (fs->w_q_lookup) {
+		free(fs->w_q_lookup, M_DUMMYNET);
+		fs->w_q_lookup = NULL;
 	}
-	if (red_lookup_depth == 0) {
+	if (dn_cfg.red_lookup_depth == 0) {
 		printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
 		    "must be > 0\n");
-		free(x, M_DUMMYNET);
+		fs->fs.flags &= ~DN_IS_RED;
+		fs->fs.flags &= ~DN_IS_GENTLE_RED;
 		return (EINVAL);
 	}
-	x->lookup_depth = red_lookup_depth;
-	x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
+	fs->lookup_depth = dn_cfg.red_lookup_depth;
+	fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
 	    M_DUMMYNET, M_NOWAIT);
-	if (x->w_q_lookup == NULL) {
+	if (fs->w_q_lookup == NULL) {
 		printf("dummynet: sorry, cannot allocate red lookup table\n");
-		free(x, M_DUMMYNET);
+		fs->fs.flags &= ~DN_IS_RED;
+		fs->fs.flags &= ~DN_IS_GENTLE_RED;
 		return(ENOSPC);
 	}
 
 	/* Fill the lookup table with (1 - w_q)^x */
-	x->lookup_step = p->lookup_step;
-	x->lookup_weight = p->lookup_weight;
-	x->w_q_lookup[0] = SCALE(1) - x->w_q;
-
-	for (i = 1; i < x->lookup_depth; i++)
-		x->w_q_lookup[i] =
-		    SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+	fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
+
+	for (i = 1; i < fs->lookup_depth; i++)
+		fs->w_q_lookup[i] =
+		    SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
+
+	if (dn_cfg.red_avg_pkt_size < 1)
+		dn_cfg.red_avg_pkt_size = 512;
+	fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
+	if (dn_cfg.red_max_pkt_size < 1)
+		dn_cfg.red_max_pkt_size = 1500;
+	fs->max_pkt_size = dn_cfg.red_max_pkt_size;
+	D("exit");
+	return 0;
+}
 
-	if (red_avg_pkt_size < 1)
-		red_avg_pkt_size = 512;
-	x->avg_pkt_size = red_avg_pkt_size;
-	if (red_max_pkt_size < 1)
-		red_max_pkt_size = 1500;
-	x->max_pkt_size = red_max_pkt_size;
-	return (0);
+/* Scan all flowset attached to this scheduler and update red */
+static void
+update_red(struct dn_schk *s)
+{
+	struct dn_fsk *fs;
+	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+		if (fs && (fs->fs.flags & DN_IS_RED))
+			config_red(fs);
+	}
 }
 
-static int
-alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
-{
-    if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
-	int l = pfs->rq_size;
-
-	if (l == 0)
-	    l = dn_hash_size;
-	if (l < 4)
-	    l = 4;
-	else if (l > DN_MAX_HASH_SIZE)
-	    l = DN_MAX_HASH_SIZE;
-	x->rq_size = l;
-    } else                  /* one is enough for null mask */
-	x->rq_size = 1;
-    x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
-	    M_DUMMYNET, M_NOWAIT | M_ZERO);
-    if (x->rq == NULL) {
-	printf("dummynet: sorry, cannot allocate queue\n");
-	return (ENOMEM);
-    }
-    x->rq_elements = 0;
-    return 0 ;
+/* attach flowset to scheduler s, possibly requeue */
+static void
+fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
+{
+	ND("remove fs %d from fsunlinked, link to sched %d",
+		fs->fs.fs_nr, s->sch.sched_nr);
+	SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
+	fs->sched = s;
+	SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
+	if (s->fp->new_fsk)
+		s->fp->new_fsk(fs);
+	/* XXX compute fsk_mask */
+	fs->fsk_mask = fs->fs.flow_mask;
+	if (fs->sched->sch.flags & DN_HAVE_MASK)
+		flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
+	if (fs->qht) {
+		/*
+		 * we must drain qht according to the old
+		 * type, and reinsert according to the new one.
+		 * The requeue is complex -- in general we need to
+		 * reclassify every single packet.
+		 * For the time being, let's hope qht is never set
+		 * when we reach this point.
+		 */
+		D("XXX TODO requeue from fs %d to sch %d",
+			fs->fs.fs_nr, s->sch.sched_nr);
+		fs->qht = NULL;
+	}
+	/* set the new type for qht */
+	if (nonzero_mask(&fs->fsk_mask))
+		fs->fs.flags |= DN_QHT_HASH;
+	else
+		fs->fs.flags &= ~DN_QHT_HASH;
+
+	/* XXX config_red() can fail... */
+	if (fs->fs.flags & DN_IS_RED)
+		config_red(fs);
 }
 
+/* update all flowsets which may refer to this scheduler */
 static void
-set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
-{
-	x->flags_fs = src->flags_fs;
-	x->qsize = src->qsize;
-	x->plr = src->plr;
-	x->flow_mask = src->flow_mask;
-	if (x->flags_fs & DN_QSIZE_IS_BYTES) {
-		if (x->qsize > pipe_byte_limit)
-			x->qsize = 1024 * 1024;
-	} else {
-		if (x->qsize == 0)
-			x->qsize = 50;
-		if (x->qsize > pipe_slot_limit)
-			x->qsize = 50;
+update_fs(struct dn_schk *s)
+{
+	struct dn_fsk *fs, *tmp;
+
+	SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
+		if (s->sch.sched_nr != fs->fs.sched_nr) {
+			D("fs %d for sch %d not %d still unlinked",
+				fs->fs.fs_nr, fs->fs.sched_nr,
+				s->sch.sched_nr);
+			continue;
+		}
+		fsk_attach(fs, s);
 	}
-	/* Configuring RED. */
-	if (x->flags_fs & DN_IS_RED)
-		config_red(src, x);	/* XXX should check errors */
 }
 
 /*
- * Setup pipe or queue parameters.
+ * Configuration -- to preserve backward compatibility we use
+ * the following scheme (N is 65536)
+ *	NUMBER		SCHED	LINK	FLOWSET
+ *	   1 ..  N-1	(1)WFQ	(2)WFQ	(3)queue
+ *	 N+1 .. 2N-1	(4)FIFO (5)FIFO	(6)FIFO for sched 1..N-1
+ *	2N+1 .. 3N-1	--	--	(7)FIFO for sched N+1..2N-1
+ *
+ * "pipe i config" configures #1, #2 and #3
+ * "sched i config" configures #1 and possibly #6
+ * "queue i config" configures #3
+ * #1 is configured with 'pipe i config' or 'sched i config'
+ * #2 is configured with 'pipe i config', and created if not
+ *	existing with 'sched i config'
+ * #3 is configured with 'queue i config'
+ * #4 is automatically configured after #1, can only be FIFO
+ * #5 is automatically configured after #2
+ * #6 is automatically created when #1 is !MULTIQUEUE,
+ *	and can be updated.
+ * #7 is automatically configured after #2
+ */
+
+/*
+ * configure a link (and its FIFO instance)
  */
 static int
-config_pipe(struct dn_pipe *p)
+config_link(struct dn_link *p, struct dn_id *arg)
 {
-	struct dn_flow_set *pfs = &(p->fs);
-	struct dn_flow_queue *q;
-	int i, error;
+	int i;
 
+	if (p->oid.len != sizeof(*p)) {
+		D("invalid pipe len %d", p->oid.len);
+		return EINVAL;
+	}
+	i = p->link_nr;
+	if (i <= 0 || i >= DN_MAX_ID)
+		return EINVAL;
 	/*
 	 * The config program passes parameters as follows:
 	 * bw = bits/second (0 means no limits),
 	 * delay = ms, must be translated into ticks.
 	 * qsize = slots/bytes
+	 * burst ???
 	 */
 	p->delay = (p->delay * hz) / 1000;
 	/* Scale burst size: bytes -> bits * hz */
 	p->burst *= 8 * hz;
-	/* We need either a pipe number or a flow_set number. */
-	if (p->pipe_nr == 0 && pfs->fs_nr == 0)
-		return (EINVAL);
-	if (p->pipe_nr != 0 && pfs->fs_nr != 0)
-		return (EINVAL);
-	if (p->pipe_nr != 0) {			/* this is a pipe */
-		struct dn_pipe *pipe;
-
-		DUMMYNET_LOCK();
-		pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
-
-		if (pipe == NULL) {		/* new pipe */
-			pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
-			    M_NOWAIT | M_ZERO);
-			if (pipe == NULL) {
-				DUMMYNET_UNLOCK();
-				printf("dummynet: no memory for new pipe\n");
-				return (ENOMEM);
-			}
-			pipe->pipe_nr = p->pipe_nr;
-			pipe->fs.pipe = pipe;
-			/*
-			 * idle_heap is the only one from which
-			 * we extract from the middle.
+
+	DN_BH_WLOCK();
+	/* do it twice, base link and FIFO link */
+	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+	    struct dn_schk *s = locate_scheduler(i);
+	    if (s == NULL) {
+		DN_BH_WUNLOCK();
+		D("sched %d not found", i);
+		return EINVAL;
+	    }
+	    /* remove profile if exists */
+	    if (s->profile) {
+		free(s->profile, M_DUMMYNET);
+		s->profile = NULL;
+	    }
+	    /* copy all parameters */
+	    s->link.oid = p->oid;
+	    s->link.link_nr = i;
+	    s->link.delay = p->delay;
+	    if (s->link.bandwidth != p->bandwidth) {
+		/* XXX bandwidth changes, need to update red params */
+	    s->link.bandwidth = p->bandwidth;
+		update_red(s);
+	    }
+	    s->link.burst = p->burst;
+	    schk_reset_credit(s);
+	}
+	dn_cfg.id++;
+	DN_BH_WUNLOCK();
+	return 0;
+}
+
+/*
+ * configure a flowset. Can be called from inside with locked=1,
+ */
+static struct dn_fsk *
+config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
+{
+	int i;
+	struct dn_fsk *fs;
+
+	if (nfs->oid.len != sizeof(*nfs)) {
+		D("invalid flowset len %d", nfs->oid.len);
+		return NULL;
+	}
+	i = nfs->fs_nr;
+	if (i <= 0 || i >= 3*DN_MAX_ID)
+		return NULL;
+	ND("flowset %d", i);
+	/* XXX other sanity checks */
+        if (nfs->flags & DN_QSIZE_BYTES) {
+		ipdn_bound_var(&nfs->qsize, 16384,
+		    1500, dn_cfg.byte_limit, NULL); // "queue byte size");
+        } else {
+		ipdn_bound_var(&nfs->qsize, 50,
+		    1, dn_cfg.slot_limit, NULL); // "queue slot size");
+        }
+	if (nfs->flags & DN_HAVE_MASK) {
+		/* make sure we have some buckets */
+		ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size,
+			1, dn_cfg.max_hash_size, "flowset buckets");
+	} else {
+		nfs->buckets = 1;	/* we only need 1 */
+	}
+	if (!locked)
+		DN_BH_WLOCK();
+	do { /* exit with break when done */
+	    struct dn_schk *s;
+	    int flags = nfs->sched_nr ? DNHT_INSERT : 0;
+	    int j;
+	    int oldc = dn_cfg.fsk_count;
+	    fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
+	    if (fs == NULL) {
+		D("missing sched for flowset %d", i);
+	        break;
+	    }
+	    /* grab some defaults from the existing one */
+	    if (nfs->sched_nr == 0) /* reuse */
+		nfs->sched_nr = fs->fs.sched_nr;
+	    for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
+		if (nfs->par[j] == -1) /* reuse */
+		    nfs->par[j] = fs->fs.par[j];
+	    }
+	    if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
+		ND("flowset %d unchanged", i);
+		break; /* no change, nothing to do */
+	    }
+	    if (oldc != dn_cfg.fsk_count)	/* new item */
+		dn_cfg.id++;
+	    s = locate_scheduler(nfs->sched_nr);
+	    /* detach from old scheduler if needed, preserving
+	     * queues if we need to reattach. Then update the
+	     * configuration, and possibly attach to the new sched.
+	     */
+	    DX(2, "fs %d changed sched %d@%p to %d@%p",
+		fs->fs.fs_nr,
+		fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
+	    if (fs->sched) {
+		int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
+		flags |= DN_DESTROY; /* XXX temporary */
+		fsk_detach(fs, flags);
+	    }
+	    fs->fs = *nfs; /* copy configuration */
+	    if (s != NULL)
+		fsk_attach(fs, s);
+	} while (0);
+	if (!locked)
+		DN_BH_WUNLOCK();
+	return fs;
+}
+
+/*
+ * config/reconfig a scheduler and its FIFO variant.
+ * For !MULTIQUEUE schedulers, also set up the flowset.
+ *
+ * On reconfigurations (detected because s->fp is set),
+ * detach existing flowsets preserving traffic, preserve link,
+ * and delete the old scheduler creating a new one.
+ */
+static int
+config_sched(struct dn_sch *_nsch, struct dn_id *arg)
+{
+	struct dn_schk *s;
+	struct schk_new_arg a; /* argument for schk_new */
+	int i;
+	struct dn_link p;	/* copy of oldlink */
+	struct dn_profile *pf = NULL;	/* copy of old link profile */
+	/* Used to preserv mask parameter */
+	struct ipfw_flow_id new_mask;
+	int new_buckets = 0;
+	int new_flags = 0;
+	int pipe_cmd;
+	int err = ENOMEM;
+
+	a.sch = _nsch;
+	if (a.sch->oid.len != sizeof(*a.sch)) {
+		D("bad sched len %d", a.sch->oid.len);
+		return EINVAL;
+	}
+	i = a.sch->sched_nr;
+	if (i <= 0 || i >= DN_MAX_ID)
+		return EINVAL;
+	/* make sure we have some buckets */
+	if (a.sch->flags & DN_HAVE_MASK)
+		ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size,
+			1, dn_cfg.max_hash_size, "sched buckets");
+	/* XXX other sanity checks */
+	bzero(&p, sizeof(p));
+
+	pipe_cmd = a.sch->flags & DN_PIPE_CMD;
+	a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
+	if (pipe_cmd) {
+		/* Copy mask parameter */
+		new_mask = a.sch->sched_mask;
+		new_buckets = a.sch->buckets;
+		new_flags = a.sch->flags;
+	}
+	DN_BH_WLOCK();
+again: /* run twice, for wfq and fifo */
+	/*
+	 * lookup the type. If not supplied, use the previous one
+	 * or default to WF2Q+. Otherwise, return an error.
+	 */
+	dn_cfg.id++;
+	a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
+	if (a.fp != NULL) {
+		/* found. Lookup or create entry */
+		s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
+	} else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
+		/* No type. search existing s* or retry with WF2Q+ */
+		s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
+		if (s != NULL) {
+			a.fp = s->fp;
+			/* Scheduler exists, skip to FIFO scheduler 
+			 * if command was pipe config...
 			 */
-			pipe->idle_heap.size = pipe->idle_heap.elements = 0;
-			pipe->idle_heap.offset =
-			    offsetof(struct dn_flow_queue, heap_pos);
+			if (pipe_cmd)
+				goto next;
 		} else {
-			/* Flush accumulated credit for all queues. */
-			for (i = 0; i <= pipe->fs.rq_size; i++) {
-				for (q = pipe->fs.rq[i]; q; q = q->next) {
-					q->numbytes = p->burst +
-					    (io_fast ? p->bandwidth : 0);
-				}
+			/* New scheduler, create a wf2q+ with no mask
+			 * if command was pipe config...
+			 */
+			if (pipe_cmd) {
+				/* clear mask parameter */
+				bzero(&a.sch->sched_mask, sizeof(new_mask));
+				a.sch->buckets = 0;
+				a.sch->flags &= ~DN_HAVE_MASK;
 			}
+			a.sch->oid.subtype = DN_SCHED_WF2QP;
+			goto again;
 		}
-
-		pipe->bandwidth = p->bandwidth;
-		pipe->burst = p->burst;
-		pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0);
-		bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
-		pipe->ifp = NULL;		/* reset interface ptr */
-		pipe->delay = p->delay;
-		set_fs_parms(&(pipe->fs), pfs);
-
-		/* Handle changes in the delay profile. */
-		if (p->samples_no > 0) {
-			if (pipe->samples_no != p->samples_no) {
-				if (pipe->samples != NULL)
-					free(pipe->samples, M_DUMMYNET);
-				pipe->samples =
-				    malloc(p->samples_no*sizeof(dn_key),
-					M_DUMMYNET, M_NOWAIT | M_ZERO);
-				if (pipe->samples == NULL) {
-					DUMMYNET_UNLOCK();
-					printf("dummynet: no memory "
-						"for new samples\n");
-					return (ENOMEM);
-				}
-				pipe->samples_no = p->samples_no;
+	} else {
+		D("invalid scheduler type %d %s",
+			a.sch->oid.subtype, a.sch->name);
+		err = EINVAL;
+		goto error;
+	}
+	/* normalize name and subtype */
+	a.sch->oid.subtype = a.fp->type;
+	bzero(a.sch->name, sizeof(a.sch->name));
+	strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
+	if (s == NULL) {
+		D("cannot allocate scheduler %d", i);
+		goto error;
+	}
+	/* restore existing link if any */
+	if (p.link_nr) {
+		s->link = p;
+		if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
+			s->profile = NULL; /* XXX maybe not needed */
+		} else {
+			s->profile = malloc(sizeof(struct dn_profile),
+					     M_DUMMYNET, M_NOWAIT | M_ZERO);
+			if (s->profile == NULL) {
+				D("cannot allocate profile");
+				goto error; //XXX
 			}
-
-			strncpy(pipe->name,p->name,sizeof(pipe->name));
-			pipe->loss_level = p->loss_level;
-			for (i = 0; i<pipe->samples_no; ++i)
-				pipe->samples[i] = p->samples[i];
-		} else if (pipe->samples != NULL) {
-			free(pipe->samples, M_DUMMYNET);
-			pipe->samples = NULL;
-			pipe->samples_no = 0;
+			bcopy(pf, s->profile, sizeof(*pf));
 		}
-
-		if (pipe->fs.rq == NULL) {	/* a new pipe */
-			error = alloc_hash(&(pipe->fs), pfs);
-			if (error) {
-				DUMMYNET_UNLOCK();
-				free_pipe(pipe);
-				return (error);
-			}
-			SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
-			    pipe, next);
+	}
+	p.link_nr = 0;
+	if (s->fp == NULL) {
+		DX(2, "sched %d new type %s", i, a.fp->name);
+	} else if (s->fp != a.fp ||
+			bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
+		/* already existing. */
+		DX(2, "sched %d type changed from %s to %s",
+			i, s->fp->name, a.fp->name);
+		DX(4, "   type/sub %d/%d -> %d/%d",
+			s->sch.oid.type, s->sch.oid.subtype, 
+			a.sch->oid.type, a.sch->oid.subtype);
+		if (s->link.link_nr == 0)
+			D("XXX WARNING link 0 for sched %d", i);
+		p = s->link;	/* preserve link */
+		if (s->profile) {/* preserve profile */
+			if (!pf)
+				pf = malloc(sizeof(*pf),
+				    M_DUMMYNET, M_NOWAIT | M_ZERO);
+			if (pf)	/* XXX should issue a warning otherwise */
+				bcopy(s->profile, pf, sizeof(*pf));
 		}
-		DUMMYNET_UNLOCK();
-	} else {				/* config queue */
-		struct dn_flow_set *fs;
-
-		DUMMYNET_LOCK();
-		fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
-
-		if (fs == NULL) {		/* new */
-			if (pfs->parent_nr == 0) { /* need link to a pipe */
-				DUMMYNET_UNLOCK();
-				return (EINVAL);
-			}
-			fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
-			    M_NOWAIT | M_ZERO);
-			if (fs == NULL) {
-				DUMMYNET_UNLOCK();
-				printf(
-				    "dummynet: no memory for new flow_set\n");
-				return (ENOMEM);
-			}
-			fs->fs_nr = pfs->fs_nr;
-			fs->parent_nr = pfs->parent_nr;
-			fs->weight = pfs->weight;
-			if (fs->weight == 0)
-				fs->weight = 1;
-			else if (fs->weight > 100)
-				fs->weight = 100;
+		/* remove from the hash */
+		dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+		/* Detach flowsets, preserve queues. */
+		// schk_delete_cb(s, NULL);
+		// XXX temporarily, kill queues
+		schk_delete_cb(s, (void *)DN_DESTROY);
+		goto again;
+	} else {
+		DX(4, "sched %d unchanged type %s", i, a.fp->name);
+	}
+	/* complete initialization */
+	s->sch = *a.sch;
+	s->fp = a.fp;
+	s->cfg = arg;
+	// XXX schk_reset_credit(s);
+	/* create the internal flowset if needed,
+	 * trying to reuse existing ones if available
+	 */
+	if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
+	        s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
+		if (!s->fs) {
+			struct dn_fs fs;
+			bzero(&fs, sizeof(fs));
+			set_oid(&fs.oid, DN_FS, sizeof(fs));
+			fs.fs_nr = i + DN_MAX_ID;
+			fs.sched_nr = i;
+			s->fs = config_fs(&fs, NULL, 1 /* locked */);
+		}
+		if (!s->fs) {
+			schk_delete_cb(s, (void *)DN_DESTROY);
+			D("error creating internal fs for %d", i);
+			goto error;
+		}
+	}
+	/* call init function after the flowset is created */
+	if (s->fp->config)
+		s->fp->config(s);
+	update_fs(s);
+next:
+	if (i < DN_MAX_ID) { /* now configure the FIFO instance */
+		i += DN_MAX_ID;
+		if (pipe_cmd) {
+			/* Restore mask parameter for FIFO */
+			a.sch->sched_mask = new_mask;
+			a.sch->buckets = new_buckets;
+			a.sch->flags = new_flags;
 		} else {
-			/*
-			 * Change parent pipe not allowed;
-			 * must delete and recreate.
-			 */
-			if (pfs->parent_nr != 0 &&
-			    fs->parent_nr != pfs->parent_nr) {
-				DUMMYNET_UNLOCK();
-				return (EINVAL);
+			/* sched config shouldn't modify the FIFO scheduler */
+			if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
+				/* FIFO already exist, don't touch it */
+				err = 0; /* and this is not an error */
+				goto error;
 			}
 		}
+		a.sch->sched_nr = i;
+		a.sch->oid.subtype = DN_SCHED_FIFO;
+		bzero(a.sch->name, sizeof(a.sch->name));
+		goto again;
+	}
+	err = 0;
+error:
+	DN_BH_WUNLOCK();
+	if (pf)
+		free(pf, M_DUMMYNET);
+	return err;
+}
 
-		set_fs_parms(fs, pfs);
+/*
+ * attach a profile to a link
+ */
+static int
+config_profile(struct dn_profile *pf, struct dn_id *arg)
+{
+	struct dn_schk *s;
+	int i, olen, err = 0;
 
-		if (fs->rq == NULL) {		/* a new flow_set */
-			error = alloc_hash(fs, pfs);
-			if (error) {
-				DUMMYNET_UNLOCK();
-				free(fs, M_DUMMYNET);
-				return (error);
-			}
-			SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
-			    fs, next);
+	if (pf->oid.len < sizeof(*pf)) {
+		D("short profile len %d", pf->oid.len);
+		return EINVAL;
+	}
+	i = pf->link_nr;
+	if (i <= 0 || i >= DN_MAX_ID)
+		return EINVAL;
+	/* XXX other sanity checks */
+	DN_BH_WLOCK();
+	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+		s = locate_scheduler(i);
+
+		if (s == NULL) {
+			err = EINVAL;
+			break;
+		}
+		dn_cfg.id++;
+		/*
+		 * If we had a profile and the new one does not fit,
+		 * or it is deleted, then we need to free memory.
+		 */
+		if (s->profile && (pf->samples_no == 0 ||
+		    s->profile->oid.len < pf->oid.len)) {
+			free(s->profile, M_DUMMYNET);
+			s->profile = NULL;
 		}
-		DUMMYNET_UNLOCK();
+		if (pf->samples_no == 0)
+			continue;
+		/*
+		 * new profile, possibly allocate memory
+		 * and copy data.
+		 */
+		if (s->profile == NULL)
+			s->profile = malloc(pf->oid.len,
+			    M_DUMMYNET, M_NOWAIT | M_ZERO);
+		if (s->profile == NULL) {
+			D("no memory for profile %d", i);
+			err = ENOMEM;
+			break;
+		}
+		/* preserve larger length XXX double check */
+		olen = s->profile->oid.len;
+		if (olen < pf->oid.len)
+			olen = pf->oid.len;
+		bcopy(pf, s->profile, pf->oid.len);
+		s->profile->oid.len = olen;
 	}
-	return (0);
+	DN_BH_WUNLOCK();
+	return err;
 }
 
 /*
- * Helper function to remove from a heap queues which are linked to
- * a flow_set about to be deleted.
+ * Delete all objects:
  */
 static void
-fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
+dummynet_flush(void)
 {
-    int i, found;
 
-    for (i = found = 0 ; i < h->elements ;) {
-	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
-	    h->elements-- ;
-	    h->p[i] = h->p[h->elements] ;
-	    found++ ;
-	} else
-	    i++ ;
-    }
-    if (found)
-	heapify(h);
+	/* delete all schedulers and related links/queues/flowsets */
+	dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
+		(void *)(uintptr_t)DN_DELETE_FS);
+	/* delete all remaining (unlinked) flowsets */
+	DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
+	dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
+	fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
+	/* Reinitialize system heap... */
+	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
 }
 
 /*
- * helper function to remove a pipe from a heap (can be there at most once)
+ * Main handler for configuration. We are guaranteed to be called
+ * with an oid which is at least a dn_id.
+ * - the first object is the command (config, delete, flush, ...)
+ * - config_link must be issued after the corresponding config_sched
+ * - parameters (DN_TXT) for an object must preceed the object
+ *   processed on a config_sched.
  */
-static void
-pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
+int
+do_config(void *p, int l)
 {
-	int i;
+	struct dn_id *next, *o;
+	int err = 0, err2 = 0;
+	struct dn_id *arg = NULL;
+	uintptr_t *a;
+
+	o = p;
+	if (o->id != DN_API_VERSION) {
+		D("invalid api version got %d need %d",
+			o->id, DN_API_VERSION);
+		return EINVAL;
+	}
+	for (; l >= sizeof(*o); o = next) {
+		struct dn_id *prev = arg;
+		if (o->len < sizeof(*o) || l < o->len) {
+			D("bad len o->len %d len %d", o->len, l);
+			err = EINVAL;
+			break;
+		}
+		l -= o->len;
+		next = (struct dn_id *)((char *)o + o->len);
+		err = 0;
+		switch (o->type) {
+		default:
+			D("cmd %d not implemented", o->type);
+			break;
 
-	for (i=0; i < h->elements ; i++ ) {
-		if (h->p[i].object == p) { /* found it */
-			h->elements-- ;
-			h->p[i] = h->p[h->elements] ;
-			heapify(h);
-			break ;
+#ifdef EMULATE_SYSCTL
+		/* sysctl emulation.
+		 * if we recognize the command, jump to the correct
+		 * handler and return
+		 */
+		case DN_SYSCTL_SET:
+			err = kesysctl_emu_set(p, l);
+			return err;
+#endif
+
+		case DN_CMD_CONFIG: /* simply a header */
+			break;
+
+		case DN_CMD_DELETE:
+			/* the argument is in the first uintptr_t after o */
+			a = (uintptr_t *)(o+1);
+			if (o->len < sizeof(*o) + sizeof(*a)) {
+				err = EINVAL;
+				break;
+			}
+			switch (o->subtype) {
+			case DN_LINK:
+				/* delete base and derived schedulers */
+				DN_BH_WLOCK();
+				err = delete_schk(*a);
+				err2 = delete_schk(*a + DN_MAX_ID);
+				DN_BH_WUNLOCK();
+				if (!err)
+					err = err2;
+				break;
+
+			default:
+				D("invalid delete type %d",
+					o->subtype);
+				err = EINVAL;
+				break;
+
+			case DN_FS:
+				err = (*a <1 || *a >= DN_MAX_ID) ?
+					EINVAL : delete_fs(*a, 0) ;
+				break;
+			}
+			break;
+
+		case DN_CMD_FLUSH:
+			DN_BH_WLOCK();
+			dummynet_flush();
+			DN_BH_WUNLOCK();
+			break;
+		case DN_TEXT:	/* store argument the next block */
+			prev = NULL;
+			arg = o;
+			break;
+		case DN_LINK:
+			err = config_link((struct dn_link *)o, arg);
+			break;
+		case DN_PROFILE:
+			err = config_profile((struct dn_profile *)o, arg);
+			break;
+		case DN_SCH:
+			err = config_sched((struct dn_sch *)o, arg);
+			break;
+		case DN_FS:
+			err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
+			break;
 		}
+		if (prev)
+			arg = NULL;
+		if (err != 0)
+			break;
 	}
+	return err;
 }
 
-/*
- * Fully delete a pipe or a queue, cleaning up associated info.
- */
 static int
-delete_pipe(struct dn_pipe *p)
+compute_space(struct dn_id *cmd, struct copy_args *a)
 {
+	int x = 0, need = 0;
+	int profile_size = sizeof(struct dn_profile) - 
+		ED_MAX_SAMPLES_NO*sizeof(int);
+
+	/* NOTE about compute space:
+	 * NP 	= dn_cfg.schk_count
+	 * NSI 	= dn_cfg.si_count
+	 * NF 	= dn_cfg.fsk_count
+	 * NQ 	= dn_cfg.queue_count
+	 * - ipfw pipe show
+	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+	 *                             link, scheduler template, flowset
+	 *                             integrated in scheduler and header
+	 *                             for flowset list
+	 *   (NSI)*(dn_flow) all scheduler instance (includes
+	 *                              the queue instance)
+	 * - ipfw sched show
+	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+	 *                             link, scheduler template, flowset
+	 *                             integrated in scheduler and header
+	 *                             for flowset list
+	 *   (NSI * dn_flow) all scheduler instances
+	 *   (NF * sizeof(uint_32)) space for flowset list linked to scheduler
+	 *   (NQ * dn_queue) all queue [XXXfor now not listed]
+	 * - ipfw queue show
+	 *   (NF * dn_fs) all flowset
+	 *   (NQ * dn_queue) all queues
+	 */
+	switch (cmd->subtype) {
+	default:
+		return -1;
+	/* XXX where do LINK and SCH differ ? */
+	/* 'ipfw sched show' could list all queues associated to
+	 * a scheduler. This feature for now is disabled
+	 */
+	case DN_LINK:	/* pipe show */
+		x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
+		need += dn_cfg.schk_count *
+			(sizeof(struct dn_fs) + profile_size) / 2;
+		need += dn_cfg.fsk_count * sizeof(uint32_t);
+		break;
+	case DN_SCH:	/* sched show */
+		need += dn_cfg.schk_count *
+			(sizeof(struct dn_fs) + profile_size) / 2;
+		need += dn_cfg.fsk_count * sizeof(uint32_t);
+		x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
+		break;
+	case DN_FS:	/* queue show */
+		x = DN_C_FS | DN_C_QUEUE;
+		break;
+	case DN_GET_COMPAT:	/* compatibility mode */
+		need =  dn_compat_calc_size(); 
+		break;
+	}
+	a->flags = x;
+	if (x & DN_C_SCH) {
+		need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
+		/* NOT also, each fs might be attached to a sched */
+		need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
+	}
+	if (x & DN_C_FS)
+		need += dn_cfg.fsk_count * sizeof(struct dn_fs);
+	if (x & DN_C_LINK) {
+		need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
+	}
+	/*
+	 * When exporting a queue to userland, only pass up the
+	 * struct dn_flow, which is the only visible part.
+	 */
 
-    if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
-	return EINVAL ;
-    if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
-	return EINVAL ;
-    if (p->pipe_nr != 0) { /* this is an old-style pipe */
-	struct dn_pipe *pipe;
-	struct dn_flow_set *fs;
-	int i;
-
-	DUMMYNET_LOCK();
-	pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
+	if (x & DN_C_QUEUE)
+		need += dn_cfg.queue_count * sizeof(struct dn_flow);
+	if (x & DN_C_FLOW)
+		need += dn_cfg.si_count * (sizeof(struct dn_flow));
+	return need;
+}
 
-	if (pipe == NULL) {
-	    DUMMYNET_UNLOCK();
-	    return (ENOENT);	/* not found */
+/*
+ * If compat != NULL dummynet_get is called in compatibility mode.
+ * *compat will be the pointer to the buffer to pass to ipfw
+ */
+int
+dummynet_get(struct sockopt *sopt, void **compat)
+{
+	int have, i, need, error;
+	char *start = NULL, *buf;
+	size_t sopt_valsize;
+	struct dn_id *cmd;
+	struct copy_args a;
+	struct copy_range r;
+	int l = sizeof(struct dn_id);
+
+	bzero(&a, sizeof(a));
+	bzero(&r, sizeof(r));
+
+	/* save and restore original sopt_valsize around copyin */
+	sopt_valsize = sopt->sopt_valsize;
+
+	cmd = &r.o;
+
+	if (!compat) {
+		/* copy at least an oid, and possibly a full object */
+		error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
+		sopt->sopt_valsize = sopt_valsize;
+		if (error)
+			goto done;
+		l = cmd->len;
+#ifdef EMULATE_SYSCTL
+		/* sysctl emulation. */
+		if (cmd->type == DN_SYSCTL_GET)
+			return kesysctl_emu_get(sopt);
+#endif
+		if (l > sizeof(r)) {
+			/* request larger than default, allocate buffer */
+			cmd = malloc(l,  M_DUMMYNET, M_WAIT);
+			if (cmd == NULL)
+				return ENOMEM; //XXX
+			error = sooptcopyin(sopt, cmd, l, l);
+			sopt->sopt_valsize = sopt_valsize;
+			if (error)
+				goto done;
+		}
+	} else { /* compatibility */
+		error = 0;
+		cmd->type = DN_CMD_GET;
+		cmd->len = sizeof(struct dn_id);
+		cmd->subtype = DN_GET_COMPAT;
+		// cmd->id = sopt_valsize;
+		D("compatibility mode");
 	}
+	a.extra = (struct copy_range *)cmd;
+	if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
+		uint32_t *rp = (uint32_t *)(cmd + 1);
+		cmd->len += 2* sizeof(uint32_t);
+		rp[0] = 1;
+		rp[1] = DN_MAX_ID - 1;
+		if (cmd->subtype == DN_LINK) {
+			rp[0] += DN_MAX_ID;
+			rp[1] += DN_MAX_ID;
+		}
+	}
+	/* Count space (under lock) and allocate (outside lock).
+	 * Exit with lock held if we manage to get enough buffer.
+	 * Try a few times then give up.
+	 */
+	for (have = 0, i = 0; i < 10; i++) {
+		DN_BH_WLOCK();
+		need = compute_space(cmd, &a);
+
+		/* if there is a range, ignore value from compute_space() */
+		if (l > sizeof(*cmd))
+			need = sopt_valsize - sizeof(*cmd);
+
+		if (need < 0) {
+			DN_BH_WUNLOCK();
+			error = EINVAL;
+			goto done;
+		}
+		need += sizeof(*cmd);
+		cmd->id = need;
+		if (have >= need)
+			break;
 
-	/* Unlink from list of pipes. */
-	SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
+		DN_BH_WUNLOCK();
+		if (start)
+			free(start, M_DUMMYNET);
+		start = NULL;
+		if (need > sopt_valsize)
+			break;
 
-	/* Remove all references to this pipe from flow_sets. */
-	for (i = 0; i < HASHSIZE; i++) {
-	    SLIST_FOREACH(fs, &flowsethash[i], next) {
-		if (fs->pipe == pipe) {
-			printf("dummynet: ++ ref to pipe %d from fs %d\n",
-			    p->pipe_nr, fs->fs_nr);
-			fs->pipe = NULL ;
-			purge_flow_set(fs, 0);
+		have = need;
+		start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
+		if (start == NULL) {
+			error = ENOMEM;
+			goto done;
 		}
-	    }
 	}
-	fs_remove_from_heap(&ready_heap, &(pipe->fs));
-	purge_pipe(pipe); /* remove all data associated to this pipe */
-	/* remove reference to here from extract_heap and wfq_ready_heap */
-	pipe_remove_from_heap(&extract_heap, pipe);
-	pipe_remove_from_heap(&wfq_ready_heap, pipe);
-	DUMMYNET_UNLOCK();
-
-	free_pipe(pipe);
-    } else { /* this is a WF2Q queue (dn_flow_set) */
-	struct dn_flow_set *fs;
 
-	DUMMYNET_LOCK();
-	fs = locate_flowset(p->fs.fs_nr); /* locate set */
+	if (start == NULL) {
+		if (compat) {
+			*compat = NULL;
+			error =  1; // XXX
+		} else {
+			error = sooptcopyout(sopt, cmd, sizeof(*cmd));
+		}
+		goto done;
+	}
+	ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
+		"%d:%d si %d, %d:%d queues %d",
+		dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
+		dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
+		dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
+		dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
+		dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
+	sopt->sopt_valsize = sopt_valsize;
+	a.type = cmd->subtype;
+
+	if (compat == NULL) {
+		bcopy(cmd, start, sizeof(*cmd));
+		((struct dn_id*)(start))->len = sizeof(struct dn_id);
+		buf = start + sizeof(*cmd);
+	} else
+		buf = start;
+	a.start = &buf;
+	a.end = start + have;
+	/* start copying other objects */
+	if (compat) {
+		a.type = DN_COMPAT_PIPE;
+		dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
+		a.type = DN_COMPAT_QUEUE;
+		dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
+	} else if (a.type == DN_FS) {
+		dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
+	} else {
+		dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
+	}
+	DN_BH_WUNLOCK();
 
-	if (fs == NULL) {
-	    DUMMYNET_UNLOCK();
-	    return (ENOENT); /* not found */
+	if (compat) {
+		*compat = start;
+		sopt->sopt_valsize = buf - start;
+		/* free() is done by ip_dummynet_compat() */
+		start = NULL; //XXX hack
+	} else {
+		error = sooptcopyout(sopt, start, buf - start);
 	}
+done:
+	if (cmd && cmd != &r.o)
+		free(cmd, M_DUMMYNET);
+	if (start)
+		free(start, M_DUMMYNET);
+	return error;
+}
 
-	/* Unlink from list of flowsets. */
-	SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
+/* Callback called on scheduler instance to delete it if idle */
+static int
+drain_scheduler_cb(void *_si, void *arg)
+{
+	struct dn_sch_inst *si = _si;
 
-	if (fs->pipe != NULL) {
-	    /* Update total weight on parent pipe and cleanup parent heaps. */
-	    fs->pipe->sum -= fs->weight * fs->backlogged ;
-	    fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
-	    fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
-#if 1	/* XXX should i remove from idle_heap as well ? */
-	    fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
-#endif
+	if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
+		return 0;
+
+	if (si->sched->fp->flags & DN_MULTIQUEUE) {
+		if (si->q_count == 0)
+			return si_destroy(si, NULL);
+		else
+			return 0;
+	} else { /* !DN_MULTIQUEUE */
+		if ((si+1)->ni.length == 0)
+			return si_destroy(si, NULL);
+		else
+			return 0;
 	}
-	purge_flow_set(fs, 1);
-	DUMMYNET_UNLOCK();
-    }
-    return 0 ;
+	return 0; /* unreachable */
 }
 
-/*
- * helper function used to copy data from kernel in DUMMYNET_GET
- */
-static char *
-dn_copy_set(struct dn_flow_set *set, char *bp)
-{
-    int i, copied = 0 ;
-    struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
-
-    DUMMYNET_LOCK_ASSERT();
-
-    for (i = 0 ; i <= set->rq_size ; i++) {
-	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
-	    if (q->hash_slot != i)
-		printf("dummynet: ++ at %d: wrong slot (have %d, "
-		    "should be %d)\n", copied, q->hash_slot, i);
-	    if (q->fs != set)
-		printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
-			i, q->fs, set);
-	    copied++ ;
-	    bcopy(q, qp, sizeof( *q ) );
-	    /* cleanup pointers */
-	    qp->next = NULL ;
-	    qp->head = qp->tail = NULL ;
-	    qp->fs = NULL ;
+/* Callback called on scheduler to check if it has instances */
+static int
+drain_scheduler_sch_cb(void *_s, void *arg)
+{
+	struct dn_schk *s = _s;
+
+	if (s->sch.flags & DN_HAVE_MASK) {
+		dn_ht_scan_bucket(s->siht, &s->drain_bucket,
+				drain_scheduler_cb, NULL);
+		s->drain_bucket++;
+	} else {
+		if (s->siht) {
+			if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL)
+				s->siht = NULL;
+		}
 	}
-    }
-    if (copied != set->rq_elements)
-	printf("dummynet: ++ wrong count, have %d should be %d\n",
-	    copied, set->rq_elements);
-    return (char *)qp ;
-}
-
-static size_t
-dn_calc_size(void)
-{
-    struct dn_flow_set *fs;
-    struct dn_pipe *pipe;
-    size_t size = 0;
-    int i;
-
-    DUMMYNET_LOCK_ASSERT();
-    /*
-     * Compute size of data structures: list of pipes and flow_sets.
-     */
-    for (i = 0; i < HASHSIZE; i++) {
-	SLIST_FOREACH(pipe, &pipehash[i], next)
-		size += sizeof(*pipe) +
-		    pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
-	SLIST_FOREACH(fs, &flowsethash[i], next)
-		size += sizeof (*fs) +
-		    fs->rq_elements * sizeof(struct dn_flow_queue);
-    }
-    return size;
+	return 0;
 }
 
-static int
-dummynet_get(struct sockopt *sopt)
-{
-    char *buf, *bp ; /* bp is the "copy-pointer" */
-    size_t size ;
-    struct dn_flow_set *fs;
-    struct dn_pipe *pipe;
-    int error=0, i ;
-
-    /* XXX lock held too long */
-    DUMMYNET_LOCK();
-    /*
-     * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
-     *      cannot use this flag while holding a mutex.
-     */
-    for (i = 0; i < 10; i++) {
-	size = dn_calc_size();
-	DUMMYNET_UNLOCK();
-	buf = malloc(size, M_TEMP, M_WAITOK);
-	DUMMYNET_LOCK();
-	if (size >= dn_calc_size())
-		break;
-	free(buf, M_TEMP);
-	buf = NULL;
-    }
-    if (buf == NULL) {
-	DUMMYNET_UNLOCK();
-	return ENOBUFS ;
-    }
-    bp = buf;
-    for (i = 0; i < HASHSIZE; i++) {
-	SLIST_FOREACH(pipe, &pipehash[i], next) {
-		struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
+/* Called every tick, try to delete a 'bucket' of scheduler */
+void
+dn_drain_scheduler(void)
+{
+	dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch,
+			   drain_scheduler_sch_cb, NULL);
+	dn_cfg.drain_sch++;
+}
 
-		/*
-		 * Copy pipe descriptor into *bp, convert delay back to ms,
-		 * then copy the flow_set descriptor(s) one at a time.
-		 * After each flow_set, copy the queue descriptor it owns.
-		 */
-		bcopy(pipe, bp, sizeof(*pipe));
-		pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
-		pipe_bp->burst = div64(pipe_bp->burst, 8 * hz);
-		/*
-		 * XXX the following is a hack based on ->next being the
-		 * first field in dn_pipe and dn_flow_set. The correct
-		 * solution would be to move the dn_flow_set to the beginning
-		 * of struct dn_pipe.
-		 */
-		pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
-		/* Clean pointers. */
-		pipe_bp->head = pipe_bp->tail = NULL;
-		pipe_bp->fs.next.sle_next = NULL;
-		pipe_bp->fs.pipe = NULL;
-		pipe_bp->fs.rq = NULL;
-		pipe_bp->samples = NULL;
+/* Callback called on queue to delete if it is idle */
+static int
+drain_queue_cb(void *_q, void *arg)
+{
+	struct dn_queue *q = _q;
 
-		bp += sizeof(*pipe) ;
-		bp = dn_copy_set(&(pipe->fs), bp);
+	if (q->ni.length == 0) {
+		dn_delete_queue(q, DN_DESTROY);
+		return DNHT_SCAN_DEL; /* queue is deleted */
 	}
-    }
 
-    for (i = 0; i < HASHSIZE; i++) {
-	SLIST_FOREACH(fs, &flowsethash[i], next) {
-		struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
+	return 0; /* queue isn't deleted */
+}
 
-		bcopy(fs, bp, sizeof(*fs));
-		/* XXX same hack as above */
-		fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
-		fs_bp->pipe = NULL;
-		fs_bp->rq = NULL;
-		bp += sizeof(*fs);
-		bp = dn_copy_set(fs, bp);
-	}
-    }
+/* Callback called on flowset used to check if it has queues */
+static int
+drain_queue_fs_cb(void *_fs, void *arg)
+{
+	struct dn_fsk *fs = _fs;
 
-    DUMMYNET_UNLOCK();
+	if (fs->fs.flags & DN_QHT_HASH) {
+		/* Flowset has a hash table for queues */
+		dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
+				drain_queue_cb, NULL);
+		fs->drain_bucket++;
+	} else {
+		/* No hash table for this flowset, null the pointer 
+		 * if the queue is deleted
+		 */
+		if (fs->qht) {
+			if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL)
+				fs->qht = NULL;
+		}
+	}
+	return 0;
+}
 
-    error = sooptcopyout(sopt, buf, size);
-    free(buf, M_TEMP);
-    return error ;
+/* Called every tick, try to delete a 'bucket' of queue */
+void
+dn_drain_queue(void)
+{
+	/* scan a bucket of flowset */
+	dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs,
+                               drain_queue_fs_cb, NULL);
+	dn_cfg.drain_fs++;
 }
 
 /*
- * Handler for the various dummynet socket options (get, flush, config, del)
+ * Handler for the various dummynet socket options
  */
 static int
 ip_dn_ctl(struct sockopt *sopt)
 {
-    int error;
-    struct dn_pipe *p = NULL;
-
-    error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
-    if (error)
-	return (error);
+	void *p = NULL;
+	int error, l;
 
-    /* Disallow sets in really-really secure mode. */
-    if (sopt->sopt_dir == SOPT_SET) {
-#if __FreeBSD_version >= 500034
-	error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+	error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
 	if (error)
-	    return (error);
-#else
-	if (securelevel >= 3)
-	    return (EPERM);
-#endif
-    }
+		return (error);
 
-    switch (sopt->sopt_name) {
-    default :
-	printf("dummynet: -- unknown option %d", sopt->sopt_name);
-	error = EINVAL ;
-	break;
-
-    case IP_DUMMYNET_GET :
-	error = dummynet_get(sopt);
-	break ;
-
-    case IP_DUMMYNET_FLUSH :
-	dummynet_flush() ;
-	break ;
-
-    case IP_DUMMYNET_CONFIGURE :
-	p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK);
-	error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p);
-	if (error)
-	    break ;
-	if (p->samples_no > 0)
-	    p->samples = &(((struct dn_pipe_max *)p)->samples[0]);
+	/* Disallow sets in really-really secure mode. */
+	if (sopt->sopt_dir == SOPT_SET) {
+		error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+		if (error)
+			return (error);
+	}
 
-	error = config_pipe(p);
-	break ;
+	switch (sopt->sopt_name) {
+	default :
+		D("dummynet: unknown option %d", sopt->sopt_name);
+		error = EINVAL;
+		break;
 
-    case IP_DUMMYNET_DEL :	/* remove a pipe or queue */
-	p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK);
-	error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p);
-	if (error)
-	    break ;
+	case IP_DUMMYNET_FLUSH:
+	case IP_DUMMYNET_CONFIGURE:
+	case IP_DUMMYNET_DEL:	/* remove a pipe or queue */
+	case IP_DUMMYNET_GET:
+		D("dummynet: compat option %d", sopt->sopt_name);
+		error = ip_dummynet_compat(sopt);
+		break;
 
-	error = delete_pipe(p);
-	break ;
-    }
+	case IP_DUMMYNET3 :
+		if (sopt->sopt_dir == SOPT_GET) {
+			error = dummynet_get(sopt, NULL);
+			break;
+		}
+		l = sopt->sopt_valsize;
+		if (l < sizeof(struct dn_id) || l > 12000) {
+			D("argument len %d invalid", l);
+			break;
+		}
+		p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
+		error = sooptcopyin(sopt, p, l, l);
+		if (error)
+			break ;
+		error = do_config(p, l);
+		break;
+	}
 
-    if (p != NULL)
-	free(p, M_TEMP);
+	if (p != NULL)
+		free(p, M_TEMP);
 
-    return error ;
+	return error ;
 }
 
+
 static void
 ip_dn_init(void)
 {
-	int i;
-
-	if (bootverbose)
-		printf("DUMMYNET with IPv6 initialized (040826)\n");
-
-	DUMMYNET_LOCK_INIT();
-
-	for (i = 0; i < HASHSIZE; i++) {
-		SLIST_INIT(&pipehash[i]);
-		SLIST_INIT(&flowsethash[i]);
-	}
-	ready_heap.size = ready_heap.elements = 0;
-	ready_heap.offset = 0;
+	if (dn_cfg.init_done)
+		return;
+	printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet);
+	dn_cfg.init_done = 1;
+	/* Set defaults here. MSVC does not accept initializers,
+	 * and this is also useful for vimages
+	 */
+	/* queue limits */
+	dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
+	dn_cfg.byte_limit = 1024 * 1024;
+	dn_cfg.expire = 1;
+
+	/* RED parameters */
+	dn_cfg.red_lookup_depth = 256;	/* default lookup table depth */
+	dn_cfg.red_avg_pkt_size = 512;	/* default medium packet size */
+	dn_cfg.red_max_pkt_size = 1500;	/* default max packet size */
+
+	/* hash tables */
+	dn_cfg.max_hash_size = 1024;	/* max in the hash tables */
+	dn_cfg.hash_size = 64;		/* default hash size */
+
+	/* create hash tables for schedulers and flowsets.
+	 * In both we search by key and by pointer.
+	 */
+	dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
+		offsetof(struct dn_schk, schk_next),
+		schk_hash, schk_match, schk_new);
+	dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
+		offsetof(struct dn_fsk, fsk_next),
+		fsk_hash, fsk_match, fsk_new);
 
-	wfq_ready_heap.size = wfq_ready_heap.elements = 0;
-	wfq_ready_heap.offset = 0;
+	/* bucket index to drain object */
+	dn_cfg.drain_fs = 0;
+	dn_cfg.drain_sch = 0;
 
-	extract_heap.size = extract_heap.elements = 0;
-	extract_heap.offset = 0;
+	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+	SLIST_INIT(&dn_cfg.fsu);
+	SLIST_INIT(&dn_cfg.schedlist);
 
-	ip_dn_ctl_ptr = ip_dn_ctl;
-	ip_dn_io_ptr = dummynet_io;
+	DN_LOCK_INIT();
 
-	TASK_INIT(&dn_task, 0, dummynet_task, NULL);
+	TASK_INIT(&dn_task, 0, dummynet_task, curvnet);
 	dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
 	    taskqueue_thread_enqueue, &dn_tq);
 	taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
@@ -2270,25 +2164,32 @@ ip_dn_init(void)
 	callout_reset(&dn_timeout, 1, dummynet, NULL);
 
 	/* Initialize curr_time adjustment mechanics. */
-	getmicrouptime(&prev_t);
+	getmicrouptime(&dn_cfg.prev_t);
 }
 
 #ifdef KLD_MODULE
 static void
-ip_dn_destroy(void)
+ip_dn_destroy(int last)
 {
-	ip_dn_ctl_ptr = NULL;
-	ip_dn_io_ptr = NULL;
+	callout_drain(&dn_timeout);
+
+	DN_BH_WLOCK();
+	if (last) {
+		printf("%s removing last instance\n", __FUNCTION__);
+		ip_dn_ctl_ptr = NULL;
+		ip_dn_io_ptr = NULL;
+	}
 
-	DUMMYNET_LOCK();
-	callout_stop(&dn_timeout);
-	DUMMYNET_UNLOCK();
+	dummynet_flush();
+	DN_BH_WUNLOCK();
 	taskqueue_drain(dn_tq, &dn_task);
 	taskqueue_free(dn_tq);
 
-	dummynet_flush();
+	dn_ht_free(dn_cfg.schedhash, 0);
+	dn_ht_free(dn_cfg.fshash, 0);
+	heap_free(&dn_cfg.evheap);
 
-	DUMMYNET_LOCK_DESTROY();
+	DN_LOCK_DESTROY();
 }
 #endif /* KLD_MODULE */
 
@@ -2296,36 +2197,116 @@ static int
 dummynet_modevent(module_t mod, int type, void *data)
 {
 
-	switch (type) {
-	case MOD_LOAD:
+	if (type == MOD_LOAD) {
 		if (ip_dn_io_ptr) {
-		    printf("DUMMYNET already loaded\n");
-		    return EEXIST ;
+			printf("DUMMYNET already loaded\n");
+			return EEXIST ;
 		}
 		ip_dn_init();
-		break;
-
-	case MOD_UNLOAD:
+		ip_dn_ctl_ptr = ip_dn_ctl;
+		ip_dn_io_ptr = dummynet_io;
+		return 0;
+	} else if (type == MOD_UNLOAD) {
 #if !defined(KLD_MODULE)
 		printf("dummynet statically compiled, cannot unload\n");
 		return EINVAL ;
 #else
-		ip_dn_destroy();
+		ip_dn_destroy(1 /* last */);
+		return 0;
 #endif
-		break ;
-	default:
+	} else
 		return EOPNOTSUPP;
-		break ;
+}
+
+/* modevent helpers for the modules */
+static int
+load_dn_sched(struct dn_alg *d)
+{
+	struct dn_alg *s;
+
+	if (d == NULL)
+		return 1; /* error */
+	ip_dn_init();	/* just in case, we need the lock */
+
+	/* Check that mandatory funcs exists */
+	if (d->enqueue == NULL || d->dequeue == NULL) {
+		D("missing enqueue or dequeue for %s", d->name);
+		return 1;
+	}
+
+	/* Search if scheduler already exists */
+	DN_BH_WLOCK();
+	SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
+		if (strcmp(s->name, d->name) == 0) {
+			D("%s already loaded", d->name);
+			break; /* scheduler already exists */
+		}
 	}
-	return 0 ;
+	if (s == NULL)
+		SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
+	DN_BH_WUNLOCK();
+	D("dn_sched %s %sloaded", d->name, s ? "not ":"");
+	return s ? 1 : 0;
+}
+
+static int
+unload_dn_sched(struct dn_alg *s)
+{
+	struct dn_alg *tmp, *r;
+	int err = EINVAL;
+
+	D("called for %s", s->name);
+
+	DN_BH_WLOCK();
+	SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
+		if (strcmp(s->name, r->name) != 0)
+			continue;
+		D("ref_count = %d", r->ref_count);
+		err = (r->ref_count != 0) ? EBUSY : 0;
+		if (err == 0)
+			SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
+		break;
+	}
+	DN_BH_WUNLOCK();
+	D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
+	return err;
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+	struct dn_alg *sch = arg;
+
+	if (cmd == MOD_LOAD)
+		return load_dn_sched(sch);
+	else if (cmd == MOD_UNLOAD)
+		return unload_dn_sched(sch);
+	else
+		return EINVAL;
 }
 
 static moduledata_t dummynet_mod = {
-	"dummynet",
-	dummynet_modevent,
-	NULL
+	"dummynet", dummynet_modevent, NULL
 };
-DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+
+#define	DN_SI_SUB	SI_SUB_PROTO_IFATTACHDOMAIN
+#define	DN_MODEV_ORD	(SI_ORDER_ANY - 128) /* after ipfw */
+DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD);
 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
 MODULE_VERSION(dummynet, 1);
+
+/*
+ * Starting up. Done in order after dummynet_modevent() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL);
+ 
+/*
+ * Shutdown handlers up shop. These are done in REVERSE ORDER, but still
+ * after dummynet_modevent() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
+
 /* end of file */
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 724536c40e7d..959ad8e3ad99 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -142,6 +142,11 @@ ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 #ifdef SYSCTL_NODE
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
+
+SYSBEGIN(f3)
+
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
@@ -156,10 +161,10 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
-    NULL, IPFW_DEFAULT_RULE,
+    &dummy_def, 0,
     "The default/max possible rule number.");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
-    NULL, IPFW_TABLES_MAX,
+    &dummy_tables_max, 0,
     "The maximum number of tables.");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
@@ -177,6 +182,8 @@ SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
     "Deny packets with unknown IPv6 Extension Headers");
 #endif /* INET6 */
 
+SYSEND
+
 #endif /* SYSCTL_NODE */
 
 
@@ -344,6 +351,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 				return(1);
 		}
 	} else {
+#ifdef	__FreeBSD__	/* and OSX too ? */
 		struct ifaddr *ia;
 
 		if_addr_rlock(ifp);
@@ -357,6 +365,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 			}
 		}
 		if_addr_runlock(ifp);
+#endif /* __FreeBSD__ */
 	}
 	return(0);	/* no match, fail ... */
 }
@@ -385,6 +394,9 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 static int
 verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 {
+#ifndef __FreeBSD__
+	return 0;
+#else
 	struct route ro;
 	struct sockaddr_in *dst;
 
@@ -427,6 +439,7 @@ verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 	/* found valid route */
 	RTFREE(ro.ro_rt);
 	return 1;
+#endif /* __FreeBSD__ */
 }
 
 #ifdef INET6
@@ -634,9 +647,14 @@ send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
 static int
 check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
     struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
-    u_int16_t src_port, struct ucred **uc, int *ugid_lookupp,
-    struct inpcb *inp)
+    u_int16_t src_port, int *ugid_lookupp,
+    struct ucred **uc, struct inpcb *inp)
 {
+#ifndef __FreeBSD__
+	return cred_check(insn, proto, oif,
+	    dst_ip, dst_port, src_ip, src_port,
+	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else  /* FreeBSD */
 	struct inpcbinfo *pi;
 	int wildcard;
 	struct inpcb *pcb;
@@ -703,6 +721,7 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
 	else if (insn->o.opcode == O_JAIL)
 		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
 	return match;
+#endif /* __FreeBSD__ */
 }
 
 /*
@@ -794,7 +813,11 @@ ipfw_chk(struct ip_fw_args *args)
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
+#ifndef __FreeBSD__
+	struct bsd_ucred ucred_cache;
+#else
 	struct ucred *ucred_cache = NULL;
+#endif
 	int ucred_lookup = 0;
 
 	/*
@@ -863,10 +886,13 @@ ipfw_chk(struct ip_fw_args *args)
 	 * ulp is NULL if not found.
 	 */
 	void *ulp = NULL;		/* upper layer protocol pointer. */
+
 	/* XXX ipv6 variables */
 	int is_ipv6 = 0;
-	u_int16_t ext_hd = 0;	/* bits vector for extension header filtering */
+	uint8_t	icmp6_type = 0;
+	uint16_t ext_hd = 0;	/* bits vector for extension header filtering */
 	/* end of ipv6 variables */
+
 	int is_ipv4 = 0;
 
 	int done = 0;		/* flag to exit the outer loop */
@@ -918,14 +944,15 @@ do {								\
 			switch (proto) {
 			case IPPROTO_ICMPV6:
 				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
-				args->f_id.flags = ICMP6(ulp)->icmp6_type;
+				icmp6_type = ICMP6(ulp)->icmp6_type;
 				break;
 
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
-				args->f_id.flags = TCP(ulp)->th_flags;
+				/* save flags for dynamic rules */
+				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
@@ -989,7 +1016,7 @@ do {								\
 					    return (IP_FW_DENY);
 					break;
 				}
-				args->f_id.frag_id6 =
+				args->f_id.extra =
 				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
 				ulp = NULL;
 				break;
@@ -1092,7 +1119,8 @@ do {								\
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
-				args->f_id.flags = TCP(ulp)->th_flags;
+				/* save flags for dynamic rules */
+				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_UDP:
@@ -1103,7 +1131,7 @@ do {								\
 
 			case IPPROTO_ICMP:
 				PULLUP_TO(hlen, ulp, struct icmphdr);
-				args->f_id.flags = ICMP(ulp)->icmp_type;
+				//args->f_id.flags = ICMP(ulp)->icmp_type;
 				break;
 
 			default:
@@ -1233,8 +1261,13 @@ do {								\
 						    (ipfw_insn_u32 *)cmd,
 						    proto, oif,
 						    dst_ip, dst_port,
-						    src_ip, src_port, &ucred_cache,
-						    &ucred_lookup, args->inp);
+						    src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+						    &ucred_cache, args->inp);
+#else
+						    (void *)&ucred_cache,
+						    (struct inpcb *)args->m);
+#endif
 				break;
 
 			case O_RECV:
@@ -1334,6 +1367,8 @@ do {								\
 					    key = dst_ip.s_addr;
 					else if (v == 1)
 					    key = src_ip.s_addr;
+					else if (v == 6) /* dscp */
+					    key = (ip->ip_tos >> 2) & 0x3f;
 					else if (offset != 0)
 					    break;
 					else if (proto != IPPROTO_TCP &&
@@ -1348,12 +1383,21 @@ do {								\
 						(ipfw_insn_u32 *)cmd,
 						proto, oif,
 						dst_ip, dst_port,
-						src_ip, src_port, &ucred_cache,
-						&ucred_lookup, args->inp);
+						src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+						&ucred_cache, args->inp);
 					    if (v == 4 /* O_UID */)
 						key = ucred_cache->cr_uid;
 					    else if (v == 5 /* O_JAIL */)
 						key = ucred_cache->cr_prison->pr_id;
+#else /* !__FreeBSD__ */
+						(void *)&ucred_cache,
+						(struct inpcb *)args->m);
+					    if (v ==4 /* O_UID */)
+						key = ucred_cache.uid;
+					    else if (v == 5 /* O_JAIL */)
+						key = ucred_cache.xid;
+#endif /* !__FreeBSD__ */
 					    key = htonl(key);
 					} else
 					    break;
@@ -1392,11 +1436,10 @@ do {								\
 					match = (tif != NULL);
 					break;
 				}
-				/* FALLTHROUGH */
 #ifdef INET6
+				/* FALLTHROUGH */
 			case O_IP6_SRC_ME:
-				match = is_ipv6 &&
-				    search_ip6_addr_net(&args->f_id.src_ip6);
+				match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
 #endif
 				break;
 
@@ -1432,14 +1475,14 @@ do {								\
 					match = (tif != NULL);
 					break;
 				}
-				/* FALLTHROUGH */
 #ifdef INET6
+				/* FALLTHROUGH */
 			case O_IP6_DST_ME:
-				match = is_ipv6 &&
-				    search_ip6_addr_net(&args->f_id.dst_ip6);
+				match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
 #endif
 				break;
 
+
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
@@ -1998,7 +2041,7 @@ do {								\
 				if (hlen > 0 && is_ipv6 &&
 				    ((offset & IP6F_OFF_MASK) == 0) &&
 				    (proto != IPPROTO_ICMPV6 ||
-				     (is_icmp6_query(args->f_id.flags) == 1)) &&
+				     (is_icmp6_query(icmp6_type) == 1)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
 					send_reject6(
@@ -2164,8 +2207,10 @@ do {								\
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
 	IPFW_RUNLOCK(chain);
+#ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
+#endif
 	return (retval);
 
 pullup_failed:
@@ -2354,7 +2399,7 @@ vnet_ipfw_uninit(const void *unused)
 	IPFW_WLOCK(chain);
 
 	ipfw_dyn_uninit(0);	/* run the callout_drain */
-	ipfw_flush_tables(chain);
+	ipfw_destroy_tables(chain);
 	reap = NULL;
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c
index ad5599af43d1..69475828e1e7 100644
--- a/sys/netinet/ipfw/ip_fw_dynamic.c
+++ b/sys/netinet/ipfw/ip_fw_dynamic.c
@@ -128,7 +128,11 @@ static VNET_DEFINE(struct callout, ipfw_timeout);
 #define V_ipfw_timeout                  VNET(ipfw_timeout)
 
 static uma_zone_t ipfw_dyn_rule_zone;
+#ifndef __FreeBSD__
+DEFINE_SPINLOCK(ipfw_dyn_mtx);
+#else
 static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
+#endif
 
 #define	IPFW_DYN_LOCK_INIT() \
 	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
@@ -183,6 +187,9 @@ static VNET_DEFINE(u_int32_t, dyn_max);		/* max # of dynamic rules */
 #define	V_dyn_max			VNET(dyn_max)
 
 #ifdef SYSCTL_NODE
+
+SYSBEGIN(f2)
+
 SYSCTL_DECL(_net_inet_ip_fw);
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
     CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
@@ -217,6 +224,9 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
     CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
     "Enable keepalives for dyn. rules");
+
+SYSEND
+
 #endif /* SYSCTL_NODE */
 
 
@@ -466,7 +476,7 @@ next:
 		V_ipfw_dyn_v[i] = q;
 	}
 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
-		u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
+		u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST);
 
 #define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
 #define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
@@ -884,6 +894,9 @@ struct mbuf *
 ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
     u_int32_t ack, int flags)
 {
+#ifndef __FreeBSD__
+	return NULL;
+#else
 	struct mbuf *m;
 	int len, dir;
 	struct ip *h = NULL;		/* stupid compiler */
@@ -1020,6 +1033,7 @@ ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
 	}
 
 	return (m);
+#endif /* __FreeBSD__ */
 }
 
 /*
diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c
index a5178dbc5c76..93bd19b22e1c 100644
--- a/sys/netinet/ipfw/ip_fw_log.c
+++ b/sys/netinet/ipfw/ip_fw_log.c
@@ -395,7 +395,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
 			if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
 				snprintf(SNPARGS(fragment, 0),
 				    " (frag %08x:%d@%d%s)",
-				    args->f_id.frag_id6,
+				    args->f_id.extra,
 				    ntohs(ip6->ip6_plen) - hlen,
 				    ntohs(offset & IP6F_OFF_MASK) << 3,
 				    (offset & IP6F_MORE_FRAG) ? "+" : "");
@@ -413,6 +413,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
 				    (ipoff & IP_MF) ? "+" : "");
 		}
 	}
+#ifdef __FreeBSD__
 	if (oif || m->m_pkthdr.rcvif)
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s %s via %s%s\n",
@@ -421,6 +422,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
 		    fragment);
 	else
+#endif
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s [no if info]%s\n",
 		    f ? f->rulenum : -1,
diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c
index a7aa5aa4f3a0..e87a4c973fe4 100644
--- a/sys/netinet/ipfw/ip_fw_pfil.c
+++ b/sys/netinet/ipfw/ip_fw_pfil.c
@@ -77,6 +77,9 @@ int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
 static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
 
 #ifdef SYSCTL_NODE
+
+SYSBEGIN(f1)
+
 SYSCTL_DECL(_net_inet_ip_fw);
 SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
@@ -87,6 +90,9 @@ SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
     ipfw_chg_hook, "I", "Enable ipfw+6");
 #endif /* INET6 */
+
+SYSEND
+
 #endif /* SYSCTL_NODE */
 
 /*
@@ -94,7 +100,7 @@ SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
  * dummynet, divert, netgraph or other modules.
  * The packet may be consumed.
  */
-static int
+int
 ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
@@ -141,8 +147,8 @@ again:
 	switch (ipfw) {
 	case IP_FW_PASS:
 		/* next_hop may be set by ipfw_chk */
-                if (args.next_hop == NULL)
-                        break; /* pass */
+		if (args.next_hop == NULL)
+			break; /* pass */
 #ifndef IPFIREWALL_FORWARD
 		ret = EACCES;
 #else
@@ -341,14 +347,14 @@ ipfw_attach_hooks(int arg)
 
 	if (arg == 0) /* detach */
 		ipfw_hook(0, AF_INET);
-        else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
+	else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
                 error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
                 printf("ipfw_hook() error\n");
         }
 #ifdef INET6
 	if (arg == 0) /* detach */
 		ipfw_hook(0, AF_INET6);
-        else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
+	else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
                 error = ENOENT;
                 printf("ipfw6_hook() error\n");
         }
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 92508f14dc45..ac55433750a1 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -35,6 +35,18 @@
 
 #ifdef _KERNEL
 
+/*
+ * For platforms that do not have SYSCTL support, we wrap the
+ * SYSCTL_* into a function (one per file) to collect the values
+ * into an array at module initialization. The wrapping macros,
+ * SYSBEGIN() and SYSEND, are empty in the default case.
+ */
+#ifndef SYSBEGIN
+#define SYSBEGIN(x)
+#endif
+#ifndef SYSEND
+#define SYSEND
+#endif
 
 /* Return values from ipfw_chk() */
 enum {
@@ -119,7 +131,13 @@ enum {
 };
 
 /* wrapper for freeing a packet, in case we need to do more work */
+#ifndef FREE_PKT
+#if defined(__linux__) || defined(_WIN32)
+#define FREE_PKT(m)	netisr_dispatch(-1, m)
+#else
 #define FREE_PKT(m)	m_freem(m)
+#endif
+#endif /* !FREE_PKT */
 
 /*
  * Function definitions.
@@ -196,11 +214,16 @@ struct ip_fw_chain {
 	struct ip_fw	*default_rule;
 	int		n_rules;	/* number of static rules */
 	int		static_len;	/* total len of static rules */
-	struct ip_fw    **map;	/* array of rule ptrs to ease lookup */
+	struct ip_fw	**map;		/* array of rule ptrs to ease lookup */
 	LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
 	struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+	spinlock_t rwmtx;
+	spinlock_t uh_lock;
+#else
 	struct rwlock	rwmtx;
 	struct rwlock	uh_lock;	/* lock for upper half */
+#endif
 	uint32_t	id;		/* ruleset id */
 };
 
@@ -240,13 +263,17 @@ int ipfw_ctl(struct sockopt *sopt);
 int ipfw_chk(struct ip_fw_args *args);
 void ipfw_reap_rules(struct ip_fw *head);
 
+/* In ip_fw_pfil */
+int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+     struct inpcb *inp);
+
 /* In ip_fw_table.c */
 struct radix_node;
 int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
     uint32_t *val);
 int ipfw_init_tables(struct ip_fw_chain *ch);
+void ipfw_destroy_tables(struct ip_fw_chain *ch);
 int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
-void ipfw_flush_tables(struct ip_fw_chain *ch);
 int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
     uint8_t mlen, uint32_t value);
 int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index 3d0fc5505cb6..c50572873a16 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -115,7 +115,8 @@ get_map(struct ip_fw_chain *chain, int extra, int locked)
 		int i;
 
 		i = chain->n_rules + extra;
-		map = malloc(i * sizeof(struct ip_fw *), M_IPFW, M_WAITOK);
+		map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
+			locked ? M_NOWAIT : M_WAITOK);
 		if (map == NULL) {
 			printf("%s: cannot allocate map\n", __FUNCTION__);
 			return NULL;
@@ -231,61 +232,103 @@ ipfw_reap_rules(struct ip_fw *head)
 	}
 }
 
+/*
+ * Used by del_entry() to check if a rule should be kept.
+ * Returns 1 if the rule must be kept, 0 otherwise.
+ *
+ * Called with cmd = {0,1,5}.
+ * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ;
+ * cmd == 1 matches on set numbers only, rule numbers are ignored;
+ * cmd == 5 matches on rule and set numbers.
+ *
+ * n == 0 is a wildcard for rule numbers, there is no wildcard for sets.
+ *
+ * Rules to keep are
+ *	(default || reserved || !match_set || !match_number)
+ * where
+ *   default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
+ *	// the default rule is always protected
+ *
+ *   reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
+ *	// RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
+ *
+ *   match_set ::= (cmd == 0 || rule->set == set)
+ *	// set number is ignored for cmd == 0
+ *
+ *   match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
+ *	// number is ignored for cmd == 1 or n == 0
+ *
+ */
+static int
+keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n)
+{
+	return
+		 (rule->rulenum == IPFW_DEFAULT_RULE)		||
+		 (cmd == 0 && n == 0 && rule->set == RESVD_SET)	||
+		!(cmd == 0 || rule->set == set)			||
+		!(cmd == 1 || n == 0 || n == rule->rulenum);
+}
+
 /**
- * Remove all rules with given number, and also do set manipulation.
+ * Remove all rules with given number, or do set manipulation.
  * Assumes chain != NULL && *chain != NULL.
  *
- * The argument is an u_int32_t. The low 16 bit are the rule or set number,
- * the next 8 bits are the new set, the top 8 bits are the command:
+ * The argument is an uint32_t. The low 16 bit are the rule or set number;
+ * the next 8 bits are the new set; the top 8 bits indicate the command:
  *
- *	0	delete rules with given number
- *	1	delete rules with given set number
- *	2	move rules with given number to new set
- *	3	move rules with given set number to new set
- *	4	swap sets with given numbers
- *	5	delete rules with given number and with given set number
+ *	0	delete rules numbered "rulenum"
+ *	1	delete rules in set "rulenum"
+ *	2	move rules "rulenum" to set "new_set"
+ *	3	move rules from set "rulenum" to set "new_set"
+ *	4	swap sets "rulenum" and "new_set"
+ *	5	delete rules "rulenum" and set "new_set"
  */
 static int
-del_entry(struct ip_fw_chain *chain, u_int32_t arg)
+del_entry(struct ip_fw_chain *chain, uint32_t arg)
 {
 	struct ip_fw *rule;
-	uint32_t rulenum;	/* rule or old_set */
+	uint32_t num;	/* rule number or old_set */
 	uint8_t cmd, new_set;
-	int start, end = 0, i, ofs, n;
+	int start, end, i, ofs, n;
 	struct ip_fw **map = NULL;
 	int error = 0;
 
-	rulenum = arg & 0xffff;
+	num = arg & 0xffff;
 	cmd = (arg >> 24) & 0xff;
 	new_set = (arg >> 16) & 0xff;
 
 	if (cmd > 5 || new_set > RESVD_SET)
 		return EINVAL;
 	if (cmd == 0 || cmd == 2 || cmd == 5) {
-		if (rulenum >= IPFW_DEFAULT_RULE)
+		if (num >= IPFW_DEFAULT_RULE)
 			return EINVAL;
 	} else {
-		if (rulenum > RESVD_SET)	/* old_set */
+		if (num > RESVD_SET)	/* old_set */
 			return EINVAL;
 	}
 
-	IPFW_UH_WLOCK(chain); /* prevent conflicts among the writers */
+	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
 	chain->reap = NULL;	/* prepare for deletions */
 
 	switch (cmd) {
-	case 0:	/* delete rules with given number (0 is special means all) */
-	case 1:	/* delete all rules with given set number, rule->set == rulenum */
-	case 5: /* delete rules with given number and with given set number.
-		 * rulenum - given rule number;
-		 * new_set - given set number.
-		 */
-		/* locate first rule to delete (start), the one after the
-		 * last one (end), and count how many rules to delete (n)
+	case 0:	/* delete rules "num" (num == 0 matches all) */
+	case 1:	/* delete all rules in set N */
+	case 5: /* delete rules with number N and set "new_set". */
+
+		/*
+		 * Locate first rule to delete (start), the rule after
+		 * the last one to delete (end), and count how many
+		 * rules to delete (n). Always use keep_rule() to
+		 * determine which rules to keep.
 		 */
 		n = 0;
-		if (cmd == 1) { /* look for a specific set, must scan all */
-			for (start = -1, i = 0; i < chain->n_rules; i++) {
-				if (chain->map[start]->set != rulenum)
+		if (cmd == 1) {
+			/* look for a specific set including RESVD_SET.
+			 * Must scan the entire range, ignore num.
+			 */
+			new_set = num;
+			for (start = -1, end = i = 0; i < chain->n_rules; i++) {
+				if (keep_rule(chain->map[i], cmd, new_set, 0))
 					continue;
 				if (start < 0)
 					start = i;
@@ -294,80 +337,94 @@ del_entry(struct ip_fw_chain *chain, u_int32_t arg)
 			}
 			end++;	/* first non-matching */
 		} else {
-			start = ipfw_find_rule(chain, rulenum, 0);
+			/* Optimized search on rule numbers */
+			start = ipfw_find_rule(chain, num, 0);
 			for (end = start; end < chain->n_rules; end++) {
 				rule = chain->map[end];
-				if (rulenum > 0 && rule->rulenum != rulenum)
+				if (num > 0 && rule->rulenum != num)
 					break;
-				if (rule->set != RESVD_SET &&
-				    (cmd == 0 || rule->set == new_set) )
+				if (!keep_rule(rule, cmd, new_set, num))
 					n++;
 			}
 		}
-		if (n == 0 && arg == 0)
-			break; /* special case, flush on empty ruleset */
-		/* allocate the map, if needed */
-		if (n > 0)
-			map = get_map(chain, -n, 1 /* locked */);
-		if (n == 0 || map == NULL) {
+
+		if (n == 0) {
+			/* A flush request (arg == 0) on empty ruleset
+			 * returns with no error. On the contrary,
+			 * if there is no match on a specific request,
+			 * we return EINVAL.
+			 */
+			error = (arg == 0) ? 0 : EINVAL;
+			break;
+		}
+
+		/* We have something to delete. Allocate the new map */
+		map = get_map(chain, -n, 1 /* locked */);
+		if (map == NULL) {
 			error = EINVAL;
 			break;
 		}
-		/* copy the initial part of the map */
+
+		/* 1. bcopy the initial part of the map */
 		if (start > 0)
 			bcopy(chain->map, map, start * sizeof(struct ip_fw *));
-		/* copy active rules between start and end */
+		/* 2. copy active rules between start and end */
 		for (i = ofs = start; i < end; i++) {
 			rule = chain->map[i];
-			if (!(rule->set != RESVD_SET &&
-			    (cmd == 0 || rule->set == new_set) ))
-				map[ofs++] = chain->map[i];
+			if (keep_rule(rule, cmd, new_set, num))
+				map[ofs++] = rule;
 		}
-		/* finally the tail */
+		/* 3. copy the final part of the map */
 		bcopy(chain->map + end, map + ofs,
 			(chain->n_rules - end) * sizeof(struct ip_fw *));
+		/* 4. swap the maps (under BH_LOCK) */
 		map = swap_map(chain, map, chain->n_rules - n);
-		/* now remove the rules deleted */
+		/* 5. now remove the rules deleted from the old map */
 		for (i = start; i < end; i++) {
+			int l;
 			rule = map[i];
-			if (rule->set != RESVD_SET &&
-			    (cmd == 0 || rule->set == new_set) ) {
-				int l = RULESIZE(rule);
-
-				chain->static_len -= l;
-				ipfw_remove_dyn_children(rule);
-				rule->x_next = chain->reap;
-				chain->reap = rule;
-			}
+			if (keep_rule(rule, cmd, new_set, num))
+				continue;
+			l = RULESIZE(rule);
+			chain->static_len -= l;
+			ipfw_remove_dyn_children(rule);
+			rule->x_next = chain->reap;
+			chain->reap = rule;
 		}
 		break;
 
-	case 2:	/* move rules with given number to new set */
-		for (i = 0; i < chain->n_rules; i++) {
+	/*
+	 * In the next 3 cases the loop stops at (n_rules - 1)
+	 * because the default rule is never eligible..
+	 */
+
+	case 2:	/* move rules with given RULE number to new set */
+		for (i = 0; i < chain->n_rules - 1; i++) {
 			rule = chain->map[i];
-			if (rule->rulenum == rulenum)
+			if (rule->rulenum == num)
 				rule->set = new_set;
 		}
 		break;
 
-	case 3: /* move rules with given set number to new set */
-		for (i = 0; i < chain->n_rules; i++) {
+	case 3: /* move rules with given SET number to new set */
+		for (i = 0; i < chain->n_rules - 1; i++) {
 			rule = chain->map[i];
-			if (rule->set == rulenum)
+			if (rule->set == num)
 				rule->set = new_set;
 		}
 		break;
 
 	case 4: /* swap two sets */
-		for (i = 0; i < chain->n_rules; i++) {
+		for (i = 0; i < chain->n_rules - 1; i++) {
 			rule = chain->map[i];
-			if (rule->set == rulenum)
+			if (rule->set == num)
 				rule->set = new_set;
 			else if (rule->set == new_set)
-				rule->set = rulenum;
+				rule->set = num;
 		}
 		break;
 	}
+
 	rule = chain->reap;
 	chain->reap = NULL;
 	IPFW_UH_WUNLOCK(chain);
@@ -445,7 +502,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 				break;
 		}
 		if (!cleared) {	/* we did not find any matching rules */
-			IPFW_WUNLOCK(chain);
+			IPFW_UH_RUNLOCK(chain);
 			return (EINVAL);
 		}
 		msg = log_only ? "logging count reset" : "cleared";
@@ -771,6 +828,44 @@ bad_size:
 	return EINVAL;
 }
 
+
+/*
+ * Translation of requests for compatibility with FreeBSD 7.2/8.
+ * a static variable tells us if we have an old client from userland,
+ * and if necessary we translate requests and responses between the
+ * two formats.
+ */
+static int is7 = 0;
+
+struct ip_fw7 {
+	struct ip_fw7	*next;		/* linked list of rules     */
+	struct ip_fw7	*next_rule;	/* ptr to next [skipto] rule    */
+	/* 'next_rule' is used to pass up 'set_disable' status      */
+
+	uint16_t	act_ofs;	/* offset of action in 32-bit units */
+	uint16_t	cmd_len;	/* # of 32-bit words in cmd */
+	uint16_t	rulenum;	/* rule number          */
+	uint8_t		set;		/* rule set (0..31)     */
+	// #define RESVD_SET   31  /* set for default and persistent rules */
+	uint8_t		_pad;		/* padding          */
+	// uint32_t        id;             /* rule id, only in v.8 */
+	/* These fields are present in all rules.           */
+	uint64_t	pcnt;		/* Packet counter       */
+	uint64_t	bcnt;		/* Byte counter         */
+	uint32_t	timestamp;	/* tv_sec of last match     */
+
+	ipfw_insn	cmd[1];		/* storage for commands     */
+};
+
+	int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_8(struct ip_fw *rule);
+
+#ifndef RULESIZE7
+#define RULESIZE7(rule)  (sizeof(struct ip_fw7) + \
+	((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
+#endif
+
+
 /*
  * Copy the static and dynamic rules to the supplied buffer
  * and return the amount of space actually used.
@@ -788,6 +883,32 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
         boot_seconds = boottime.tv_sec;
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
+
+		if (is7) {
+		    /* Convert rule to FreeBSd 7.2 format */
+		    l = RULESIZE7(rule);
+		    if (bp + l + sizeof(uint32_t) <= ep) {
+			int error;
+			bcopy(rule, bp, l + sizeof(uint32_t));
+			error = convert_rule_to_7((struct ip_fw *) bp);
+			if (error)
+				return 0; /*XXX correct? */
+			/*
+			 * XXX HACK. Store the disable mask in the "next"
+			 * pointer in a wild attempt to keep the ABI the same.
+			 * Why do we do this on EVERY rule?
+			 */
+			bcopy(&V_set_disable,
+				&(((struct ip_fw7 *)bp)->next_rule),
+				sizeof(V_set_disable));
+			if (((struct ip_fw7 *)bp)->timestamp)
+			    ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
+			bp += l;
+		    }
+		    continue; /* go to next rule */
+		}
+
+		/* normal mode, don't touch rules */
 		l = RULESIZE(rule);
 		if (bp + l > ep) { /* should not happen */
 			printf("overflow dumping static rules\n");
@@ -886,16 +1007,43 @@ ipfw_ctl(struct sockopt *sopt)
 	case IP_FW_ADD:
 		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
 		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
-			sizeof(struct ip_fw) );
+			sizeof(struct ip_fw7) );
+
+		/*
+		 * If the size of commands equals RULESIZE7 then we assume
+		 * a FreeBSD7.2 binary is talking to us (set is7=1).
+		 * is7 is persistent so the next 'ipfw list' command
+		 * will use this format.
+		 * NOTE: If wrong version is guessed (this can happen if
+		 *       the first ipfw command is 'ipfw [pipe] list')
+		 *       the ipfw binary may crash or loop infinitly...
+		 */
+		if (sopt->sopt_valsize == RULESIZE7(rule)) {
+		    is7 = 1;
+		    error = convert_rule_to_8(rule);
+		    if (error)
+			return error;
+		    if (error == 0)
+			error = check_ipfw_struct(rule, RULESIZE(rule));
+		} else {
+		    is7 = 0;
 		if (error == 0)
 			error = check_ipfw_struct(rule, sopt->sopt_valsize);
+		}
 		if (error == 0) {
 			/* locking is done within ipfw_add_rule() */
 			error = ipfw_add_rule(chain, rule);
 			size = RULESIZE(rule);
-			if (!error && sopt->sopt_dir == SOPT_GET)
+			if (!error && sopt->sopt_dir == SOPT_GET) {
+				if (is7) {
+					error = convert_rule_to_7(rule);
+					size = RULESIZE7(rule);
+					if (error)
+						return error;
+				}
 				error = sooptcopyout(sopt, rule, size);
 		}
+		}
 		free(rule, M_TEMP);
 		break;
 
@@ -1078,4 +1226,118 @@ ipfw_ctl(struct sockopt *sopt)
 	return (error);
 #undef RULE_MAXSIZE
 }
+
+
+#define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
+
+/* Functions to convert rules 7.2 <==> 8.0 */
+int
+convert_rule_to_7(struct ip_fw *rule)
+{
+	/* Used to modify original rule */
+	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
+	/* copy of original rule, version 8 */
+	struct ip_fw *tmp;
+
+	/* Used to copy commands */
+	ipfw_insn *ccmd, *dst;
+	int ll = 0, ccmdlen = 0;
+
+	tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+	if (tmp == NULL) {
+		return 1; //XXX error
+	}
+	bcopy(rule, tmp, RULE_MAXSIZE);
+
+	/* Copy fields */
+	rule7->_pad = tmp->_pad;
+	rule7->set = tmp->set;
+	rule7->rulenum = tmp->rulenum;
+	rule7->cmd_len = tmp->cmd_len;
+	rule7->act_ofs = tmp->act_ofs;
+	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
+	rule7->next = (struct ip_fw7 *)tmp->x_next;
+	rule7->cmd_len = tmp->cmd_len;
+	rule7->pcnt = tmp->pcnt;
+	rule7->bcnt = tmp->bcnt;
+	rule7->timestamp = tmp->timestamp;
+
+	/* Copy commands */
+	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
+			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+		ccmdlen = F_LEN(ccmd);
+
+		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+		if (dst->opcode > O_NAT)
+			/* O_REASS doesn't exists in 7.2 version, so
+			 * decrement opcode if it is after O_REASS
+			 */
+			dst->opcode--;
+
+		if (ccmdlen > ll) {
+			printf("ipfw: opcode %d size truncated\n",
+				ccmd->opcode);
+			return EINVAL;
+		}
+	}
+	free(tmp, M_TEMP);
+
+	return 0;
+}
+
+int
+convert_rule_to_8(struct ip_fw *rule)
+{
+	/* Used to modify original rule */
+	struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
+
+	/* Used to copy commands */
+	ipfw_insn *ccmd, *dst;
+	int ll = 0, ccmdlen = 0;
+
+	/* Copy of original rule */
+	struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+	if (tmp == NULL) {
+		return 1; //XXX error
+	}
+
+	bcopy(rule7, tmp, RULE_MAXSIZE);
+
+	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
+			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+		ccmdlen = F_LEN(ccmd);
+		
+		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+		if (dst->opcode > O_NAT)
+			/* O_REASS doesn't exists in 7.2 version, so
+			 * increment opcode if it is after O_REASS
+			 */
+			dst->opcode++;
+
+		if (ccmdlen > ll) {
+			printf("ipfw: opcode %d size truncated\n",
+			    ccmd->opcode);
+			return EINVAL;
+		}
+	}
+
+	rule->_pad = tmp->_pad;
+	rule->set = tmp->set;
+	rule->rulenum = tmp->rulenum;
+	rule->cmd_len = tmp->cmd_len;
+	rule->act_ofs = tmp->act_ofs;
+	rule->next_rule = (struct ip_fw *)tmp->next_rule;
+	rule->x_next = (struct ip_fw *)tmp->next;
+	rule->cmd_len = tmp->cmd_len;
+	rule->id = 0; /* XXX see if is ok = 0 */
+	rule->pcnt = tmp->pcnt;
+	rule->bcnt = tmp->bcnt;
+	rule->timestamp = tmp->timestamp;
+
+	free (tmp, M_TEMP);
+	return 0;
+}
+
 /* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c
index 0d8625af9715..517622f02171 100644
--- a/sys/netinet/ipfw/ip_fw_table.c
+++ b/sys/netinet/ipfw/ip_fw_table.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in.h>
 #include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
 #include <netinet/ip_fw.h>
+#include <sys/queue.h> /* LIST_HEAD */
 #include <netinet/ipfw/ip_fw_private.h>
 
 #ifdef MAC
@@ -175,14 +176,18 @@ ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
 }
 
 void
-ipfw_flush_tables(struct ip_fw_chain *ch)
+ipfw_destroy_tables(struct ip_fw_chain *ch)
 {
 	uint16_t tbl;
+	struct radix_node_head *rnh;
 
 	IPFW_WLOCK_ASSERT(ch);
 
-	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
+	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) {
 		ipfw_flush_table(ch, tbl);
+		rnh = ch->tables[tbl];
+		rn_detachhead((void **)&rnh);
+	}
 }
 
 int
diff --git a/sys/netinet/ipfw/test/Makefile b/sys/netinet/ipfw/test/Makefile
new file mode 100644
index 000000000000..c556a4bf3d51
--- /dev/null
+++ b/sys/netinet/ipfw/test/Makefile
@@ -0,0 +1,51 @@
+#
+# $FreeBSD$
+#
+# Makefile for building userland tests
+# this is written in a form compatible with gmake
+
+SCHED_SRCS = test_dn_sched.c
+SCHED_SRCS += dn_sched_fifo.c
+SCHED_SRCS += dn_sched_prio.c
+SCHED_SRCS += dn_sched_qfq.c
+SCHED_SRCS += dn_sched_rr.c
+SCHED_SRCS += dn_sched_wf2q.c
+SCHED_SRCS += dn_heap.c
+SCHED_SRCS += main.c
+
+SCHED_OBJS=$(SCHED_SRCS:.c=.o)
+
+HEAP_SRCS = dn_heap.c test_dn_heap.c
+HEAP_OBJS=$(HEAP_SRCS:.c=.o)
+
+VPATH=	.:..
+
+CFLAGS = -I.. -I. -Wall -Werror -O3 -DIPFW
+TARGETS= test_sched # no test_heap by default
+
+all:	$(TARGETS)
+
+test_heap : $(HEAP_OBJS)
+	$(CC) -o $@ $(HEAP_OBJS)
+
+test_sched : $(SCHED_OBJS)
+	$(CC) -o $@ $(SCHED_OBJS)
+
+$(SCHED_OBJS): dn_test.h
+main.o: mylist.h
+
+clean:
+	- rm *.o $(TARGETS) *.core
+
+ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \
+	dn_sched.h dn_heap.h ip_dn_private.h Makefile
+TMPBASE = /tmp/testXYZ
+TMPDIR = $(TMPBASE)/test
+
+tgz:
+	-rm -rf $(TMPDIR)
+	mkdir -p $(TMPDIR)
+	-cp -p $(ALLSRCS) $(TMPDIR)
+	-(cd ..; cp -p $(ALLSRCS) $(TMPDIR))
+	ls -la  $(TMPDIR)
+	(cd $(TMPBASE); tar cvzf /tmp/test.tgz test)
diff --git a/sys/netinet/ipfw/test/dn_test.h b/sys/netinet/ipfw/test/dn_test.h
new file mode 100644
index 000000000000..4e079bc4d68b
--- /dev/null
+++ b/sys/netinet/ipfw/test/dn_test.h
@@ -0,0 +1,175 @@
+/*
+ * $FreeBSD$
+ *
+ * userspace compatibility code for dummynet schedulers
+ */
+
+#ifndef _DN_TEST_H
+#define _DN_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>	/* bzero, ffs, ... */
+#include <string.h>	/* strcmp */
+#include <errno.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+
+extern int debug;
+#define ND(fmt, args...) do {} while (0)
+#define D1(fmt, args...) do {} while (0)
+#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n",      \
+        __FUNCTION__, ## args)
+#define DX(lev, fmt, args...) do {              \
+        if (debug > lev) D(fmt, ## args); } while (0)
+
+
+#ifndef offsetof
+#define offsetof(t,m) (int)((&((t *)0L)->m))
+#endif
+
+#include <mylist.h>
+
+/* prevent include of other system headers */
+#define	_NETINET_IP_VAR_H_	/* ip_fw_args */
+#define _IPFW2_H
+#define _SYS_MBUF_H_
+
+enum	{
+	DN_QUEUE,
+};
+
+enum	{
+	DN_SCHED_FIFO,
+	DN_SCHED_WF2QP,
+};
+
+struct dn_id {
+	int type, subtype, len, id;
+};
+
+struct dn_fs {
+	int par[4];	/* flowset parameters */
+
+	/* simulation entries.
+	 * 'index' is not strictly necessary
+	 * y is used for the inverse mapping ,
+	 */
+	int index;
+	int y;	/* inverse mapping */
+	int base_y;	/* inverse mapping */
+	int next_y;	/* inverse mapping */
+	int n_flows;
+	int first_flow;
+	int next_flow;	/* first_flow + n_flows */
+	/*
+	 * when generating, let 'cur' go from 0 to n_flows-1,
+	 * then point to flow first_flow + cur
+	 */
+	int	cur;
+};
+
+struct dn_sch {
+};
+
+struct dn_flow {
+	struct dn_id oid;
+	int length;
+	int len_bytes;
+	int drops;
+	uint64_t tot_bytes;
+	uint32_t flow_id;
+	struct list_head h;	/* used by the generator */
+};
+
+struct dn_link {
+};
+
+struct ip_fw_args {
+};
+
+struct mbuf {
+        struct {
+                int len;
+        } m_pkthdr;
+        struct mbuf *m_nextpkt;
+	int flow_id;	/* for testing, index of a flow */
+	//int flowset_id;	/* for testing, index of a flowset */
+	void *cfg;	/* config args */
+};
+
+#define MALLOC_DECLARE(x)
+#define KASSERT(x, y)	do { if (!(x)) printf y ; exit(0); } while (0)
+struct ipfw_flow_id {
+};
+
+typedef void * module_t;
+
+struct _md_t {
+	const char *name;
+	int (*f)(module_t, int, void *);
+	void *p;
+};
+
+typedef struct _md_t moduledata_t;
+
+#define DECLARE_MODULE(name, b, c, d)	\
+	moduledata_t *_g_##name = & b
+#define MODULE_DEPEND(a, b, c, d, e)
+
+#ifdef IPFW
+#include <dn_heap.h>
+#include <ip_dn_private.h>
+#include <dn_sched.h>
+#else
+struct dn_queue {
+        struct dn_fsk *fs;             /* parent flowset. */
+        struct dn_sch_inst *_si;	/* parent sched instance. */
+};
+struct dn_schk {
+};
+struct dn_fsk {
+	struct dn_fs fs;
+	struct dn_schk *sched;
+};
+struct dn_sch_inst {
+	struct dn_schk *sched;
+};
+struct dn_alg {
+	int type;
+	const char *name;
+	void *enqueue, *dequeue;
+	int q_datalen, si_datalen, schk_datalen;
+	int (*config)(struct dn_schk *);
+	int (*new_sched)(struct dn_sch_inst *);
+	int (*new_fsk)(struct dn_fsk *);
+        int (*new_queue)(struct dn_queue *q);
+};
+
+#endif
+
+#ifndef __FreeBSD__
+int fls(int);
+#endif
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+        if (q->head == NULL)
+                q->head = m;
+        else
+                q->tail->m_nextpkt = m;
+        q->tail = m;
+        m->m_nextpkt = NULL;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DN_TEST_H */
diff --git a/sys/netinet/ipfw/test/main.c b/sys/netinet/ipfw/test/main.c
new file mode 100644
index 000000000000..be9fdf53612c
--- /dev/null
+++ b/sys/netinet/ipfw/test/main.c
@@ -0,0 +1,636 @@
+/*
+ * $FreeBSD$
+ *
+ * Testing program for schedulers
+ *
+ * The framework include a simple controller which, at each
+ * iteration, decides whether we can enqueue and/or dequeue.
+ * Then the mainloop runs the required number of tests,
+ * keeping track of statistics.
+ */
+
+#include "dn_test.h"
+
+struct q_list {
+	struct list_head h;
+};
+
+struct cfg_s {
+	int ac;
+	char * const *av;
+
+	const char *name;
+	int loops;
+	struct timeval time;
+
+	/* running counters */
+	uint32_t	_enqueue;
+	uint32_t	drop;
+	uint32_t	pending;
+	uint32_t	dequeue;
+
+	/* generator parameters */
+	int th_min, th_max;
+	int maxburst;
+	int lmin, lmax;	/* packet len */
+	int flows;	/* number of flows */
+	int flowsets;	/* number of flowsets */
+	int wsum;	/* sum of weights of all flows */
+	int max_y;	/* max random number in the generation */
+	int cur_y, cur_fs;	/* used in generation, between 0 and max_y - 1 */
+	const char *fs_config; /* flowset config */
+	int can_dequeue;
+	int burst;	/* count of packets sent in a burst */
+	struct mbuf *tosend;	/* packet to send -- also flag to enqueue */
+
+	struct mbuf *freelist;
+
+	struct mbuf *head, *tail;	/* a simple tailq */
+
+	/* scheduler hooks */
+	int (*enq)(struct dn_sch_inst *, struct dn_queue *,
+		struct mbuf *);
+	struct mbuf * (*deq)(struct dn_sch_inst *);
+	/* size of the three fields including sched-specific areas */
+	int schk_len;
+	int q_len; /* size of a queue including sched-fields */
+	int si_len; /* size of a sch_inst including sched-fields */
+	char *q;	/* array of flow queues */
+		/* use a char* because size is variable */
+	struct dn_fsk *fs;	/* array of flowsets */
+	struct dn_sch_inst *si;
+	struct dn_schk *sched;
+
+	/* generator state */
+	int state;		/* 0 = going up, 1: going down */
+
+	/*
+	 * We keep lists for each backlog level, and always serve
+	 * the one with shortest backlog. llmask contains a bitmap
+	 * of lists, and ll are the heads of the lists. The last
+	 * entry (BACKLOG) contains all entries considered 'full'
+	 * XXX to optimize things, entry i could contain queues with
+	 * 2^{i-1}+1 .. 2^i entries.
+	 */
+#define BACKLOG	30
+	uint32_t	llmask;
+	struct list_head ll[BACKLOG + 10];
+};
+
+/* FI2Q and Q2FI converts from flow_id to dn_queue and back.
+ * We cannot easily use pointer arithmetic because it is variable size.
+  */
+#define FI2Q(c, i)	((struct dn_queue *)((c)->q + (c)->q_len * (i)))
+#define Q2FI(c, q)	(((char *)(q) - (c)->q)/(c)->q_len)
+
+int debug = 0;
+
+struct dn_parms dn_cfg;
+
+static void controller(struct cfg_s *c);
+
+/* release a packet: put the mbuf in the freelist, and the queue in
+ * the bucket.
+ */
+int
+drop(struct cfg_s *c, struct mbuf *m)
+{
+	struct dn_queue *q;
+	int i;
+
+	c->drop++;
+	q = FI2Q(c, m->flow_id);
+	i = q->ni.length; // XXX or ffs...
+
+	ND("q %p id %d current length %d", q, m->flow_id, i);
+	if (i < BACKLOG) {
+		struct list_head *h = &q->ni.h;
+		c->llmask &= ~(1<<(i+1));
+		c->llmask |= (1<<(i));
+		list_del(h);
+		list_add_tail(h, &c->ll[i]);
+	}
+	m->m_nextpkt = c->freelist;
+	c->freelist = m;
+	return 0;
+}
+
+/* dequeue returns NON-NULL when a packet is dropped */
+static int
+enqueue(struct cfg_s *c, void *_m)
+{
+	struct mbuf *m = _m;
+	if (c->enq)
+		return c->enq(c->si, FI2Q(c, m->flow_id), m);
+	if (c->head == NULL)
+		c->head = m;
+	else
+		c->tail->m_nextpkt = m;
+	c->tail = m;
+	return 0; /* default - success */
+}
+
+/* dequeue returns NON-NULL when a packet is available */
+static void *
+dequeue(struct cfg_s *c)
+{
+	struct mbuf *m;
+	if (c->deq)
+		return c->deq(c->si);
+	if ((m = c->head)) {
+		m = c->head;
+		c->head = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+	}
+	return m;
+}
+
+static int
+mainloop(struct cfg_s *c)
+{
+	int i;
+	struct mbuf *m;
+
+	for (i=0; i < c->loops; i++) {
+		/* implement histeresis */
+		controller(c);
+		DX(3, "loop %d enq %d send %p rx %d",
+			i, c->_enqueue, c->tosend, c->can_dequeue);
+		if ( (m = c->tosend) ) {
+			c->_enqueue++;
+			if (enqueue(c, m)) {
+				drop(c, m);
+				ND("loop %d enqueue fail", i );
+			} else {
+				ND("enqueue ok");
+				c->pending++;
+			}
+		}
+		if (c->can_dequeue) {
+			c->dequeue++;
+			if ((m = dequeue(c))) {
+				c->pending--;
+				drop(c, m);
+				c->drop--;	/* compensate */
+			}
+		}
+	}
+	DX(1, "mainloop ends %d", i);
+	return 0;
+}
+
+int
+dump(struct cfg_s *c)
+{
+	int i;
+	struct dn_queue *q;
+
+	for (i=0; i < c->flows; i++) {
+		q = FI2Q(c, i);
+		DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes);
+	}
+	DX(1, "done %d loops\n", c->loops);
+	return 0;
+}
+
+/* interpret a number in human form */
+static long
+getnum(const char *s, char **next, const char *key)
+{
+	char *end = NULL;
+	long l;
+
+	if (next)	/* default */
+		*next = NULL;
+	if (s && *s) {
+		DX(3, "token is <%s> %s", s, key ? key : "-");
+		l = strtol(s, &end, 0);
+	} else {
+		DX(3, "empty string");
+		l = -1;
+	}
+	if (l < 0) {
+		DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") );
+		return 0;	// invalid 
+	}
+	if (!end || !*end)
+		return l;
+	if (*end == 'n')
+		l = -l;	/* multiply by n */
+	else if (*end == 'K')
+		l = l*1000;
+	else if (*end == 'M')
+		l = l*1000000;
+	else if (*end == 'k')
+		l = l*1024;
+	else if (*end == 'm')
+		l = l*1024*1024;
+	else if (*end == 'w')
+		;
+	else {/* not recognized */
+		D("suffix %s for %s, next %p", end, key, next);
+		end--;
+	}
+	end++;
+	DX(3, "suffix now %s for %s, next %p", end, key, next);
+	if (next && *end) {
+		DX(3, "setting next to %s for %s", end, key);
+		*next = end;
+	}
+	return l;
+}
+
+/*
+ * flowsets are a comma-separated list of
+ *     weight:maxlen:flows
+ * indicating how many flows are hooked to that fs.
+ * Both weight and range can be min-max-steps.
+ * In a first pass we just count the number of flowsets and flows,
+ * in a second pass we complete the setup.
+ */
+static void
+parse_flowsets(struct cfg_s *c, const char *fs, int pass)
+{
+	char *s, *cur, *next;
+	int n_flows = 0, n_fs = 0, wsum = 0;
+	int i, j;
+	struct dn_fs *prev = NULL;
+
+	DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets);
+	if (pass == 0)
+		c->fs_config = fs;
+	s = c->fs_config ? strdup(c->fs_config) : NULL;
+	if (s == NULL) {
+		if (pass == 0)
+			D("no fsconfig");
+		return;
+	}
+	for (next = s; (cur = strsep(&next, ","));) {
+		char *p = NULL;
+		int w, w_h, w_steps, wi;
+		int len, len_h, l_steps, li;
+		int flows;
+
+		w = getnum(strsep(&cur, ":"), &p, "weight");
+		if (w <= 0)
+			w = 1;
+		w_h = p ? getnum(p+1, &p, "weight_max") : w;
+		w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2);
+		len = getnum(strsep(&cur, ":"), &p, "len");
+		if (len <= 0)
+			len = 1000;
+		len_h = p ? getnum(p+1, &p, "len_max") : len;
+		l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2);
+		flows = getnum(strsep(&cur, ":"), NULL, "flows");
+		if (flows == 0)
+			flows = 1;
+		DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d",
+			w, w_h, w_steps, len, len_h, l_steps, flows);
+		if (w == 0 || w_h < w || len == 0 || len_h < len ||
+				flows == 0) {
+			DX(4,"wrong parameters %s", fs);
+			return;
+		}
+		n_flows += flows * w_steps * l_steps;
+		for (i = 0; i < w_steps; i++) {
+			wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1));
+			for (j = 0; j < l_steps; j++, n_fs++) {
+				struct dn_fs *fs = &c->fs[n_fs].fs; // tentative
+				int x;
+
+				li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1));
+				x = (wi*2048)/li;
+				DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d",
+					n_fs, wi, li, x, flows);
+				if (pass == 0)
+					continue;
+				if (c->fs == NULL || c->flowsets <= n_fs) {
+					D("error in number of flowsets");
+					return;
+				}
+				wsum += wi * flows;
+				fs->par[0] = wi;
+				fs->par[1] = li;
+				fs->index = n_fs;
+				fs->n_flows = flows;
+				fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow;
+				fs->next_flow = fs->first_flow + fs->n_flows;
+				fs->y = x * flows;
+				fs->base_y = (prev == NULL) ? 0 : prev->next_y;
+				fs->next_y = fs->base_y + fs->y;
+				prev = fs;
+			}
+		}
+	}
+	c->max_y = prev ? prev->base_y + prev->y : 0;
+	c->flows = n_flows;
+	c->flowsets = n_fs;
+	c->wsum = wsum;
+	if (pass == 0)
+		return;
+
+	/* now link all flows to their parent flowsets */
+	DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y);
+	for (i=0; i < c->flowsets; i++) {
+		struct dn_fs *fs = &c->fs[i].fs;
+		DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d",
+			i, fs->par[0], fs->par[1],
+			fs->first_flow, fs->next_flow,
+			fs->base_y, fs->next_y);
+		for (j = fs->first_flow; j < fs->next_flow; j++) {
+			struct dn_queue *q = FI2Q(c, j);
+			q->fs = &c->fs[i];
+		}
+	}
+}
+
+static int
+init(struct cfg_s *c)
+{
+	int i;
+	int ac = c->ac;
+	char * const *av = c->av;
+
+	c->si_len = sizeof(struct dn_sch_inst);
+	c->q_len = sizeof(struct dn_queue);
+	moduledata_t *mod = NULL;
+	struct dn_alg *p = NULL;
+
+	c->th_min = 0;
+	c->th_max = -20;/* 20 packets per flow */
+	c->lmin = c->lmax = 1280;	/* packet len */
+	c->flows = 1;
+	c->flowsets = 1;
+	c->name = "null";
+	ac--; av++;
+	while (ac > 1) {
+		if (!strcmp(*av, "-n")) {
+			c->loops = getnum(av[1], NULL, av[0]);
+		} else if (!strcmp(*av, "-d")) {
+			debug = atoi(av[1]);
+		} else if (!strcmp(*av, "-alg")) {
+			extern moduledata_t *_g_dn_fifo;
+			extern moduledata_t *_g_dn_wf2qp;
+			extern moduledata_t *_g_dn_rr;
+			extern moduledata_t *_g_dn_qfq;
+#ifdef WITH_KPS
+			extern moduledata_t *_g_dn_kps;
+#endif
+			if (!strcmp(av[1], "rr"))
+				mod = _g_dn_rr;
+			else if (!strcmp(av[1], "wf2qp"))
+				mod = _g_dn_wf2qp;
+			else if (!strcmp(av[1], "fifo"))
+				mod = _g_dn_fifo;
+			else if (!strcmp(av[1], "qfq"))
+				mod = _g_dn_qfq;
+#ifdef WITH_KPS
+			else if (!strcmp(av[1], "kps"))
+				mod = _g_dn_kps;
+#endif
+			else
+				mod = NULL;
+			c->name = mod ? mod->name : "NULL";
+			DX(3, "using scheduler %s", c->name);
+		} else if (!strcmp(*av, "-len")) {
+			c->lmin = getnum(av[1], NULL, av[0]);
+			c->lmax = c->lmin;
+			DX(3, "setting max to %d", c->th_max);
+		} else if (!strcmp(*av, "-burst")) {
+			c->maxburst = getnum(av[1], NULL, av[0]);
+			DX(3, "setting max to %d", c->th_max);
+		} else if (!strcmp(*av, "-qmax")) {
+			c->th_max = getnum(av[1], NULL, av[0]);
+			DX(3, "setting max to %d", c->th_max);
+		} else if (!strcmp(*av, "-qmin")) {
+			c->th_min = getnum(av[1], NULL, av[0]);
+			DX(3, "setting min to %d", c->th_min);
+		} else if (!strcmp(*av, "-flows")) {
+			c->flows = getnum(av[1], NULL, av[0]);
+			DX(3, "setting flows to %d", c->flows);
+		} else if (!strcmp(*av, "-flowsets")) {
+			parse_flowsets(c, av[1], 0);
+			DX(3, "setting flowsets to %d", c->flowsets);
+		} else {
+			D("option %s not recognised, ignore", *av);
+		}
+		ac -= 2; av += 2;
+	}
+	if (c->maxburst <= 0)
+		c->maxburst = 1;
+	if (c->loops <= 0)
+		c->loops = 1;
+	if (c->flows <= 0)
+		c->flows = 1;
+	if (c->flowsets <= 0)
+		c->flowsets = 1;
+	if (c->lmin <= 0)
+		c->lmin = 1;
+	if (c->lmax <= 0)
+		c->lmax = 1;
+	/* multiply by N */
+	if (c->th_min < 0)
+		c->th_min = c->flows * -c->th_min;
+	if (c->th_max < 0)
+		c->th_max = c->flows * -c->th_max;
+	if (c->th_max <= c->th_min)
+		c->th_max = c->th_min + 1;
+	if (mod) {
+		p = mod->p;
+		DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p);
+		DX(3, "modname %s ty %d", p->name, p->type);
+		c->enq = p->enqueue;
+		c->deq = p->dequeue;
+		c->si_len += p->si_datalen;
+		c->q_len += p->q_datalen;
+		c->schk_len += p->schk_datalen;
+	}
+	/* allocate queues, flowsets and one scheduler */
+	c->q = calloc(c->flows, c->q_len);
+	c->fs = calloc(c->flowsets, sizeof(struct dn_fsk));
+	c->si = calloc(1, c->si_len);
+	c->sched = calloc(c->flows, c->schk_len);
+	if (c->q == NULL || c->fs == NULL) {
+		D("error allocating memory for flows");
+		exit(1);
+	}
+	c->si->sched = c->sched;
+	if (p) {
+		if (p->config)
+			p->config(c->sched);
+		if (p->new_sched)
+			p->new_sched(c->si);
+	}
+	/* parse_flowsets links queues to their flowsets */
+	parse_flowsets(c, av[1], 1);
+	/* complete the work calling new_fsk */
+	for (i = 0; i < c->flowsets; i++) {
+		if (c->fs[i].fs.par[1] == 0)
+			c->fs[i].fs.par[1] = 1000;	/* default pkt len */
+		c->fs[i].sched = c->sched;
+		if (p && p->new_fsk)
+			p->new_fsk(&c->fs[i]);
+	}
+
+	/* initialize the lists for the generator, and put
+	 * all flows in the list for backlog = 0
+	 */
+	for (i=0; i <= BACKLOG+5; i++)
+		INIT_LIST_HEAD(&c->ll[i]);
+
+	for (i = 0; i < c->flows; i++) {
+		struct dn_queue *q = FI2Q(c, i);
+		if (q->fs == NULL)
+			q->fs = &c->fs[0]; /* XXX */
+		q->_si = c->si;
+		if (p && p->new_queue)
+			p->new_queue(q);
+		INIT_LIST_HEAD(&q->ni.h);
+		list_add_tail(&q->ni.h, &c->ll[0]);
+	}
+	c->llmask = 1;
+	return 0;
+}
+
+
+int
+main(int ac, char *av[])
+{
+	struct cfg_s c;
+	struct timeval end;
+	double ll;
+	int i;
+	char msg[40];
+
+	bzero(&c, sizeof(c));
+	c.ac = ac;
+	c.av = av;
+	init(&c);
+	gettimeofday(&c.time, NULL);
+	mainloop(&c);
+	gettimeofday(&end, NULL);
+	end.tv_sec -= c.time.tv_sec;
+	end.tv_usec -= c.time.tv_usec;
+	if (end.tv_usec < 0) {
+		end.tv_usec += 1000000;
+		end.tv_sec--;
+	}
+	c.time = end;
+	ll = end.tv_sec*1000000 + end.tv_usec;
+	ll *= 1000;	/* convert to nanoseconds */
+	ll /= c._enqueue;
+	sprintf(msg, "1::%d", c.flows);
+	D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d",
+		c.name, c._enqueue, c.loops,
+		(int)c.time.tv_sec, (int)c.time.tv_usec, ll,
+		c.th_min, c.th_max,
+		c.fs_config ? c.fs_config : msg, c.drop);
+	dump(&c);
+	DX(1, "done ac %d av %p", ac, av);
+	for (i=0; i < ac; i++)
+		DX(1, "arg %d %s", i, av[i]);
+	return 0;
+}
+
+/*
+ * The controller decides whether in this iteration we should send
+ * (the packet is in c->tosend) and/or receive (flag c->can_dequeue)
+ */
+static void
+controller(struct cfg_s *c)
+{
+	struct mbuf *m;
+	struct dn_fs *fs;
+	int flow_id;
+
+	/* histeresis between max and min */
+	if (c->state == 0 && c->pending >= c->th_max)
+		c->state = 1;
+	else if (c->state == 1 && c->pending <= c->th_min)
+		c->state = 0;
+	ND(1, "state %d pending %2d", c->state, c->pending);
+	c->can_dequeue = c->state;
+	c->tosend = NULL;
+	if (c->state)
+		return;
+
+    if (1) {
+	int i;
+	struct dn_queue *q;
+	struct list_head *h;
+
+	i = ffs(c->llmask) - 1;
+	if (i < 0) {
+		DX(2, "no candidate");
+		c->can_dequeue = 1;
+		return;
+	}
+	h = &c->ll[i];
+	ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next);
+	q = list_first_entry(h, struct dn_queue, ni.h);
+	list_del(&q->ni.h);
+	flow_id = Q2FI(c, q);
+	DX(2, "extracted flow %p %d backlog %d", q, flow_id, i);
+	if (list_empty(h)) {
+		ND(2, "backlog %d empty", i);
+		c->llmask &= ~(1<<i);
+	}
+	ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+	list_add_tail(&q->ni.h, h+1);
+	ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+	if (i < BACKLOG) {
+		ND(2, "backlog %d full", i+1);
+		c->llmask |= 1<<(1+i);
+	}
+	fs = &q->fs->fs;
+	c->cur_fs = q->fs - c->fs;
+	fs->cur = flow_id;
+    } else {
+	/* XXX this does not work ? */
+	/* now decide whom to send the packet, and the length */
+	/* lookup in the flow table */
+	if (c->cur_y >= c->max_y) {	/* handle wraparound */
+		c->cur_y = 0;
+		c->cur_fs = 0;
+	}
+	fs = &c->fs[c->cur_fs].fs;
+	flow_id = fs->cur++;
+	if (fs->cur >= fs->next_flow)
+		fs->cur = fs->first_flow;
+	c->cur_y++;
+	if (c->cur_y >= fs->next_y)
+		c->cur_fs++;
+    }
+
+	/* construct a packet */
+	if (c->freelist) {
+		m = c->tosend = c->freelist;
+		c->freelist = c->freelist->m_nextpkt;
+	} else {
+		m = c->tosend = calloc(1, sizeof(struct mbuf));
+	}
+	if (m == NULL)
+		return;
+
+	m->cfg = c;
+	m->m_nextpkt = NULL;
+	m->m_pkthdr.len = fs->par[1]; // XXX maxlen
+	m->flow_id = flow_id;
+
+	ND(2,"y %6d flow %5d fs %3d weight %4d len %4d",
+		c->cur_y, m->flow_id, c->cur_fs,
+		fs->par[0], m->m_pkthdr.len);
+
+}
+
+/*
+Packet allocation:
+to achieve a distribution that matches weights, for each X=w/lmax class
+we should generate a number of packets proportional to Y = X times the number
+of flows in the class.
+So we construct an array with the cumulative distribution of Y's,
+and use it to identify the flow via inverse mapping (if the Y's are
+not too many we can use an array for the lookup). In practice,
+each flow will have X entries [virtually] pointing to it.
+
+*/
diff --git a/sys/netinet/ipfw/test/mylist.h b/sys/netinet/ipfw/test/mylist.h
new file mode 100644
index 000000000000..6247f32ea4e4
--- /dev/null
+++ b/sys/netinet/ipfw/test/mylist.h
@@ -0,0 +1,49 @@
+/*
+ * $FreeBSD$
+ *
+ * linux-like bidirectional lists
+ */
+
+#ifndef _MYLIST_H
+#define _MYLIST_H
+struct list_head {
+        struct list_head *prev, *next;
+};
+
+#define INIT_LIST_HEAD(l) do {  (l)->prev = (l)->next = (l); } while (0)
+#define list_empty(l)   ( (l)->next == l )
+static inline void
+__list_add(struct list_head *o, struct list_head *prev,
+        struct list_head *next)
+{
+        next->prev = o;
+        o->next = next;
+        o->prev = prev;
+        prev->next = o;
+}
+ 
+static inline void
+list_add_tail(struct list_head *o, struct list_head *head)
+{
+        __list_add(o, head->prev, head);
+}
+
+#define list_first_entry(pL, ty, member)        \
+        (ty *)((char *)((pL)->next) - offsetof(ty, member))
+
+static inline void
+__list_del(struct list_head *prev, struct list_head *next)
+{
+        next->prev = prev;
+        prev->next = next;
+}
+
+static inline void
+list_del(struct list_head *entry)
+{
+	ND("called on %p", entry);
+        __list_del(entry->prev, entry->next);
+        entry->next = entry->prev = NULL;
+}
+
+#endif /* _MYLIST_H */
diff --git a/sys/netinet/ipfw/test/test_dn_heap.c b/sys/netinet/ipfw/test/test_dn_heap.c
new file mode 100644
index 000000000000..d460cf2ff36b
--- /dev/null
+++ b/sys/netinet/ipfw/test/test_dn_heap.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Userland code for testing binary heaps and hash tables
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include  "dn_heap.h"
+#define log(x, arg...)	fprintf(stderr, ## arg)
+#define panic(x...)	fprintf(stderr, ## x), exit(1)
+
+#include <string.h>
+
+struct x {
+	struct x *ht_link;
+	char buf[0];
+};
+
+uint32_t hf(uintptr_t key, int flags, void *arg)
+{
+	return (flags & DNHT_KEY_IS_OBJ) ?
+		((struct x *)key)->buf[0] : *(char *)key;
+}
+
+int matchf(void *obj, uintptr_t key, int flags, void *arg)
+{
+	char *s = (flags & DNHT_KEY_IS_OBJ) ?
+		((struct x *)key)->buf : (char *)key;
+	return (strcmp(((struct x *)obj)->buf, s) == 0);
+}
+
+void *newfn(uintptr_t key, int flags, void *arg)
+{
+	char *s = (char *)key;
+	struct x *p = malloc(sizeof(*p) + 1 + strlen(s));
+	if (p)
+		strcpy(p->buf, s);
+	return p;
+}
+
+char *strings[] = {
+	"undici", "unico", "doppio", "devoto",
+	"uno", "due", "tre", "quattro", "cinque", "sei",
+	"uno", "due", "tre", "quattro", "cinque", "sei",
+	NULL,
+};
+
+int doprint(void *_x, void *arg)
+{
+	struct x *x = _x;
+	printf("found element <%s>\n", x->buf);
+	return (int)arg;
+}
+
+static void
+test_hash()
+{
+	char **p;
+	struct dn_ht *h;
+	uintptr_t x = 0;
+	uintptr_t x1 = 0;
+
+	/* first, find and allocate */
+	h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn);
+
+	for (p = strings; *p; p++) {
+		dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL);
+	}
+	dn_ht_scan(h, doprint, 0);
+	printf("/* second -- find without allocate */\n");
+	h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL);
+	for (p = strings; *p; p++) {
+		void **y = newfn((uintptr_t)*p, 0, NULL);
+		if (x == 0)
+			x = (uintptr_t)y;
+		else {
+			if (x1 == 0)
+				x1 = (uintptr_t)*p;
+		}
+		dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL);
+	}
+	dn_ht_scan(h, doprint, 0);
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+	dn_ht_scan(h, doprint, 0);
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct dn_heap h;
+	int i, n, n2, n3;
+
+	test_hash();
+	return 0;
+
+	/* n = elements, n2 = cycles */
+	n = (argc > 1) ? atoi(argv[1]) : 0;
+	if (n <= 0 || n > 1000000)
+		n = 100;
+	n2 = (argc > 2) ? atoi(argv[2]) : 0;
+	if (n2 <= 0)
+		n = 1000000;
+	n3 = (argc > 3) ? atoi(argv[3]) : 0;
+	bzero(&h, sizeof(h));
+	heap_init(&h, n, -1);
+	while (n2-- > 0) {
+		uint64_t prevk = 0;
+		for (i=0; i < n; i++)
+			heap_insert(&h, n3 ? n-i: random(), (void *)(100+i));
+		
+		for (i=0; h.elements > 0; i++) {
+			uint64_t k = h.p[0].key;
+			if (k < prevk)
+				panic("wrong sequence\n");
+			prevk = k;
+			if (0)
+			printf("%d key %llu, val %p\n",
+				i, h.p[0].key, h.p[0].object);
+			heap_extract(&h, NULL);
+		}
+	}
+	return 0;
+}
diff --git a/sys/netinet/ipfw/test/test_dn_sched.c b/sys/netinet/ipfw/test/test_dn_sched.c
new file mode 100644
index 000000000000..ee46c95ed868
--- /dev/null
+++ b/sys/netinet/ipfw/test/test_dn_sched.c
@@ -0,0 +1,89 @@
+/*
+ * $FreeBSD$
+ *
+ * library functions for userland testing of dummynet schedulers
+ */
+
+#include "dn_test.h"
+
+void
+m_freem(struct mbuf *m)
+{
+	printf("free %p\n", m);
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+	return 0;
+}
+
+void
+dn_free_pkts(struct mbuf *m)
+{
+	struct mbuf *x;
+	while ( (x = m) ) {
+		m = m->m_nextpkt;
+		m_freem(x);
+	}
+}
+		
+int
+dn_delete_queue(void *_q, void *do_free)
+{
+	struct dn_queue *q = _q;
+        if (q->mq.head)
+                dn_free_pkts(q->mq.head);
+        free(q);
+        return 0;
+}
+
+/*
+ * This is a simplified function for testing purposes, which does
+ * not implement statistics or random loss.
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+        if (drop)
+                goto drop;
+        if (q->ni.length >= 200)
+                goto drop;
+        mq_append(&q->mq, m);
+        q->ni.length++;
+        q->ni.tot_bytes += m->m_pkthdr.len;
+        return 0;
+
+drop:
+        q->ni.drops++;
+        return 1;
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+        if (*v < lo) {
+                *v = dflt;
+        } else if (*v > hi) {
+                *v = hi;
+        }
+        return *v;
+}
+
+#ifndef __FreeBSD__
+int
+fls(int mask)
+{
+	int bit;
+
+	if (mask == 0)
+		return (0);
+	for (bit = 1; mask != 1; bit++)
+		mask = (unsigned int)mask >> 1;
+	return (bit);
+}
+#endif