sys/geom/sched/gs_rr.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686

/*-
 * Copyright (c) 2009-2010 Fabio Checconi
 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id$
 * $FreeBSD: src/sys/geom/sched/gs_rr.c,v 1.2.2.2.4.1 2010/12/21 17:09:25 kensmith Exp $
 *
 * A round-robin (RR) anticipatory scheduler, with per-client queues.
 *
 * The goal of this implementation is to improve throughput compared
 * to the pure elevator algorithm, and insure some fairness among
 * clients.
 * 
 * Requests coming from the same client are put in the same queue.
 * We use anticipation to help reducing seeks, and each queue
 * is never served continuously for more than a given amount of
 * time or data. Queues are then served in a round-robin fashion.
 *
 * Each queue can be in any of the following states:
 *     READY	immediately serve the first pending request;
 *     BUSY	one request is under service, wait for completion;
 *     IDLING	do not serve incoming requests immediately, unless
 * 		they are "eligible" as defined later.
 *
 * Scheduling is made looking at the status of all queues,
 * and the first one in round-robin order is privileged.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/callout.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
#include "gs_scheduler.h"

/* possible states of the scheduler */
enum g_rr_state {
	G_QUEUE_READY = 0,	/* Ready to dispatch. */
	G_QUEUE_BUSY,		/* Waiting for a completion. */
	G_QUEUE_IDLING		/* Waiting for a new request. */
};

/* possible queue flags */
enum g_rr_flags {
	G_FLAG_COMPLETED = 1,	/* Completed a req. in the current budget. */
};

struct g_rr_softc;

/*
 * Queue descriptor, containing reference count, scheduling
 * state, a queue of pending requests, configuration parameters.
 * Queues with pending request(s) and not under service are also
 * stored in a Round Robin (RR) list.
 */
struct g_rr_queue {
	struct g_rr_softc *q_sc;	/* link to the parent */

	enum g_rr_state	q_status;
	unsigned int	q_service;	/* service received so far */
	int		q_slice_end;	/* actual slice end in ticks */
	enum g_rr_flags	q_flags;	/* queue flags */
	struct bio_queue_head q_bioq;

	/* Scheduling parameters */
	unsigned int	q_budget;	/* slice size in bytes */
	unsigned int	q_slice_duration; /* slice size in ticks */
	unsigned int	q_wait_ticks;	/* wait time for anticipation */

	/* Stats to drive the various heuristics. */
	struct g_savg	q_thinktime;	/* Thinktime average. */
	struct g_savg	q_seekdist;	/* Seek distance average. */

	int		q_bionum;	/* Number of requests. */

	off_t		q_lastoff;	/* Last submitted req. offset. */
	int		q_lastsub;	/* Last submitted req. time. */

	/* Expiration deadline for an empty queue. */
	int		q_expire;

	TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */
};

/* List types. */
TAILQ_HEAD(g_rr_tailq, g_rr_queue);

/* list of scheduler instances */
LIST_HEAD(g_scheds, g_rr_softc);

/* Default quantum for RR between queues. */
#define	G_RR_DEFAULT_BUDGET	0x00800000

/*
 * Per device descriptor, holding the Round Robin list of queues
 * accessing the disk, a reference to the geom, and the timer.
 */
struct g_rr_softc {
	struct g_geom	*sc_geom;

	/*
	 * sc_active is the queue we are anticipating for.
	 * It is set only in gs_rr_next(), and possibly cleared
	 * only in gs_rr_next() or on a timeout.
	 * The active queue is never in the Round Robin list
	 * even if it has requests queued.
	 */
	struct g_rr_queue *sc_active;
	struct callout	sc_wait;	/* timer for sc_active */

	struct g_rr_tailq sc_rr_tailq;	/* the round-robin list */
	int		sc_nqueues;	/* number of queues */

	/* Statistics */
	int		sc_in_flight;	/* requests in the driver */

	LIST_ENTRY(g_rr_softc)	sc_next;
};

/* Descriptor for bounded values, min and max are constant. */
struct x_bound {		
	const int	x_min;
	int		x_cur;
	const int	x_max;
};

/*
 * parameters, config and stats
 */
struct g_rr_params {
	int	queues;			/* total number of queues */
	int	w_anticipate;		/* anticipate writes */
	int	bypass;			/* bypass scheduling writes */

	int	units;			/* how many instances */
	/* sc_head is used for debugging */
	struct g_scheds	sc_head;	/* first scheduler instance */

	struct x_bound queue_depth;	/* max parallel requests */
	struct x_bound wait_ms;		/* wait time, milliseconds */
	struct x_bound quantum_ms;	/* quantum size, milliseconds */
	struct x_bound quantum_kb;	/* quantum size, Kb (1024 bytes) */

	/* statistics */
	int	wait_hit;		/* success in anticipation */
	int	wait_miss;		/* failure in anticipation */
};

/*
 * Default parameters for the scheduler.  The quantum sizes target
 * a 80MB/s disk; if the hw is faster or slower the minimum of the
 * two will have effect: the clients will still be isolated but
 * the fairness may be limited.  A complete solution would involve
 * the on-line measurement of the actual disk throughput to derive
 * these parameters.  Or we may just choose to ignore service domain
 * fairness and accept what can be achieved with time-only budgets.
 */
static struct g_rr_params me = {
	.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
	.w_anticipate =	1,
	.queue_depth =	{ 1,	1,	50 },
	.wait_ms =	{ 1, 	10,	30 },
	.quantum_ms =	{ 1, 	100,	500 },
	.quantum_kb =	{ 16, 	8192,	65536 },
};

struct g_rr_params *gs_rr_me = &me;

SYSCTL_DECL(_kern_geom_sched);
SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0,
    "GEOM_SCHED ROUND ROBIN stuff");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD,
    &me.units, 0, "Scheduler instances");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD,
    &me.queues, 0, "Total rr queues");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW,
    &me.wait_ms.x_cur, 0, "Wait time milliseconds");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW,
    &me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW,
    &me.bypass, 0, "Bypass scheduler");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW,
    &me.w_anticipate, 0, "Do anticipation on writes");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW,
    &me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW,
    &me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW,
    &me.wait_hit, 0, "Hits in anticipation");
SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW,
    &me.wait_miss, 0, "Misses in anticipation");

#ifdef DEBUG_QUEUES
/* print the status of a queue */
static void
gs_rr_dump_q(struct g_rr_queue *qp, int index)
{
	int l = 0;
	struct bio *bp;

	TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
		l++;
	}
	printf("--- rr queue %d %p status %d len %d ---\n",
	    index, qp, qp->q_status, l);
}

/*
 * Dump the scheduler status when writing to this sysctl variable.
 * XXX right now we only dump the status of the last instance created.
 * not a severe issue because this is only for debugging
 */
static int
gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS)
{
        int error, val = 0;
	struct g_rr_softc *sc;

        error = sysctl_handle_int(oidp, &val, 0, req);
        if (error || !req->newptr )
                return (error);

        printf("called %s\n", __FUNCTION__);

	LIST_FOREACH(sc, &me.sc_head, sc_next) {
		int i, tot = 0;
		printf("--- sc %p active %p nqueues %d "
		    "callout %d in_flight %d ---\n",
		    sc, sc->sc_active, sc->sc_nqueues,
		    callout_active(&sc->sc_wait),
		    sc->sc_in_flight);
		for (i = 0; i < G_RR_HASH_SIZE; i++) {
			struct g_rr_queue *qp;
			LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
				gs_rr_dump_q(qp, tot);
				tot++;
			}
		}
	}
        return (0);
}

SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status,
	CTLTYPE_UINT | CTLFLAG_RW,
    0, sizeof(int), gs_rr_sysctl_status, "I", "status");

#endif	/* DEBUG_QUEUES */

/*
 * Get a bounded value, optionally convert to a min of t_min ticks.
 */
static int
get_bounded(struct x_bound *v, int t_min)
{
	int x;

	x = v->x_cur;
	if (x < v->x_min)
		x = v->x_min;
	else if (x > v->x_max)
		x = v->x_max;
	if (t_min) {
		x = x * hz / 1000;	/* convert to ticks */
		if (x < t_min)
			x = t_min;
	}
	return x;
}

/*
 * Get a reference to the queue for bp, using the generic
 * classification mechanism.
 */
static struct g_rr_queue *
g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp)
{

	return (g_sched_get_class(sc->sc_geom, bp));
}

static int
g_rr_init_class(void *data, void *priv)
{
	struct g_rr_softc *sc = data;
	struct g_rr_queue *qp = priv;

	gs_bioq_init(&qp->q_bioq);

	/*
	 * Set the initial parameters for the client:
	 * slice size in bytes and ticks, and wait ticks.
	 * Right now these are constant, but we could have
	 * autoconfiguration code to adjust the values based on
	 * the actual workload.
	 */
	qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
	qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
	qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);

	qp->q_sc = sc;		/* link to the parent */
	qp->q_sc->sc_nqueues++;
	me.queues++;

	return (0);
}

/*
 * Release a reference to the queue.
 */
static void
g_rr_queue_put(struct g_rr_queue *qp)
{

	g_sched_put_class(qp->q_sc->sc_geom, qp);
}

static void
g_rr_fini_class(void *data, void *priv)
{
	struct g_rr_queue *qp = priv;

	KASSERT(gs_bioq_first(&qp->q_bioq) == NULL,
			("released nonempty queue"));
	qp->q_sc->sc_nqueues--;
	me.queues--;
}

static inline int
g_rr_queue_expired(struct g_rr_queue *qp)
{

	if (qp->q_service >= qp->q_budget)
		return (1);

	if ((qp->q_flags & G_FLAG_COMPLETED) &&
	    ticks - qp->q_slice_end >= 0)
		return (1);

	return (0);
}

static inline int
g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
{
	int wait = get_bounded(&me.wait_ms, 2);

	if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
		return (0);

	if (g_savg_valid(&qp->q_thinktime) &&
	    g_savg_read(&qp->q_thinktime) > wait)
		return (0);

	if (g_savg_valid(&qp->q_seekdist) &&
	    g_savg_read(&qp->q_seekdist) > 8192)
		return (0);

	return (1);
}

/*
 * Called on a request arrival, timeout or completion.
 * Try to serve a request among those queued.
 */
static struct bio *
g_rr_next(void *data, int force)
{
	struct g_rr_softc *sc = data;
	struct g_rr_queue *qp;
	struct bio *bp, *next;
	int expired;

	qp = sc->sc_active;
	if (me.bypass == 0 && !force) {
		if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
			return (NULL);

		/* Try with the queue under service first. */
		if (qp != NULL && qp->q_status != G_QUEUE_READY) {
			/*
			 * Queue is anticipating, ignore request.
			 * We should check that we are not past
			 * the timeout, but in that case the timeout
			 * will fire immediately afterwards so we
			 * don't bother.
			 */
			return (NULL);
		}
	} else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
		g_rr_queue_put(qp);
		sc->sc_active = qp = NULL;
	}

	/*
	 * No queue under service, look for the first in RR order.
	 * If we find it, select if as sc_active, clear service
	 * and record the end time of the slice.
	 */
	if (qp == NULL) {
		qp = TAILQ_FIRST(&sc->sc_rr_tailq);
		if (qp == NULL)
			return (NULL); /* no queues at all, return */
		/* otherwise select the new queue for service. */
		TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
		sc->sc_active = qp;
		qp->q_service = 0;
		qp->q_flags &= ~G_FLAG_COMPLETED;
	}

	bp = gs_bioq_takefirst(&qp->q_bioq);	/* surely not NULL */
	qp->q_service += bp->bio_length;	/* charge the service */

	/*
	 * The request at the head of the active queue is always
	 * dispatched, and gs_rr_next() will be called again
	 * immediately.
	 * We need to prepare for what to do next:
	 *
	 * 1. have we reached the end of the (time or service) slice ?
	 *    If so, clear sc_active and possibly requeue the previous
	 *    active queue if it has more requests pending;
	 * 2. do we have more requests in sc_active ?
	 *    If yes, do not anticipate, as gs_rr_next() will run again;
	 *    if no, decide whether or not to anticipate depending
	 *    on read or writes (e.g., anticipate only on reads).
	 */
	expired = g_rr_queue_expired(qp);	/* are we expired ? */
	next = gs_bioq_first(&qp->q_bioq);	/* do we have one more ? */
 	if (expired) {
		sc->sc_active = NULL;
		/* Either requeue or release reference. */
		if (next != NULL)
			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
		else
			g_rr_queue_put(qp);
	} else if (next != NULL) {
		qp->q_status = G_QUEUE_READY;
	} else {
		if (!force && g_rr_should_anticipate(qp, bp)) {
			/* anticipate */
			qp->q_status = G_QUEUE_BUSY;
		} else {
			/* do not anticipate, release reference */
			g_rr_queue_put(qp);
			sc->sc_active = NULL;
		}
	}
	/* If sc_active != NULL, its q_status is always correct. */

	sc->sc_in_flight++;

	return (bp);
}

static inline void
g_rr_update_thinktime(struct g_rr_queue *qp)
{
	int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);

	if (qp->q_sc->sc_active != qp)
		return;

	qp->q_lastsub = ticks;
	delta = (delta > 2 * wait) ? 2 * wait : delta;
	if (qp->q_bionum > 7)
		g_savg_add_sample(&qp->q_thinktime, delta);
}

static inline void
g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
{
	off_t dist;

	if (qp->q_lastoff > bp->bio_offset)
		dist = qp->q_lastoff - bp->bio_offset;
	else
		dist = bp->bio_offset - qp->q_lastoff;

	if (dist > (8192 * 8))
		dist = 8192 * 8;

	qp->q_lastoff = bp->bio_offset + bp->bio_length;

	if (qp->q_bionum > 7)
		g_savg_add_sample(&qp->q_seekdist, dist);
}

/*
 * Called when a real request for disk I/O arrives.
 * Locate the queue associated with the client.
 * If the queue is the one we are anticipating for, reset its timeout;
 * if the queue is not in the round robin list, insert it in the list.
 * On any error, do not queue the request and return -1, the caller
 * will take care of this request.
 */
static int
g_rr_start(void *data, struct bio *bp)
{
	struct g_rr_softc *sc = data;
	struct g_rr_queue *qp;

	if (me.bypass)
		return (-1);	/* bypass the scheduler */

	/* Get the queue for the request. */
	qp = g_rr_queue_get(sc, bp);
	if (qp == NULL)
		return (-1); /* allocation failed, tell upstream */

	if (gs_bioq_first(&qp->q_bioq) == NULL) {
		/*
		 * We are inserting into an empty queue.
		 * Reset its state if it is sc_active,
		 * otherwise insert it in the RR list.
		 */
		if (qp == sc->sc_active) {
			qp->q_status = G_QUEUE_READY;
			callout_stop(&sc->sc_wait);
		} else {
			g_sched_priv_ref(qp);
			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
		}
	}

	qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);

	g_rr_update_thinktime(qp);
	g_rr_update_seekdist(qp, bp);

	/* Inherit the reference returned by g_rr_queue_get(). */
	bp->bio_caller1 = qp;
	gs_bioq_disksort(&qp->q_bioq, bp);

	return (0);
}

/*
 * Callout executed when a queue times out anticipating a new request.
 */
static void
g_rr_wait_timeout(void *data)
{
	struct g_rr_softc *sc = data;
	struct g_geom *geom = sc->sc_geom;

	g_sched_lock(geom);
	/*
	 * We can race with other events, so check if
	 * sc_active is still valid.
	 */
	if (sc->sc_active != NULL) {
		/* Release the reference to the queue. */
		g_rr_queue_put(sc->sc_active);
		sc->sc_active = NULL;
		me.wait_hit--;
		me.wait_miss++;	/* record the miss */
	}
	g_sched_dispatch(geom);
	g_sched_unlock(geom);
}

/*
 * Module glue: allocate descriptor, initialize its fields.
 */
static void *
g_rr_init(struct g_geom *geom)
{
	struct g_rr_softc *sc;

	/* XXX check whether we can sleep */
	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
	sc->sc_geom = geom;
	TAILQ_INIT(&sc->sc_rr_tailq);
	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
	LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
	me.units++;

	return (sc);
}

/*
 * Module glue -- drain the callout structure, destroy the
 * hash table and its element, and free the descriptor.
 */
static void
g_rr_fini(void *data)
{
	struct g_rr_softc *sc = data;

	callout_drain(&sc->sc_wait);
	KASSERT(sc->sc_active == NULL, ("still a queue under service"));
	KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues"));

	LIST_REMOVE(sc, sc_next);
	me.units--;
	free(sc, M_GEOM_SCHED);
}

/*
 * Called when the request under service terminates.
 * Start the anticipation timer if needed.
 */
static void
g_rr_done(void *data, struct bio *bp)
{
	struct g_rr_softc *sc = data;
	struct g_rr_queue *qp;

	sc->sc_in_flight--;

	qp = bp->bio_caller1;
	if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
		if (!(qp->q_flags & G_FLAG_COMPLETED)) {
			qp->q_flags |= G_FLAG_COMPLETED;
			/* in case we want to make the slice adaptive */
			qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
			qp->q_slice_end = ticks + qp->q_slice_duration;
		}

		/* The queue is trying anticipation, start the timer. */
		qp->q_status = G_QUEUE_IDLING;
		/* may make this adaptive */
		qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
		me.wait_hit++;
		callout_reset(&sc->sc_wait, qp->q_wait_ticks,
		    g_rr_wait_timeout, sc);
	} else
		g_sched_dispatch(sc->sc_geom);

	/* Release a reference to the queue. */
	g_rr_queue_put(qp);
}

static void
g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
    struct g_consumer *cp, struct g_provider *pp)
{
	if (indent == NULL) {   /* plaintext */
		sbuf_printf(sb, " units %d queues %d",
			me.units, me.queues);
        }
}

static struct g_gsched g_rr = {
	.gs_name = "rr",
	.gs_priv_size = sizeof(struct g_rr_queue),
	.gs_init = g_rr_init,
	.gs_fini = g_rr_fini,
	.gs_start = g_rr_start,
	.gs_done = g_rr_done,
	.gs_next = g_rr_next,
	.gs_dumpconf = g_rr_dumpconf,
	.gs_init_class = g_rr_init_class,
	.gs_fini_class = g_rr_fini_class,
};

DECLARE_GSCHED_MODULE(rr, &g_rr);