aboutsummaryrefslogtreecommitdiff
path: root/en_US.ISO8859-1/books/handbook/vinum/chapter.xml
blob: a18b8491ec8b01c0e57367a011d47623d7624c26 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
	The Vinum Volume Manager
	By Greg Lehey (grog at lemis dot com)

	Added to the Handbook by Hiten Pandya <hmp@FreeBSD.org>
	and Tom Rhodes <trhodes@FreeBSD.org>

	For the FreeBSD Documentation Project
	$FreeBSD$
-->
<chapter xmlns="http://docbook.org/ns/docbook" xmlns:xlink="http://www.w3.org/1999/xlink" version="5.0" xml:id="vinum-vinum">
  <info><title>The <filename>vinum</filename> Volume Manager</title>
    <authorgroup>
      <author><personname><firstname>Greg</firstname><surname>Lehey</surname></personname><contrib>Originally written by </contrib></author>
    </authorgroup>
  </info>

  

  <sect1 xml:id="vinum-synopsis">
    <title>Synopsis</title>

    <para>No matter the type of disks, there are always potential
      problems.  The disks can be too small, too slow, or too
      unreliable to meet the system's requirements.  While disks are
      getting bigger, so are data storage requirements.  Often a file
      system is needed that is bigger than a disk's capacity.  Various
      solutions to these problems have been proposed and
      implemented.</para>

    <para>One method is through the use of multiple, and sometimes
      redundant, disks.  In addition to supporting various cards and
      controllers for hardware Redundant Array of Independent
      Disks <acronym>RAID</acronym> systems, the base &os; system
      includes the <filename>vinum</filename> volume manager, a
      block device driver that implements virtual disk drives and
      addresses these three problems.  <filename>vinum</filename>
      provides more flexibility, performance, and reliability than
      traditional disk storage and implements
      <acronym>RAID</acronym>-0, <acronym>RAID</acronym>-1, and
      <acronym>RAID</acronym>-5 models, both individually and in
      combination.</para>

    <para>This chapter provides an overview of potential problems with
      traditional disk storage, and an introduction to the
      <filename>vinum</filename> volume manager.</para>

    <note>
      <para>Starting with &os;&nbsp;5, <filename>vinum</filename>
	has been rewritten in order to fit into the <link linkend="GEOM">GEOM architecture</link>, while retaining the
	original ideas, terminology, and on-disk metadata.  This
	rewrite is called <emphasis>gvinum</emphasis> (for <emphasis>
	  GEOM vinum</emphasis>).  While this chapter uses the term
	<filename>vinum</filename>, any command invocations should
	be performed with <command>gvinum</command>.  The name of the
	kernel module has changed from the original
	<filename>vinum.ko</filename> to
	<filename>geom_vinum.ko</filename>, and all device nodes
	reside under <filename>/dev/gvinum</filename> instead of
	<filename>/dev/vinum</filename>.  As of
	&os;&nbsp;6, the original <filename>vinum</filename>
	implementation is no longer available in the code base.</para>
    </note>
  </sect1>

  <sect1 xml:id="vinum-access-bottlenecks">
    <title>Access Bottlenecks</title>

    <para>Modern systems frequently need to access data in a highly
      concurrent manner.  For example, large FTP or HTTP servers can
      maintain thousands of concurrent sessions and have multiple
      100&nbsp;Mbit/s connections to the outside world, well beyond
      the sustained transfer rate of most disks.</para>

    <para>Current disk drives can transfer data sequentially at up to
      70&nbsp;MB/s, but this value is of little importance in an
      environment where many independent processes access a drive, and
      where they may achieve only a fraction of these values.  In such
      cases, it is more interesting to view the problem from the
      viewpoint of the disk subsystem.  The important parameter is the
      load that a transfer places on the subsystem, or the time for
      which a transfer occupies the drives involved in the
      transfer.</para>

    <para>In any disk transfer, the drive must first position the
      heads, wait for the first sector to pass under the read head,
      and then perform the transfer.  These actions can be considered
      to be atomic as it does not make any sense to interrupt
      them.</para>

    <para><anchor xml:id="vinum-latency"/> Consider a typical transfer of
      about 10&nbsp;kB: the current generation of high-performance
      disks can position the heads in an average of 3.5&nbsp;ms.  The
      fastest drives spin at 15,000&nbsp;rpm, so the average
      rotational latency (half a revolution) is 2&nbsp;ms.  At
      70&nbsp;MB/s, the transfer itself takes about 150&nbsp;&mu;s,
      almost nothing compared to the positioning time.  In such a
      case, the effective transfer rate drops to a little over
      1&nbsp;MB/s and is clearly highly dependent on the transfer
      size.</para>

    <para>The traditional and obvious solution to this bottleneck is
      <quote>more spindles</quote>:  rather than using one large disk,
      use several smaller disks with the same aggregate storage
      space.  Each disk is capable of positioning and transferring
      independently, so the effective throughput increases by a factor
      close to the number of disks used.</para>

    <para>The actual throughput improvement is smaller than the
      number of disks involved.  Although each drive is capable of
      transferring in parallel, there is no way to ensure that the
      requests are evenly distributed across the drives.  Inevitably
      the load on one drive will be higher than on another.</para>

    <indexterm>
      <primary>disk concatenation</primary>
    </indexterm>
    <indexterm>
      <primary>Vinum</primary>
      <secondary>concatenation</secondary>
    </indexterm>

    <para>The evenness of the load on the disks is strongly dependent
      on the way the data is shared across the drives.  In the
      following discussion, it is convenient to think of the disk
      storage as a large number of data sectors which are addressable
      by number, rather like the pages in a book.  The most obvious
      method is to divide the virtual disk into groups of consecutive
      sectors the size of the individual physical disks and store them
      in this manner, rather like taking a large book and tearing it
      into smaller sections.  This method is called
      <emphasis>concatenation</emphasis> and has the advantage that
      the disks are not required to have any specific size
      relationships.  It works well when the access to the virtual
      disk is spread evenly about its address space.  When access is
      concentrated on a smaller area, the improvement is less marked.
      <xref linkend="vinum-concat"/> illustrates the sequence in
      which storage units are allocated in a concatenated
      organization.</para>

    <para>
      <figure xml:id="vinum-concat">
	<title>Concatenated Organization</title>

	<mediaobject><imageobject><imagedata fileref="vinum/vinum-concat"/></imageobject></mediaobject>
      </figure></para>

    <indexterm>
      <primary>disk striping</primary>
    </indexterm>
    <indexterm>
      <primary>Vinum</primary>
      <secondary>striping</secondary>
    </indexterm>
    <indexterm>
      <primary><acronym>RAID</acronym></primary>
    </indexterm>

    <para>An alternative mapping is to divide the address space into
      smaller, equal-sized components and store them sequentially on
      different devices.  For example, the first 256 sectors may be
      stored on the first disk, the next 256 sectors on the next disk
      and so on.  After filling the last disk, the process repeats
      until the disks are full.  This mapping is called
      <emphasis>striping</emphasis> or
      <acronym>RAID-0</acronym>.</para>

    <para><acronym>RAID</acronym> offers various forms of fault
      tolerance, though <acronym>RAID-0</acronym> is somewhat
      misleading as it provides no redundancy.  Striping requires
      somewhat more effort to locate the data, and it can cause
      additional I/O load where a transfer is spread over multiple
      disks, but it can also provide a more constant load across the
      disks.  <xref linkend="vinum-striped"/> illustrates the
      sequence in which storage units are allocated in a striped
      organization.</para>

    <para>
      <figure xml:id="vinum-striped">
	<title>Striped Organization</title>

	<mediaobject><imageobject><imagedata fileref="vinum/vinum-striped"/></imageobject></mediaobject>
      </figure></para>
  </sect1>

  <sect1 xml:id="vinum-data-integrity">
    <title>Data Integrity</title>

    <para>The final problem with disks is that they are unreliable.
      Although reliability has increased tremendously over the last
      few years, disk drives are still the most likely core component
      of a server to fail.  When they do, the results can be
      catastrophic and replacing a failed disk drive and restoring
      data can result in server downtime.</para>

    <indexterm>
      <primary>disk mirroring</primary>
    </indexterm>
    <indexterm><primary>vinum</primary>
      <secondary>mirroring</secondary>
    </indexterm>
    <indexterm><primary><acronym>RAID</acronym>-1</primary>
    </indexterm>

    <para>One approach to this problem is
      <emphasis>mirroring</emphasis>, or
      <acronym>RAID-1</acronym>, which keeps two copies of the
      data on different physical hardware.  Any write to the volume
      writes to both disks; a read can be satisfied from either, so if
      one drive fails, the data is still available on the other
      drive.</para>

    <para>Mirroring has two problems:</para>

    <itemizedlist>
      <listitem>
	<para>It requires twice as much disk storage as a
	  non-redundant solution.</para>
      </listitem>

      <listitem>
	<para>Writes must be performed to both drives, so they take up
	  twice the bandwidth of a non-mirrored volume.  Reads do not
	  suffer from a performance penalty and can even be
	  faster.</para>
      </listitem>
    </itemizedlist>

    <indexterm><primary><acronym>RAID</acronym>-5</primary></indexterm>

    <para>An alternative solution is <emphasis>parity</emphasis>,
      implemented in <acronym>RAID</acronym> levels 2, 3, 4 and 5.
      Of these, <acronym>RAID-5</acronym> is the most interesting.  As
      implemented in <filename>vinum</filename>, it is a variant
      on a striped organization which dedicates one block of each
      stripe to parity one of the other blocks.  As implemented by
      <filename>vinum</filename>, a
      <acronym>RAID-5</acronym> plex is similar to a striped plex,
      except that it implements <acronym>RAID-5</acronym> by
      including a parity block in each stripe.  As required by
      <acronym>RAID-5</acronym>, the location of this parity block
      changes from one stripe to the next.  The numbers in the data
      blocks indicate the relative block numbers.</para>

    <para>
      <figure xml:id="vinum-raid5-org">
	<title><acronym>RAID</acronym>-5 Organization</title>

	<mediaobject><imageobject><imagedata fileref="vinum/vinum-raid5-org"/></imageobject></mediaobject>
      </figure></para>

    <para>Compared to mirroring, <acronym>RAID-5</acronym> has the
      advantage of requiring significantly less storage space.  Read
      access is similar to that of striped organizations, but write
      access is significantly slower, approximately 25% of the read
      performance.  If one drive fails, the array can continue to
      operate in degraded mode where a read from one of the remaining
      accessible drives continues normally, but a read from the
      failed drive is recalculated from the corresponding block from
      all the remaining drives.</para>
  </sect1>

  <sect1 xml:id="vinum-objects">
    <title><filename>vinum</filename> Objects</title>

    <para>In order to address these problems,
      <filename>vinum</filename> implements a four-level hierarchy
      of objects:</para>

    <itemizedlist>
      <listitem>
	<para>The most visible object is the virtual disk, called a
	  <emphasis>volume</emphasis>.  Volumes have essentially the
	  same properties as a &unix; disk drive, though there are
	  some minor differences.  For one, they have no size
	  limitations.</para>
      </listitem>

      <listitem>
	<para>Volumes are composed of <emphasis>plexes</emphasis>,
	  each of which represent the total address space of a
	  volume.  This level in the hierarchy provides redundancy.
	  Think of plexes as individual disks in a mirrored array,
	  each containing the same data.</para>
      </listitem>

      <listitem>
	<para>Since <filename>vinum</filename> exists within the
	  &unix; disk storage framework, it would be possible to use
	  &unix; partitions as the building block for multi-disk
	  plexes.  In fact, this turns out to be too inflexible as
	  &unix; disks can have only a limited number of partitions.
	  Instead, <filename>vinum</filename> subdivides a single
	  &unix; partition, the <emphasis>drive</emphasis>, into
	  contiguous areas called <emphasis>subdisks</emphasis>, which
	  are used as building blocks for plexes.</para>
      </listitem>

      <listitem>
	<para>Subdisks reside on <filename>vinum</filename>
	  <emphasis>drives</emphasis>, currently &unix; partitions.
	  <filename>vinum</filename> drives can contain any
	  number of subdisks.  With the exception of a small area at
	  the beginning of the drive, which is used for storing
	  configuration and state information, the entire drive is
	  available for data storage.</para>
      </listitem>
    </itemizedlist>

    <para>The following sections describe the way these objects
      provide the functionality required of
      <filename>vinum</filename>.</para>

    <sect2>
      <title>Volume Size Considerations</title>

      <para>Plexes can include multiple subdisks spread over all
	drives in the <filename>vinum</filename> configuration.
	As a result, the size of an individual drive does not limit
	the size of a plex or a volume.</para>
    </sect2>

    <sect2>
      <title>Redundant Data Storage</title>

      <para><filename>vinum</filename> implements mirroring by
	attaching multiple plexes to a volume.  Each plex is a
	representation of the data in a volume.  A volume may contain
	between one and eight plexes.</para>

      <para>Although a plex represents the complete data of a volume,
	it is possible for parts of the representation to be
	physically missing, either by design (by not defining a
	subdisk for parts of the plex) or by accident (as a result of
	the failure of a drive).  As long as at least one plex can
	provide the data for the complete address range of the volume,
	the volume is fully functional.</para>
    </sect2>

    <sect2>
      <title>Which Plex Organization?</title>

      <para><filename>vinum</filename> implements both
	concatenation and striping at the plex level:</para>

      <itemizedlist>
	<listitem>
	  <para>A <emphasis>concatenated plex</emphasis> uses the
	    address space of each subdisk in turn.  Concatenated
	    plexes are the most flexible as they can contain any
	    number of subdisks, and the subdisks may be of different
	    length.  The plex may be extended by adding additional
	    subdisks.  They require less <acronym>CPU</acronym>
	    time than striped plexes, though the difference in
	    <acronym>CPU</acronym> overhead is not measurable.  On
	    the other hand, they are most susceptible to hot spots,
	    where one disk is very active and others are idle.</para>
	</listitem>

	<listitem>
	  <para>A <emphasis>striped plex</emphasis> stripes the data
	    across each subdisk.  The subdisks must all be the same
	    size and there must be at least two subdisks in order to
	    distinguish it from a concatenated plex.  The greatest
	    advantage of striped plexes is that they reduce hot spots.
	    By choosing an optimum sized stripe, about 256&nbsp;kB,
	    the load can be evened out on the component drives.
	    Extending a plex by adding new subdisks is so complicated
	    that <filename>vinum</filename> does not implement
	    it.</para>
	</listitem>
      </itemizedlist>

      <para><xref linkend="vinum-comparison"/> summarizes the
	advantages and disadvantages of each plex organization.</para>

      <table xml:id="vinum-comparison" frame="none">
	<title><filename>vinum</filename> Plex
	  Organizations</title>

	<tgroup cols="5">
	  <thead>
	    <row>
	      <entry>Plex type</entry>
	      <entry>Minimum subdisks</entry>
	      <entry>Can add subdisks</entry>
	      <entry>Must be equal size</entry>
	      <entry>Application</entry>
	    </row>
	  </thead>

	  <tbody>
	    <row>
	      <entry>concatenated</entry>
	      <entry>1</entry>
	      <entry>yes</entry>
	      <entry>no</entry>
	      <entry>Large data storage with maximum placement
		flexibility and moderate performance</entry>
	    </row>

	    <row>
	      <entry>striped</entry>
	      <entry>2</entry>
	      <entry>no</entry>
	      <entry>yes</entry>
	      <entry>High performance in combination with highly
		concurrent access</entry>
	    </row>
	  </tbody>
	</tgroup>
      </table>
    </sect2>
  </sect1>

  <sect1 xml:id="vinum-examples">
    <title>Some Examples</title>

    <para><filename>vinum</filename> maintains a
      <emphasis>configuration database</emphasis> which describes the
      objects known to an individual system.  Initially, the user
      creates the configuration database from one or more
      configuration files using &man.gvinum.8;.
      <filename>vinum</filename> stores a copy of its
      configuration database on each disk
      <emphasis>device</emphasis> under its control.  This database is
      updated on each state change, so that a restart accurately
      restores the state of each
      <filename>vinum</filename> object.</para>

    <sect2>
      <title>The Configuration File</title>

      <para>The configuration file describes individual
	<filename>vinum</filename> objects.  The definition of a
	simple volume might be:</para>

      <programlisting>    drive a device /dev/da3h
    volume myvol
      plex org concat
        sd length 512m drive a</programlisting>

      <para>This file describes four <filename>vinum</filename>
	objects:</para>

      <itemizedlist>
	<listitem>
	  <para>The <emphasis>drive</emphasis> line describes a disk
	    partition (<emphasis>drive</emphasis>) and its location
	    relative to the underlying hardware.  It is given the
	    symbolic name <emphasis>a</emphasis>.  This separation of
	    symbolic names from device names allows disks to be moved
	    from one location to another without confusion.</para>
	</listitem>

	<listitem>
	  <para>The <emphasis>volume</emphasis> line describes a
	    volume.  The only required attribute is the name, in this
	    case <emphasis>myvol</emphasis>.</para>
	</listitem>

	<listitem>
	  <para>The <emphasis>plex</emphasis> line defines a plex.
	    The only required parameter is the organization, in this
	    case <emphasis>concat</emphasis>.  No name is necessary as
	    the system automatically generates a name from the volume
	    name by adding the suffix
	    <emphasis>.p</emphasis><emphasis>x</emphasis>, where
	    <emphasis>x</emphasis> is the number of the plex in the
	    volume.  Thus this plex will be called
	    <emphasis>myvol.p0</emphasis>.</para>
	</listitem>

	<listitem>
	  <para>The <emphasis>sd</emphasis> line describes a subdisk.
	    The minimum specifications are the name of a drive on
	    which to store it, and the length of the subdisk.  No name
	    is necessary as the system automatically assigns names
	    derived from the plex name by adding the suffix
	    <emphasis>.s</emphasis><emphasis>x</emphasis>, where
	    <emphasis>x</emphasis> is the number of the subdisk in
	    the plex.  Thus <filename>vinum</filename> gives this
	    subdisk the name <emphasis>myvol.p0.s0</emphasis>.</para>
	</listitem>
      </itemizedlist>

      <para>After processing this file, &man.gvinum.8; produces the
	following output:</para>

      <programlisting width="97">
      &prompt.root; gvinum -&gt; <userinput>create config1</userinput>
      Configuration summary
      Drives:         1 (4 configured)
      Volumes:        1 (4 configured)
      Plexes:         1 (8 configured)
      Subdisks:       1 (16 configured)

	D a                     State: up       Device /dev/da3h      Avail: 2061/2573 MB (80%)

	V myvol                 State: up       Plexes:       1 Size:      512 MB

	P myvol.p0            C State: up       Subdisks:     1 Size:      512 MB

	S myvol.p0.s0           State: up       PO:        0  B Size:      512 MB</programlisting>

      <para>This output shows the brief listing format of
	&man.gvinum.8;.  It is represented graphically in <xref linkend="vinum-simple-vol"/>.</para>

      <para>
	<figure xml:id="vinum-simple-vol">
	  <title>A Simple <filename>vinum</filename>
	    Volume</title>

	  <mediaobject><imageobject><imagedata fileref="vinum/vinum-simple-vol"/></imageobject></mediaobject>
	</figure></para>

      <para>This figure, and the ones which follow, represent a
	volume, which contains the plexes, which in turn contains the
	subdisks.  In this example, the volume contains one plex, and
	the plex contains one subdisk.</para>

      <para>This particular volume has no specific advantage over a
	conventional disk partition.  It contains a single plex, so it
	is not redundant.  The plex contains a single subdisk, so
	there is no difference in storage allocation from a
	conventional disk partition.  The following sections
	illustrate various more interesting configuration
	methods.</para>
    </sect2>

    <sect2>
      <title>Increased Resilience: Mirroring</title>

      <para>The resilience of a volume can be increased by mirroring.
	When laying out a mirrored volume, it is important to ensure
	that the subdisks of each plex are on different drives, so
	that a drive failure will not take down both plexes.  The
	following configuration mirrors a volume:</para>

      <programlisting>	drive b device /dev/da4h
	volume mirror
      plex org concat
        sd length 512m drive a
	  plex org concat
	    sd length 512m drive b</programlisting>

      <para>In this example, it was not necessary to specify a
	definition of drive <emphasis>a</emphasis> again, since
	<filename>vinum</filename> keeps track of all objects in
	its configuration database.  After processing this definition,
	the configuration looks like:</para>

      <programlisting width="97">
	Drives:         2 (4 configured)
	Volumes:        2 (4 configured)
	Plexes:         3 (8 configured)
	Subdisks:       3 (16 configured)

	D a                     State: up       Device /dev/da3h       Avail: 1549/2573 MB (60%)
	D b                     State: up       Device /dev/da4h       Avail: 2061/2573 MB (80%)

    V myvol                 State: up       Plexes:       1 Size:        512 MB
    V mirror                State: up       Plexes:       2 Size:        512 MB

    P myvol.p0            C State: up       Subdisks:     1 Size:        512 MB
    P mirror.p0           C State: up       Subdisks:     1 Size:        512 MB
    P mirror.p1           C State: initializing     Subdisks:     1 Size:        512 MB

    S myvol.p0.s0           State: up       PO:        0  B Size:        512 MB
	S mirror.p0.s0          State: up       PO:        0  B Size:        512 MB
	S mirror.p1.s0          State: empty    PO:        0  B Size:        512 MB</programlisting>

      <para><xref linkend="vinum-mirrored-vol"/> shows the
	structure graphically.</para>

      <para>
	<figure xml:id="vinum-mirrored-vol">
	  <title>A Mirrored <filename>vinum</filename>
	    Volume</title>

	  <mediaobject><imageobject><imagedata fileref="vinum/vinum-mirrored-vol"/></imageobject></mediaobject>
	</figure></para>

      <para>In this example, each plex contains the full 512&nbsp;MB
	of address space.  As in the previous example, each plex
	contains only a single subdisk.</para>
    </sect2>

    <sect2>
      <title>Optimizing Performance</title>

      <para>The mirrored volume in the previous example is more
	resistant to failure than an unmirrored volume, but its
	performance is less as each write to the volume requires a
	write to both drives, using up a greater proportion of the
	total disk bandwidth.  Performance considerations demand a
	different approach:  instead of mirroring, the data is striped
	across as many disk drives as possible.  The following
	configuration shows a volume with a plex striped across four
	disk drives:</para>

      <programlisting>        drive c device /dev/da5h
	drive d device /dev/da6h
	volume stripe
	plex org striped 512k
	  sd length 128m drive a
	  sd length 128m drive b
	  sd length 128m drive c
	  sd length 128m drive d</programlisting>

      <para>As before, it is not necessary to define the drives which
	are already known to <filename>vinum</filename>.  After
	processing this definition, the configuration looks
	like:</para>

      <programlisting width="92">
	Drives:         4 (4 configured)
	Volumes:        3 (4 configured)
	Plexes:         4 (8 configured)
	Subdisks:       7 (16 configured)

    D a                     State: up       Device /dev/da3h        Avail: 1421/2573 MB (55%)
    D b                     State: up       Device /dev/da4h        Avail: 1933/2573 MB (75%)
    D c                     State: up       Device /dev/da5h        Avail: 2445/2573 MB (95%)
    D d                     State: up       Device /dev/da6h        Avail: 2445/2573 MB (95%)

    V myvol                 State: up       Plexes:       1 Size:        512 MB
    V mirror                State: up       Plexes:       2 Size:        512 MB
    V striped               State: up       Plexes:       1 Size:        512 MB

    P myvol.p0            C State: up       Subdisks:     1 Size:        512 MB
    P mirror.p0           C State: up       Subdisks:     1 Size:        512 MB
    P mirror.p1           C State: initializing     Subdisks:     1 Size:        512 MB
    P striped.p1            State: up       Subdisks:     1 Size:        512 MB

    S myvol.p0.s0           State: up       PO:        0  B Size:        512 MB
    S mirror.p0.s0          State: up       PO:        0  B Size:        512 MB
    S mirror.p1.s0          State: empty    PO:        0  B Size:        512 MB
    S striped.p0.s0         State: up       PO:        0  B Size:        128 MB
    S striped.p0.s1         State: up       PO:      512 kB Size:        128 MB
    S striped.p0.s2         State: up       PO:     1024 kB Size:        128 MB
    S striped.p0.s3         State: up       PO:     1536 kB Size:        128 MB</programlisting>

      <para>
	<figure xml:id="vinum-striped-vol">
	  <title>A Striped <filename>vinum</filename>
	    Volume</title>

	  <mediaobject><imageobject><imagedata fileref="vinum/vinum-striped-vol"/></imageobject></mediaobject>
	</figure></para>

      <para>This volume is represented in <xref linkend="vinum-striped-vol"/>.  The darkness of the
	stripes indicates the position within the plex address space,
	where the lightest stripes come first and the darkest
	last.</para>
    </sect2>

    <sect2>
      <title>Resilience and Performance</title>

      <para><anchor xml:id="vinum-resilience"/>With sufficient hardware,
	it is possible to build volumes which show both increased
	resilience and increased performance compared to standard
	&unix; partitions.  A typical configuration file might
	be:</para>

      <programlisting>	volume raid10
      plex org striped 512k
        sd length 102480k drive a
        sd length 102480k drive b
        sd length 102480k drive c
        sd length 102480k drive d
        sd length 102480k drive e
      plex org striped 512k
        sd length 102480k drive c
        sd length 102480k drive d
        sd length 102480k drive e
        sd length 102480k drive a
        sd length 102480k drive b</programlisting>

      <para>The subdisks of the second plex are offset by two drives
	from those of the first plex.  This helps to ensure that
	writes do not go to the same subdisks even if a transfer goes
	over two drives.</para>

      <para><xref linkend="vinum-raid10-vol"/> represents the
	structure of this volume.</para>

      <para>
	<figure xml:id="vinum-raid10-vol">
	  <title>A Mirrored, Striped <filename>vinum</filename>
	    Volume</title>

	  <mediaobject><imageobject><imagedata fileref="vinum/vinum-raid10-vol"/></imageobject></mediaobject>
	</figure></para>
    </sect2>
  </sect1>

  <sect1 xml:id="vinum-object-naming">
    <title>Object Naming</title>

    <para><filename>vinum</filename> assigns default names to
      plexes and subdisks, although they may be overridden.
      Overriding the default names is not recommended as it does not
      bring a significant advantage and it can cause
      confusion.</para>

    <para>Names may contain any non-blank character, but it is
      recommended to restrict them to letters, digits and the
      underscore characters.  The names of volumes, plexes, and
      subdisks may be up to 64 characters long, and the names of
      drives may be up to 32 characters long.</para>

    <para><filename>vinum</filename> objects are assigned device
      nodes in the hierarchy <filename>/dev/gvinum</filename>.  The configuration
      shown above would cause <filename>vinum</filename> to create
      the following device nodes:</para>

    <itemizedlist>
      <listitem>
	<para>Device entries for each volume.  These are the main
	  devices used by <filename>vinum</filename>.  The
	  configuration above would include the devices
	  <filename>/dev/gvinum/myvol</filename>,
	  <filename>/dev/gvinum/mirror</filename>,
	  <filename>/dev/gvinum/striped</filename>,
	  <filename>/dev/gvinum/raid5</filename>
	  and <filename>/dev/gvinum/raid10</filename>.</para>
      </listitem>

      <listitem>
	<para>All volumes get direct entries under
	  <filename>/dev/gvinum/</filename>.</para>
      </listitem>

      <listitem>
	<para>The directories
	  <filename>/dev/gvinum/plex</filename>, and
	  <filename>/dev/gvinum/sd</filename>, which
	  contain device nodes for each plex and for each subdisk,
	  respectively.</para>
      </listitem>
    </itemizedlist>

    <para>For example, consider the following configuration
      file:</para>

    <programlisting>	drive drive1 device /dev/sd1h
	drive drive2 device /dev/sd2h
	drive drive3 device /dev/sd3h
	drive drive4 device /dev/sd4h
    volume s64 setupstate
      plex org striped 64k
        sd length 100m drive drive1
        sd length 100m drive drive2
        sd length 100m drive drive3
        sd length 100m drive drive4</programlisting>

    <para>After processing this file, &man.gvinum.8; creates the
      following structure in <filename>/dev/gvinum</filename>:</para>

    <programlisting>	drwxr-xr-x  2 root  wheel       512 Apr 13
16:46 plex
	crwxr-xr--  1 root  wheel   91,   2 Apr 13 16:46 s64
	drwxr-xr-x  2 root  wheel       512 Apr 13 16:46 sd

    /dev/vinum/plex:
    total 0
    crwxr-xr--  1 root  wheel   25, 0x10000002 Apr 13 16:46 s64.p0

    /dev/vinum/sd:
    total 0
    crwxr-xr--  1 root  wheel   91, 0x20000002 Apr 13 16:46 s64.p0.s0
    crwxr-xr--  1 root  wheel   91, 0x20100002 Apr 13 16:46 s64.p0.s1
    crwxr-xr--  1 root  wheel   91, 0x20200002 Apr 13 16:46 s64.p0.s2
    crwxr-xr--  1 root  wheel   91, 0x20300002 Apr 13 16:46 s64.p0.s3</programlisting>

    <para>Although it is recommended that plexes and subdisks should
      not be allocated specific names,
      <filename>vinum</filename> drives must be named.  This makes
      it possible to move a drive to a different location and still
      recognize it automatically.  Drive names may be up to 32
      characters long.</para>

    <sect2>
      <title>Creating File Systems</title>

      <para>Volumes appear to the system to be identical to disks,
	with one exception.  Unlike &unix; drives,
	<filename>vinum</filename> does not partition volumes,
	which thus do not contain a partition table.  This has
	required modification to some disk utilities, notably
	&man.newfs.8;, so that it does not try to interpret the last
	letter of a <filename>vinum</filename> volume name as a
	partition identifier.  For example, a disk drive may have a
	name like <filename>/dev/ad0a</filename>
	or <filename>/dev/da2h</filename>.  These
	names represent the first partition
	(<filename>a</filename>) on the first (0) IDE disk
	(<filename>ad</filename>) and the eighth partition
	(<filename>h</filename>) on the third (2) SCSI disk
	(<filename>da</filename>) respectively.  By contrast, a
	<filename>vinum</filename> volume might be called
	<filename>/dev/gvinum/concat</filename>,
	which has no relationship with a partition name.</para>

      <para>In order to create a file system on this volume, use
	&man.newfs.8;:</para>

      <screen>&prompt.root; <userinput>newfs /dev/gvinum/concat</userinput></screen>
    </sect2>
  </sect1>

  <sect1 xml:id="vinum-config">
    <title>Configuring <filename>vinum</filename></title>

    <para>The <filename>GENERIC</filename> kernel does not contain
      <filename>vinum</filename>.  It is possible to build a
      custom kernel which includes <filename>vinum</filename>, but
      this is not recommended.  The standard way to start
      <filename>vinum</filename> is as a kernel module.
      &man.kldload.8; is not needed because when &man.gvinum.8;
      starts, it checks whether the module has been loaded, and if it
      is not, it loads it automatically.</para>


    <sect2>
      <title>Startup</title>

      <para><filename>vinum</filename> stores configuration
	information on the disk slices in essentially the same form as
	in the configuration files.  When reading from the
	configuration database, <filename>vinum</filename>
	recognizes a number of keywords which are not allowed in the
	configuration files.  For example, a disk configuration might
	contain the following text:</para>

      <programlisting width="119">volume myvol state up
volume bigraid state down
plex name myvol.p0 state up org concat vol myvol
plex name myvol.p1 state up org concat vol myvol
plex name myvol.p2 state init org striped 512b vol myvol
plex name bigraid.p0 state initializing org raid5 512b vol bigraid
sd name myvol.p0.s0 drive a plex myvol.p0 state up len 1048576b driveoffset 265b plexoffset 0b
sd name myvol.p0.s1 drive b plex myvol.p0 state up len 1048576b driveoffset 265b plexoffset 1048576b
sd name myvol.p1.s0 drive c plex myvol.p1 state up len 1048576b driveoffset 265b plexoffset 0b
sd name myvol.p1.s1 drive d plex myvol.p1 state up len 1048576b driveoffset 265b plexoffset 1048576b
sd name myvol.p2.s0 drive a plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 0b
sd name myvol.p2.s1 drive b plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 524288b
sd name myvol.p2.s2 drive c plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 1048576b
sd name myvol.p2.s3 drive d plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 1572864b
sd name bigraid.p0.s0 drive a plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 0b
sd name bigraid.p0.s1 drive b plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 4194304b
sd name bigraid.p0.s2 drive c plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 8388608b
sd name bigraid.p0.s3 drive d plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 12582912b
sd name bigraid.p0.s4 drive e plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 16777216b</programlisting>

	<para>The obvious differences here are the presence of
	  explicit location information and naming, both of which are
	  allowed but discouraged, and the information on the states.
	  <filename>vinum</filename> does not store information
	  about drives in the configuration information.  It finds the
	  drives by scanning the configured disk drives for partitions
	  with a <filename>vinum</filename> label.  This enables
	  <filename>vinum</filename> to identify drives correctly
	  even if they have been assigned different &unix; drive
	  IDs.</para>

	<sect3 xml:id="vinum-rc-startup">
	  <title>Automatic Startup</title>

	  <para><emphasis>Gvinum</emphasis> always features an
	    automatic startup once the kernel module is loaded, via
	    &man.loader.conf.5;.  To load the
	    <emphasis>Gvinum</emphasis> module at boot time, add
	    <literal>geom_vinum_load="YES"</literal> to
	    <filename>/boot/loader.conf</filename>.</para>

	  <para>When <filename>vinum</filename> is started with
	    <command>gvinum start</command>,
	    <filename>vinum</filename> reads the configuration
	    database from one of the <filename>vinum</filename>
	    drives.  Under normal circumstances, each drive contains
	    an identical copy of the configuration database, so it
	    does not matter which drive is read.  After a crash,
	    however, <filename>vinum</filename> must determine
	    which drive was updated most recently and read the
	    configuration from this drive.  It then updates the
	    configuration, if necessary, from progressively older
	    drives.</para>
	</sect3>
      </sect2>
    </sect1>

    <sect1 xml:id="vinum-root">
      <title>Using <filename>vinum</filename> for the Root
	File System</title>

      <para>For a machine that has fully-mirrored file systems using
	<filename>vinum</filename>, it is desirable to also
	mirror the root file system.  Setting up such a configuration
	is less trivial than mirroring an arbitrary file system
	because:</para>

      <itemizedlist>
	<listitem>
	  <para>The root file system must be available very early
	    during the boot process, so the
	    <filename>vinum</filename> infrastructure must
	    already be available at this time.</para>
	</listitem>
	<listitem>
	  <para>The volume containing the root file system also
	    contains the system bootstrap and the kernel.  These must
	    be read using the host system's native utilities, such as
	    the BIOS, which often cannot be taught about the details
	    of <filename>vinum</filename>.</para>
	</listitem>
      </itemizedlist>

      <para>In the following sections, the term <quote>root
	  volume</quote> is generally used to describe the
	<filename>vinum</filename> volume that contains the root
	file system.</para>

      <sect2>
	<title>Starting up <filename>vinum</filename> Early
	  Enough for the Root File System</title>

	<para><filename>vinum</filename> must be available early
	  in the system boot as &man.loader.8; must be able to load
	  the vinum kernel module before starting the kernel.  This
	  can be accomplished by putting this line in
	  <filename>/boot/loader.conf</filename>:</para>

	    <programlisting>geom_vinum_load="YES"</programlisting>

    </sect2>

    <sect2>
      <title>Making a <filename>vinum</filename>-based Root
	Volume Accessible to the Bootstrap</title>

      <para>The current &os; bootstrap is only 7.5 KB of code and
	does not understand the internal
	<filename>vinum</filename> structures.  This means that it
	cannot parse the <filename>vinum</filename> configuration
	data or figure out the elements of a boot volume.  Thus, some
	workarounds are necessary to provide the bootstrap code with
	the illusion of a standard <literal>a</literal> partition
	that contains the root file system.</para>

      <para>For this to be possible, the following requirements must
	be met for the root volume:</para>

      <itemizedlist>
	<listitem>
	  <para>The root volume must not be a stripe or
	    <acronym>RAID</acronym>-5.</para>
	</listitem>

	<listitem>
	  <para>The root volume must not contain more than one
	    concatenated subdisk per plex.</para>
	</listitem>
      </itemizedlist>

      <para>Note that it is desirable and possible to use multiple
	plexes, each containing one replica of the root file system.
	The bootstrap process will only use one replica for finding
	the bootstrap and all boot files, until the kernel mounts the
	root file system.  Each single subdisk within these plexes
	needs its own <literal>a</literal> partition illusion, for
	the respective device to be bootable.  It is not strictly
	needed that each of these faked <literal>a</literal>
	partitions is located at the same offset within its device,
	compared with other devices containing plexes of the root
	volume.  However, it is probably a good idea to create the
	<filename>vinum</filename> volumes that way so the
	resulting mirrored devices are symmetric, to avoid
	confusion.</para>

      <para>In order to set up these <literal>a</literal>
	partitions for each device containing part of the root
	volume, the following is required:</para>

      <procedure>
	<step>
	  <para>The location, offset from the beginning of the device,
	    and size of this device's subdisk that is part of the root
	    volume needs to be examined, using the command:</para>

	  <screen>&prompt.root; <userinput>gvinum l -rv root</userinput></screen>

	  <para><filename>vinum</filename> offsets and sizes are
	    measured in bytes.  They must be divided by 512 in order
	    to obtain the block numbers that are to be used by
	    <command>bsdlabel</command>.</para>
	</step>

	<step>
	  <para>Run this command for each device that participates in
	    the root volume:</para>

	  <screen>&prompt.root; <userinput>bsdlabel -e devname</userinput></screen>

	  <para><replaceable>devname</replaceable> must be either the
	    name of the disk, like <filename>da0</filename> for
	    disks without a slice table, or the name of the
	    slice, like <filename>ad0s1</filename>.</para>

	  <para>If there is already an <literal>a</literal>
	    partition on the device from a
	    pre-<filename>vinum</filename> root file system, it
	    should be renamed to something else so that it remains
	    accessible (just in case), but will no longer be used by
	    default to bootstrap the system.  A currently mounted root
	    file system cannot be renamed, so this must be executed
	    either when being booted from a <quote>Fixit</quote>
	    media, or in a two-step process where, in a mirror, the
	    disk that is not been currently booted is manipulated
	    first.</para>

	  <para>The offset of the <filename>vinum</filename>
	    partition on this device (if any) must be added to the
	    offset of the respective root volume subdisk on this
	    device.  The resulting value will become the
	    <literal>offset</literal> value for the new
	    <literal>a</literal> partition.  The
	    <literal>size</literal> value for this partition can be
	    taken verbatim from the calculation above.  The
	    <literal>fstype</literal> should be
	    <literal>4.2BSD</literal>.  The
	    <literal>fsize</literal>, <literal>bsize</literal>,
	    and <literal>cpg</literal> values should be chosen
	    to match the actual file system, though they are fairly
	    unimportant within this context.</para>

	  <para>That way, a new <literal>a</literal> partition will
	    be established that overlaps the
	    <filename>vinum</filename> partition on this device.
	    <command>bsdlabel</command> will only allow for this
	    overlap if the <filename>vinum</filename> partition
	    has properly been marked using the
	    <literal>vinum</literal> fstype.</para>
	</step>

	<step>
	  <para>A faked <literal>a</literal> partition now exists
	    on each device that has one replica of the root volume.
	    It is highly recommendable to verify the result using a
	    command like:</para>

	  <screen>&prompt.root; <userinput>fsck -n /dev/devnamea</userinput></screen>
	</step>
      </procedure>

      <para>It should be remembered that all files containing control
	information must be relative to the root file system in the
	<filename>vinum</filename> volume which, when setting up
	a new <filename>vinum</filename> root volume, might not
	match the root file system that is currently active.  So in
	particular, <filename>/etc/fstab</filename> and
	<filename>/boot/loader.conf</filename> need to be taken care
	of.</para>

      <para>At next reboot, the bootstrap should figure out the
	appropriate control information from the new
	<filename>vinum</filename>-based root file system, and act
	accordingly.  At the end of the kernel initialization process,
	after all devices have been announced, the prominent notice
	that shows the success of this setup is a message like:</para>

      <screen>Mounting root from ufs:/dev/gvinum/root</screen>
    </sect2>

    <sect2>
      <title>Example of a <filename>vinum</filename>-based Root
	Setup</title>

      <para>After the <filename>vinum</filename> root volume has
	been set up, the output of <command>gvinum l -rv
	  root</command> could look like:</para>

      <screen>...
Subdisk root.p0.s0:
		Size:        125829120 bytes (120 MB)
		State: up
		Plex root.p0 at offset 0 (0  B)
		Drive disk0 (/dev/da0h) at offset 135680 (132 kB)

Subdisk root.p1.s0:
		Size:        125829120 bytes (120 MB)
		State: up
		Plex root.p1 at offset 0 (0  B)
		Drive disk1 (/dev/da1h) at offset 135680 (132 kB)</screen>

      <para>The values to note are <literal>135680</literal> for the
	offset, relative to partition
	<filename>/dev/da0h</filename>.  This
	translates to 265 512-byte disk blocks in
	<command>bsdlabel</command>'s terms.  Likewise, the size of
	this root volume is 245760 512-byte blocks.  <filename>/dev/da1h</filename>, containing the
	second replica of this root volume, has a symmetric
	setup.</para>

      <para>The bsdlabel for these devices might look like:</para>

      <screen>...
8 partitions:
#        size   offset    fstype   [fsize bsize bps/cpg]
  a:   245760      281    4.2BSD     2048 16384     0   # (Cyl.    0*- 15*)
  c: 71771688        0    unused        0     0         # (Cyl.    0 - 4467*)
  h: 71771672       16     vinum                        # (Cyl.    0*- 4467*)</screen>

      <para>It can be observed that the <literal>size</literal>
	parameter for the faked <literal>a</literal> partition
	matches the value outlined above, while the
	<literal>offset</literal> parameter is the sum of the offset
	within the <filename>vinum</filename> partition
	<literal>h</literal>, and the offset of this partition
	within the device or slice.  This is a typical setup that is
	necessary to avoid the problem described in <xref linkend="vinum-root-panic"/>.  The entire
	<literal>a</literal> partition is completely within the
	<literal>h</literal> partition containing all the
	<filename>vinum</filename> data for this device.</para>

      <para>In the above example, the entire device is dedicated to
	<filename>vinum</filename> and there is no leftover
	pre-<filename>vinum</filename> root partition.</para>
    </sect2>

    <sect2>
      <title>Troubleshooting</title>

      <para>The following list contains a few known pitfalls and
	solutions.</para>

      <sect3>
	<title>System Bootstrap Loads, but System Does Not
	  Boot</title>

	<para>If for any reason the system does not continue to boot,
	  the bootstrap can be interrupted by pressing
	  <keycap>space</keycap> at the 10-seconds warning.  The
	  loader variable <literal>vinum.autostart</literal> can be
	  examined by typing <command>show</command> and manipulated
	  using <command>set</command> or
	  <command>unset</command>.</para>

	<para>If the <filename>vinum</filename> kernel module was
	  not yet in the list of modules to load automatically, type
	  <command>load geom_vinum</command>.</para>

	<para>When ready, the boot process can be continued by typing
	  <command>boot -as</command> which
	  <option>-as</option> requests the kernel to ask for the
	  root file system to mount (<option>-a</option>) and make the
	  boot process stop in single-user mode (<option>-s</option>),
	  where the root file system is mounted read-only.  That way,
	  even if only one plex of a multi-plex volume has been
	  mounted, no data inconsistency between plexes is being
	  risked.</para>

	<para>At the prompt asking for a root file system to mount,
	  any device that contains a valid root file system can be
	  entered.  If <filename>/etc/fstab</filename> is set up
	  correctly, the default should be something like
	  <literal>ufs:/dev/gvinum/root</literal>.  A typical
	  alternate choice would be something like
	  <literal>ufs:da0d</literal> which could be a
	  hypothetical partition containing the
	  pre-<filename>vinum</filename> root file system.  Care
	  should be taken if one of the alias
	  <literal>a</literal> partitions is entered here, that it
	  actually references the subdisks of the
	  <filename>vinum</filename> root device, because in a
	  mirrored setup, this would only mount one piece of a
	  mirrored root device.  If this file system is to be mounted
	  read-write later on, it is necessary to remove the other
	  plex(es) of the <filename>vinum</filename> root volume
	  since these plexes would otherwise carry inconsistent
	  data.</para>
      </sect3>

      <sect3>
	<title>Only Primary Bootstrap Loads</title>

	<para>If <filename>/boot/loader</filename> fails to load, but
	  the primary bootstrap still loads (visible by a single dash
	  in the left column of the screen right after the boot
	  process starts), an attempt can be made to interrupt the
	  primary bootstrap by pressing
	  <keycap>space</keycap>.  This will make the bootstrap stop
	  in <link linkend="boot-boot1">stage two</link>.  An attempt
	  can be made here to boot off an alternate partition, like
	  the partition containing the previous root file system that
	  has been moved away from <literal>a</literal>.</para>
      </sect3>

      <sect3 xml:id="vinum-root-panic">
	<title>Nothing Boots, the Bootstrap
	  Panics</title>

	<para>This situation will happen if the bootstrap had been
	  destroyed by the <filename>vinum</filename>
	  installation.  Unfortunately, <filename>vinum</filename>
	  accidentally leaves only 4 KB at the beginning of its
	  partition free before starting to write its
	  <filename>vinum</filename> header information.  However,
	  the stage one and two bootstraps plus the bsdlabel require 8
	  KB.  So if a <filename>vinum</filename> partition was
	  started at offset 0 within a slice or disk that was meant to
	  be bootable, the <filename>vinum</filename> setup will
	  trash the bootstrap.</para>

	<para>Similarly, if the above situation has been recovered,
	  by booting from a <quote>Fixit</quote> media, and the
	  bootstrap has been re-installed using
	  <command>bsdlabel -B</command> as described in <xref linkend="boot-boot1"/>, the bootstrap will trash the
	  <filename>vinum</filename> header, and
	  <filename>vinum</filename> will no longer find its
	  disk(s).  Though no actual <filename>vinum</filename>
	  configuration data or data in <filename>vinum</filename>
	  volumes will be trashed, and it would be possible to recover
	  all the data by entering exactly the same
	  <filename>vinum</filename> configuration data again, the
	  situation is hard to fix.  It is necessary to move the
	  entire <filename>vinum</filename> partition by at least
	  4 KB, in order to have the <filename>vinum</filename>
	  header and the system bootstrap no longer collide.</para>
      </sect3>
    </sect2>
  </sect1>
</chapter>