1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
|
<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
<!--
The Vinum Volume Manager
By Greg Lehey (grog at lemis dot com)
Added to the Handbook by Hiten Pandya <hmp@FreeBSD.org>
and Tom Rhodes <trhodes@FreeBSD.org>
For the FreeBSD Documentation Project
$FreeBSD$
-->
<chapter id="vinum-vinum">
<chapterinfo>
<authorgroup>
<author>
<firstname>Greg</firstname>
<surname>Lehey</surname>
<contrib>Originally written by </contrib>
</author>
</authorgroup>
</chapterinfo>
<title>The Vinum Volume Manager</title>
<sect1 id="vinum-synopsis">
<title>Synopsis</title>
<para>No matter what disks you have, there are always potential
problems:</para>
<itemizedlist>
<listitem>
<para>They can be too small.</para>
</listitem>
<listitem>
<para>They can be too slow.</para>
</listitem>
<listitem>
<para>They can be too unreliable.</para>
</listitem>
</itemizedlist>
<para>Various solutions to these problems have been proposed and
implemented. One way some users safeguard themselves against such
issues is through the use of multiple, and sometimes redundant,
disks. In addition to supporting various cards and controllers
for hardware RAID systems, the base &os; system includes the
Vinum Volume Manager, a block device driver that implements
virtual disk drives. <emphasis>Vinum</emphasis> is a
so-called <emphasis>Volume Manager</emphasis>, a virtual disk
driver that addresses these three problems. Vinum provides more
flexibility, performance, and reliability than traditional disk
storage, and implements RAID-0, RAID-1, and RAID-5 models both
individually and in combination.</para>
<para>This chapter provides an overview of potential problems with
traditional disk storage, and an introduction to the Vinum Volume
Manager.</para>
<note>
<para>Starting with &os; 5, Vinum has been rewritten in order
to fit into the GEOM architecture (<xref linkend="GEOM"/>),
retaining the original ideas, terminology, and on-disk
metadata. This rewrite is called <emphasis>gvinum</emphasis>
(for <emphasis> GEOM vinum</emphasis>). The following text
usually refers to <emphasis>Vinum</emphasis> as an abstract
name, regardless of the implementation variant. Any command
invocations should now be done using
the <command>gvinum</command> command, and the name of the
kernel module has been changed
from <filename>vinum.ko</filename>
to <filename>geom_vinum.ko</filename>, and all device nodes
reside under <filename class="directory">/dev/gvinum</filename> instead
of <filename class="directory">/dev/vinum</filename>. As of &os; 6, the old
Vinum implementation is no longer available in the code
base.</para>
</note>
</sect1>
<sect1 id="vinum-intro">
<title>Disks Are Too Small</title>
<indexterm><primary>Vinum</primary></indexterm>
<indexterm><primary>RAID</primary>
<secondary>software</secondary></indexterm>
<para>Disks are getting bigger, but so are data storage
requirements. Often you will find you want a file system that
is bigger than the disks you have available. Admittedly, this
problem is not as acute as it was ten years ago, but it still
exists. Some systems have solved this by creating an abstract
device which stores its data on a number of disks.</para>
</sect1>
<sect1 id="vinum-access-bottlenecks">
<title>Access Bottlenecks</title>
<para>Modern systems frequently need to access data in a highly
concurrent manner. For example, large FTP or HTTP servers can
maintain thousands of concurrent sessions and have multiple
100 Mbit/s connections to the outside world, well beyond
the sustained transfer rate of most disks.</para>
<para>Current disk drives can transfer data sequentially at up to
70 MB/s, but this value is of little importance in an
environment where many independent processes access a drive,
where they may achieve only a fraction of these values. In such
cases it is more interesting to view the problem from the
viewpoint of the disk subsystem: the important parameter is the
load that a transfer places on the subsystem, in other words the
time for which a transfer occupies the drives involved in the
transfer.</para>
<para>In any disk transfer, the drive must first position the
heads, wait for the first sector to pass under the read head,
and then perform the transfer. These actions can be considered
to be atomic: it does not make any sense to interrupt
them.</para>
<para><anchor id="vinum-latency"/> Consider a typical transfer of
about 10 kB: the current generation of high-performance
disks can position the heads in an average of 3.5 ms. The
fastest drives spin at 15,000 rpm, so the average
rotational latency (half a revolution) is 2 ms. At
70 MB/s, the transfer itself takes about 150 μs,
almost nothing compared to the positioning time. In such a
case, the effective transfer rate drops to a little over
1 MB/s and is clearly highly dependent on the transfer
size.</para>
<para>The traditional and obvious solution to this bottleneck is
<quote>more spindles</quote>: rather than using one large disk,
it uses several smaller disks with the same aggregate storage
space. Each disk is capable of positioning and transferring
independently, so the effective throughput increases by a factor
close to the number of disks used.
</para>
<para>The exact throughput improvement is, of course, smaller than
the number of disks involved: although each drive is capable of
transferring in parallel, there is no way to ensure that the
requests are evenly distributed across the drives. Inevitably
the load on one drive will be higher than on another.</para>
<indexterm>
<primary>disk concatenation</primary>
</indexterm>
<indexterm>
<primary>Vinum</primary>
<secondary>concatenation</secondary>
</indexterm>
<para>The evenness of the load on the disks is strongly dependent
on the way the data is shared across the drives. In the
following discussion, it is convenient to think of the disk
storage as a large number of data sectors which are addressable
by number, rather like the pages in a book. The most obvious
method is to divide the virtual disk into groups of consecutive
sectors the size of the individual physical disks and store them
in this manner, rather like taking a large book and tearing it
into smaller sections. This method is called
<emphasis>concatenation</emphasis> and has the advantage that
the disks are not required to have any specific size
relationships. It works well when the access to the virtual
disk is spread evenly about its address space. When access is
concentrated on a smaller area, the improvement is less marked.
<xref linkend="vinum-concat"/> illustrates the sequence in which
storage units are allocated in a concatenated
organization.</para>
<para>
<figure id="vinum-concat">
<title>Concatenated Organization</title>
<graphic fileref="vinum/vinum-concat"/>
</figure>
</para>
<indexterm>
<primary>disk striping</primary>
</indexterm>
<indexterm>
<primary>Vinum</primary>
<secondary>striping</secondary>
</indexterm>
<indexterm>
<primary>RAID</primary>
</indexterm>
<para>An alternative mapping is to divide the address space into
smaller, equal-sized components and store them sequentially on
different devices. For example, the first 256 sectors may be
stored on the first disk, the next 256 sectors on the next disk
and so on. After filling the last disk, the process repeats
until the disks are full. This mapping is called
<emphasis>striping</emphasis> or <acronym>RAID-0</acronym>
<footnote>
<para><acronym>RAID</acronym> stands for <emphasis>Redundant
Array of Inexpensive Disks</emphasis> and offers various forms
of fault tolerance, though the latter term is somewhat
misleading: it provides no redundancy.</para> </footnote>.
Striping requires somewhat more effort to locate the data, and it
can cause additional I/O load where a transfer is spread over
multiple disks, but it can also provide a more constant load
across the disks. <xref linkend="vinum-striped"/> illustrates the
sequence in which storage units are allocated in a striped
organization.</para>
<para>
<figure id="vinum-striped">
<title>Striped Organization</title>
<graphic fileref="vinum/vinum-striped"/>
</figure>
</para>
</sect1>
<sect1 id="vinum-data-integrity">
<title>Data Integrity</title>
<para>The final problem with current disks is that they are
unreliable. Although disk drive reliability has increased
tremendously over the last few years, they are still the most
likely core component of a server to fail. When they do, the
results can be catastrophic: replacing a failed disk drive and
restoring data to it can take days.</para>
<indexterm>
<primary>disk mirroring</primary>
</indexterm>
<indexterm>
<primary>Vinum</primary>
<secondary>mirroring</secondary>
</indexterm>
<indexterm>
<primary>RAID-1</primary>
</indexterm>
<para>The traditional way to approach this problem has been
<emphasis>mirroring</emphasis>, keeping two copies of the data
on different physical hardware. Since the advent of the
<acronym>RAID</acronym> levels, this technique has also been
called <acronym>RAID level 1</acronym> or
<acronym>RAID-1</acronym>. Any write to the volume writes to
both locations; a read can be satisfied from either, so if one
drive fails, the data is still available on the other
drive.</para>
<para>Mirroring has two problems:</para>
<itemizedlist>
<listitem>
<para>The price. It requires twice as much disk storage as
a non-redundant solution.</para>
</listitem>
<listitem>
<para>The performance impact. Writes must be performed to
both drives, so they take up twice the bandwidth of a
non-mirrored volume. Reads do not suffer from a
performance penalty: it even looks as if they are
faster.</para>
</listitem>
</itemizedlist>
<para><indexterm><primary>RAID-5</primary></indexterm>An
alternative solution is <emphasis>parity</emphasis>,
implemented in the <acronym>RAID</acronym> levels 2, 3, 4 and
5. Of these, <acronym>RAID-5</acronym> is the most
interesting. As implemented in Vinum, it is a variant on a
striped organization which dedicates one block of each stripe
to parity one of the other blocks. As implemented by Vinum, a
<acronym>RAID-5</acronym> plex is similar to a striped plex,
except that it implements <acronym>RAID-5</acronym> by
including a parity block in each stripe. As required by
<acronym>RAID-5</acronym>, the location of this parity block
changes from one stripe to the next. The numbers in the data
blocks indicate the relative block numbers.</para>
<para>
<figure id="vinum-raid5-org">
<title>RAID-5 Organization</title>
<graphic fileref="vinum/vinum-raid5-org"/>
</figure>
</para>
<para>Compared to mirroring, <acronym>RAID-5</acronym> has the
advantage of requiring significantly less storage space. Read
access is similar to that of striped organizations, but write
access is significantly slower, approximately 25% of the read
performance. If one drive fails, the array can continue to
operate in degraded mode: a read from one of the remaining
accessible drives continues normally, but a read from the
failed drive is recalculated from the corresponding block from
all the remaining drives.
</para>
</sect1>
<sect1 id="vinum-objects">
<title>Vinum Objects</title>
<para>In order to address these problems, Vinum implements a four-level
hierarchy of objects:</para>
<itemizedlist>
<listitem>
<para>The most visible object is the virtual disk, called a
<emphasis>volume</emphasis>. Volumes have essentially the same
properties as a &unix; disk drive, though there are some minor
differences. They have no size limitations.</para>
</listitem>
<listitem>
<para>Volumes are composed of <emphasis>plexes</emphasis>,
each of which represent the total address space of a
volume. This level in the hierarchy thus provides
redundancy. Think of plexes as individual disks in a
mirrored array, each containing the same data.</para>
</listitem>
<listitem>
<para>Since Vinum exists within the &unix; disk storage
framework, it would be possible to use &unix;
partitions as the building block for multi-disk plexes,
but in fact this turns out to be too inflexible:
&unix; disks can have only a limited number of
partitions. Instead, Vinum subdivides a single
&unix; partition (the <emphasis>drive</emphasis>)
into contiguous areas called
<emphasis>subdisks</emphasis>, which it uses as building
blocks for plexes.</para>
</listitem>
<listitem>
<para>Subdisks reside on Vinum <emphasis>drives</emphasis>,
currently &unix; partitions. Vinum drives can
contain any number of subdisks. With the exception of a
small area at the beginning of the drive, which is used
for storing configuration and state information, the
entire drive is available for data storage.</para>
</listitem>
</itemizedlist>
<para>The following sections describe the way these objects provide the
functionality required of Vinum.</para>
<sect2>
<title>Volume Size Considerations</title>
<para>Plexes can include multiple subdisks spread over all
drives in the Vinum configuration. As a result, the size of
an individual drive does not limit the size of a plex, and
thus of a volume.</para>
</sect2>
<sect2>
<title>Redundant Data Storage</title>
<para>Vinum implements mirroring by attaching multiple plexes to
a volume. Each plex is a representation of the data in a
volume. A volume may contain between one and eight
plexes.</para>
<para>Although a plex represents the complete data of a volume,
it is possible for parts of the representation to be
physically missing, either by design (by not defining a
subdisk for parts of the plex) or by accident (as a result of
the failure of a drive). As long as at least one plex can
provide the data for the complete address range of the volume,
the volume is fully functional.</para>
</sect2>
<sect2>
<title>Performance Issues</title>
<para>Vinum implements both concatenation and striping at the
plex level:</para>
<itemizedlist>
<listitem>
<para>A <emphasis>concatenated plex</emphasis> uses the
address space of each subdisk in turn.</para>
</listitem>
<listitem>
<para>A <emphasis>striped plex</emphasis> stripes the data
across each subdisk. The subdisks must all have the same
size, and there must be at least two subdisks in order to
distinguish it from a concatenated plex.</para>
</listitem>
</itemizedlist>
</sect2>
<sect2>
<title>Which Plex Organization?</title>
<para>The version of Vinum supplied with &os; &rel.current; implements
two kinds of plex:</para>
<itemizedlist>
<listitem>
<para>Concatenated plexes are the most flexible: they can
contain any number of subdisks, and the subdisks may be of
different length. The plex may be extended by adding
additional subdisks. They require less
<acronym>CPU</acronym> time than striped plexes, though
the difference in <acronym>CPU</acronym> overhead is not
measurable. On the other hand, they are most susceptible
to hot spots, where one disk is very active and others are
idle.</para>
</listitem>
<listitem>
<para>The greatest advantage of striped
(<acronym>RAID-0</acronym>) plexes is that they reduce hot
spots: by choosing an optimum sized stripe (about
256 kB), you can even out the load on the component
drives. The disadvantages of this approach are
(fractionally) more complex code and restrictions on
subdisks: they must be all the same size, and extending a
plex by adding new subdisks is so complicated that Vinum
currently does not implement it. Vinum imposes an
additional, trivial restriction: a striped plex must have
at least two subdisks, since otherwise it is
indistinguishable from a concatenated plex.</para>
</listitem>
</itemizedlist>
<para><xref linkend="vinum-comparison"/> summarizes the advantages
and disadvantages of each plex organization.</para>
<table id="vinum-comparison" frame="none">
<title>Vinum Plex Organizations</title>
<tgroup cols="5">
<thead>
<row>
<entry>Plex type</entry>
<entry>Minimum subdisks</entry>
<entry>Can add subdisks</entry>
<entry>Must be equal size</entry>
<entry>Application</entry>
</row>
</thead>
<tbody>
<row>
<entry>concatenated</entry>
<entry>1</entry>
<entry>yes</entry>
<entry>no</entry>
<entry>Large data storage with maximum placement flexibility
and moderate performance</entry>
</row>
<row>
<entry>striped</entry>
<entry>2</entry>
<entry>no</entry>
<entry>yes</entry>
<entry>High performance in combination with highly concurrent
access</entry>
</row>
</tbody>
</tgroup>
</table>
</sect2>
</sect1>
<sect1 id="vinum-examples">
<title>Some Examples</title>
<para>Vinum maintains a <emphasis>configuration
database</emphasis> which describes the objects known to an
individual system. Initially, the user creates the
configuration database from one or more configuration files with
the aid of the &man.gvinum.8; utility program. Vinum stores a
copy of its configuration database on each disk slice (which
Vinum calls a <emphasis>device</emphasis>) under its control.
This database is updated on each state change, so that a restart
accurately restores the state of each Vinum object.</para>
<sect2>
<title>The Configuration File</title>
<para>The configuration file describes individual Vinum objects. The
definition of a simple volume might be:</para>
<programlisting>
drive a device /dev/da3h
volume myvol
plex org concat
sd length 512m drive a</programlisting>
<para>This file describes four Vinum objects:</para>
<itemizedlist>
<listitem>
<para>The <emphasis>drive</emphasis> line describes a disk
partition (<emphasis>drive</emphasis>) and its location
relative to the underlying hardware. It is given the
symbolic name <emphasis>a</emphasis>. This separation of
the symbolic names from the device names allows disks to
be moved from one location to another without
confusion.</para>
</listitem>
<listitem>
<para>The <emphasis>volume</emphasis> line describes a volume.
The only required attribute is the name, in this case
<emphasis>myvol</emphasis>.</para>
</listitem>
<listitem>
<para>The <emphasis>plex</emphasis> line defines a plex.
The only required parameter is the organization, in this
case <emphasis>concat</emphasis>. No name is necessary:
the system automatically generates a name from the volume
name by adding the suffix
<emphasis>.p</emphasis><emphasis>x</emphasis>, where
<emphasis>x</emphasis> is the number of the plex in the
volume. Thus this plex will be called
<emphasis>myvol.p0</emphasis>.</para>
</listitem>
<listitem>
<para>The <emphasis>sd</emphasis> line describes a subdisk.
The minimum specifications are the name of a drive on
which to store it, and the length of the subdisk. As with
plexes, no name is necessary: the system automatically
assigns names derived from the plex name by adding the
suffix <emphasis>.s</emphasis><emphasis>x</emphasis>,
where <emphasis>x</emphasis> is the number of the subdisk
in the plex. Thus Vinum gives this subdisk the name
<emphasis>myvol.p0.s0</emphasis>.</para>
</listitem>
</itemizedlist>
<para>After processing this file, &man.gvinum.8; produces the following
output:</para>
<programlisting width="97">
&prompt.root; gvinum -> <userinput>create config1</userinput>
Configuration summary
Drives: 1 (4 configured)
Volumes: 1 (4 configured)
Plexes: 1 (8 configured)
Subdisks: 1 (16 configured)
D a State: up Device /dev/da3h Avail: 2061/2573 MB (80%)
V myvol State: up Plexes: 1 Size: 512 MB
P myvol.p0 C State: up Subdisks: 1 Size: 512 MB
S myvol.p0.s0 State: up PO: 0 B Size: 512 MB</programlisting>
<para>This output shows the brief listing format of &man.gvinum.8;. It
is represented graphically in <xref linkend="vinum-simple-vol"/>.</para>
<para>
<figure id="vinum-simple-vol">
<title>A Simple Vinum Volume</title>
<graphic fileref="vinum/vinum-simple-vol"/>
</figure>
</para>
<para>This figure, and the ones which follow, represent a
volume, which contains the plexes, which in turn contain the
subdisks. In this trivial example, the volume contains one
plex, and the plex contains one subdisk.</para>
<para>This particular volume has no specific advantage over a
conventional disk partition. It contains a single plex, so it
is not redundant. The plex contains a single subdisk, so
there is no difference in storage allocation from a
conventional disk partition. The following sections
illustrate various more interesting configuration
methods.</para>
</sect2>
<sect2>
<title>Increased Resilience: Mirroring</title>
<para>The resilience of a volume can be increased by mirroring.
When laying out a mirrored volume, it is important to ensure
that the subdisks of each plex are on different drives, so
that a drive failure will not take down both plexes. The
following configuration mirrors a volume:</para>
<programlisting>
drive b device /dev/da4h
volume mirror
plex org concat
sd length 512m drive a
plex org concat
sd length 512m drive b</programlisting>
<para>In this example, it was not necessary to specify a
definition of drive <emphasis>a</emphasis> again, since Vinum
keeps track of all objects in its configuration database.
After processing this definition, the configuration looks
like:</para>
<programlisting width="97">
Drives: 2 (4 configured)
Volumes: 2 (4 configured)
Plexes: 3 (8 configured)
Subdisks: 3 (16 configured)
D a State: up Device /dev/da3h Avail: 1549/2573 MB (60%)
D b State: up Device /dev/da4h Avail: 2061/2573 MB (80%)
V myvol State: up Plexes: 1 Size: 512 MB
V mirror State: up Plexes: 2 Size: 512 MB
P myvol.p0 C State: up Subdisks: 1 Size: 512 MB
P mirror.p0 C State: up Subdisks: 1 Size: 512 MB
P mirror.p1 C State: initializing Subdisks: 1 Size: 512 MB
S myvol.p0.s0 State: up PO: 0 B Size: 512 MB
S mirror.p0.s0 State: up PO: 0 B Size: 512 MB
S mirror.p1.s0 State: empty PO: 0 B Size: 512 MB</programlisting>
<para><xref linkend="vinum-mirrored-vol"/> shows the structure
graphically.</para>
<para>
<figure id="vinum-mirrored-vol">
<title>A Mirrored Vinum Volume</title>
<graphic fileref="vinum/vinum-mirrored-vol"/>
</figure>
</para>
<para>In this example, each plex contains the full 512 MB
of address space. As in the previous example, each plex
contains only a single subdisk.</para>
</sect2>
<sect2>
<title>Optimizing Performance</title>
<para>The mirrored volume in the previous example is more
resistant to failure than an unmirrored volume, but its
performance is less: each write to the volume requires a write
to both drives, using up a greater proportion of the total
disk bandwidth. Performance considerations demand a different
approach: instead of mirroring, the data is striped across as
many disk drives as possible. The following configuration
shows a volume with a plex striped across four disk
drives:</para>
<programlisting>
drive c device /dev/da5h
drive d device /dev/da6h
volume stripe
plex org striped 512k
sd length 128m drive a
sd length 128m drive b
sd length 128m drive c
sd length 128m drive d</programlisting>
<para>As before, it is not necessary to define the drives which are
already known to Vinum. After processing this definition, the
configuration looks like:</para>
<programlisting width="92">
Drives: 4 (4 configured)
Volumes: 3 (4 configured)
Plexes: 4 (8 configured)
Subdisks: 7 (16 configured)
D a State: up Device /dev/da3h Avail: 1421/2573 MB (55%)
D b State: up Device /dev/da4h Avail: 1933/2573 MB (75%)
D c State: up Device /dev/da5h Avail: 2445/2573 MB (95%)
D d State: up Device /dev/da6h Avail: 2445/2573 MB (95%)
V myvol State: up Plexes: 1 Size: 512 MB
V mirror State: up Plexes: 2 Size: 512 MB
V striped State: up Plexes: 1 Size: 512 MB
P myvol.p0 C State: up Subdisks: 1 Size: 512 MB
P mirror.p0 C State: up Subdisks: 1 Size: 512 MB
P mirror.p1 C State: initializing Subdisks: 1 Size: 512 MB
P striped.p1 State: up Subdisks: 1 Size: 512 MB
S myvol.p0.s0 State: up PO: 0 B Size: 512 MB
S mirror.p0.s0 State: up PO: 0 B Size: 512 MB
S mirror.p1.s0 State: empty PO: 0 B Size: 512 MB
S striped.p0.s0 State: up PO: 0 B Size: 128 MB
S striped.p0.s1 State: up PO: 512 kB Size: 128 MB
S striped.p0.s2 State: up PO: 1024 kB Size: 128 MB
S striped.p0.s3 State: up PO: 1536 kB Size: 128 MB</programlisting>
<para>
<figure id="vinum-striped-vol">
<title>A Striped Vinum Volume</title>
<graphic fileref="vinum/vinum-striped-vol"/>
</figure>
</para>
<para>This volume is represented in
<xref linkend="vinum-striped-vol"/>. The darkness of the stripes
indicates the position within the plex address space: the lightest stripes
come first, the darkest last.</para>
</sect2>
<sect2>
<title>Resilience and Performance</title>
<para><anchor id="vinum-resilience"/>With sufficient hardware, it
is possible to build volumes which show both increased
resilience and increased performance compared to standard
&unix; partitions. A typical configuration file might
be:</para>
<programlisting>
volume raid10
plex org striped 512k
sd length 102480k drive a
sd length 102480k drive b
sd length 102480k drive c
sd length 102480k drive d
sd length 102480k drive e
plex org striped 512k
sd length 102480k drive c
sd length 102480k drive d
sd length 102480k drive e
sd length 102480k drive a
sd length 102480k drive b</programlisting>
<para>The subdisks of the second plex are offset by two drives from those
of the first plex: this helps ensure that writes do not go to the same
subdisks even if a transfer goes over two drives.</para>
<para><xref linkend="vinum-raid10-vol"/> represents the structure
of this volume.</para>
<para>
<figure id="vinum-raid10-vol">
<title>A Mirrored, Striped Vinum Volume</title>
<graphic fileref="vinum/vinum-raid10-vol"/>
</figure>
</para>
</sect2>
</sect1>
<sect1 id="vinum-object-naming">
<title>Object Naming</title>
<para>As described above, Vinum assigns default names to plexes
and subdisks, although they may be overridden. Overriding the
default names is not recommended: experience with the VERITAS
volume manager, which allows arbitrary naming of objects, has
shown that this flexibility does not bring a significant
advantage, and it can cause confusion.</para>
<para>Names may contain any non-blank character, but it is
recommended to restrict them to letters, digits and the
underscore characters. The names of volumes, plexes and
subdisks may be up to 64 characters long, and the names of
drives may be up to 32 characters long.</para>
<para>Vinum objects are assigned device nodes in the hierarchy
<filename class="directory">/dev/gvinum</filename>. The configuration shown above
would cause Vinum to create the following device nodes:</para>
<itemizedlist>
<listitem>
<para>Device entries for each volume.
These are the main devices used by Vinum. Thus the configuration
above would include the devices
<filename class="devicefile">/dev/gvinum/myvol</filename>,
<filename class="devicefile">/dev/gvinum/mirror</filename>,
<filename class="devicefile">/dev/gvinum/striped</filename>,
<filename class="devicefile">/dev/gvinum/raid5</filename> and
<filename class="devicefile">/dev/gvinum/raid10</filename>.</para>
</listitem>
<listitem>
<para>All volumes get direct entries under
<filename class="directory">/dev/gvinum/</filename>.</para>
</listitem>
<listitem>
<para>The directories
<filename class="directory">/dev/gvinum/plex</filename>, and
<filename class="directory">/dev/gvinum/sd</filename>, which contain
device nodes for each plex and for each subdisk,
respectively.</para>
</listitem>
</itemizedlist>
<para>For example, consider the following configuration file:</para>
<programlisting>
drive drive1 device /dev/sd1h
drive drive2 device /dev/sd2h
drive drive3 device /dev/sd3h
drive drive4 device /dev/sd4h
volume s64 setupstate
plex org striped 64k
sd length 100m drive drive1
sd length 100m drive drive2
sd length 100m drive drive3
sd length 100m drive drive4</programlisting>
<para>After processing this file, &man.gvinum.8; creates the following
structure in <filename class="directory">/dev/gvinum</filename>:</para>
<programlisting>
drwxr-xr-x 2 root wheel 512 Apr 13 16:46 plex
crwxr-xr-- 1 root wheel 91, 2 Apr 13 16:46 s64
drwxr-xr-x 2 root wheel 512 Apr 13 16:46 sd
/dev/vinum/plex:
total 0
crwxr-xr-- 1 root wheel 25, 0x10000002 Apr 13 16:46 s64.p0
/dev/vinum/sd:
total 0
crwxr-xr-- 1 root wheel 91, 0x20000002 Apr 13 16:46 s64.p0.s0
crwxr-xr-- 1 root wheel 91, 0x20100002 Apr 13 16:46 s64.p0.s1
crwxr-xr-- 1 root wheel 91, 0x20200002 Apr 13 16:46 s64.p0.s2
crwxr-xr-- 1 root wheel 91, 0x20300002 Apr 13 16:46 s64.p0.s3</programlisting>
<para>Although it is recommended that plexes and subdisks should
not be allocated specific names, Vinum drives must be named.
This makes it possible to move a drive to a different location
and still recognize it automatically. Drive names may be up to
32 characters long.</para>
<sect2>
<title>Creating File Systems</title>
<para>Volumes appear to the system to be identical to disks,
with one exception. Unlike &unix; drives, Vinum does
not partition volumes, which thus do not contain a partition
table. This has required modification to some disk
utilities, notably &man.newfs.8;, which previously tried to
interpret the last letter of a Vinum volume name as a
partition identifier. For example, a disk drive may have a
name like <filename class="devicefile">/dev/ad0a</filename> or
<filename class="devicefile">/dev/da2h</filename>. These names represent
the first partition (<devicename>a</devicename>) on the
first (0) IDE disk (<devicename>ad</devicename>) and the
eighth partition (<devicename>h</devicename>) on the third
(2) SCSI disk (<devicename>da</devicename>) respectively.
By contrast, a Vinum volume might be called
<filename class="devicefile">/dev/gvinum/concat</filename>, a name which has
no relationship with a partition name.</para>
<para>In order to create a file system on this volume, use
&man.newfs.8;:</para>
<screen>&prompt.root; <userinput>newfs /dev/gvinum/concat</userinput></screen>
</sect2>
</sect1>
<sect1 id="vinum-config">
<title>Configuring Vinum</title>
<para>The <filename>GENERIC</filename> kernel does not contain
Vinum. It is possible to build a special kernel which includes
Vinum, but this is not recommended. The standard way to start
Vinum is as a kernel module (<acronym>kld</acronym>). You do
not even need to use &man.kldload.8; for Vinum: when you start
&man.gvinum.8;, it checks whether the module has been loaded, and
if it is not, it loads it automatically.</para>
<sect2>
<title>Startup</title>
<para>Vinum stores configuration information on the disk slices
in essentially the same form as in the configuration files.
When reading from the configuration database, Vinum recognizes
a number of keywords which are not allowed in the
configuration files. For example, a disk configuration might
contain the following text:</para>
<programlisting width="119">volume myvol state up
volume bigraid state down
plex name myvol.p0 state up org concat vol myvol
plex name myvol.p1 state up org concat vol myvol
plex name myvol.p2 state init org striped 512b vol myvol
plex name bigraid.p0 state initializing org raid5 512b vol bigraid
sd name myvol.p0.s0 drive a plex myvol.p0 state up len 1048576b driveoffset 265b plexoffset 0b
sd name myvol.p0.s1 drive b plex myvol.p0 state up len 1048576b driveoffset 265b plexoffset 1048576b
sd name myvol.p1.s0 drive c plex myvol.p1 state up len 1048576b driveoffset 265b plexoffset 0b
sd name myvol.p1.s1 drive d plex myvol.p1 state up len 1048576b driveoffset 265b plexoffset 1048576b
sd name myvol.p2.s0 drive a plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 0b
sd name myvol.p2.s1 drive b plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 524288b
sd name myvol.p2.s2 drive c plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 1048576b
sd name myvol.p2.s3 drive d plex myvol.p2 state init len 524288b driveoffset 1048841b plexoffset 1572864b
sd name bigraid.p0.s0 drive a plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 0b
sd name bigraid.p0.s1 drive b plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 4194304b
sd name bigraid.p0.s2 drive c plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 8388608b
sd name bigraid.p0.s3 drive d plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 12582912b
sd name bigraid.p0.s4 drive e plex bigraid.p0 state initializing len 4194304b driveoff set 1573129b plexoffset 16777216b</programlisting>
<para>The obvious differences here are the presence of
explicit location information and naming (both of which are
also allowed, but discouraged, for use by the user) and the
information on the states (which are not available to the
user). Vinum does not store information about drives in the
configuration information: it finds the drives by scanning
the configured disk drives for partitions with a Vinum
label. This enables Vinum to identify drives correctly even
if they have been assigned different &unix; drive
IDs.</para>
<sect3 id="vinum-rc-startup">
<title>Automatic Startup</title>
<para>
<emphasis>Gvinum</emphasis> always
features an automatic startup once the kernel module is
loaded, via &man.loader.conf.5;. To load the
<emphasis>Gvinum</emphasis> module at boot time, add
<literal>geom_vinum_load="YES"</literal> to
<filename>/boot/loader.conf</filename>.</para>
<para>When you start Vinum with the <command>gvinum
start</command> command, Vinum reads the configuration
database from one of the Vinum drives. Under normal
circumstances, each drive contains an identical copy of the
configuration database, so it does not matter which drive is
read. After a crash, however, Vinum must determine which
drive was updated most recently and read the configuration
from this drive. It then updates the configuration if
necessary from progressively older drives.</para>
</sect3>
</sect2>
</sect1>
<sect1 id="vinum-root">
<title>Using Vinum for the Root Filesystem</title>
<para>For a machine that has fully-mirrored filesystems using
Vinum, it is desirable to also mirror the root filesystem.
Setting up such a configuration is less trivial than mirroring
an arbitrary filesystem because:</para>
<itemizedlist>
<listitem>
<para>The root filesystem must be available very early during
the boot process, so the Vinum infrastructure must already be
available at this time.</para>
</listitem>
<listitem>
<para>The volume containing the root filesystem also contains
the system bootstrap and the kernel, which must be read
using the host system's native utilities (e. g. the BIOS on
PC-class machines) which often cannot be taught about the
details of Vinum.</para>
</listitem>
</itemizedlist>
<para>In the following sections, the term <quote>root
volume</quote> is generally used to describe the Vinum volume
that contains the root filesystem. It is probably a good idea
to use the name <literal>"root"</literal> for this volume, but
this is not technically required in any way. All command
examples in the following sections assume this name though.</para>
<sect2>
<title>Starting up Vinum Early Enough for the Root
Filesystem</title>
<para>There are several measures to take for this to
happen:</para>
<itemizedlist>
<listitem>
<para>Vinum must be available in the kernel at boot-time.
Thus, the method to start Vinum automatically described in
<xref linkend="vinum-rc-startup"/> is not applicable to
accomplish this task, and the
<literal>start_vinum</literal> parameter must actually
<emphasis>not</emphasis> be set when the following setup
is being arranged. The first option would be to compile
Vinum statically into the kernel, so it is available all
the time, but this is usually not desirable. There is
another option as well, to have
<filename>/boot/loader</filename> (<xref
linkend="boot-loader"/>) load the vinum kernel module
early, before starting the kernel. This can be
accomplished by putting the line:</para>
<programlisting>geom_vinum_load="YES"</programlisting>
<para>into the file
<filename>/boot/loader.conf</filename>.</para>
</listitem>
<listitem>
<para>For <emphasis>Gvinum</emphasis>, all startup
is done automatically once the kernel module has been
loaded, so the procedure described above is all that is
needed.</para>
</listitem>
</itemizedlist>
</sect2>
<sect2>
<title>Making a Vinum-based Root Volume Accessible to the
Bootstrap</title>
<para>Since the current &os; bootstrap is only 7.5 KB of
code, and already has the burden of reading files (like
<filename>/boot/loader</filename>) from the UFS filesystem, it
is sheer impossible to also teach it about internal Vinum
structures so it could parse the Vinum configuration data, and
figure out about the elements of a boot volume itself. Thus,
some tricks are necessary to provide the bootstrap code with
the illusion of a standard <literal>"a"</literal> partition
that contains the root filesystem.</para>
<para>For this to be possible at all, the following requirements
must be met for the root volume:</para>
<itemizedlist>
<listitem>
<para>The root volume must not be striped or RAID-5.</para>
</listitem>
<listitem>
<para>The root volume must not contain more than one
concatenated subdisk per plex.</para>
</listitem>
</itemizedlist>
<para>Note that it is desirable and possible that there are
multiple plexes, each containing one replica of the root
filesystem. The bootstrap process will, however, only use one
of these replica for finding the bootstrap and all the files,
until the kernel will eventually mount the root filesystem
itself. Each single subdisk within these plexes will then
need its own <literal>"a"</literal> partition illusion, for
the respective device to become bootable. It is not strictly
needed that each of these faked <literal>"a"</literal>
partitions is located at the same offset within its device,
compared with other devices containing plexes of the root
volume. However, it is probably a good idea to create the
Vinum volumes that way so the resulting mirrored devices are
symmetric, to avoid confusion.</para>
<para>In order to set up these <literal>"a"</literal> partitions,
for each device containing part of the root volume, the
following needs to be done:</para>
<procedure>
<step>
<para>The location (offset from the beginning of the device)
and size of this device's subdisk that is part of the root
volume need to be examined, using the command:</para>
<screen>&prompt.root; <userinput>gvinum l -rv root</userinput></screen>
<para>Note that Vinum offsets and sizes are measured in
bytes. They must be divided by 512 in order to obtain the
block numbers that are to be used in the
<command>bsdlabel</command> command.</para>
</step>
<step>
<para>Run the command:</para>
<screen>&prompt.root; <userinput>bsdlabel -e <replaceable>devname</replaceable></userinput></screen>
<para>for each device that participates in the root volume.
<replaceable>devname</replaceable> must be either the name
of the disk (like <devicename>da0</devicename>) for disks
without a slice (aka. fdisk) table, or the name of the
slice (like <devicename>ad0s1</devicename>).</para>
<para>If there is already an <literal>"a"</literal>
partition on the device (presumably, containing a
pre-Vinum root filesystem), it should be renamed to
something else, so it remains accessible (just in case),
but will no longer be used by default to bootstrap the
system. Note that active partitions (like a root
filesystem currently mounted) cannot be renamed, so this
must be executed either when being booted from a
<quote>Fixit</quote> medium, or in a two-step process,
where (in a mirrored situation) the disk that has not been
currently booted is being manipulated first.</para>
<para>Then, the offset of the Vinum partition on this
device (if any) must be added to the offset of the
respective root volume subdisk on this device. The
resulting value will become the
<literal>"offset"</literal> value for the new
<literal>"a"</literal> partition. The
<literal>"size"</literal> value for this partition can be
taken verbatim from the calculation above. The
<literal>"fstype"</literal> should be
<literal>4.2BSD</literal>. The
<literal>"fsize"</literal>, <literal>"bsize"</literal>,
and <literal>"cpg"</literal> values should best be chosen
to match the actual filesystem, though they are fairly
unimportant within this context.</para>
<para>That way, a new <literal>"a"</literal> partition will
be established that overlaps the Vinum partition on this
device. Note that the <command>bsdlabel</command> will
only allow for this overlap if the Vinum partition has
properly been marked using the <literal>"vinum"</literal>
fstype.</para>
</step>
<step>
<para>That's all! A faked <literal>"a"</literal> partition
does exist now on each device that has one replica of the
root volume. It is highly recommendable to verify the
result again, using a command like:</para>
<screen>&prompt.root; <userinput>fsck -n /dev/<replaceable>devname</replaceable>a</userinput></screen>
</step>
</procedure>
<para>It should be remembered that all files containing control
information must be relative to the root filesystem in the
Vinum volume which, when setting up a new Vinum root volume,
might not match the root filesystem that is currently active.
So in particular, the files <filename>/etc/fstab</filename>
and <filename>/boot/loader.conf</filename> need to be taken
care of.</para>
<para>At next reboot, the bootstrap should figure out the
appropriate control information from the new Vinum-based root
filesystem, and act accordingly. At the end of the kernel
initialization process, after all devices have been announced,
the prominent notice that shows the success of this setup is a
message like:</para>
<screen>Mounting root from ufs:/dev/gvinum/root</screen>
</sect2>
<sect2>
<title>Example of a Vinum-based Root Setup</title>
<para>After the Vinum root volume has been set up, the output of
<command>gvinum l -rv root</command> could look like:</para>
<screen>
...
Subdisk root.p0.s0:
Size: 125829120 bytes (120 MB)
State: up
Plex root.p0 at offset 0 (0 B)
Drive disk0 (/dev/da0h) at offset 135680 (132 kB)
Subdisk root.p1.s0:
Size: 125829120 bytes (120 MB)
State: up
Plex root.p1 at offset 0 (0 B)
Drive disk1 (/dev/da1h) at offset 135680 (132 kB)
</screen>
<para>The values to note are <literal>135680</literal> for the
offset (relative to partition
<filename class="devicefile">/dev/da0h</filename>). This translates to 265
512-byte disk blocks in <command>bsdlabel</command>'s terms.
Likewise, the size of this root volume is 245760 512-byte
blocks. <filename class="devicefile">/dev/da1h</filename>, containing the
second replica of this root volume, has a symmetric
setup.</para>
<para>The bsdlabel for these devices might look like:</para>
<screen>
...
8 partitions:
# size offset fstype [fsize bsize bps/cpg]
a: 245760 281 4.2BSD 2048 16384 0 # (Cyl. 0*- 15*)
c: 71771688 0 unused 0 0 # (Cyl. 0 - 4467*)
h: 71771672 16 vinum # (Cyl. 0*- 4467*)
</screen>
<para>It can be observed that the <literal>"size"</literal>
parameter for the faked <literal>"a"</literal> partition
matches the value outlined above, while the
<literal>"offset"</literal> parameter is the sum of the offset
within the Vinum partition <literal>"h"</literal>, and the
offset of this partition within the device (or slice). This
is a typical setup that is necessary to avoid the problem
described in <xref linkend="vinum-root-panic"/>. It can also
be seen that the entire <literal>"a"</literal> partition is
completely within the <literal>"h"</literal> partition
containing all the Vinum data for this device.</para>
<para>Note that in the above example, the entire device is
dedicated to Vinum, and there is no leftover pre-Vinum root
partition, since this has been a newly set-up disk that was
only meant to be part of a Vinum configuration, ever.</para>
</sect2>
<sect2>
<title>Troubleshooting</title>
<para>If something goes wrong, a way is needed to recover from
the situation. The following list contains few known pitfalls
and solutions.</para>
<sect3>
<title>System Bootstrap Loads, but System Does Not Boot</title>
<para>If for any reason the system does not continue to boot,
the bootstrap can be interrupted with by pressing the
<keycap>space</keycap> key at the 10-seconds warning. The
loader variables (like <literal>vinum.autostart</literal>)
can be examined using the <command>show</command>, and
manipulated using <command>set</command> or
<command>unset</command> commands.</para>
<para>If the only problem was that the Vinum kernel module was
not yet in the list of modules to load automatically, a
simple <command>load geom_vinum</command> will help.</para>
<para>When ready, the boot process can be continued with a
<command>boot -as</command>. The options
<option>-as</option> will request the kernel to ask for the
root filesystem to mount (<option>-a</option>), and make the
boot process stop in single-user mode (<option>-s</option>),
where the root filesystem is mounted read-only. That way,
even if only one plex of a multi-plex volume has been
mounted, no data inconsistency between plexes is being
risked.</para>
<para>At the prompt asking for a root filesystem to mount, any
device that contains a valid root filesystem can be entered.
If <filename>/etc/fstab</filename> had been set up
correctly, the default should be something like
<literal>ufs:/dev/gvinum/root</literal>. A typical alternate
choice would be something like
<literal>ufs:da0d</literal> which could be a
hypothetical partition that contains the pre-Vinum root
filesystem. Care should be taken if one of the alias
<literal>"a"</literal> partitions are entered here that are
actually reference to the subdisks of the Vinum root device,
because in a mirrored setup, this would only mount one piece
of a mirrored root device. If this filesystem is to be
mounted read-write later on, it is necessary to remove the
other plex(es) of the Vinum root volume since these plexes
would otherwise carry inconsistent data.</para>
</sect3>
<sect3>
<title>Only Primary Bootstrap Loads</title>
<para>If <filename>/boot/loader</filename> fails to load, but
the primary bootstrap still loads (visible by a single dash
in the left column of the screen right after the boot
process starts), an attempt can be made to interrupt the
primary bootstrap at this point, using the
<keycap>space</keycap> key. This will make the bootstrap
stop in stage two, see <xref linkend="boot-boot1"/>. An
attempt can be made here to boot off an alternate partition,
like the partition containing the previous root filesystem
that has been moved away from <literal>"a"</literal>
above.</para>
</sect3>
<sect3 id="vinum-root-panic">
<title>Nothing Boots, the Bootstrap
Panics</title>
<para>This situation will happen if the bootstrap had been
destroyed by the Vinum installation. Unfortunately, Vinum
accidentally currently leaves only 4 KB at the beginning of
its partition free before starting to write its Vinum header
information. However, the stage one and two bootstraps plus
the bsdlabel embedded between them currently require 8 KB.
So if a Vinum partition was started at offset 0 within a
slice or disk that was meant to be bootable, the Vinum setup
will trash the bootstrap.</para>
<para>Similarly, if the above situation has been recovered,
for example by booting from a <quote>Fixit</quote> medium,
and the bootstrap has been re-installed using
<command>bsdlabel -B</command> as described in <xref
linkend="boot-boot1"/>, the bootstrap will trash the Vinum
header, and Vinum will no longer find its disk(s). Though
no actual Vinum configuration data or data in Vinum volumes
will be trashed by this, and it would be possible to recover
all the data by entering exact the same Vinum configuration
data again, the situation is hard to fix at all. It would
be necessary to move the entire Vinum partition by at least
4 KB off, in order to have the Vinum header and the system
bootstrap no longer collide.</para>
</sect3>
</sect2>
</sect1>
</chapter>
|