1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
|
<?xml version="1.0" encoding="iso-8859-1"?>
<!-- Copyright (c) 1998, 1999 Nik Clayton, All rights reserved.
Redistribution and use in source (SGML DocBook) and 'compiled' forms
(SGML, HTML, PDF, PostScript, RTF and so forth) with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code (SGML DocBook) must retain the above
copyright notice, this list of conditions and the following
disclaimer as the first lines of this file unmodified.
2. Redistributions in compiled form (transformed to other DTDs,
converted to PDF, PostScript, RTF and other formats) must reproduce
the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials
provided with the distribution.
THIS DOCUMENTATION IS PROVIDED BY NIK CLAYTON "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL NIK CLAYTON BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS DOCUMENTATION, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-->
<chapter xmlns="http://docbook.org/ns/docbook" xmlns:xlink="http://www.w3.org/1999/xlink" version="5.0" xml:id="xml-primer">
<title>XML Primer</title>
<para>Most <acronym>FDP</acronym> documentation is written with
markup languages based on <acronym>XML</acronym>. This chapter
explains what that means, how to read and understand the
documentation source, and the <acronym>XML</acronym> techniques
used.</para>
<para>Portions of this section were inspired by Mark Galassi's
<link xlink:href="http://www.galassi.org/mark/mydocs/docbook-intro/docbook-intro.html">Get
Going With DocBook</link>.</para>
<sect1 xml:id="xml-primer-overview">
<title>Overview</title>
<para>In the original days of computers, electronic text was
simple. There were a few character sets like
<acronym>ASCII</acronym> or <acronym>EBCDIC</acronym>, but that
was about it. Text was text, and what you saw really was what
you got. No frills, no formatting, no intelligence.</para>
<para>Inevitably, this was not enough. When text is in a
machine-usable format, machines are expected to be able to use
and manipulate it intelligently. Authors want to indicate that
certain phrases should be emphasized, or added to a glossary, or
made into hyperlinks. Filenames could be shown in a
<quote>typewriter</quote> style font for viewing on screen, but
as <quote>italics</quote> when printed, or any of a myriad of
other options for presentation.</para>
<para>It was once hoped that Artificial Intelligence (AI) would
make this easy. The computer would read the document and
automatically identify key phrases, filenames, text that the
reader should type in, examples, and more. Unfortunately, real
life has not happened quite like that, and computers still
require assistance before they can meaningfully process
text.</para>
<para>More precisely, they need help identifying what is what.
Consider this text:</para>
<blockquote>
<para>To remove <filename>/tmp/foo</filename>, use
&man.rm.1;.</para>
<screen>&prompt.user; <userinput>rm /tmp/foo</userinput></screen>
</blockquote>
<para>It is easy to see which parts are filenames, which are
commands to be typed in, which parts are references to manual
pages, and so on. But the computer processing the document
cannot. For this we need markup.</para>
<para><quote>Markup</quote> is commonly used to describe
<quote>adding value</quote> or <quote>increasing cost</quote>.
The term takes on both these meanings when applied to text.
Markup is additional text included in the document,
distinguished from the document's content in some way, so that
programs that process the document can read the markup and use
it when making decisions about the document. Editors can hide
the markup from the user, so the user is not distracted by
it.</para>
<para>The extra information stored in the markup
<emphasis>adds value</emphasis> to the document. Adding the
markup to the document must typically be done by a
person—after all, if computers could recognize the text
sufficiently well to add the markup then there would be no need
to add it in the first place. This
<emphasis>increases the cost</emphasis> (the effort required) to
create the document.</para>
<para>The previous example is actually represented in this
document like this:</para>
<programlisting><tag class="starttag">para</tag>To remove <tag class="starttag">filename</tag>/tmp/foo<tag class="endtag">filename</tag>, use &man.rm.1;.<tag class="endtag">para</tag>
<tag class="starttag">screen</tag>&prompt.user; <tag class="starttag">userinput</tag>rm /tmp/foo<tag class="endtag">userinput</tag><tag class="endtag">screen</tag></programlisting>
<para>The markup is clearly separate from the content.</para>
<para>Markup languages define what the markup means and how it
should be interpreted.</para>
<para>Of course, one markup language might not be enough. A
markup language for technical documentation has very different
requirements than a markup language that is intended for cookery
recipes. This, in turn, would be very different from a markup
language used to describe poetry. What is really needed is a
first language used to write these other markup languages. A
<emphasis>meta markup language</emphasis>.</para>
<para>This is exactly what the eXtensible Markup
Language (<acronym>XML</acronym>) is. Many markup languages
have been written in <acronym>XML</acronym>, including the two
most used by the <acronym>FDP</acronym>,
<acronym>XHTML</acronym> and DocBook.</para>
<para>Each language definition is more properly called a grammar,
vocabulary, schema or Document Type Definition
(<acronym>DTD</acronym>). There are various languages to
specify an <acronym>XML</acronym> grammar, or
<emphasis>schema</emphasis>.</para>
<para xml:id="xml-primer-validating">A schema is a
<emphasis>complete</emphasis> specification of all the elements
that are allowed to appear, the order in which they should
appear, which elements are mandatory, which are optional, and so
forth. This makes it possible to write an
<acronym>XML</acronym> <emphasis>parser</emphasis> which reads
in both the schema and a document which claims to conform to the
schema. The parser can then confirm whether or not all the
elements required by the vocabulary are in the document in the
right order, and whether there are any errors in the markup.
This is normally referred to as
<quote>validating the document</quote>.</para>
<note>
<para>Validation confirms that the choice of
elements, their ordering, and so on, conforms to that listed
in the grammar. It does <emphasis>not</emphasis> check
whether <emphasis>appropriate</emphasis> markup has been used
for the content. If all the filenames in a document were
marked up as function names, the parser would not flag this as
an error (assuming, of course, that the schema defines
elements for filenames and functions, and that they are
allowed to appear in the same place).</para>
</note>
<para>Most contributions to the Documentation
Project will be content marked up in either
<acronym>XHTML</acronym> or DocBook, rather than alterations to
the schemas. For this reason, this book will not touch on how
to write a vocabulary.</para>
</sect1>
<sect1 xml:id="xml-primer-elements">
<title>Elements, Tags, and Attributes</title>
<para>All the vocabularies written in <acronym>XML</acronym> share
certain characteristics. This is hardly surprising, as the
philosophy behind <acronym>XML</acronym> will inevitably show
through. One of the most obvious manifestations of this
philosophy is that of <emphasis>content</emphasis> and
<emphasis>elements</emphasis>.</para>
<para>Documentation, whether it is a single web page, or a lengthy
book, is considered to consist of content. This content is then
divided and further subdivided into elements. The purpose of
adding markup is to name and identify the boundaries of these
elements for further processing.</para>
<para>For example, consider a typical book. At the very top
level, the book is itself an element. This <quote>book</quote>
element obviously contains chapters, which can be considered to
be elements in their own right. Each chapter will contain more
elements, such as paragraphs, quotations, and footnotes. Each
paragraph might contain further elements, identifying content
that was direct speech, or the name of a character in the
story.</para>
<para>It may be helpful to think of this as
<quote>chunking</quote> content. At the very top level is one
chunk, the book. Look a little deeper, and there are more
chunks, the individual chapters. These are chunked further into
paragraphs, footnotes, character names, and so on.</para>
<para>Notice how this differentiation between different elements
of the content can be made without resorting to any
<acronym>XML</acronym> terms. It really is surprisingly
straightforward. This could be done with a highlighter pen and
a printout of the book, using different colors to indicate
different chunks of content.</para>
<para>Of course, we do not have an electronic highlighter pen, so
we need some other way of indicating which element each piece of
content belongs to. In languages written in
<acronym>XML</acronym> (<acronym>XHTML</acronym>, DocBook, et
al) this is done by means of <emphasis>tags</emphasis>.</para>
<para>A tag is used to identify where a particular element starts,
and where the element ends. <emphasis>The tag is not part of
the element itself</emphasis>. Because each grammar was
normally written to mark up specific types of information, each
one will recognize different elements, and will therefore have
different names for the tags.</para>
<para>For an element called
<replaceable>element-name</replaceable> the start tag will
normally look like <tag class="starttag"><replaceable>element-name</replaceable></tag>.
The corresponding closing tag for this element is <tag class="endtag"><replaceable>element-name</replaceable></tag>.</para>
<example>
<title>Using an Element (Start and End Tags)</title>
<para><acronym>XHTML</acronym> has an element for indicating
that the content enclosed by the element is a paragraph,
called <tag>p</tag>.</para>
<programlisting><tag class="starttag">p</tag>This is a paragraph. It starts with the start tag for
the 'p' element, and it will end with the end tag for the 'p'
element.<tag class="endtag">p</tag>
<tag class="starttag">p</tag>This is another paragraph. But this one is much shorter.<tag class="endtag">p</tag></programlisting>
</example>
<para>Some elements have no content. For example, in
<acronym>XHTML</acronym>, a horizontal line can be included in
the document. For these <quote>empty</quote> elements,
<acronym>XML</acronym> introduced a shorthand form that is
completely equivalent to the two-tag version:</para>
<example>
<title>Using an Element Without Content</title>
<para><acronym>XHTML</acronym> has an element for indicating a
horizontal rule, called <tag>hr</tag>. This element
does not wrap content, so it looks like this:</para>
<programlisting><tag class="starttag">p</tag>One paragraph.<tag class="endtag">p</tag>
<tag class="starttag">hr</tag><tag class="endtag">hr</tag>
<tag class="starttag">p</tag>This is another paragraph. A horizontal rule separates this
from the previous paragraph.<tag class="endtag">p</tag></programlisting>
<para>The shorthand version consists of a single tag:</para>
<programlisting><tag class="starttag">p</tag>One paragraph.<tag class="endtag">p</tag>
<tag class="emptytag">hr</tag>
<tag class="starttag">p</tag>This is another paragraph. A horizontal rule separates this
from the previous paragraph.<tag class="endtag">p</tag></programlisting>
</example>
<para>As shown above, elements can contain other elements. In the
book example earlier, the book element contained all the chapter
elements, which in turn contained all the paragraph elements,
and so on.</para>
<example>
<title>Elements Within Elements; <tag>em</tag></title>
<programlisting><tag class="starttag">p</tag>This is a simple <tag class="starttag">em</tag>paragraph<tag class="endtag">em</tag> where some
of the <tag class="starttag">em</tag>words<tag class="endtag">em</tag> have been <tag class="starttag">em</tag>emphasized<tag class="endtag">em</tag>.<tag class="endtag">p</tag></programlisting>
</example>
<para>The grammar consists of rules that describe which elements
can contain other elements, and exactly what they can
contain.</para>
<important>
<para>People often confuse the terms tags and elements, and use
the terms as if they were interchangeable. They are
not.</para>
<para>An element is a conceptual part of your document. An
element has a defined start and end. The tags mark where the
element starts and ends.</para>
<para>When this document (or anyone else knowledgeable about
<acronym>XML</acronym>) refers to
<quote>the <tag class="starttag">p</tag> tag</quote>
they mean the literal text consisting of the three characters
<literal><</literal>, <literal>p</literal>, and
<literal>></literal>. But the phrase
<quote>the <tag>p</tag> element</quote> refers to the
whole element.</para>
<para>This distinction <emphasis>is</emphasis> very subtle. But
keep it in mind.</para>
</important>
<para>Elements can have attributes. An attribute has a name and a
value, and is used for adding extra information to the element.
This might be information that indicates how the content should
be rendered, or might be something that uniquely identifies that
occurrence of the element, or it might be something else.</para>
<para>An element's attributes are written
<emphasis>inside</emphasis> the start tag for that element, and
take the form
<literal><replaceable>attribute-name</replaceable>="<replaceable>attribute-value</replaceable>"</literal>.</para>
<para>In <acronym>XHTML</acronym>, the <tag>p</tag>
element has an attribute called
<tag class="attribute">align</tag>, which suggests an
alignment (justification) for the paragraph to the program
displaying the <acronym>XHTML</acronym>.</para>
<para>The <tag class="attribute">align</tag> attribute can
take one of four defined values, <literal>left</literal>,
<literal>center</literal>, <literal>right</literal> and
<literal>justify</literal>. If the attribute is not specified
then the default is <literal>left</literal>.</para>
<example>
<title>Using an Element with an Attribute</title>
<programlisting><tag class="starttag">p align="left"</tag>The inclusion of the align attribute
on this paragraph was superfluous, since the default is left.<tag class="endtag">p</tag>
<tag class="starttag">p align="center"</tag>This may appear in the center.<tag class="endtag">p</tag></programlisting>
</example>
<para>Some attributes only take specific values, such as
<literal>left</literal> or <literal>justify</literal>. Others
allow any value.</para>
<example>
<title>Single Quotes Around Attributes</title>
<programlisting><tag class="starttag">p align='right'</tag>I am on the right!<tag class="endtag">p</tag></programlisting>
</example>
<para>Attribute values in <acronym>XML</acronym> must be enclosed
in either single or double quotes. Double quotes are
traditional. Single quotes are useful when the attribute value
contains double quotes.</para>
<para>Information about attributes, elements, and tags is stored
in catalog files. The Documentation Project uses standard
DocBook catalogs and includes additional catalogs for
&os;-specific features. Paths to the catalog files are defined
in an environment variable so they can be found by the document
build tools.</para>
<sect2 xml:id="xml-primer-elements-to-do">
<title>To Do…</title>
<para>Before running the examples in this document, install
<package>textproc/docproj</package> from
the &os; Ports Collection. This is a
<emphasis>meta-port</emphasis> that downloads and installs
the standard programs and supporting files needed by the
Documentation Project. &man.csh.1; users must use
<command>rehash</command> for the shell to recognize new
programs after they have been installed, or log out
and then log back in again.</para>
<procedure>
<step>
<para>Create <filename>example.xml</filename>, and enter
this text:</para>
<programlisting><tag class="starttag">!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"</tag>
<tag class="starttag">html xmlns="http://www.w3.org/1999/xhtml"</tag>
<tag class="starttag">head</tag>
<tag class="starttag">title</tag>An Example XHTML File<tag class="endtag">title</tag>
<tag class="endtag">head</tag>
<tag class="starttag">body</tag>
<tag class="starttag">p</tag>This is a paragraph containing some text.<tag class="endtag">p</tag>
<tag class="starttag">p</tag>This paragraph contains some more text.<tag class="endtag">p</tag>
<tag class="starttag">p align="right"</tag>This paragraph might be right-justified.<tag class="endtag">p</tag>
<tag class="endtag">body</tag>
<tag class="endtag">html</tag></programlisting>
</step>
<step>
<para>Try to validate this file using an
<acronym>XML</acronym> parser.</para>
<para><package>textproc/docproj</package>
includes the <command>xmllint</command>
<link linkend="xml-primer-validating">validating
parser</link>.</para>
<para>Use <command>xmllint</command> to validate the
document:</para>
<screen>&prompt.user; <userinput>xmllint --valid --noout example.xml</userinput></screen>
<para><command>xmllint</command> returns without displaying
any output, showing that the document validated
successfully.</para>
</step>
<step>
<para>See what happens when required elements are omitted.
Delete the line with the
<tag class="starttag">title</tag> and
<tag class="endtag">title</tag> tags, and re-run
the validation.</para>
<screen>&prompt.user; <userinput>xmllint --valid --noout example.xml</userinput>
example.xml:5: element head: validity error : Element head content does not follow the DTD, expecting ((script | style | meta | link | object | isindex)* , ((title , (script | style | meta | link | object | isindex)* , (base , (script | style | meta | link | object | isindex)*)?) | (base , (script | style | meta | link | object | isindex)* , title , (script | style | meta | link | object | isindex)*))), got ()</screen>
<para>This shows that the validation error comes from the
<replaceable>fifth</replaceable> line of the
<replaceable>example.xml</replaceable> file and that the
content of the <tag class="starttag">head</tag> is
the part which does not follow the rules of the
<acronym>XHTML</acronym> grammar.</para>
<para>Then <command>xmllint</command> shows the line where
the error was found and marks the exact character position
with a <literal>^</literal> sign.</para>
</step>
<step>
<para>Replace the <tag>title</tag> element.</para>
</step>
</procedure>
</sect2>
</sect1>
<sect1 xml:id="xml-primer-doctype-declaration">
<title>The DOCTYPE Declaration</title>
<para>The beginning of each document can specify the name of the
<acronym>DTD</acronym> to which the document conforms. This
DOCTYPE declaration is used by <acronym>XML</acronym> parsers to
identify the <acronym>DTD</acronym> and ensure that the document
does conform to it.</para>
<para>A typical declaration for a document written to conform with
version 1.0 of the <acronym>XHTML</acronym>
<acronym>DTD</acronym> looks like this:</para>
<programlisting><tag class="starttag">!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"</tag></programlisting>
<para>That line contains a number of different components.</para>
<variablelist>
<varlistentry>
<term><literal><!</literal></term>
<listitem>
<para>The <emphasis>indicator</emphasis> shows
this is an <acronym>XML</acronym> declaration.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>DOCTYPE</literal></term>
<listitem>
<para>Shows that this is an <acronym>XML</acronym>
declaration of the document type.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>html</literal></term>
<listitem>
<para>Names the first
<link linkend="xml-primer-elements">element</link> that
will appear in the document.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"</literal></term>
<listitem>
<para>Lists the Formal Public Identifier
(<acronym>FPI</acronym>)
<indexterm>
<primary>Formal Public Identifier</primary>
</indexterm>
for the <acronym>DTD</acronym> to which this document
conforms. The <acronym>XML</acronym> parser uses this to
find the correct <acronym>DTD</acronym> when processing
this document.</para>
<para><literal>PUBLIC</literal> is not a part of the
<acronym>FPI</acronym>, but indicates to the
<acronym>XML</acronym> processor how to find the
<acronym>DTD</acronym> referenced in the
<acronym>FPI</acronym>. Other ways of telling the
<acronym>XML</acronym> parser how to find the
<acronym>DTD</acronym> are shown <link linkend="xml-primer-fpi-alternatives">later</link>.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"</literal></term>
<listitem>
<para>A local filename or a <acronym>URL</acronym> to find
the <acronym>DTD</acronym>.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>></literal></term>
<listitem>
<para>Ends the declaration and returns to the
document.</para>
</listitem>
</varlistentry>
</variablelist>
<sect2 xml:id="doctype-declaration-fpi">
<title>Formal Public Identifiers
(<acronym>FPI</acronym>s)</title>
<indexterm significance="preferred">
<primary>Formal Public Identifier</primary>
</indexterm>
<note>
<para>It is not necessary to know this, but it is useful
background, and might help debug problems when the
<acronym>XML</acronym> processor can not locate the
<acronym>DTD</acronym>.</para>
</note>
<para><acronym>FPI</acronym>s must follow a specific
syntax:</para>
<programlisting>"<replaceable>Owner</replaceable>//<replaceable>Keyword</replaceable> <replaceable>Description</replaceable>//<replaceable>Language</replaceable>"</programlisting>
<variablelist>
<varlistentry>
<term><replaceable>Owner</replaceable></term>
<listitem>
<para>The owner of the <acronym>FPI</acronym>.</para>
<para>The beginning of the string identifies the owner of
the <acronym>FPI</acronym>. For example, the
<acronym>FPI</acronym>
<literal>"ISO 8879:1986//ENTITIES Greek
Symbols//EN"</literal> lists
<literal>ISO 8879:1986</literal> as being the owner for
the set of entities for Greek symbols.
<acronym>ISO</acronym> 8879:1986 is the International
Organization for Standardization
(<acronym>ISO</acronym>) number for the
<acronym>SGML</acronym> standard, the predecessor (and a
superset) of <acronym>XML</acronym>.</para>
<para>Otherwise, this string will either look like
<literal>-//<replaceable>Owner</replaceable></literal>
or
<literal>+//<replaceable>Owner</replaceable></literal>
(notice the only difference is the leading
<literal>+</literal> or <literal>-</literal>).</para>
<para>If the string starts with <literal>-</literal> then
the owner information is unregistered, with a
<literal>+</literal> identifying it as
registered.</para>
<para><acronym>ISO</acronym> 9070:1991 defines how
registered names are generated. It might be derived
from the number of an <acronym>ISO</acronym>
publication, an <acronym>ISBN</acronym> code, or an
organization code assigned according to
<acronym>ISO</acronym> 6523. Additionally, a
registration authority could be created in order to
assign registered names. The <acronym>ISO</acronym>
council delegated this to the American National
Standards Institute (<acronym>ANSI</acronym>).</para>
<para>Because the &os; Project has not been registered,
the owner string is <literal>-//&os;</literal>. As seen
in the example, the <acronym>W3C</acronym> are not a
registered owner either.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><replaceable>Keyword</replaceable></term>
<listitem>
<para>There are several keywords that indicate the type of
information in the file. Some of the most common
keywords are <literal>DTD</literal>,
<literal>ELEMENT</literal>, <literal>ENTITIES</literal>,
and <literal>TEXT</literal>. <literal>DTD</literal> is
used only for <acronym>DTD</acronym> files,
<literal>ELEMENT</literal> is usually used for
<acronym>DTD</acronym> fragments that contain only
entity or element declarations. <literal>TEXT</literal>
is used for <acronym>XML</acronym> content (text and
tags).</para>
</listitem>
</varlistentry>
<varlistentry>
<term><replaceable>Description</replaceable></term>
<listitem>
<para>Any description can be given for the contents
of this file. This may include version numbers or any
short text that is meaningful and unique for the
<acronym>XML</acronym> system.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><replaceable>Language</replaceable></term>
<listitem>
<para>An <acronym>ISO</acronym> two-character code that
identifies the native language for the file.
<literal>EN</literal> is used for English.</para>
</listitem>
</varlistentry>
</variablelist>
<sect3 xml:id="doctype-declaration-fpi-catalog">
<title><filename>catalog</filename> Files</title>
<para>With the syntax above, an <acronym>XML</acronym>
processor needs to have some way of turning the
<acronym>FPI</acronym> into the name of the file containing
the <acronym>DTD</acronym>. A catalog file (typically
called <filename>catalog</filename>) contains lines that map
<acronym>FPI</acronym>s to filenames. For example, if the
catalog file contained the line:</para>
<!-- XXX: mention XML catalog or maybe replace this totally and only cover XML catalog -->
<programlisting>PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "1.0/transitional.dtd"</programlisting>
<para>The <acronym>XML</acronym> processor knows that the
<acronym>DTD</acronym> is called
<filename>transitional.dtd</filename> in the
<filename>1.0</filename> subdirectory of the directory that
held <filename>catalog</filename>.</para>
<para>Examine the contents of
<filename>/usr/local/share/xml/dtd/xhtml/catalog.xml</filename>.
This is the catalog file for the <acronym>XHTML</acronym>
<acronym>DTD</acronym>s that were installed as part of the
<package>textproc/docproj</package> port.</para>
</sect3>
</sect2>
<sect2 xml:id="xml-primer-fpi-alternatives">
<title>Alternatives to <acronym>FPI</acronym>s</title>
<para>Instead of using an <acronym>FPI</acronym> to indicate the
<acronym>DTD</acronym> to which the document conforms (and
therefore, which file on the system contains the
<acronym>DTD</acronym>), the filename can be explicitly
specified.</para>
<para>The syntax is slightly different:</para>
<programlisting><tag class="starttag">!DOCTYPE html SYSTEM "/path/to/file.dtd"</tag></programlisting>
<para>The <literal>SYSTEM</literal> keyword indicates that the
<acronym>XML</acronym> processor should locate the
<acronym>DTD</acronym> in a system specific fashion. This
typically (but not always) means the <acronym>DTD</acronym>
will be provided as a filename.</para>
<para>Using <acronym>FPI</acronym>s is preferred for reasons of
portability. If the <literal>SYSTEM</literal> identifier is
used, then the <acronym>DTD</acronym> must be provided and
kept in the same location for everyone.</para>
</sect2>
</sect1>
<sect1 xml:id="xml-primer-xml-escape">
<title>Escaping Back to <acronym>XML</acronym></title>
<para>Some of the underlying <acronym>XML</acronym> syntax can be
useful within documents. For example, comments can be included
in the document, and will be ignored by the parser. Comments
are entered using <acronym>XML</acronym> syntax. Other uses for
<acronym>XML</acronym> syntax will be shown later.</para>
<para><acronym>XML</acronym> sections begin with a
<literal><!</literal> tag and end with a
<literal>></literal>. These sections contain instructions
for the parser rather than elements of the document. Everything
between these tags is <acronym>XML</acronym> syntax. The
<link linkend="xml-primer-doctype-declaration">DOCTYPE
declaration</link> shown earlier is an example of
<acronym>XML</acronym> syntax included in the document.</para>
</sect1>
<sect1 xml:id="xml-primer-comments">
<title>Comments</title>
<para>An <acronym>XML</acronym> document may contain comments.
They may appear anywhere as long as they are not inside tags.
They are even allowed in some locations inside the
<acronym>DTD</acronym> (e.g., between <link
linkend="xml-primer-entities">entity
declarations</link>).</para>
<para><acronym>XML</acronym> comments start with the string
<quote><literal><!--</literal></quote> and end with the
string <quote><literal>--></literal></quote>.</para>
<para>Here are some examples of valid <acronym>XML</acronym>
comments:</para>
<example>
<title><acronym>XML</acronym> Generic Comments</title>
<programlisting><!-- This is inside the comment -->
<!--This is another comment-->
<!-- This is how you
write multiline comments -->
<p>A simple <!-- Comment inside an element's content --> paragraph.</p></programlisting>
</example>
<para><acronym>XML</acronym> comments may contain any strings
except <quote><literal>--</literal></quote>:</para>
<example>
<title>Erroneous <acronym>XML</acronym> Comment</title>
<programlisting><!-- This comment--is wrong --></programlisting>
</example>
<sect2 xml:id="xml-primer-comments-to-do">
<title>To Do…</title>
<procedure>
<step>
<para>Add some comments to
<filename>example.xml</filename>, and check that the file
still validates using <command>xmllint</command>.</para>
</step>
<step>
<para>Add some invalid comments to
<filename>example.xml</filename>, and see the error
messages that <command>xmllint</command> gives when it
encounters an invalid comment.</para>
</step>
</procedure>
</sect2>
</sect1>
<sect1 xml:id="xml-primer-entities">
<title>Entities</title>
<para>Entities are a mechanism for assigning names to chunks of
content. As an <acronym>XML</acronym> parser processes a
document, any entities it finds are replaced by the content of
the entity.</para>
<para>This is a good way to have re-usable, easily changeable
chunks of content in <acronym>XML</acronym> documents. It is
also the only way to include one marked up file inside another
using <acronym>XML</acronym>.</para>
<para>There are two types of entities for two different
situations: <emphasis>general entities</emphasis> and
<emphasis>parameter entities</emphasis>.</para>
<sect2 xml:id="xml-primer-general-entities">
<title>General Entities</title>
<para>General entities are used to assign names to reusable
chunks of text. These entities can only be used in the
document. They cannot be used in an
<acronym>XML</acronym> context.</para>
<para>To include the text of a general entity in the document,
include
<literal>&<replaceable>entity-name</replaceable>;</literal>
in the text. For example, consider a general entity called
<literal>current.version</literal> which expands to the
current version number of a product. To use it in the
document, write:</para>
<programlisting><tag class="starttag">para</tag>The current version of our product is
&current.version;.<tag class="endtag">para</tag></programlisting>
<para>When the version number changes, edit the definition of
the general entity, replacing the value. Then reprocess the
document.</para>
<para>General entities can also be used to enter characters that
could not otherwise be included in an <acronym>XML</acronym>
document. For example, <literal><</literal> and
<literal>&</literal> cannot normally appear in an
<acronym>XML</acronym> document. The <acronym>XML</acronym>
parser sees the <literal><</literal> symbol as the start of
a tag. Likewise, when the <literal>&</literal> symbol is
seen, the next text is expected to be an entity name.</para>
<para>These symbols can be included by using two predefined
general entities: <literal>&lt;</literal> and
<literal>&amp;</literal>.</para>
<para>General entities can only be defined within an
<acronym>XML</acronym> context. Such definitions are usually
done immediately after the DOCTYPE declaration.</para>
<example>
<title>Defining General Entities</title>
<programlisting><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [
<!ENTITY current.version "3.0-RELEASE">
<!ENTITY last.version "2.2.7-RELEASE">
]></programlisting>
<para>The DOCTYPE declaration has been extended by adding a
square bracket at the end of the first line. The two
entities are then defined over the next two lines, the
square bracket is closed, and then the DOCTYPE declaration
is closed.</para>
<para>The square brackets are necessary to indicate that the
DTD indicated by the DOCTYPE declaration is being
extended.</para>
</example>
</sect2>
<sect2 xml:id="xml-primer-parameter-entities">
<title>Parameter Entities</title>
<para>Parameter entities, like
<link linkend="xml-primer-general-entities">general
entities</link>, are used to assign names to reusable chunks
of text. But parameter entities can only be used within an
<link linkend="xml-primer-xml-escape">XML
context</link>.</para>
<para>Parameter entity definitions are similar to those for
general entities. However, parameter entries are included
with
<literal>%<replaceable>entity-name</replaceable>;</literal>.
The definition also includes the <literal>%</literal> between
the <literal>ENTITY</literal> keyword and the name of the
entity.</para>
<para>For a mnemonic, think
<quote><emphasis>P</emphasis>arameter entities use the
<emphasis>P</emphasis>ercent symbol</quote>.</para>
<example>
<title>Defining Parameter Entities</title>
<programlisting><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [
<!ENTITY % param.some "some">
<!ENTITY % param.text "text">
<!ENTITY % param.new "%param.some more %param.text">
<!-- %param.new now contains "some more text" -->
]></programlisting>
</example>
</sect2>
<sect2 xml:id="xml-primer-to-do">
<title>To Do…</title>
<procedure>
<step>
<para>Add a general entity to
<filename>example.xml</filename>.</para>
<programlisting><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [
<!ENTITY version "1.1">
]>
<tag class="starttag">html xmlns="http://www.w3.org/1999/xhtml"</tag>
<tag class="starttag">head</tag>
<tag class="starttag">title</tag>An Example XHTML File<tag class="endtag">title</tag>
<tag class="endtag">head</tag>
<!-- There may be some comments in here as well -->
<tag class="starttag">body</tag>
<tag class="starttag">p</tag>This is a paragraph containing some text.<tag class="endtag">p</tag>
<tag class="starttag">p</tag>This paragraph contains some more text.<tag class="endtag">p</tag>
<tag class="starttag">p align="right"</tag>This paragraph might be right-justified.<tag class="endtag">p</tag>
<tag class="starttag">p</tag>The current version of this document is: &version;<tag class="endtag">p</tag>
<tag class="endtag">body</tag>
<tag class="endtag">html</tag></programlisting>
</step>
<step>
<para>Validate the document using
<command>xmllint</command>.</para>
</step>
<step>
<para>Load <filename>example.xml</filename> into a web
browser. It may have to be copied to
<filename>example.html</filename> before the browser
recognizes it as an <acronym>XHTML</acronym>
document.</para>
<para>Older browsers with simple parsers may not render this
file as expected. The entity reference
<literal>&version;</literal> may not be replaced by
the version number, or the <acronym>XML</acronym> context
closing <literal>]></literal> may not be recognized and
instead shown in the output.</para>
</step>
<step>
<para>The solution is to <emphasis>normalize</emphasis> the
document with an <acronym>XML</acronym> normalizer. The
normalizer reads valid <acronym>XML</acronym> and writes
equally valid <acronym>XML</acronym> which has been
transformed in some way. One way the normalizer
transforms the input is by expanding all the entity
references in the document, replacing the entities with
the text that they represent.</para>
<para><command>xmllint</command> can be used for this. It
also has an option to drop the initial
<acronym>DTD</acronym> section so that the closing
<literal>]></literal> does not confuse browsers:</para>
<screen>&prompt.user; <userinput>xmllint --noent --dropdtd example.xml > example.html</userinput></screen>
<para>A normalized copy of the document with entities
expanded is produced in <filename>example.html</filename>,
ready to load into a web browser.</para>
</step>
</procedure>
</sect2>
</sect1>
<sect1 xml:id="xml-primer-include">
<title>Using Entities to Include Files</title>
<para>Both
<link linkend="xml-primer-general-entities">general</link> and
<link linkend="xml-primer-parameter-entities">parameter</link>
entities are particularly useful for including one file inside
another.</para>
<sect2 xml:id="xml-primer-include-using-gen-entities">
<title>Using General Entities to Include Files</title>
<para>Consider some content for an <acronym>XML</acronym> book
organized into files, one file per chapter, called
<filename>chapter1.xml</filename>,
<filename>chapter2.xml</filename>, and so forth, with a
<filename>book.xml</filename> that will contain these
chapters.</para>
<para>In order to use the contents of these files as the values
for entities, they are declared with the
<literal>SYSTEM</literal> keyword. This directs the
<acronym>XML</acronym> parser to include the contents of the
named file as the value of the entity.</para>
<example>
<title>Using General Entities to Include Files</title>
<programlisting><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [
<!ENTITY chapter.1 SYSTEM "chapter1.xml">
<!ENTITY chapter.2 SYSTEM "chapter2.xml">
<!ENTITY chapter.3 SYSTEM "chapter3.xml">
<!-- And so forth -->
]>
<tag class="starttag">html xmlns="http://www.w3.org/1999/xhtml"</tag>
<!-- Use the entities to load in the chapters -->
&chapter.1;
&chapter.2;
&chapter.3;
<tag class="endtag">html</tag></programlisting>
</example>
<warning>
<para>When using general entities to include other files
within a document, the files being included
(<filename>chapter1.xml</filename>,
<filename>chapter2.xml</filename>, and so on)
<emphasis>must not</emphasis> start with a DOCTYPE
declaration. This is a syntax error because entities are
low-level constructs and they are resolved before any
parsing happens.</para>
</warning>
</sect2>
<sect2 xml:id="xml-primer-include-parameter">
<title>Using Parameter Entities to Include Files</title>
<para>Parameter entities can only be used inside an
<acronym>XML</acronym> context. Including a file in an
<acronym>XML</acronym> context can be used
to ensure that general entities are reusable.</para>
<para>Suppose that there are many chapters in the document, and
these chapters were reused in two different books, each book
organizing the chapters in a different fashion.</para>
<para>The entities could be listed at the top of each book, but
that quickly becomes cumbersome to manage.</para>
<para>Instead, place the general entity definitions inside one
file, and use a parameter entity to include that file within
the document.</para>
<example>
<title>Using Parameter Entities to Include Files</title>
<para>Place the entity definitions in a separate file
called <filename>chapters.ent</filename> and
containing this text:</para>
<programlisting><!ENTITY chapter.1 SYSTEM "chapter1.xml">
<!ENTITY chapter.2 SYSTEM "chapter2.xml">
<!ENTITY chapter.3 SYSTEM "chapter3.xml"></programlisting>
<para>Create a parameter entity to refer to the contents
of the file. Then use the parameter entity to load the file
into the document, which will then make all the general
entities available for use. Then use the general entities
as before:</para>
<programlisting><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [
<!-- Define a parameter entity to load in the chapter general entities -->
<!ENTITY % chapters SYSTEM "chapters.ent">
<!-- Now use the parameter entity to load in this file -->
%chapters;
]>
<tag class="starttag">html xmlns="http://www.w3.org/1999/xhtml"</tag>
&chapter.1;
&chapter.2;
&chapter.3;
<tag class="endtag">html</tag></programlisting>
</example>
</sect2>
<sect2 xml:id="xml-primer-include-parameter-to-do">
<title>To Do…</title>
<sect3 xml:id="xml-primer-include-general-entities-include">
<title>Use General Entities to Include Files</title>
<procedure>
<step>
<para>Create three files, <filename>para1.xml</filename>,
<filename>para2.xml</filename>, and
<filename>para3.xml</filename>.</para>
<para>Put content like this in each file:</para>
<programlisting><tag class="starttag">p</tag>This is the first paragraph.<tag class="endtag">p</tag></programlisting>
</step>
<step>
<para>Edit <filename>example.xml</filename> so that it
looks like this:</para>
<programlisting><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [
<!ENTITY version "1.1">
<!ENTITY para1 SYSTEM "para1.xml">
<!ENTITY para2 SYSTEM "para2.xml">
<!ENTITY para3 SYSTEM "para3.xml">
]>
<tag class="starttag">html xmlns="http://www.w3.org/1999/xhtml"</tag>
<tag class="starttag">head</tag>
<tag class="starttag">title</tag>An Example XHTML File<tag class="endtag">title</tag>
<tag class="endtag">head</tag>
<tag class="starttag">body</tag>
<tag class="starttag">p</tag>The current version of this document is: &version;<tag class="endtag">p</tag>
&para1;
&para2;
&para3;
<tag class="endtag">body</tag>
<tag class="endtag">html</tag></programlisting>
</step>
<step>
<para>Produce <filename>example.html</filename> by
normalizing <filename>example.xml</filename>.</para>
<screen>&prompt.user; <userinput>xmllint --dropdtd --noent example.xml > example.html</userinput></screen>
</step>
<step>
<para>Load <filename>example.html</filename> into the web
browser and confirm that the
<filename>para<replaceable>n</replaceable>.xml</filename>
files have been included in
<filename>example.html</filename>.</para>
</step>
</procedure>
</sect3>
<sect3 xml:id="xml-primer-include-parameter-entities-include">
<title>Use Parameter Entities to Include Files</title>
<note>
<para>The previous steps must have completed before this
step.</para>
</note>
<procedure>
<step>
<para>Edit <filename>example.xml</filename> so that it
looks like this:</para>
<programlisting><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [
<!ENTITY % entities SYSTEM "entities.ent"> %entities;
]>
<tag class="starttag">html xmlns="http://www.w3.org/1999/xhtml"</tag>
<tag class="starttag">head</tag>
<tag class="starttag">title</tag>An Example XHTML File<tag class="endtag">title</tag>
<tag class="endtag">head</tag>
<tag class="starttag">body</tag>
<tag class="starttag">p</tag>The current version of this document is: &version;<tag class="endtag">p</tag>
&para1;
&para2;
&para3;
<tag class="endtag">body</tag>
<tag class="endtag">html</tag></programlisting>
</step>
<step>
<para>Create a new file called
<filename>entities.ent</filename> with this
content:</para>
<programlisting><!ENTITY version "1.1">
<!ENTITY para1 SYSTEM "para1.xml">
<!ENTITY para2 SYSTEM "para2.xml">
<!ENTITY para3 SYSTEM "para3.xml"></programlisting>
</step>
<step>
<para>Produce <filename>example.html</filename> by
normalizing <filename>example.xml</filename>.</para>
<screen>&prompt.user; <userinput>xmllint --dropdtd --noent example.xml > example.html</userinput></screen>
</step>
<step>
<para>Load <filename>example.html</filename> into the web
browser and confirm that the
<filename>para<replaceable>n</replaceable>.xml</filename>
files have been included in
<filename>example.html</filename>.</para>
</step>
</procedure>
</sect3>
</sect2>
</sect1>
<sect1 xml:id="xml-primer-marked-sections">
<title>Marked Sections</title>
<para><acronym>XML</acronym> provides a mechanism to indicate that
particular pieces of the document should be processed in a
special way. These are called
<quote>marked sections</quote>.</para>
<example>
<title>Structure of a Marked Section</title>
<programlisting><![<replaceable>KEYWORD</replaceable>[
Contents of marked section
]]></programlisting>
</example>
<para>As expected of an <acronym>XML</acronym> construct, a marked
section starts with <literal><!</literal>.</para>
<para>The first square bracket begins the marked section.</para>
<para><replaceable>KEYWORD</replaceable> describes how this marked
section is to be processed by the parser.</para>
<para>The second square bracket indicates the start of the
marked section's content.</para>
<para>The marked section is finished by closing the two square
brackets, and then returning to the document context from the
<acronym>XML</acronym> context with
<literal>></literal>.</para>
<sect2 xml:id="xml-primer-marked-section-keywords">
<title>Marked Section Keywords</title>
<sect3 xml:id="xml-primer-cdata">
<title><literal>CDATA</literal></title>
<para>These keywords denote the marked sections
<emphasis>content model</emphasis>, and allow you to change
it from the default.</para>
<para>When an <acronym>XML</acronym> parser is processing a
document, it keeps track of the
<quote>content model</quote>.</para>
<para>The content model describes the
content the parser is expecting to see and what it will do
with that content.</para>
<para>The <literal>CDATA</literal> content model is one of the
most useful.</para>
<para><literal>CDATA</literal> is for
<quote>Character Data</quote>. When the parser is in this
content model, it expects to see only characters. In this
model the <literal><</literal> and
<literal>&</literal> symbols lose their special status,
and will be treated as ordinary characters.</para>
<note>
<para>When using <literal>CDATA</literal> in examples of
text marked up in <acronym>XML</acronym>, remember that
the content of <literal>CDATA</literal> is not validated.
The included text must be check with other means. For
example, the content could be written in another document,
validated, and then pasted into the
<literal>CDATA</literal> section.</para>
</note>
<example>
<title>Using a <literal>CDATA</literal> Marked
Section</title>
<programlisting><tag class="starttag">para</tag>Here is an example of how to include some text that contains
many <tag class="starttag">literal</tag>&lt;<tag class="endtag">literal</tag> and <tag class="starttag">literal</tag>&amp;<tag class="endtag">literal</tag>
symbols. The sample text is a fragment of
<tag class="starttag">acronym</tag>XHTML<tag class="endtag">acronym</tag>. The surrounding text (<tag class="starttag">para</tag> and
<tag class="starttag">programlisting</tag>) are from DocBook.<tag class="endtag">para</tag>
<tag class="starttag">programlisting</tag><![CDATA[<tag class="starttag">p</tag>This is a sample that shows some of the
elements within <tag class="starttag">acronym</tag>XHTML<tag class="endtag">acronym</tag>. Since the angle
brackets are used so many times, it is simpler to say the whole
example is a CDATA marked section than to use the entity names for
the left and right angle brackets throughout.<tag class="endtag">p</tag>
<tag class="starttag">ul</tag>
<tag class="starttag">li</tag>This is a listitem<tag class="endtag">li</tag>
<tag class="starttag">li</tag>This is a second listitem<tag class="endtag">li</tag>
<tag class="starttag">li</tag>This is a third listitem<tag class="endtag">li</tag>
<tag class="endtag">ul</tag>
<tag class="starttag">p</tag>This is the end of the example.<tag class="endtag">p</tag>]]><tag class="endtag">programlisting</tag></programlisting>
</example>
</sect3>
<sect3 xml:id="xml-primer-include-ignore">
<title><literal>INCLUDE</literal> and
<literal>IGNORE</literal></title>
<para>When the keyword is <literal>INCLUDE</literal>, then the
contents of the marked section will be processed. When the
keyword is <literal>IGNORE</literal>, the marked section
is ignored and will not be processed. It will not appear in
the output.</para>
<example>
<title>Using <literal>INCLUDE</literal> and
<literal>IGNORE</literal> in Marked Sections</title>
<programlisting><![INCLUDE[
This text will be processed and included.
]]>
<![IGNORE[
This text will not be processed or included.
]]></programlisting>
</example>
<para>By itself, this is not too useful. Text to be
removed from the document could be cut out, or wrapped
in comments.</para>
<para>It becomes more useful when controlled by
<link linkend="xml-primer-parameter-entities">parameter
entities</link>, yet this usage is limited
to entity files.</para>
<para>For example, suppose that documentation was produced in
a hard-copy version and an electronic version. Some extra
text is desired in the electronic version content that was
not to appear in the hard-copy.</para>
<para>Create an entity file that defines general entities to
include each chapter and guard these definitions with a
parameter entity that can be set to either
<literal>INCLUDE</literal> or <literal>IGNORE</literal> to
control whether the entity is defined. After these
conditional general entity definitions, place one more
definition for each general entity to set them to an empty
value. This technique makes use of the fact that entity
definitions cannot be overridden but the first definition
always takes effect. So the inclusion of the chapter is
controlled with the corresponding parameter entity. Set to
<literal>INCLUDE</literal>, the first general entity
definition will be read and the second one will be ignored.
Set to <literal>IGNORE</literal>, the first definition will
be ignored and the second one will take effect.</para>
<example>
<title>Using a Parameter Entity to Control a Marked
Section</title>
<programlisting><!ENTITY % electronic.copy "INCLUDE">
<![%electronic.copy;[
<!ENTITY chap.preface SYSTEM "preface.xml">
]]>
<!ENTITY chap.preface ""></programlisting>
<para>When producing the hard-copy version, change the
parameter entity's definition to:</para>
<programlisting><!ENTITY % electronic.copy "IGNORE"></programlisting>
</example>
</sect3>
</sect2>
<sect2 xml:id="xml-primer-marked-section-keywords-to-do">
<title>To Do…</title>
<procedure>
<step>
<para>Modify <filename>entities.ent</filename> to
contain the following:</para>
<programlisting><!ENTITY version "1.1">
<!ENTITY % conditional.text "IGNORE">
<![%conditional.text;[
<!ENTITY para1 SYSTEM "para1.xml">
]]>
<!ENTITY para1 "">
<!ENTITY para2 SYSTEM "para2.xml">
<!ENTITY para3 SYSTEM "para3.xml"></programlisting>
</step>
<step>
<para>Normalize <filename>example.xml</filename>
and notice that the conditional text is not present in the
output document. Set the parameter entity
guard to <literal>INCLUDE</literal> and regenerate the
normalized document and the text will appear again.
This method makes sense if there are more
conditional chunks depending on the same condition. For
example, to control generating printed or online
text.</para>
</step>
</procedure>
</sect2>
</sect1>
<sect1 xml:id="xml-primer-conclusion">
<title>Conclusion</title>
<para>That is the conclusion of this <acronym>XML</acronym>
primer. For reasons of space and complexity, several things
have not been covered in depth (or at all). However, the
previous sections cover enough <acronym>XML</acronym> to
introduce the organization of the <acronym>FDP</acronym>
documentation.</para>
</sect1>
</chapter>
|