aboutsummaryrefslogtreecommitdiff
path: root/contrib/file/magic/Magdir/ispell
blob: 4bcb9f062e4f505c4983fc2ad205c6951c38175b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

#------------------------------------------------------------------------------
# $File: ispell,v 1.10 2023/10/23 19:49:58 christos Exp $
# ispell:  file(1) magic for ispell, MySpell, Hunspell and aspell
#
# Ispell 3.0 has a magic of 0x9601 and ispell 3.1 has 0x9602.  This magic
# will match 0x9600 through 0x9603 in *both* little endian and big endian.
# (No other current magic entries collide.)
#
# Updated by Daniel Quinlan (quinlan@yggdrasil.com)
#
0	leshort&0xFFFC	0x9600		little endian ispell
>0	byte		0		hash file (?),
>0	byte		1		3.0 hash file,
>0	byte		2		3.1 hash file,
>0	byte		3		hash file (?),
>2	leshort		0x00		8-bit, no capitalization, 26 flags
>2	leshort		0x01		7-bit, no capitalization, 26 flags
>2	leshort		0x02		8-bit, capitalization, 26 flags
>2	leshort		0x03		7-bit, capitalization, 26 flags
>2	leshort		0x04		8-bit, no capitalization, 52 flags
>2	leshort		0x05		7-bit, no capitalization, 52 flags
>2	leshort		0x06		8-bit, capitalization, 52 flags
>2	leshort		0x07		7-bit, capitalization, 52 flags
>2	leshort		0x08		8-bit, no capitalization, 128 flags
>2	leshort		0x09		7-bit, no capitalization, 128 flags
>2	leshort		0x0A		8-bit, capitalization, 128 flags
>2	leshort		0x0B		7-bit, capitalization, 128 flags
>2	leshort		0x0C		8-bit, no capitalization, 256 flags
>2	leshort		0x0D		7-bit, no capitalization, 256 flags
>2	leshort		0x0E		8-bit, capitalization, 256 flags
>2	leshort		0x0F		7-bit, capitalization, 256 flags
>4	leshort		>0		and %d string characters
0	beshort&0xFFFC	0x9600		big endian ispell
>1	byte		0		hash file (?),
>1	byte		1		3.0 hash file,
>1	byte		2		3.1 hash file,
>1	byte		3		hash file (?),
>2	beshort		0x00		8-bit, no capitalization, 26 flags
>2	beshort		0x01		7-bit, no capitalization, 26 flags
>2	beshort		0x02		8-bit, capitalization, 26 flags
>2	beshort		0x03		7-bit, capitalization, 26 flags
>2	beshort		0x04		8-bit, no capitalization, 52 flags
>2	beshort		0x05		7-bit, no capitalization, 52 flags
>2	beshort		0x06		8-bit, capitalization, 52 flags
>2	beshort		0x07		7-bit, capitalization, 52 flags
>2	beshort		0x08		8-bit, no capitalization, 128 flags
>2	beshort		0x09		7-bit, no capitalization, 128 flags
>2	beshort		0x0A		8-bit, capitalization, 128 flags
>2	beshort		0x0B		7-bit, capitalization, 128 flags
>2	beshort		0x0C		8-bit, no capitalization, 256 flags
>2	beshort		0x0D		7-bit, no capitalization, 256 flags
>2	beshort		0x0E		8-bit, capitalization, 256 flags
>2	beshort		0x0F		7-bit, capitalization, 256 flags
>4	beshort		>0		and %d string characters
# ispell 4.0 hash files  kromJx <kromJx@crosswinds.net>
# Ispell 4.0
0       string          ISPL            ispell
>4      long            x               hash file version %d,
>8      long            x               lexletters %d,
>12     long            x               lexsize %d,
>16     long            x               hashsize %d,
>20     long            x               stblsize %d

# Summary:	affixes defition text files for Ispell/MySpell/Hunspell
# From:		Joerg Jenderek
# URL:		https://www.openoffice.org/lingucomponent/affix.readme
#		https://man.archlinux.org/man/hunspell.5.en
# Reference:	http://mark0.net/download/triddefs_xml.7z/defs/a/affix.trid.xml
# Note:		called "Affix file" by TrID
# variant starting with comment character
0		ubyte		0x23
# look for SET character command followed by whitespace (seems to be often 1 space character) like in:
# /usr/share/calibre/dictionaries/en-GB/en-GB.aff
>0		search/60459	SET\040
# skip scripts like /bin/affixcompress /bin/setupcon /bin/imdbpy2sql.py by checking for valid character SET argument
# character SET argument like: UTF-8
>>&0		string		UTF-8
>>>0		use					spell-aff
# character SET argument like: ISO8859-1 - ISO8859-10 ISO8859-13 - ISO8859-15
>>&0		string		ISO8859-
>>>0		use				spell-aff
# character SET argument for Russian with Cyrillic alphabet like: KOI8-R KOI8-U
# no russian support until war against ukraine
>>&0		string		KOI8-
#>>>0		use				spell-aff
# character SET argument for languages with Cyrillic alphabet like: cp1251
# no cyrillic support until russia war against ukraine
>>&0		string		cp1251
#>>>0		use				spell-aff
# character SET argument for Indian Script Code for Information Interchange (ISCII) like: ISCII-DEVANAGARI
>>&0		string		ISCII-
# no example found
>>>0		use				spell-aff
# not "real" affix rule files but found as tests unit inside thunderbird sources like:
# 1463589.aff 1695964.aff 2970240.aff
>0		default		x
# look for suffix SFX command followed by whitespace like in:
# 1695964.aff
>>0		search/164	SFX\040
>>>0		use				spell-aff
# if not real Hunspell/MySpell affix look for ispell variant
>>0		default		x
# URL:		https://manpages.debian.org/testing/ispell/ispell.5.en.html
# look for ispell declaration like in: /usr/lib/ispell/espanol.aff
>>>0		search/8251	defstringtype
# defstringtype declaration start with unique name (like "list" "lat" "utf8" "iso" "nroff" often like formatter name)
# followed by formatter name (like "nroff" "tex")
# followed by suffix list (like ".mm" ".ms" ".me" ".man" ".NeXT" ".txt" ".list")
#>>>>&1		string		x		DECLARATION=%s
>>>>0		use				spell-aff
# ispell variant without declaration like in: /usr/lib/ispell/bulgarian.aff /usr/lib/ispell/russian.aff
>>>0		default		x
# skip /etc/nilfs_cleanerd.conf by looking for ispell suffix section
>>>>0		search/3233	suffixes\n
>>>>>0		use				spell-aff
# variant starting with empty line and comment character at the beginning of 2nd line like in: /usr/lib/ispell/polish.aff
0		ubeshort	0x0a23
# skip /etc/discover-modprobe.conf by looking for ispell declaration
>2		search/3118	defstringtype
>>0		use				spell-aff
# starting with UTF-8 Byte Order Mark (BOM) https://en.wikipedia.org/wiki/Byte_order_mark
0		string		\xEF\xBB\xBF
# starting with UTF-8 Byte Order Mark (BOM) followed by comment starting character
>3		string		\x23
# starting with UTF-8 BOM and with SET character command followed by whitespace
# like in: /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/lt.aff
# look for character SET command used in MySpell and Hunspell
>3		search/9883	SET\040
>>0		use				spell-aff
# look for FLAG type command used in MySpell and Hunspell
0		string		FLAG
# followed by space character like in
# /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/en_US.aff
>4		ubyte		0x20
>>0		use				spell-aff
# or followed by tabulator character like in
# /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/ar.aff
>4		ubyte		0x09
>>0		use				spell-aff
# starting with character SET command used in MySpell and Hunspell like in: org/languagetool/resource/sv/hunspell/sv_SE.aff
0		string		SET\040
>0		use				spell-aff
# starting with language code LANG used in MySpell and Hunspell like in: /usr/share/hunspell/tr_TR.aff
0		string		LANG\040
>0		use				spell-aff
# starting with affix flag command AF used in MySpell and Hunspell like in: /usr/lib/thunderbird/extensions/langpack-hu@thunderbird.mozilla.org/dictionaries/hu.aff
0		string		AF\040
# look for number of flag vector aliases
>3		regex		[0-9]{1,4}
>>0		use				spell-aff
#	display information (encoding,language,...) about affixes rules text for Ispell/MySpell/Hunspell
0		name				spell-aff
>1		ubeshort	x		affix definition
#!:mime		text/plain
!:mime		text/x-affix
!:ext		aff
# GRR: need extra test so that default clause works
>0		ubyte		x
# look for ispell declaration
>>0		search/8251	defstringtype	for Ispell
# ispell variant without declaration
>>0		default		x
# look for ispell suffixes command
>>>0		search/3233	suffixes
# skip "suffixes used to create first part of a compound" by checking for flag argument like in: languagetool\resource\sv\hunspell\sv_SE.aff
>>>>&0		search/2	flag		for Ispell
>>>>&0		default		x		for MySpell/Hunspell
# without suffixes keyword
>>>0		default		x		for MySpell/Hunspell
# look for language code command used in MySpell and Hunspell
# like in: /usr/share/hunspell/de_AT.aff /usr/share/hunspell/it_IT.aff /usr/share/hunspell/tr_TR.aff /usr/lib/firefox/browser/extensions/langpack-hu@firefox.mozilla.org/dictionaries/hu.aff
>>0		search/1117643	LANG\040	\b, language
# language code argument like: de_DE hu_HU it_IT mn_MN tr_TR
>>>&0		string		x		%s
# look for character SET command used in MySpell and Hunspell
>>0		search/1117729	SET
# skip SETTINGS like in /usr/lib/ispell/ngerman.aff
# SET command followed often by space character (0x20) or tabulator (0x09) like in
# /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/ar.aff
>>>&0	ubyte&0xD6	=0x00
# skip SSET	#     schosS in /usr/lib/ispell/ogerman.aff
>>>>&0		ubyte		>0x48		\b,
# character SET argument like: cp1251 ISCII-DEVANAGAR ISO8859-1 - ISO8859-10 ISO8859-13 - ISO8859-15 KOI8-R KOI8-U UTF-8
>>>>>&-1	string	x			"%s" encoded
# for control reasons show first non empty lines for ASCII or ISO-8859 text variant
>1		ubeshort	!0xBBBF
# 1st line starting with 0x0A like in /usr/src/dicts/sjp-ispell-pl-20140213/polish.aff
>>0		ubyte		=0x0A
>>>1		ubyte		!0x0A		\b, 2nd line
>>>>&-1		string		x		"%s"
# 3rd line starting with 0x0A like in polish.aff
>>>>>&1		ubyte		=0x0A
>>>>>>&0	string		x		\b, 4th line "%s"
# 1st line starting with ASCII text like: 
# this is the affix file of the de_DE Hunspell dictionary
>>0		ubyte		!0x0A
>>>0		string		x		\b, 1st line "%s"
>>>>&1		ubyte		>0x1F		\b, 2nd line
>>>>>&-1	string		x		"%s"
# 2nd line starting with 0x0A like in /usr/lib/ispell/bulgarian.aff
>>>>&1		ubyte		=0x0A		\b, 3rd line
>>>>>&0		string		x		"%s"
# for control reasons show first lines for variant starting with ByteOrderMark (BOM=\xEF\xBB\xBF)
>1		ubeshort	=0xBBBF	   	\b, with BOM
>>3		string		x		\b, 1st line "%s"
>>>&1		ubyte		>0x1F		\b, 2nd line
>>>>&-1		string		x		"%s"

# From:		Joerg Jenderek
# URL:		https://en.wikipedia.org/wiki/GNU_Aspell
#		https://manpages.ubuntu.com/manpages/trusty/en/man8/aspell-autobuildhash.8.html
# Reference:	http://mark0.net/download/triddefs_xml.7z/defs/r/rws-aspell.trid.xml
#		https://ftp.gnu.org/gnu/aspell/aspell-0.60.8.tar.gz
#		aspell-0.60.8/modules/speller/default/data.cpp
#		aspell-0.60.8/modules/speller/default/readonly_ws.cpp
# Note:		called "aspell dictionary" by TrID
0	string	aspell\040default\040speller\040rowl	aspell dictionary
#!:mime	application/octet-stream
!:mime	application/x-aspell-dictionary
!:ext	rws
# version like: 1.10 1.4
>28	string	x					\b, version %s
# u32int endian_check; 12345678=00BC614Eh
#>64	ulelong	x					\b, endian_check=%u
>>64	ulelong	12345678				\b, little endian
# not tested
>>64	ubelong	12345678				\b, big endian
# older aspell version not like 0.60.8
>>64	default	x					\b, old
# URL:		https://en.wikipedia.org/wiki/GNU_Aspell
# Reference	http://aspell.net/man-html/Format-of-the-Personal-and-Replacement-Dictionaries.html
# personal_ws-1.1 lang num [encoding]
0	string	personal_				aspell personal
# Reference:	http://mark0.net/download/triddefs_xml.7z/defs/p/pws-aspell.trid.xml
# Note:		called "aspell Personal dictionary" by TrID
>9	string	ws-					dictionary
#!:mime	text/plain
!:mime	text/x-aspell-dictionary
# like: ~/.aspell.en.pws ~/.aspell.de_DE.pws ~/.aspell.it.pws
!:ext	pws
# Reference:	http://mark0.net/download/triddefs_xml.7z/defs/p/prepl-aspell.trid.xml
# Note:		called "aspell Personal Replacement dictionary" by TrID
# personal_repl-1.1 lang num [encoding]
>9	string	repl-					replacement dictionary
#!:mime	text/plain
!:mime	text/x-aspell-dictionary
# like: ~/.aspell.en.prepl ~/.aspell.de_DE.prepl ~/.aspell.it.prepl
!:ext	prepl