aboutsummaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorHiroki Sato <hrs@FreeBSD.org>2020-12-29 19:21:19 +0000
committerHiroki Sato <hrs@FreeBSD.org>2020-12-29 19:40:27 +0000
commit916806472a8a245e8f2ddfeea4a1db652879a6f6 (patch)
treef09d8f1c2511840bdb1ffea2aedc155597b21820 /tools
parentf3f16c31fea258b2b1ec51ddd1bceb6207b66198 (diff)
downloadsrc-916806472a8a245e8f2ddfeea4a1db652879a6f6.tar.gz
src-916806472a8a245e8f2ddfeea4a1db652879a6f6.zip
Fix generation of colldef source files for non-UTF-8 locales
- Files for colldef were generated by duplicating UTF-8 collation files for each language and included invalid characters in the non-UTF-8 encodings. localedef(1) does not allow those characters. cldr2def.pl now checks if the characters are valid based on charmap files. TODO: ja_JP.UTF-8 locale should not be generated solely from CLDR because it was standardized in a document "UI-OSF Application Platform Profile for Japanese Environment" which was incompatible with information in CLDR. Most of commercial Unix vendors adopt this pre-Unicode-era document as the reference even for UTF-8 locale. Newer versions of Solaris have added a CLDR version as ja_JP.UTF-8@cldr, and IBM AIX has used JA_JP.UTF-8 for the UI-OSF specification and ja_JP.UTF-8 for CLDR. Note that this commit does not change generation of ja_JP.UTF-8. Changes related to this issue will be committed separately later. - Generate POSIX charamap UTF-32 as a reference. It was confusing that charmap.xml used Unicode names defined in UnicodeData.txt though POSIX charmap used slightly different names for the same code points. cldr2def.pl now uses UTF-32.cm as single information source for Unicode symbol names and code points. Charset.xml is also updated to use them. - Fix a bug in get_encodings() in cldr2def.pl which did not understand 0x00+0x00 notation correctly in charmaps/ISCII-DEV.TXT. - Do not regenerate posix/xx_Comm_C.UTF-8.src every time when doing "make build". Reviewed by: bapt Differential Revision: https://reviews.freebsd.org/D27809
Diffstat (limited to 'tools')
-rw-r--r--tools/tools/locale/Makefile6
-rw-r--r--tools/tools/locale/README9
-rw-r--r--tools/tools/locale/etc/charmaps.xml421
-rwxr-xr-xtools/tools/locale/tools/cldr2def.pl210
4 files changed, 363 insertions, 283 deletions
diff --git a/tools/tools/locale/Makefile b/tools/tools/locale/Makefile
index 27ff255d7f9a..92f890b2f4d3 100644
--- a/tools/tools/locale/Makefile
+++ b/tools/tools/locale/Makefile
@@ -168,7 +168,8 @@ ENCODINGS= Big5 \
KOI8-U \
SJIS \
US-ASCII \
- UTF-8
+ UTF-8 \
+ UTF-32
# CLDR files
CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip
@@ -211,9 +212,10 @@ ${UNIDIR}/posix:
ln -s -f ../posix ${.TARGET}
clean-posix:
rm -rf posix ${UNIDIR}/posix
-post-posixcm: ${UNIDIR}/posix
+${UNIDIR}/posix/xx_Comm_C.UTF-8.src: ${UNIDIR}/posix
perl -I ${TOOLSDIR} ${TOOLSDIR}/utf8-rollup.pl \
--unidir=${UNIDIR}
+post-posixcm: ${UNIDIR}/posix/xx_Comm_C.UTF-8.src
.for enc in ${ENCODINGS}
posixcm: build-tools posix/${enc}.cm
.ORDER: build-tools posix/${enc}.cm
diff --git a/tools/tools/locale/README b/tools/tools/locale/README
index 0b5ce24b51cd..380786929b7c 100644
--- a/tools/tools/locale/README
+++ b/tools/tools/locale/README
@@ -19,7 +19,7 @@ More details are as follows:
Variables:
LOCALESRCDIR
Destination path for the generated locale files.
- Default: $DESTDIR/usr/src/share.
+ Default: ${SRCTOP}/share.
TMPDIR
Temporary directory.
Default: /tmp
@@ -29,7 +29,12 @@ Targets:
Create a temporary directory for building.
make clean
- Clean up the obj directories.
+ Clean up the obj directories. Note that this does not
+ clean up tools or posix locale source files generated
+ from the CLDR files because it takes a long time to generate
+ them and they are not changed as long as using the same
+ CLDR files. "make clean && make build" will
+ regenerate the locale source files for src/share/*def.
make cleandir
Remove the obj directories completely.
diff --git a/tools/tools/locale/etc/charmaps.xml b/tools/tools/locale/etc/charmaps.xml
index 78a344d6929e..52e80f2dee05 100644
--- a/tools/tools/locale/etc/charmaps.xml
+++ b/tools/tools/locale/etc/charmaps.xml
@@ -195,395 +195,404 @@
</languages>
<translations>
+ <!--
+ encoding: Space-separated list of encodings
+ cldr: Symbol to be replaced with hex, string, unicode, or ucc.
+ The symbol name should be defined in posix/*.cm files.
+ string: raw code in string.
+ hex: raw code in hex.
+ unicode: Symbol name in Unicode.
+ ucc: Unicode code point in hex.
+ -->
<!-- These don't have a special Euro sign so just use Eu for it -->
- <translation encoding="ISO8859-1" cldr="EURO SIGN" string="Eu" />
- <translation encoding="ISO8859-2" cldr="EURO SIGN" string="Eu" />
- <translation encoding="ISO8859-4" cldr="EURO SIGN" string="Eu" />
- <translation encoding="ISO8859-13" cldr="EURO SIGN" string="Eu" />
+ <translation encoding="ISO8859-1" cldr="EURO_SIGN" string="Eu" />
+ <translation encoding="ISO8859-2" cldr="EURO_SIGN" string="Eu" />
+ <translation encoding="ISO8859-4" cldr="EURO_SIGN" string="Eu" />
+ <translation encoding="ISO8859-13" cldr="EURO_SIGN" string="Eu" />
<!-- Minus and dashes -->
<translation encoding="ISO8859-1 ISO8859-2 ISO8859-4 ISO8859-13 ISO8859-15"
- cldr="MINUS SIGN" unicode="HYPHEN-MINUS" />
+ cldr="MINUS_SIGN" unicode="HYPHEN-MINUS" />
<translation encoding="ISO8859-2"
- cldr="EN DASH" unicode="HYPHEN-MINUS" />
+ cldr="EN_DASH" unicode="HYPHEN-MINUS" />
<!-- Got these from http://www.decodeunicode.org/en/u+0400.
Where possible use the international or ISO translation!
-->
<translation encoding="ISO8859-2" ucc="0408"
- cldr="CYRILLIC CAPITAL LETTER JE"
- unicode="LATIN CAPITAL LETTER J" />
+ cldr="CYRILLIC_CAPITAL_LETTER_JE"
+ unicode="LATIN_CAPITAL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0458"
- cldr="CYRILLIC SMALL LETTER JE" unicode="LATIN SMALL LETTER J" />
+ cldr="CYRILLIC_SMALL_LETTER_JE" unicode="LATIN_SMALL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0409"
- cldr="CYRILLIC CAPITAL LETTER LJE" string="lj" />
+ cldr="CYRILLIC_CAPITAL_LETTER_LJE" string="lj" />
<translation encoding="ISO8859-2" ucc="0459"
- cldr="CYRILLIC SMALL LETTER LJE" string="lj" />
+ cldr="CYRILLIC_SMALL_LETTER_LJE" string="lj" />
<translation encoding="ISO8859-2" ucc="0410"
- cldr="CYRILLIC CAPITAL LETTER A" unicode="LATIN CAPITAL LETTER A" />
+ cldr="CYRILLIC_CAPITAL_LETTER_A" unicode="LATIN_CAPITAL_LETTER_A" />
<translation encoding="ISO8859-2" ucc="0430"
- cldr="CYRILLIC SMALL LETTER A" unicode="LATIN SMALL LETTER A" />
+ cldr="CYRILLIC_SMALL_LETTER_A" unicode="LATIN_SMALL_LETTER_A" />
<translation encoding="ISO8859-2" ucc="0411"
- cldr="CYRILLIC CAPITAL LETTER BE"
- unicode="LATIN CAPITAL LETTER B" />
+ cldr="CYRILLIC_CAPITAL_LETTER_BE"
+ unicode="LATIN_CAPITAL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0431"
- cldr="CYRILLIC SMALL LETTER BE" unicode="LATIN SMALL LETTER B" />
+ cldr="CYRILLIC_SMALL_LETTER_BE" unicode="LATIN_SMALL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0412"
- cldr="CYRILLIC CAPITAL LETTER VE"
- unicode="LATIN CAPITAL LETTER B" />
+ cldr="CYRILLIC_CAPITAL_LETTER_VE"
+ unicode="LATIN_CAPITAL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0432"
- cldr="CYRILLIC SMALL LETTER VE" unicode="LATIN SMALL LETTER B" />
+ cldr="CYRILLIC_SMALL_LETTER_VE" unicode="LATIN_SMALL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0413"
- cldr="CYRILLIC CAPITAL LETTER GHE"
- unicode="LATIN CAPITAL LETTER G" />
+ cldr="CYRILLIC_CAPITAL_LETTER_GHE"
+ unicode="LATIN_CAPITAL_LETTER_G" />
<translation encoding="ISO8859-2" ucc="0433"
- cldr="CYRILLIC SMALL LETTER GHE" unicode="LATIN SMALL LETTER G" />
+ cldr="CYRILLIC_SMALL_LETTER_GHE" unicode="LATIN_SMALL_LETTER_G" />
<translation encoding="ISO8859-2" ucc="0414"
- cldr="CYRILLIC CAPITAL LETTER DE" string="D" />
+ cldr="CYRILLIC_CAPITAL_LETTER_DE" string="D" />
<translation encoding="ISO8859-2" ucc="0434"
- cldr="CYRILLIC SMALL LETTER DE" string="d" />
+ cldr="CYRILLIC_SMALL_LETTER_DE" string="d" />
<translation encoding="ISO8859-2" ucc="0415"
- cldr="CYRILLIC CAPITAL LETTER IE"
- unicode="LATIN CAPITAL LETTER E" />
+ cldr="CYRILLIC_CAPITAL_LETTER_IE"
+ unicode="LATIN_CAPITAL_LETTER_E" />
<translation encoding="ISO8859-2" ucc="0435"
- cldr="CYRILLIC SMALL LETTER IE" unicode="LATIN SMALL LETTER E" />
+ cldr="CYRILLIC_SMALL_LETTER_IE" unicode="LATIN_SMALL_LETTER_E" />
<translation encoding="ISO8859-2" ucc="0416"
- cldr="CYRILLIC CAPITAL LETTER ZHE" string="ZH" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ZHE" string="ZH" />
<translation encoding="ISO8859-2" ucc="0436"
- cldr="CYRILLIC SMALL LETTER ZHE" string="zh" />
+ cldr="CYRILLIC_SMALL_LETTER_ZHE" string="zh" />
<translation encoding="ISO8859-2" ucc="0417"
- cldr="CYRILLIC CAPITAL LETTER ZE" string="z" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ZE" string="z" />
<translation encoding="ISO8859-2" ucc="0437"
- cldr="CYRILLIC SMALL LETTER ZE" string="z" />
+ cldr="CYRILLIC_SMALL_LETTER_ZE" string="z" />
<translation encoding="ISO8859-2" ucc="0418"
- cldr="CYRILLIC CAPITAL LETTER I" unicode="LATIN CAPITAL LETTER J" />
+ cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0438"
- cldr="CYRILLIC SMALL LETTER I" unicode="LATIN CAPITAL LETTER J" />
+ cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0419"
- cldr="CYRILLIC CAPITAL LETTER I" unicode="LATIN SMALL LETTER J" />
+ cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0439"
- cldr="CYRILLIC SMALL LETTER I" unicode="LATIN SMALL LETTER J" />
+ cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="041A"
- cldr="CYRILLIC CAPITAL LETTER KA"
- unicode="LATIN CAPITAL LETTER K" />
+ cldr="CYRILLIC_CAPITAL_LETTER_KA"
+ unicode="LATIN_CAPITAL_LETTER_K" />
<translation encoding="ISO8859-2" ucc="043A"
- cldr="CYRILLIC SMALL LETTER KA" unicode="LATIN SMALL LETTER K" />
+ cldr="CYRILLIC_SMALL_LETTER_KA" unicode="LATIN_SMALL_LETTER_K" />
<translation encoding="ISO8859-2" ucc="041B"
- cldr="CYRILLIC CAPITAL LETTER EL"
- unicode="LATIN CAPITAL LETTER L" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EL"
+ unicode="LATIN_CAPITAL_LETTER_L" />
<translation encoding="ISO8859-2" ucc="043B"
- cldr="CYRILLIC SMALL LETTER EL" unicode="LATIN SMALL LETTER L" />
+ cldr="CYRILLIC_SMALL_LETTER_EL" unicode="LATIN_SMALL_LETTER_L" />
<translation encoding="ISO8859-2" ucc="041C"
- cldr="CYRILLIC CAPITAL LETTER EM"
- unicode="LATIN CAPITAL LETTER M" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EM"
+ unicode="LATIN_CAPITAL_LETTER_M" />
<translation encoding="ISO8859-2" ucc="043C"
- cldr="CYRILLIC SMALL LETTER EM" unicode="LATIN SMALL LETTER M" />
+ cldr="CYRILLIC_SMALL_LETTER_EM" unicode="LATIN_SMALL_LETTER_M" />
<translation encoding="ISO8859-2" ucc="041D"
- cldr="CYRILLIC CAPITAL LETTER EN"
- unicode="LATIN CAPITAL LETTER H" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EN"
+ unicode="LATIN_CAPITAL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="043D"
- cldr="CYRILLIC SMALL LETTER EN" unicode="LATIN SMALL LETTER H" />
+ cldr="CYRILLIC_SMALL_LETTER_EN" unicode="LATIN_SMALL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="041E"
- cldr="CYRILLIC CAPITAL LETTER O" unicode="LATIN CAPITAL LETTER O" />
+ cldr="CYRILLIC_CAPITAL_LETTER_O" unicode="LATIN_CAPITAL_LETTER_O" />
<translation encoding="ISO8859-2" ucc="043E"
- cldr="CYRILLIC SMALL LETTER O" unicode="LATIN SMALL LETTER O" />
+ cldr="CYRILLIC_SMALL_LETTER_O" unicode="LATIN_SMALL_LETTER_O" />
<translation encoding="ISO8859-2" ucc="041F"
- cldr="CYRILLIC CAPITAL LETTER PE"
- unicode="LATIN CAPITAL LETTER P" />
+ cldr="CYRILLIC_CAPITAL_LETTER_PE"
+ unicode="LATIN_CAPITAL_LETTER_P" />
<translation encoding="ISO8859-2" ucc="043F"
- cldr="CYRILLIC SMALL LETTER PE" unicode="LATIN SMALL LETTER P" />
+ cldr="CYRILLIC_SMALL_LETTER_PE" unicode="LATIN_SMALL_LETTER_P" />
<translation encoding="ISO8859-2" ucc="0420"
- cldr="CYRILLIC CAPITAL LETTER ER"
- unicode="LATIN CAPITAL LETTER R" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ER"
+ unicode="LATIN_CAPITAL_LETTER_R" />
<translation encoding="ISO8859-2" ucc="0440"
- cldr="CYRILLIC SMALL LETTER ER" unicode="LATIN SMALL LETTER R" />
+ cldr="CYRILLIC_SMALL_LETTER_ER" unicode="LATIN_SMALL_LETTER_R" />
<translation encoding="ISO8859-2" ucc="0421"
- cldr="CYRILLIC CAPITAL LETTER ES"
- unicode="LATIN CAPITAL LETTER C" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ES"
+ unicode="LATIN_CAPITAL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0441"
- cldr="CYRILLIC SMALL LETTER ES" unicode="LATIN SMALL LETTER C" />
+ cldr="CYRILLIC_SMALL_LETTER_ES" unicode="LATIN_SMALL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0422"
- cldr="CYRILLIC CAPITAL LETTER TE"
- unicode="LATIN CAPITAL LETTER T" />
+ cldr="CYRILLIC_CAPITAL_LETTER_TE"
+ unicode="LATIN_CAPITAL_LETTER_T" />
<translation encoding="ISO8859-2" ucc="0442"
- cldr="CYRILLIC SMALL LETTER TE" unicode="LATIN SMALL LETTER T" />
+ cldr="CYRILLIC_SMALL_LETTER_TE" unicode="LATIN_SMALL_LETTER_T" />
<translation encoding="ISO8859-2" ucc="0423"
- cldr="CYRILLIC CAPITAL LETTER U" unicode="LATIN CAPITAL LETTER U" />
+ cldr="CYRILLIC_CAPITAL_LETTER_U" unicode="LATIN_CAPITAL_LETTER_U" />
<translation encoding="ISO8859-2" ucc="0443"
- cldr="CYRILLIC SMALL LETTER U" unicode="LATIN SMALL LETTER U" />
+ cldr="CYRILLIC_SMALL_LETTER_U" unicode="LATIN_SMALL_LETTER_U" />
<translation encoding="ISO8859-2" ucc="0424"
- cldr="CYRILLIC CAPITAL LETTER EF"
- unicode="LATIN CAPITAL LETTER F" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EF"
+ unicode="LATIN_CAPITAL_LETTER_F" />
<translation encoding="ISO8859-2" ucc="0444"
- cldr="CYRILLIC SMALL LETTER EF" unicode="LATIN SMALL LETTER F" />
+ cldr="CYRILLIC_SMALL_LETTER_EF" unicode="LATIN_SMALL_LETTER_F" />
<translation encoding="ISO8859-2" ucc="0425"
- cldr="CYRILLIC CAPITAL LETTER HA"
- unicode="LATIN CAPITAL LETTER H" />
+ cldr="CYRILLIC_CAPITAL_LETTER_HA"
+ unicode="LATIN_CAPITAL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="0445"
- cldr="CYRILLIC SMALL LETTER HA" unicode="LATIN SMALL LETTER H" />
+ cldr="CYRILLIC_SMALL_LETTER_HA" unicode="LATIN_SMALL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="0426"
- cldr="CYRILLIC CAPITAL LETTER TSE"
- unicode="LATIN CAPITAL LETTER C" />
+ cldr="CYRILLIC_CAPITAL_LETTER_TSE"
+ unicode="LATIN_CAPITAL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0446"
- cldr="CYRILLIC SMALL LETTER TSE" unicode="LATIN SMALL LETTER C" />
+ cldr="CYRILLIC_SMALL_LETTER_TSE" unicode="LATIN_SMALL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0427"
- cldr="CYRILLIC CAPITAL LETTER CHE"
- unicode="LATIN CAPITAL LETTER C WITH CARON" />
+ cldr="CYRILLIC_CAPITAL_LETTER_CHE"
+ unicode="LATIN_CAPITAL_LETTER_C_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0447"
- cldr="CYRILLIC SMALL LETTER CHE"
- unicode="LATIN SMALL LETTER C WITH CARON" />
+ cldr="CYRILLIC_SMALL_LETTER_CHE"
+ unicode="LATIN_SMALL_LETTER_C_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0428"
- cldr="CYRILLIC CAPITAL LETTER SHA"
- unicode="LATIN CAPITAL LETTER S WITH CARON" />
+ cldr="CYRILLIC_CAPITAL_LETTER_SHA"
+ unicode="LATIN_CAPITAL_LETTER_S_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0448"
- cldr="CYRILLIC SMALL LETTER SHA"
- unicode="LATIN SMALL LETTER S WITH CARON" />
+ cldr="CYRILLIC_SMALL_LETTER_SHA"
+ unicode="LATIN_SMALL_LETTER_S_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0429"
- cldr="CYRILLIC CAPITAL LETTER SHCHA"
- unicode="LATIN CAPITAL LETTER S WITH CIRCUMFLEX" />
+ cldr="CYRILLIC_CAPITAL_LETTER_SHCHA"
+ unicode="LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2" ucc="0449"
- cldr="CYRILLIC SMALL LETTER SHCHA"
- unicode="LATIN SMALL LETTER S WITH CIRCUMFLEX" />
+ cldr="CYRILLIC_SMALL_LETTER_SHCHA"
+ unicode="LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2" ucc="042A"
- cldr="?CYRILLIC CAPITAL LETTER HARD SIGN" unicode="?" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_HARD_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="044A"
- cldr="?CYRILLIC SMALL LETTER HARD SIGN" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_HARD_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="042B"
- cldr="?CYRILLIC CAPITAL LETTER YERU" unicode="?" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_YERU" unicode="?" />
<translation encoding="ISO8859-2" ucc="044B"
- cldr="?CYRILLIC SMALL LETTER YERU" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_YERU" unicode="?" />
<translation encoding="ISO8859-2" ucc="042C"
- cldr="?CYRILLIC CAPITAL LETTER SOFT SIGN" unicode="?" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_SOFT_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="044C"
- cldr="?CYRILLIC SMALL LETTER SOFT SIGN" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_SOFT_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="042D"
- cldr="CYRILLIC CAPITAL LETTER E"
- unicode="LATIN CAPITAL LETTER E WITH GRAVE" />
+ cldr="CYRILLIC_CAPITAL_LETTER_E"
+ unicode="LATIN_CAPITAL_LETTER_E_WITH_GRAVE" />
<translation encoding="ISO8859-2" ucc="044D"
- cldr="CYRILLIC SMALL LETTER E"
- unicode="LATIN SMALL LETTER E WITH GRAVE" />
+ cldr="CYRILLIC_SMALL_LETTER_E"
+ unicode="LATIN_SMALL_LETTER_E_WITH_GRAVE" />
<translation encoding="ISO8859-2" ucc="042E"
- cldr="?CYRILLIC CAPITAL LETTER YU" unicode="?" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_YU" unicode="?" />
<translation encoding="ISO8859-2" ucc="044E"
- cldr="?CYRILLIC SMALL LETTER YU" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_YU" unicode="?" />
<translation encoding="ISO8859-2" ucc="042F"
- cldr="CYRILLIC CAPITAL LETTER YA"
- unicode="LATIN CAPITAL LETTER A WITH CIRCUMFLEX" />
+ cldr="CYRILLIC_CAPITAL_LETTER_YA"
+ unicode="LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2" ucc="044F"
- cldr="CYRILLIC SMALL LETTER YA"
- unicode="LATIN SMALL LETTER A WITH CIRCUMFLEX" />
+ cldr="CYRILLIC_SMALL_LETTER_YA"
+ unicode="LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2"
- cldr="LATIN SMALL LETTER T WITH COMMA BELOW"
- unicode="LATIN SMALL LETTER T" />
+ cldr="LATIN_SMALL_LETTER_T_WITH_COMMA_BELOW"
+ unicode="LATIN_SMALL_LETTER_T" />
<translation encoding="ISO8859-5"
- cldr="MODIFIER LETTER APOSTROPHE" unicode="APOSTROPHE" />
+ cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
<translation encoding="ISO8859-5"
- cldr="LATIN SMALL LETTER C WITH CARON"
- unicode="LATIN SMALL LETTER C" />
+ cldr="LATIN_SMALL_LETTER_C_WITH_CARON"
+ unicode="LATIN_SMALL_LETTER_C" />
<translation encoding="KOI8-U"
- cldr="MODIFIER LETTER APOSTROPHE" unicode="APOSTROPHE" />
+ cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
<translation encoding="CP1251"
- cldr="MODIFIER LETTER APOSTROPHE" unicode="APOSTROPHE" />
+ cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
<!-- Copied from the original FreeBSD src/share/monetdef -->
- <translation encoding="CP1251" cldr="HRYVNIA SIGN" hex="E3F0ED" />
- <translation encoding="ISO8859-5" cldr="HRYVNIA SIGN" hex="D3E0DD" />
- <translation encoding="KOI8-U" cldr="HRYVNIA SIGN" hex="C7D2CE" />
- <translation encoding="CP866" cldr="RUBLE SIGN" hex="E0E3A1" />
- <translation encoding="ISO8859-5" cldr="RUBLE SIGN" hex="E0E3D1" />
- <translation encoding="CP1251" cldr="RUBLE SIGN" hex="E0E3D1" />
- <translation encoding="KOI8-R" cldr="RUBLE SIGN" hex="D2D5C2" />
+ <translation encoding="CP1251" cldr="HRYVNIA_SIGN" hex="E3F0ED" />
+ <translation encoding="ISO8859-5" cldr="HRYVNIA_SIGN" hex="D3E0DD" />
+ <translation encoding="KOI8-U" cldr="HRYVNIA_SIGN" hex="C7D2CE" />
+ <translation encoding="CP866" cldr="RUBLE_SIGN" hex="E0E3A1" />
+ <translation encoding="ISO8859-5" cldr="RUBLE_SIGN" hex="E0E3D1" />
+ <translation encoding="CP1251" cldr="RUBLE_SIGN" hex="E0E3D1" />
+ <translation encoding="KOI8-R" cldr="RUBLE_SIGN" hex="D2D5C2" />
<!-- These don't have a special Kow sign so just use KRW for it -->
- <translation encoding="CP949" cldr="WON SIGN" hex="5C" />
- <translation encoding="eucKR" cldr="WON SIGN" hex="5C" />
+ <translation encoding="CP949" cldr="WON_SIGN" hex="5C" />
+ <translation encoding="eucKR" cldr="WON_SIGN" hex="5C" />
<!-- Asian characters -->
<translation encoding="GB2312 eucCN" cldr="C"
- unicode="FULLWIDTH LATIN CAPITAL LETTER C" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_C" />
<translation encoding="Big5" cldr="D"
- unicode="FULLWIDTH LATIN CAPITAL LETTER D" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_D" />
<translation encoding="GB2312 eucCN Big5" cldr="N"
- unicode="FULLWIDTH LATIN CAPITAL LETTER N" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_N" />
<translation encoding="Big5" cldr="T"
- unicode="FULLWIDTH LATIN CAPITAL LETTER T" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_T" />
<translation encoding="Big5" cldr="W"
- unicode="FULLWIDTH LATIN CAPITAL LETTER W" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_W" />
<translation encoding="GB2312 eucCN" cldr="Y"
- unicode="FULLWIDTH LATIN CAPITAL LETTER Y" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_Y" />
<translation encoding="GB2312 Big5 eucCN" cldr="one"
- unicode="FULLWIDTH DIGIT ONE" />
+ unicode="FULLWIDTH_DIGIT_ONE" />
<translation encoding="GB2312 Big5 eucCN" cldr="two"
- unicode="FULLWIDTH DIGIT TWO" />
+ unicode="FULLWIDTH_DIGIT_TWO" />
<translation encoding="GB2312 Big5 eucCN" cldr="three"
- unicode="FULLWIDTH DIGIT THREE" />
+ unicode="FULLWIDTH_DIGIT_THREE" />
<translation encoding="GB2312 Big5 eucCN" cldr="four"
- unicode="FULLWIDTH DIGIT FOUR" />
+ unicode="FULLWIDTH_DIGIT_FOUR" />
<translation encoding="GB2312 Big5 eucCN" cldr="five"
- unicode="FULLWIDTH DIGIT FIVE" />
+ unicode="FULLWIDTH_DIGIT_FIVE" />
<translation encoding="GB2312 Big5 eucCN" cldr="six"
- unicode="FULLWIDTH DIGIT SIX" />
+ unicode="FULLWIDTH_DIGIT_SIX" />
<translation encoding="GB2312 Big5 eucCN" cldr="seven"
- unicode="FULLWIDTH DIGIT SEVEN" />
+ unicode="FULLWIDTH_DIGIT_SEVEN" />
<translation encoding="GB2312 Big5 eucCN" cldr="eight"
- unicode="FULLWIDTH DIGIT EIGHT" />
+ unicode="FULLWIDTH_DIGIT_EIGHT" />
<translation encoding="GB2312 Big5 eucCN" cldr="nine"
- unicode="FULLWIDTH DIGIT NINE" />
+ unicode="FULLWIDTH_DIGIT_NINE" />
<translation encoding="GB2312 Big5 eucCN" cldr="zero"
- unicode="FULLWIDTH DIGIT ZERO" />
+ unicode="FULLWIDTH_DIGIT_ZERO" />
<translation encoding="GB2312 eucCN Big5" cldr="space"
- unicode="IDEOGRAPHIC SPACE" />
- <translation encoding="GB2312 eucCN Big5" cldr="FULL STOP"
- unicode="FULLWIDTH FULL STOP" />
+ unicode="IDEOGRAPHIC_SPACE" />
+ <translation encoding="GB2312 eucCN Big5" cldr="FULL_STOP"
+ unicode="FULLWIDTH_FULL_STOP" />
<translation encoding="GB2312 eucCN Big5" cldr="SOLIDUS"
- unicode="FULLWIDTH SOLIDUS" />
+ unicode="FULLWIDTH_SOLIDUS" />
<translation encoding="GB2312 eucCN Big5" cldr="COMMA"
- unicode="FULLWIDTH COMMA" />
+ unicode="FULLWIDTH_COMMA" />
<translation encoding="GB2312 eucCN Big5" cldr="HYPHEN-MINUS"
- unicode="FULLWIDTH HYPHEN-MINUS" />
- <translation encoding="Big5" cldr="DOLLAR SIGN"
- unicode="FULLWIDTH DOLLAR SIGN" />
+ unicode="FULLWIDTH_HYPHEN-MINUS" />
+ <translation encoding="Big5" cldr="DOLLAR_SIGN"
+ unicode="FULLWIDTH_DOLLAR_SIGN" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E00" ucc="4E00" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E00" ucc="4E00" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E03" ucc="4E03" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E03" ucc="4E03" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E09" ucc="4E09" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E09" ucc="4E09" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E0A" ucc="4E0A" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E0A" ucc="4E0A" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E0B" ucc="4E0B" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E0B" ucc="4E0B" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E0D" ucc="4E0D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E0D" ucc="4E0D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E5D" ucc="4E5D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E5D" ucc="4E5D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E8C" ucc="4E8C" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E8C" ucc="4E8C" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-4E94" ucc="4E94" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E94" ucc="4E94" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-516B" ucc="516B" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-516B" ucc="516B" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-516D" ucc="516D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-516D" ucc="516D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-5206" ucc="5206" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5206" ucc="5206" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-524D" ucc="524D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-524D" ucc="524D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-5341" ucc="5341" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5341" ucc="5341" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-5348" ucc="5348" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5348" ucc="5348" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-5426" ucc="5426" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5426" ucc="5426" />
<translation encoding="GB2312 GB18030 GBK eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-5468" ucc="5468" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5468" ucc="5468" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-56DB" ucc="56DB" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-56DB" ucc="56DB" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-571F" ucc="571F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-571F" ucc="571F" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-5B9A" ucc="5B9A" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5B9A" ucc="5B9A" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-5E74" ucc="5E74" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5E74" ucc="5E74" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-5F8C" ucc="5F8C" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5F8C" ucc="5F8C" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-65E5" ucc="65E5" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-65E5" ucc="65E5" />
<translation encoding="GB2312 GB18030 GBK eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-65F6" ucc="65F6" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-65F6" ucc="65F6" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-661F" ucc="661F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-661F" ucc="661F" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-662F" ucc="662F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-662F" ucc="662F" />
<translation encoding="Big5 "
- cldr="CJK UNIFIED IDEOGRAPH-6642" ucc="6642" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6642" ucc="6642" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-66DC" ucc="66DC" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-66DC" ucc="66DC" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-6708" ucc="6708" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6708" ucc="6708" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-671F" ucc="671F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-671F" ucc="671F" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-6728" ucc="6728" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6728" ucc="6728" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-6C34" ucc="6C34" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6C34" ucc="6C34" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-706B" ucc="706B" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-706B" ucc="706B" />
<translation encoding="GB2312 GB18030 GBK eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-786E" ucc="786E" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-786E" ucc="786E" />
<translation encoding="Big5 "
- cldr="CJK UNIFIED IDEOGRAPH-78BA" ucc="78BA" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-78BA" ucc="78BA" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
- cldr="CJK UNIFIED IDEOGRAPH-79D2" ucc="79D2" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-79D2" ucc="79D2" />
<translation encoding="Big5 "
- cldr="CJK UNIFIED IDEOGRAPH-9031" ucc="9031" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-9031" ucc="9031" />
<translation encoding="eucJP SJIS"
- cldr="CJK UNIFIED IDEOGRAPH-91D1" ucc="91D1" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-91D1" ucc="91D1" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE GEUM" ucc="AE08" />
+ cldr="HANGUL_SYLLABLE_GEUM" ucc="AE08" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE NYEON" ucc="B144" />
+ cldr="HANGUL_SYLLABLE_NYEON" ucc="B144" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE NI" ucc="B2C8" />
+ cldr="HANGUL_SYLLABLE_NI" ucc="B2C8" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE MOG" ucc="BAA9" />
+ cldr="HANGUL_SYLLABLE_MOG" ucc="BAA9" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE BUN" ucc="BD84" />
+ cldr="HANGUL_SYLLABLE_BUN" ucc="BD84" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE SU" ucc="C218" />
+ cldr="HANGUL_SYLLABLE_SU" ucc="C218" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE SI" ucc="C2DC" />
+ cldr="HANGUL_SYLLABLE_SI" ucc="C2DC" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE A" ucc="C544" />
+ cldr="HANGUL_SYLLABLE_A" ucc="C544" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE YE" ucc="C608" />
+ cldr="HANGUL_SYLLABLE_YE" ucc="C608" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE O" ucc="C624" />
+ cldr="HANGUL_SYLLABLE_O" ucc="C624" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE YO" ucc="C694" />
+ cldr="HANGUL_SYLLABLE_YO" ucc="C694" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE WEOL" ucc="C6D4" />
+ cldr="HANGUL_SYLLABLE_WEOL" ucc="C6D4" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE IL" ucc="C77C" />
+ cldr="HANGUL_SYLLABLE_IL" ucc="C77C" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE JEON" ucc="C804" />
+ cldr="HANGUL_SYLLABLE_JEON" ucc="C804" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE CO" ucc="CD08" />
+ cldr="HANGUL_SYLLABLE_CO" ucc="CD08" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE TO" ucc="D1A0" />
+ cldr="HANGUL_SYLLABLE_TO" ucc="D1A0" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE HWA" ucc="D654" />
+ cldr="HANGUL_SYLLABLE_HWA" ucc="D654" />
<translation encoding="eucKR"
- cldr="HANGUL SYLLABLE HU" ucc="D6C4" />
+ cldr="HANGUL_SYLLABLE_HU" ucc="D6C4" />
<translation encoding="ARMSCII-8"
- cldr="ONE DOT LEADER" unicode="FULL STOP" />
+ cldr="ONE_DOT_LEADER" unicode="FULL_STOP" />
- <translation encoding="US-ASCII" cldr="POUND SIGN" string="GBP" />
+ <translation encoding="US-ASCII" cldr="POUND_SIGN" string="GBP" />
<translation encoding="US-ASCII"
- cldr="NO-BREAK SPACE" unicode="SPACE" />
+ cldr="NO-BREAK_SPACE" unicode="SPACE" />
<translation encoding="ISO8859-1 ISO8859-15"
- cldr="NARROW NO-BREAK SPACE" unicode="NO-BREAK SPACE" />
+ cldr="NARROW_NO-BREAK_SPACE" unicode="NO-BREAK_SPACE" />
<!-- punctuation and currency -->
<translation encoding="ISO8859-1 ISO8859-15"
- cldr="RIGHT SINGLE QUOTATION MARK" unicode="APOSTROPHE" />
+ cldr="RIGHT_SINGLE_QUOTATION_MARK" unicode="APOSTROPHE" />
- <translation encoding="ISCII-DEV" cldr="INDIAN RUPEE SIGN" hex="FC" />
- <translation encoding="ISO8859-1" cldr="PESO SIGN" hex="A4" />
- <translation encoding="ISO8859-1" cldr="COLON SIGN" hex="A4" />
- <translation encoding="ARMSCII-8" cldr="ARMENIAN DRAM SIGN"
+ <translation encoding="ISCII-DEV" cldr="INDIAN_RUPEE_SIGN" hex="FC" />
+ <translation encoding="ISO8859-1" cldr="PESO_SIGN" hex="A4" />
+ <translation encoding="ISO8859-1" cldr="COLON_SIGN" hex="A4" />
+ <translation encoding="ARMSCII-8" cldr="ARMENIAN_DRAM_SIGN"
hex="B9F12E" />
- <translation encoding="ISO8859-9" cldr="TURKISH LIRA SIGN"
+ <translation encoding="ISO8859-9" cldr="TURKISH_LIRA_SIGN"
string="TL" />
</translations>
diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl
index 8617ca81ca40..fd475db714a0 100755
--- a/tools/tools/locale/tools/cldr2def.pl
+++ b/tools/tools/locale/tools/cldr2def.pl
@@ -4,6 +4,7 @@
#
# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
# Copyright 2015 John Marino <draco@marino.st>
+# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@@ -38,7 +39,6 @@ use Getopt::Long;
use Digest::SHA qw(sha1_hex);
require "charmaps.pm";
-
if ($#ARGV < 2) {
print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
exit(1);
@@ -69,10 +69,11 @@ my %encodings = ();
my %alternativemonths = ();
get_languages();
-my %utf8map = ();
-my %utf8aliases = ();
-get_unidata($UNIDIR);
-get_utf8map("$UNIDIR/posix/$DEFENCODING.cm");
+my %utfmap = ();
+$utfmap{'UTF-8'} = {};
+$utfmap{'UTF-32'} = {};
+get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
+get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
get_encodings("$ETCDIR/charmaps");
my %keys = ();
@@ -334,25 +335,8 @@ sub callback_abmon {
############################
-sub get_unidata {
- my $directory = shift;
-
- open(FIN, "$directory/UnicodeData.txt")
- or die("Cannot open $directory/UnicodeData.txt");;
- my @lines = <FIN>;
- chomp(@lines);
- close(FIN);
-
- foreach my $l (@lines) {
- my @a = split(/;/, $l);
-
- $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name
- $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code
- }
-}
-
-sub get_utf8map {
- my $file = shift;
+sub get_utfmap {
+ my ($file, $db) = @_;
open(FIN, $file);
my @lines = <FIN>;
@@ -363,7 +347,7 @@ sub get_utf8map {
my $prev_v = "";
my $incharmap = 0;
foreach my $l (@lines) {
- $l =~ s/\r//;
+ chomp($l);
next if ($l =~ /^\#/);
next if ($l eq "");
@@ -378,17 +362,28 @@ sub get_utf8map {
$l =~ /^<([^\s]+)>\s+(.*)/;
my $k = $1;
my $v = $2;
- $k =~ s/_/ /g; # unicode char string
$v =~ s/\\x//g; # UTF-8 char code
- $utf8map{$k} = $v;
+ $db->{$k} = $v;
+# print STDERR "UTF $k = $v\n";
- $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
+ # XXX: no longer needed
+ # $db_alias->{$k} = $prev_k if ($prev_v eq $v);
$prev_v = $v;
$prev_k = $k;
}
}
+sub resolve_enc_addition {
+ my $ret = '';
+
+ foreach my $t (split(/\+/, $_[0])) {
+ $t =~ s/^0[xX]//;
+ $ret .= $t;
+ }
+ return $ret;
+}
+
sub get_encodings {
my $dir = shift;
foreach my $e (sort(keys(%encodings))) {
@@ -403,14 +398,20 @@ sub get_encodings {
chomp(@lines);
foreach my $l (@lines) {
$l =~ s/\r//;
- next if ($l =~ /^\#/);
next if ($l eq "");
my @a = split(" ", $l);
next if ($#a < 1);
- $a[0] =~ s/^0[xX]//; # local char code
- $a[1] =~ s/^0[xX]//; # unicode char code
- $convertors{$e}{uc($a[1])} = uc($a[0]);
+ next if ($a[0] =~ /^\#/ or $a[1] =~ /^\#/);
+ next if ($a[0] eq '' or $a[1] eq '');
+
+ $a[0] = resolve_enc_addition($a[0]); # local
+ $a[1] = resolve_enc_addition($a[1]); # UTF-32
+ my $u32 = sprintf("%08X", hex($a[1]));
+# print STDERR "$a[1] => $u32\n";
+
+ # Use UTF-32 as the indices.
+ $convertors{$e}{$u32} = uc($a[0]);
}
}
}
@@ -565,8 +566,75 @@ EOF
foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
next if ($enc eq $DEFENCODING);
- copy ("$TYPE.draft/$actfile.$DEFENCODING.src",
- "$TYPE.draft/$actfile.$enc.src");
+
+ open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src";
+ open FOUT, ">$TYPE.draft/$actfile.$enc.src";
+ my $order_start = 0;
+ my $print_p = 0;
+ #
+ # %c_elem: collation elements
+ #
+ # undef: not defined
+ # 1: defined
+ # 2: invalid in this encoding
+ #
+ my %c_elem = ();
+ while (<FIN>) { # XXX: this loop should be refactored.
+ chomp;
+ $print_p = 1;
+ if ($order_start) {
+ $order_start = 0 if (m/^order_end/);
+ if (m/^<([^>]+)>/) {
+ if (not defined $c_elem{$1}) {
+# print STDERR "$1:\n";
+
+ my $u32 = $utfmap{'UTF-32'}->{$1};
+ die "order, $1\n" if (not defined $u32);
+# print STDERR "u32 for $1 = $u32\n";
+ if (not defined $convertors{$enc}{$u32}) {
+# print STDERR "$1 - $u32 not defined in $enc\n";
+ $print_p = 0;
+ }
+ } elsif ($c_elem{$1} == 2) {
+# print STDERR "$1 is marked as invalid in $enc\n";
+ $print_p = 0;
+ }
+ }
+ } elsif (m/^collating-element/) {
+ my ($elem, $l);
+ if (m/<([^>]+)> from (.+)/) {
+ ($elem, $l) = ($1, $2);
+ }
+# print STDERR "$elem: enter ($print_p, $l,)\n";
+ while ($print_p and
+ defined $l and
+ $l =~ m/<([^>]+)>/g) {
+# print STDERR "$elem: $1\n";
+ my $u32 = $utfmap{'UTF-32'}->{$1};
+ die "collating-element, $1\n" if (not defined $u32);
+# print STDERR "u32 for $1 = $u32\n";
+ if (not $convertors{$enc}{$u32}) {
+# print STDERR "$1 - $u32 not defined in $enc\n";
+ $print_p = 0;
+# print STDERR "Mark $elem as invalid\n";
+ $c_elem{$elem} = 2;
+ }
+ }
+ if ($print_p) {
+# print STDERR "Add $elem\n";
+ $c_elem{$elem} = 1;
+ }
+ } elsif (m/^collating-symbol <([^>]+)>/) {
+# print STDERR "Add $1\n";
+ $c_elem{$1} = 1;
+ } elsif (m/^order_start/) {
+ $order_start = 1;
+ # do nothing
+ }
+ print FOUT $_, "\n" if ($print_p);
+ }
+ close FOUT;
+ close FIN;
$languages{$l}{$f}{data}{$c}{$enc} = $shex;
$hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
}
@@ -626,11 +694,11 @@ sub get_fields {
$continue = ($line =~ /\/$/);
$line =~ s/\/$// if ($continue);
- while ($line =~ /_/) {
- $line =~
- s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
- }
- die "_ in data - $line" if ($line =~ /_/);
+# while ($line =~ /_/) {
+# $line =~
+# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
+# }
+# die "_ in data - $line" if ($line =~ /_/);
$values{$l}{$f}{$c}{$k} .= $line;
last if (!$continue);
@@ -652,56 +720,52 @@ sub decodecldr {
# Conversion to UTF-8 can be done from the Unicode name to
# the UTF-8 character code.
#
- $v = $utf8map{$s};
+ $v = $utfmap{'UTF-8'}->{$s};
die "Cannot convert $s in $e (charmap)" if (!defined $v);
} else {
#
# Conversion to these encodings can be done from the Unicode
# name to Unicode code to the encodings code.
#
- my $ucc = undef;
- $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
- $ucc = $ucd{name2code}{$utf8aliases{$s}}
- if (!defined $ucc
- && $utf8aliases{$s}
- && defined $ucd{name2code}{$utf8aliases{$s}});
-
- if (!defined $ucc) {
- if (defined $translations{$e}{$s}{hex}) {
- $v = $translations{$e}{$s}{hex};
- $ucc = 0;
- } elsif (defined $translations{$e}{$s}{ucc}) {
- $ucc = $translations{$e}{$s}{ucc};
+ # hex - hex or string attr
+ # unicode - unicode attr
+ # ucc - ucc attr
+ my $hex = $translations{$e}{$s}{hex};
+ my $ucc = $utfmap{'UTF-32'}->{$s};
+ my $ucc_attr = $translations{$e}{$s}{ucc};
+ my $unicode = $translations{$e}{$s}{unicode};
+
+ if (defined $hex) { # hex is in local encoding
+ $v = $hex;
+ } elsif (defined $unicode) { # unicode is in name
+ $v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}};
+ } elsif (defined $ucc_attr) { # ucc is in code point
+ if (defined $ucc) {
+# print STDERR "INFO: ucc=$ucc_attr ",
+# "overrides $ucc in UTF-32\n";
}
- }
-
- die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
- $v = $convertors{$e}{$ucc} if (!defined $v);
-
- $v = $translations{$e}{$s}{hex}
- if (!defined $v && defined $translations{$e}{$s}{hex});
-
- if (!defined $v && defined $translations{$e}{$s}{unicode}) {
- my $ucn = $translations{$e}{$s}{unicode};
- $ucc = $ucd{name2code}{$ucn}
- if (defined $ucd{name2code}{$ucn});
- $ucc = $ucd{name2code}{$utf8aliases{$ucn}}
- if (!defined $ucc
- && defined $ucd{name2code}{$utf8aliases{$ucn}});
+ # normalize
+ $ucc_attr = sprintf("%08X", hex($ucc_attr));
+# print STDERR "convert $ucc_attr into $e\n";
+ $v = $convertors{$e}{$ucc_attr};
+ } elsif (defined $ucc) {
+ # normalize
+ $ucc = sprintf("%08X", hex($ucc));
+# print STDERR "convert $ucc into $e\n";
$v = $convertors{$e}{$ucc};
}
-
- die "Cannot convert $s in $e (charmap)" if (!defined $v);
+ die "Cannot convert $s in $e" if (!defined $v);
}
+ # XXX: length = 8 is not supported yet.
+ $v =~ s/^[0]+//g;
+ $v = "0" . $v if (length($v) % 2);
return pack("C", hex($v)) if (length($v) == 2);
return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
if (length($v) == 4);
return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
hex(substr($v, 4, 2))) if (length($v) == 6);
- print STDERR "Cannot convert $e $s\n";
- return "length = " . length($v);
-
+ die "Cannot convert $s in $e (length = " . length($v) . "\n";
}
sub translate {