aboutsummaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorBaptiste Daroussin <bapt@FreeBSD.org>2016-04-16 17:36:02 +0000
committerBaptiste Daroussin <bapt@FreeBSD.org>2016-04-16 17:36:02 +0000
commitedca0642eea8a5355a4071b26cbb7dc4b6b0810d (patch)
tree76fae602b2751848257c829fd21a58416f4a701d /tools
parent02277afdb15770d13f5d35a790d2d8fb4c518a07 (diff)
downloadsrc-edca0642eea8a5355a4071b26cbb7dc4b6b0810d.tar.gz
src-edca0642eea8a5355a4071b26cbb7dc4b6b0810d.zip
Rework collation generation:
When building collation database for non unicode encodings use the proper unicode mapping (this fixes collation not working properly for those encodings) For locales where new characters are added but only for unicode, stop trying to map the new characters, directly extract from CLDR the collation files for the said encoding Stop trying to generate encoding map from unicode version for GB2312 and encCN It was not reliable. Instead use the map provide by the CLDR project Reported by: ache
Notes
Notes: svn path=/head/; revision=298116
Diffstat (limited to 'tools')
-rw-r--r--tools/tools/locale/Makefile69
-rw-r--r--tools/tools/locale/etc/charmaps.xml58
-rw-r--r--tools/tools/locale/etc/charmaps/charmaps.txt1
-rwxr-xr-xtools/tools/locale/tools/cldr2def.pl14
-rwxr-xr-xtools/tools/locale/tools/convert_map.pl3
-rw-r--r--tools/tools/locale/tools/extract-colldef.awk18
-rwxr-xr-xtools/tools/locale/tools/finalize44
7 files changed, 160 insertions, 47 deletions
diff --git a/tools/tools/locale/Makefile b/tools/tools/locale/Makefile
index 2b5aa55d189a..bac5c3e7f63b 100644
--- a/tools/tools/locale/Makefile
+++ b/tools/tools/locale/Makefile
@@ -22,6 +22,23 @@ KNOWN= monetdef numericdef msgdef timedef colldef ctypedef
TYPES?= ${KNOWN}
LOCALE_DESTDIR?= /tmp/generated-locales/
+COLLATION_SPECIAL?= \
+ cs_CZ ISO8859-2 \
+ da_DK ISO8859-1 \
+ da_DK ISO8859-15 \
+ hr_HR ISO8859-2 \
+ hu_HU ISO8859-2 \
+ nb_NO ISO8859-1 \
+ nb_NO ISO8859-15 \
+ sk_SK ISO8859-2 \
+ zh_Hans_CN GB2312 \
+ zh_Hans_CN eucCN \
+
+.for area enc in ${COLLATION_SPECIAL}
+COLLATIONS_SPECIAL_ENV+= ${area}.${enc}
+.endfor
+PASSON+= COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}"
+
.if defined(LC)
LC:= --lc=${LC}
.endif
@@ -55,17 +72,26 @@ post-install:
.endfor
.for t in ${TYPES}
-build-${t}:
+gen-${t}:
mkdir -p ${t} ${t}.draft
perl -I tools tools/cldr2def.pl \
--cldr=$$(realpath ${CLDRDIR}) \
--unidata=$$(realpath ${UNIDATADIR}) \
--etc=$$(realpath ${ETCDIR}) \
--type=${t} ${LC}
+
+build-${t}: gen-${t}
env ${PASSON} tools/finalize ${t}
.endfor
-build-ctypedef: transfer-rollup
+gen-ctypedef: transfer-rollup
+static-colldef: gen-colldef
+build-colldef: static-colldef
+
+static-colldef:
+.for area enc in ${COLLATION_SPECIAL}
+ awk -f tools/extract-colldef.awk ${CLDRDIR}/posix/${area}.${enc}.src > colldef/${area}.${enc}.src
+.endfor
transfer-rollup:
cp ${ETCDIR}/common.UTF-8.src ${CLDRDIR}/posix/xx_Comm_US.UTF-8.src
@@ -93,12 +119,34 @@ BASE_LOCALES_OF_INTEREST?= \
uk_UA \
kk_Cyrl_KZ mn_Cyrl_MN sr_Cyrl_RS sr_Latn_RS \
zh_Hans_CN zh_Hant_HK zh_Hant_TW \
- \
- \
bn_IN gu_IN or_IN ta_IN te_IN kn_IN ml_IN si_LK \
th_TH lo_LA bo_IN my_MM pa_Guru_IN ka_GE chr_US \
km_KH shi_Tfng_MA ii_CN vai_Vaii_LR vi_VN
+ENCODINGS= Big5 \
+ CP1251 \
+ CP866 \
+ CP949 \
+ eucCN \
+ eucJP \
+ eucKR \
+ GB18030 \
+ GB2312 \
+ GBK \
+ ISO8859-1 \
+ ISO8859-13 \
+ ISO8859-15 \
+ ISO8859-2 \
+ ISO8859-5 \
+ ISO8859-7 \
+ ISO8859-9 \
+ KOI8-R \
+ KOI8-U \
+ SJIS \
+ US-ASCII \
+ UTF-8 \
+
+
POSIX:
.if exists (${CLDRDIR}/tools/java/cldr.jar)
mkdir -p ${CLDRDIR}/posix
@@ -109,11 +157,20 @@ POSIX:
-d ${CLDRDIR}/posix -m ${area} -c UTF-8
. endif
. endfor
-. if !exists(${CLDRDIR}/posix/UTF-8.cm)
+. for area encoding in ${COLLATION_SPECIAL}
+. if !exists(${CLDRDIR}/posix/${area}.${encoding}.src)
+ java -DCLDR_DIR=${CLDRDIR:Q} -jar ${CLDRDIR}/tools/java/cldr.jar \
+ org.unicode.cldr.posix.GeneratePOSIX \
+ -d ${CLDRDIR}/posix -m ${area} -c ${encoding}
+. endif
+. endfor
+. for enc in ${ENCODINGS}
+. if !exists(${CLDRDIR}/posix/${enc}.cm)
java -DCLDR_DIR=${CLDRDIR:Q} -jar ${CLDRDIR}/tools/java/cldr.jar \
org.unicode.cldr.posix.GenerateCharmap \
- -d ${CLDRDIR}/posix
+ -d ${CLDRDIR}/posix -c ${enc}
. endif
+. endfor
.else
@echo "Please install CLDR toolset for the desired release"
@echo "It should go at ${CLDRDIR}/tools"
diff --git a/tools/tools/locale/etc/charmaps.xml b/tools/tools/locale/etc/charmaps.xml
index 0b6551aee55f..e0d39b25d576 100644
--- a/tools/tools/locale/etc/charmaps.xml
+++ b/tools/tools/locale/etc/charmaps.xml
@@ -187,10 +187,6 @@
countries="CN" />
<language name="zh"
family="Hant"
- encoding="Big5HKSCS"
- countries="HK" />
- <language name="zh"
- family="Hant"
encoding="Big5"
countries="TW" />
</languages>
@@ -444,69 +440,69 @@
unicode="FULLWIDTH HYPHEN-MINUS" />
<translation encoding="Big5" cldr="DOLLAR SIGN"
unicode="FULLWIDTH DOLLAR SIGN" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E00" ucc="4E00" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E03" ucc="4E03" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E09" ucc="4E09" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E0A" ucc="4E0A" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E0B" ucc="4E0B" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E0D" ucc="4E0D" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E5D" ucc="4E5D" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E8C" ucc="4E8C" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E94" ucc="4E94" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-516B" ucc="516B" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-516D" ucc="516D" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5206" ucc="5206" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-524D" ucc="524D" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5341" ucc="5341" />
<translation
- encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN eucJP SJIS"
+ encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-5348" ucc="5348" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5426" ucc="5426" />
<translation encoding="GB2312 GB18030 GBK eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5468" ucc="5468" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-56DB" ucc="56DB" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-571F" ucc="571F" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5B9A" ucc="5B9A" />
<translation
- encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN eucJP SJIS"
+ encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-5E74" ucc="5E74" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-5F8C" ucc="5F8C" />
<translation
- encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN eucJP SJIS"
+ encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-65E5" ucc="65E5" />
<translation encoding="GB2312 GB18030 GBK eucCN"
cldr="CJK UNIFIED IDEOGRAPH-65F6" ucc="65F6" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-661F" ucc="661F" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-662F" ucc="662F" />
- <translation encoding="Big5 Big5HKSCS"
+ <translation encoding="Big5 "
cldr="CJK UNIFIED IDEOGRAPH-6642" ucc="6642" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-66DC" ucc="66DC" />
<translation
- encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN eucJP SJIS"
+ encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-6708" ucc="6708" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-671F" ucc="671F" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-6728" ucc="6728" />
@@ -516,11 +512,11 @@
cldr="CJK UNIFIED IDEOGRAPH-706B" ucc="706B" />
<translation encoding="GB2312 GB18030 GBK eucCN"
cldr="CJK UNIFIED IDEOGRAPH-786E" ucc="786E" />
- <translation encoding="Big5 Big5HKSCS"
+ <translation encoding="Big5 "
cldr="CJK UNIFIED IDEOGRAPH-78BA" ucc="78BA" />
- <translation encoding="GB2312 GB18030 GBK Big5 Big5HKSCS eucCN"
+ <translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-79D2" ucc="79D2" />
- <translation encoding="Big5 Big5HKSCS"
+ <translation encoding="Big5 "
cldr="CJK UNIFIED IDEOGRAPH-9031" ucc="9031" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-91D1" ucc="91D1" />
diff --git a/tools/tools/locale/etc/charmaps/charmaps.txt b/tools/tools/locale/etc/charmaps/charmaps.txt
index a0791f7f9567..d8f8bb8190b3 100644
--- a/tools/tools/locale/etc/charmaps/charmaps.txt
+++ b/tools/tools/locale/etc/charmaps/charmaps.txt
@@ -8,7 +8,6 @@ haible.de: http://haible.de/bruno/charsets/conversion-tables/
ARMSCII-8 haible.de: Armenian.html
Big5 unicodeorg: OBSOLETE/EASTASIA/OTHER
- Big5HKSCS haible.de: BIG5-HKSCS.html /
CP1131 haible.de: CP1131.html / aix-4.3.2/IBM-1131.TXT
CP1251 unicode.org: VENDORS/MICSFT/WINDOWS
CP866 unicode.org: VENDORS/MICSFT/PC
diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl
index fae7c91b4273..3f61bb127402 100755
--- a/tools/tools/locale/tools/cldr2def.pl
+++ b/tools/tools/locale/tools/cldr2def.pl
@@ -808,14 +808,24 @@ sub make_makefile {
my $SRCOUT;
my $SRCOUT2;
my $SRCOUT3 = "";
+ my $SRCOUT4 = "";
my $MAPLOC;
if ($TYPE eq "colldef") {
$SRCOUT = "localedef -D -U -i \${.IMPSRC} \\\n" .
- "\t-f \${MAPLOC}/map.UTF-8 " .
+ "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E} " .
"\${.OBJDIR}/\${.IMPSRC:T:R}";
$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
"locale/etc/final-maps\n";
$SRCOUT2 = "LC_COLLATE";
+ $SRCOUT3 = "" .
+ ".for f t in \${LOCALES_MAPPED}\n" .
+ "FILES+=\t\$t.LC_COLLATE\n" .
+ "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" .
+ "\tlocaledef -D -U -i \${.ALLSRC} \\\n" .
+ "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E} \\\n" .
+ "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" .
+ ".endfor\n\n";
+ $SRCOUT4 = "## LOCALES_MAPPED\n";
}
elsif ($TYPE eq "ctypedef") {
$SRCOUT = "localedef -D -U -c -w \${MAPLOC}/widths.txt \\\n" .
@@ -855,6 +865,8 @@ ${MAPLOC}
## PLACEHOLDER
+${SRCOUT4}
+
EOF
foreach my $hash (keys(%hashtable)) {
diff --git a/tools/tools/locale/tools/convert_map.pl b/tools/tools/locale/tools/convert_map.pl
index e5381f3f3dcc..88222531d064 100755
--- a/tools/tools/locale/tools/convert_map.pl
+++ b/tools/tools/locale/tools/convert_map.pl
@@ -1,5 +1,7 @@
#! /usr/local/bin/perl
#
+# $FreeBSD$
+#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
@@ -167,7 +169,6 @@ elsif ($codeset eq "eucKR") { $max_mb = 2 }
elsif ($codeset eq "GBK") { $max_mb = 2 }
elsif ($codeset eq "GB2312") { $max_mb = 2 }
elsif ($codeset eq "Big5") { $max_mb = 2 }
-elsif ($codeset eq "Big5HKSCS") { $max_mb = 2 }
else { $max_mb = 1 };
print("<code_set_name> \"$codeset\"\n");
print("<mb_cur_min> 1\n");
diff --git a/tools/tools/locale/tools/extract-colldef.awk b/tools/tools/locale/tools/extract-colldef.awk
new file mode 100644
index 000000000000..3f2924922fc1
--- /dev/null
+++ b/tools/tools/locale/tools/extract-colldef.awk
@@ -0,0 +1,18 @@
+# $FreeBSD$
+
+BEGIN {
+ print "# Warning: Do not edit. This is automatically extracted"
+ print "# from CLDR project data, obtained from http://cldr.unicode.org/"
+ print "# -----------------------------------------------------------------------------"
+}
+$1 == "comment_char" { print $0 }
+$1 == "escape_char" { print $0 }
+$1 == "LC_COLLATE" {
+ print $0
+ while (getline line) {
+ print line
+ if (line == "END LC_COLLATE") {
+ break
+ }
+ }
+}
diff --git a/tools/tools/locale/tools/finalize b/tools/tools/locale/tools/finalize
index 7ce3e74bb6ec..b32c52c21d6d 100755
--- a/tools/tools/locale/tools/finalize
+++ b/tools/tools/locale/tools/finalize
@@ -26,12 +26,15 @@ new=${base}/../${1}
TEMP=/tmp/${1}.locales
TEMP2=/tmp/${1}.hashes
TEMP3=/tmp/${1}.symlinks
+TEMP4=/tmp/${1}.mapped
FULLMAP=/tmp/utf8-map
FULLEXTRACT=/tmp/extracted-names
AWKCMD="/## PLACEHOLDER/ { \
while ( getline line < \"${TEMP}\" ) {print line} } \
/## SYMPAIRS/ { \
while ( getline line < \"${TEMP3}\" ) {print line} } \
+ /## LOCALES_MAPPED/ { \
+ while ( getline line < \"${TEMP4}\" ) {print line} } \
!/## / { print \$0 }"
grep '^LOCALES+' ${old}/Makefile > ${TEMP}
@@ -51,21 +54,23 @@ then
/usr/bin/sed -E -e 's/[ ]+/ /g' \
${CLDRDIR}/posix/UTF-8.cm \
> ${base}/../etc/final-maps/map.UTF-8
- CHARMAPS="ARMSCII-8 Big5 Big5HKSCS CP1131 CP1251 \
+ /usr/bin/sed -E -e 's/[ ]+/ /g' \
+ ${CLDRDIR}/posix/eucCN.cm \
+ > ${base}/../etc/final-maps/map.eucCN
+ /usr/bin/sed -E -e 's/[ ]+/ /g' \
+ ${CLDRDIR}/posix/eucCN.cm \
+ > ${base}/../etc/final-maps/map.GB2312
+ CHARMAPS="ARMSCII-8 Big5 CP1131 CP1251 \
CP866 GB2312 GBK ISCII-DEV ISO8859-1 \
ISO8859-13 ISO8859-15 ISO8859-2 ISO8859-4 \
ISO8859-5 ISO8859-7 ISO8859-9 KOI8-R KOI8-U \
- PT154 SJIS US-ASCII eucCN eucJP eucKR"
+ PT154 SJIS US-ASCII eucJP eucKR"
# GB18030 blows up, use pre-generate Illumos version
for map in ${CHARMAPS}
do
encoding=${map}
- if [ ${map} = "Big5HKSCS" ]
- then
- encoding="Big5"
- fi
/usr/local/bin/perl ${base}/convert_map.pl \
${base}/../etc/charmaps/${map}.TXT ${encoding} \
| /usr/bin/sed -E -e 's/ +/ /g' \
@@ -73,6 +78,31 @@ then
echo map ${map} converted.
done
+elif [ $1 = "colldef" ]
+then
+ awk -v tmp4=${TEMP4} '$1 == "SAME+=" && $0 !~ /legacy/ {
+ orig=$2
+ dest=$3
+ gsub(/.*\./, "", orig)
+ gsub(/.*\./, "", dest)
+ if (orig != dest )
+ print "LOCALES_MAPPED+=\t"$2 " "$3 > tmp4
+ }' ${old}/Makefile
+
+ for line in $(awk '{ print $3 }' ${TEMP4}); do
+ sed -i '' "/^SAME.*$line$/d" ${old}/Makefile
+ done
+ echo "" >> ${TEMP4}
+ for enc in ${COLLATIONS_SPECIAL}; do
+ sed -i '' "/^.*${enc}$/d" ${TEMP4}
+ echo "LOCALES+= ${enc}" >> ${TEMP4}
+ done
+
+ keep=$(cat ${TEMP} | awk '{ print $2 }')
+ for original in ${keep}
+ do
+ cp ${old}/${original}.src ${new}/
+ done
else # below is everything but ctypedef
keep=$(cat ${TEMP} | awk '{ print $2 }')
@@ -85,4 +115,4 @@ fi
grep -v '^LOCALES+' ${old}/Makefile | awk "${AWKCMD}" > ${new}/Makefile
-rm -f ${TEMP} ${TEMP3}
+rm -f ${TEMP} ${TEMP3} ${TEMP4}