summaryrefslogtreecommitdiff
path: root/sal/textenc/generate
diff options
context:
space:
mode:
authorStephan Bergmann <sb@openoffice.org>2001-10-12 08:44:51 +0000
committerStephan Bergmann <sb@openoffice.org>2001-10-12 08:44:51 +0000
commite8366e3816b50dbe4e1f9b697abece8813e11b00 (patch)
treeed20bc56ce29ba0fbfe57b4a25a646a50401c6b1 /sal/textenc/generate
parentba4f155598069228bd52b0ab2d12744c231d60dc (diff)
#87140#
Diffstat (limited to 'sal/textenc/generate')
-rw-r--r--sal/textenc/generate/cns116431992.pl786
-rw-r--r--sal/textenc/generate/gb180302000.pl343
2 files changed, 1129 insertions, 0 deletions
diff --git a/sal/textenc/generate/cns116431992.pl b/sal/textenc/generate/cns116431992.pl
new file mode 100644
index 000000000000..4542e08101e4
--- /dev/null
+++ b/sal/textenc/generate/cns116431992.pl
@@ -0,0 +1,786 @@
+#!/usr/bin/perl
+#*************************************************************************
+#
+# $RCSfile: cns116431992.pl,v $
+#
+# $Revision: 1.1 $
+#
+# last change: $Author: sb $ $Date: 2001-10-12 09:44:51 $
+#
+# The Contents of this file are made available subject to the terms of
+# either of the following licenses
+#
+# - GNU Lesser General Public License Version 2.1
+# - Sun Industry Standards Source License Version 1.1
+#
+# Sun Microsystems Inc., October, 2000
+#
+# GNU Lesser General Public License Version 2.1
+# =============================================
+# Copyright 2000 by Sun Microsystems, Inc.
+# 901 San Antonio Road, Palo Alto, CA 94303, USA
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License version 2.1, as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+# MA 02111-1307 USA
+#
+#
+# Sun Industry Standards Source License Version 1.1
+# =================================================
+# The contents of this file are subject to the Sun Industry Standards
+# Source License Version 1.1 (the "License"); You may not use this file
+# except in compliance with the License. You may obtain a copy of the
+# License at http://www.openoffice.org/license.html.
+#
+# Software provided under this License is provided on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
+# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
+# See the License for the specific provisions governing your rights and
+# obligations concerning the Software.
+#
+# The Initial Developer of the Original Code is: Sun Microsystems, Inc.
+#
+# Copyright: 2000 by Sun Microsystems, Inc.
+#
+# All Rights Reserved.
+#
+# Contributor(s): _______________________________________
+#
+#
+#*************************************************************************
+
+# The following files must be available in a ./input subdir:
+
+# <http://www.unicode.org/Public/UNIDATA/Unihan.txt>:
+# "Unicode version: 3.1.1 Table version: 1.1 Date: 28 June 2001"
+# contains descriptions for:
+# U+3400..4DFF CJK Unified Ideographs Extension A
+# U+4E00..9FFF CJK Unified Ideographs
+# U+F900..FAFF CJK Compatibility Ideographs
+# U+20000..2F7FF CJK Unified Ideographs Extension B
+# U+2F800..2FFFF CJK Compatibility Ideographs Supplement
+
+# <http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/CNS11643.TXT>:
+# "Unicode version: 1.1 Table version: 0.0d1 Date: 21 October 1994"
+# contains mappings for CNS 11643-1986
+
+# <http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/Uni2CNS.Z>:
+# "Unicode version: 1.1 Table version: 0.49 Date: 26 March 1998"
+# contains mappings for CNS 11643-1992 that are incompatible with
+# CNS11643.TXT
+
+$id = "Cns116431992";
+
+sub isValidUtf32
+{
+ my $utf32 = $_[0];
+ return $utf32 >= 0 && $utf32 <= 0x10FFFF
+ && !($utf32 >= 0xD800 && $utf32 <= 0xDFFF)
+ && !($utf32 >= 0xFDD0 && $utf32 <= 0xFDEF)
+ && ($utf32 & 0xFFFF) < 0xFFFE;
+}
+
+sub printUtf32
+{
+ my $utf32 = $_[0];
+ return sprintf("U+%04X", $utf32);
+}
+
+sub isValidCns116431992
+{
+ my $plane = $_[0];
+ my $row = $_[1];
+ my $column = $_[2];
+ return $plane >= 1 && $plane <= 16
+ && $row >= 1 && $row <= 94
+ && $column >= 1 && $column <= 94;
+}
+
+sub printCns116431992
+{
+ my $plane = $_[0];
+ my $row = $_[1];
+ my $column = $_[2];
+ return sprintf("%d-%02d/%02d", $plane, $row, $column);
+}
+
+sub printStats
+{
+ my $used = $_[0];
+ my $space = $_[1];
+ return sprintf("%d/%d bytes (%.1f%%)",
+ $used,
+ $space,
+ $used * 100 / $space);
+}
+
+$count_Unihan_txt = 0;
+$count_CNS11643_TXT = 0;
+$count_Uni2CNS = 0;
+
+if (1)
+{
+ $filename = "Unihan.txt";
+ open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+ while (<IN>)
+ {
+ if (/^U\+([0-9A-F]+)\tkCNS1992\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $cns_plane = oct("0x" . $2);
+ $cns_row = oct("0x" . $3) - 0x20;
+ $cns_column = oct("0x" . $4) - 0x20;
+ isValidUtf32($utf32)
+ or die "Bad UTF32 char U+" . printUtf32($utf32);
+ isValidCns116431992($cns_plane, $cns_row, $cns_column)
+ or die "Bad CNS11643-1992 char "
+ . printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column);
+ if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+ {
+ $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+ $cns_plane_used[$cns_plane] = 1;
+ ++$count_Unihan_txt;
+ }
+ else
+ {
+ ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+ or die "Mapping "
+ . printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column)
+ . " to "
+ . printUtf32($cns_map[$cns_plane]
+ [$cns_row]
+ [$cns_column])
+ . ", NOT "
+ . printUtf32($utf32);
+ }
+ }
+ elsif (/^U\+([0-9A-F]+)\tkIRG_TSource\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $cns_plane = oct("0x" . $2);
+ $cns_row = oct("0x" . $3) - 0x20;
+ $cns_column = oct("0x" . $4) - 0x20;
+ isValidUtf32($utf32)
+ or die "Bad UTF32 char U+" . printUtf32($utf32);
+ isValidCns116431992($cns_plane, $cns_row, $cns_column)
+ or die "Bad CNS11643-1992 char "
+ . printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column);
+ if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+ {
+ $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+ $cns_plane_used[$cns_plane] = 1;
+ ++$count_Unihan_txt;
+ }
+ else
+ {
+ ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+ or print "WARNING! Mapping ",
+ printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column),
+ " to ",
+ printUtf32($cns_map[$cns_plane]
+ [$cns_row]
+ [$cns_column]),
+ ", NOT ",
+ printUtf32($utf32),
+ "\n";
+ }
+ }
+ elsif (/^U\+([0-9A-F]+)\tkCNS1992\t.*$/)
+ {
+ die "Bad format";
+ }
+ }
+ close IN;
+}
+
+if (1)
+{
+ $filename = "CNS11643.TXT";
+ open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+ while (<IN>)
+ {
+ if (/0x([0-9A-F])([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])\t0x([0-9A-F]+)\t\#.*$/)
+ {
+ $utf32 = oct("0x" . $4);
+ $cns_plane = oct("0x" . $1);
+ $cns_row = oct("0x" . $2) - 0x20;
+ $cns_column = oct("0x" . $3) - 0x20;
+ isValidUtf32($utf32)
+ or die "Bad UTF32 char U+" . printUtf32($utf32);
+ isValidCns116431992($cns_plane, $cns_row, $cns_column)
+ or die "Bad CNS11643-1992 char "
+ . printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column);
+ if ($cns_plane <= 2)
+ {
+ if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+ {
+ $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+ $cns_plane_used[$cns_plane] = 1;
+ ++$count_CNS11643_TXT;
+ }
+ else
+ {
+ ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+ or die "Mapping "
+ . printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column)
+ . " to "
+ . printUtf32($cns_map[$cns_plane]
+ [$cns_row]
+ [$cns_column])
+ . ", NOT "
+ . printUtf32($utf32);
+ }
+ }
+ }
+ }
+ close IN;
+}
+
+if (0)
+{
+ $filename = "Uni2CNS";
+ open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+ while (<IN>)
+ {
+ if (/([0-9A-F]+)\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])\t.*$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $cns_plane = oct("0x" . $2);
+ $cns_row = oct("0x" . $3) - 0x20;
+ $cns_column = oct("0x" . $4) - 0x20;
+ isValidUtf32($utf32)
+ or die "Bad UTF32 char U+" . printUtf32($utf32);
+ isValidCns116431992($cns_plane, $cns_row, $cns_column)
+ or die "Bad CNS11643-1992 char "
+ . printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column);
+ if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+ {
+ $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+ $cns_plane_used[$cns_plane] = 1;
+ ++$count_Uni2CNS;
+ }
+ else
+ {
+# ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+# or die "Mapping "
+# . printCns116431992($cns_plane,
+# $cns_row,
+# $cns_column)
+# . " to "
+# . printUtf32($cns_map[$cns_plane]
+# [$cns_row]
+# [$cns_column])
+# . ", NOT "
+# . printUtf32($utf32);
+ }
+ if ($cns_plane == 1)
+ {
+ print printCns116431992($cns_plane, $cns_row, $cns_column),
+ "\n";
+ }
+ }
+ }
+ close IN;
+}
+
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+ if (defined($cns_plane_used[$cns_plane]))
+ {
+ for ($cns_row = 1; $cns_row <= 94; ++$cns_row)
+ {
+ for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+ {
+ if (defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+ {
+ $utf32 = $cns_map[$cns_plane][$cns_row][$cns_column];
+ $uni_plane = $utf32 >> 16;
+ $uni_page = ($utf32 >> 8) & 0xFF;
+ $uni_index = $utf32 & 0xFF;
+ if (!defined($uni_plane_used[$uni_plane])
+ || !defined($uni_page_used[$uni_plane][$uni_page])
+ || !defined($uni_map[$uni_plane]
+ [$uni_page]
+ [$uni_index]))
+ {
+ $uni_map[$uni_plane][$uni_page][$uni_index]
+ = ($cns_plane << 16)
+ | ($cns_row << 8)
+ | $cns_column;
+ $uni_plane_used[$uni_plane] = 1;
+ $uni_page_used[$uni_plane][$uni_page] = 1;
+ }
+ else
+ {
+ $cns1 = $uni_map[$uni_plane][$uni_page][$uni_index];
+ $cns1_plane = $cns1 >> 16;
+ $cns1_row = ($cns1 >> 8) & 0xFF;
+ $cns1_column = $cns1 & 0xFF;
+
+ # Do not map from Unicode to Fictious Character Set
+ # Extensions (Lunde, p. 131), if possible:
+ if ($cns_plane == 3
+ && ($cns_row == 66 && $cns_column > 38
+ || $cns_row > 66))
+ {
+ print " (",
+ printUtf32($utf32),
+ " to fictious ",
+ printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column),
+ " ignored, favouring ",
+ printCns116431992($cns1_plane,
+ $cns1_row,
+ $cns1_column),
+ ")\n";
+ }
+ elsif ($cns1_plane == 3
+ && ($cns1_row == 66 && $cns1_column > 38
+ || $cns1_row > 66))
+ {
+ $uni_map[$uni_plane][$uni_page][$uni_index]
+ = ($cns_plane << 16)
+ | ($cns_row << 8)
+ | $cns_column;
+ print " (",
+ printUtf32($utf32),
+ " to fictious ",
+ printCns116431992($cns1_plane,
+ $cns1_row,
+ $cns1_column),
+ " ignored, favouring ",
+ printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column),
+ ")\n";
+ }
+ else
+ {
+ print "WARNING! Mapping ",
+ printUtf32($utf32),
+ " to ",
+ printCns116431992($cns1_plane,
+ $cns1_row,
+ $cns1_column),
+ ", NOT ",
+ printCns116431992($cns_plane,
+ $cns_row,
+ $cns_column),
+ "\n";
+ }
+ }
+ }
+ }
+ }
+ }
+}
+if (defined($uni_plane_used[0]) && defined($uni_page_used[0][0]))
+{
+ for ($utf32 = 0; $utf32 <= 0x7F; ++$utf32)
+ {
+ if (defined($uni_map[0][0][$uni_index]))
+ {
+ $cns = $uni_map[0][0][$utf32];
+ die "Mapping "
+ . printUtf32($utf32)
+ . " to "
+ . printCns116431992($cns >> 16,
+ ($cns >> 8) & 0xFF,
+ $cns & 0xFF);
+ }
+ }
+}
+
+$filename = lc($id) . ".dat";
+open OUT, ("> " . $filename) or die "Cannot write " . $filename;
+
+$filename = lc($id) . ".dat";
+open OUT, ("> " . $filename) or die "Cannot write " . $filename;
+
+{
+ $filename = lc($id). ".pl";
+ open IN, $filename or die "Cannot read ". $filename;
+ $first = 1;
+ while (<IN>)
+ {
+ if (/^\#!.*$/)
+ {
+ }
+ elsif (/^\#(\*.*)$/)
+ {
+ if ($first == 1)
+ {
+ print OUT "/", $1, "\n";
+ $first = 0;
+ }
+ else
+ {
+ print OUT " ", substr($1, 0, length($1) - 1), "/\n";
+ }
+ }
+ elsif (/^\# (.*)$/)
+ {
+ print OUT " *", $1, "\n";
+ }
+ elsif (/^\#(.*)$/)
+ {
+ print OUT " *", $1, "\n";
+ }
+ else
+ {
+ goto done;
+ }
+ }
+ done:
+}
+
+print OUT "\n",
+ "#ifndef _SAL_TYPES_H_\n",
+ "#include \"sal/types.h\"\n",
+ "#endif\n",
+ "\n";
+
+print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n";
+$cns_data_offset = 0;
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+ if (defined($cns_plane_used[$cns_plane]))
+ {
+ $cns_rows = 0;
+ $cns_chars = 0;
+ for ($cns_row = 1; $cns_row <= 94; ++$cns_row)
+ {
+ $cns_row_used = 0;
+ for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+ {
+ if (defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+ {
+ $cns_row_used = 1;
+ goto found;
+ }
+ }
+ found:
+ if ($cns_row_used == 1)
+ {
+ ++$cns_rows;
+ print OUT " /* plane ", $cns_plane, ", row ", $cns_row,
+ " */\n ";
+ $chars_in_row = 0;
+ $surrogates_in_row = 0;
+ for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+ {
+ if (defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+ {
+ $utf32 = $cns_map[$cns_plane][$cns_row][$cns_column];
+ ++$chars_in_row;
+ if ($utf32 <= 0xFFFF)
+ {
+ printf OUT "0x%04X,", $utf32;
+ }
+ else
+ {
+ printf OUT "0x%04X,",
+ (0xD800 | (($utf32 - 0x10000) >> 10));
+ ++$surrogates_in_row;
+ }
+ }
+ else
+ {
+ printf OUT "0xffff,";
+ }
+ if ($cns_column % 10 == 9)
+ {
+ print OUT "\n ";
+ }
+ }
+ print OUT "\n";
+ $cns_data_offsets[$cns_plane][$cns_row] = $cns_data_offset++;
+ if ($surrogates_in_row > 0)
+ {
+ print OUT " ";
+ for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+ {
+ $utf32 = 0;
+ if (defined($cns_map[$cns_plane]
+ [$cns_row]
+ [$cns_column]))
+ {
+ $utf32
+ = $cns_map[$cns_plane][$cns_row][$cns_column];
+ }
+ if ($utf32 <= 0xFFFF)
+ {
+ printf OUT " 0,";
+ }
+ else
+ {
+ printf OUT "0x%04X,",
+ (0xDC00
+ | (($utf32 - 0x10000) & 0x3FF));
+ }
+ if ($cns_column % 10 == 9)
+ {
+ print OUT "\n ";
+ }
+ }
+ print OUT "\n";
+ ++$cns_data_offset;
+ }
+ $cns_chars += $chars_in_row;
+ $cns_data_space[$cns_plane][$cns_row]
+ = ($surrogates_in_row == 0 ? 94 : 2 * 94) * 2;
+ $cns_data_used[$cns_plane][$cns_row]
+ = ($chars_in_row + $surrogates_in_row) * 2;
+ }
+ else
+ {
+ print OUT " /* plane ", $cns_plane, ", row ", $cns_row,
+ ": --- */\n";
+ $cns_data_offsets[$cns_plane][$cns_row] = -1;
+ }
+ }
+ print "cns plane ",
+ $cns_plane,
+ ": rows = ",
+ $cns_rows,
+ ", chars = ",
+ $cns_chars,
+ "\n";
+ }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImpl", $id, "ToUnicodeRowOffsets[] = {\n";
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+ if (defined ($cns_plane_used[$cns_plane]))
+ {
+ $cns_rowoffsets_used[$cns_plane] = 0;
+ for ($cns_row = 1; $cns_row <= 94; ++$cns_row)
+ {
+ if ($cns_data_offsets[$cns_plane][$cns_row] == -1)
+ {
+ print OUT " -1, /* plane ",
+ $cns_plane,
+ ", row ",
+ $cns_row,
+ " */\n";
+ }
+ else
+ {
+ print OUT " ",
+ $cns_data_offsets[$cns_plane][$cns_row],
+ " * 94, /* plane ",
+ $cns_plane,
+ ", row ",
+ $cns_row,
+ "; ",
+ printStats($cns_data_used[$cns_plane][$cns_row],
+ $cns_data_space[$cns_plane][$cns_row]),
+ " */\n";
+ $cns_rowoffsets_used[$cns_plane] += 4;
+ }
+ }
+ }
+ else
+ {
+ print OUT " /* plane ", $cns_plane, ": --- */\n";
+ }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImpl",
+ $id,
+ "ToUnicodePlaneOffsets[] = {\n";
+$cns_row_offset = 0;
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+ if (defined ($cns_plane_used[$cns_plane]))
+ {
+ print OUT " ",
+ $cns_row_offset++,
+ " * 94, /* plane ",
+ $cns_plane,
+ "; ",
+ printStats($cns_rowoffsets_used[$cns_plane], 94 * 4),
+ " */\n";
+ }
+ else
+ {
+ print OUT " -1, /* plane ", $cns_plane, " */\n";
+ }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n";
+$uni_data_offset = 0;
+for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane)
+{
+ if (defined($uni_plane_used[$uni_plane]))
+ {
+ for ($uni_page = 0; $uni_page <= 255; ++$uni_page)
+ {
+ if (defined($uni_page_used[$uni_plane][$uni_page]))
+ {
+ $uni_data_used[$uni_plane][$uni_page] = 0;
+ print OUT " /* plane ", $uni_plane, ", page ", $uni_page,
+ " */\n ";
+ for ($uni_index = 0; $uni_index <= 255; ++$uni_index)
+ {
+ if (defined($uni_map[$uni_plane][$uni_page][$uni_index]))
+ {
+ $cns = $uni_map[$uni_plane][$uni_page][$uni_index];
+ printf OUT "0x%02X%02X%02X,",
+ $cns >> 16,
+ 0xA0 + ($cns >> 8 & 0xFF),
+ 0xA0 + ($cns & 0xFF);
+ $uni_data_used[$uni_plane][$uni_page] += 4;
+ }
+ else
+ {
+ print OUT " 0,";
+ }
+ if ($uni_index % 8 == 7 && $uni_index != 255)
+ {
+ print OUT "\n ";
+ }
+ }
+ print OUT "\n";
+ $uni_data_offsets[$uni_plane][$uni_page] = $uni_data_offset++;
+ }
+ else
+ {
+ print OUT " /* plane ", $uni_plane, ", page ", $uni_page,
+ ": --- */\n";
+ $uni_data_offsets[$uni_plane][$uni_page] = -1;
+ }
+ }
+ }
+ else
+ {
+ print OUT " /* plane ", $uni_plane, ": --- */\n";
+ }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImplUnicodeTo", $id, "PageOffsets[] = {\n";
+for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane)
+{
+ if (defined($uni_plane_used[$uni_plane]))
+ {
+ $uni_pageoffsets_used[$uni_plane] = 0;
+ $uni_data_used_sum[$uni_plane] = 0;
+ $uni_data_space_sum[$uni_plane] = 0;
+ for ($uni_page = 0; $uni_page <= 255; ++$uni_page)
+ {
+ $offset = $uni_data_offsets[$uni_plane][$uni_page];
+ if ($offset == -1)
+ {
+ print OUT " -1, /* plane ",
+ $uni_plane,
+ ", page ",
+ $uni_page,
+ " */\n";
+ }
+ else
+ {
+ print OUT " ",
+ $offset,
+ " * 256, /* plane ",
+ $uni_plane,
+ ", page ",
+ $uni_page,
+ "; ",
+ printStats($uni_data_used[$uni_plane][$uni_page],
+ 256 * 4),
+ " */\n";
+ $uni_pageoffsets_used[$uni_plane] += 4;
+ $uni_data_used_sum[$uni_plane]
+ += $uni_data_used[$uni_plane][$uni_page];
+ $uni_data_space_sum[$uni_plane] += 256 * 4;
+ }
+ }
+ }
+ else
+ {
+ print OUT " /* plane ", $uni_plane, ": --- */\n";
+ }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImplUnicodeTo",
+ $id,
+ "PlaneOffsets[] = {\n";
+$uni_page_offset = 0;
+$uni_planeoffsets_used = 0;
+$uni_pageoffsets_used_sum = 0;
+$uni_pageoffsets_space_sum = 0;
+$uni_data_used_sum2 = 0;
+$uni_data_space_sum2 = 0;
+for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane)
+{
+ if (defined ($uni_plane_used[$uni_plane]))
+ {
+ print OUT " ",
+ $uni_page_offset++,
+ " * 256, /* plane ",
+ $uni_plane,
+ "; ",
+ printStats($uni_pageoffsets_used[$uni_plane], 256 * 4),
+ ", ",
+ printStats($uni_data_used_sum[$uni_plane],
+ $uni_data_space_sum[$uni_plane]),
+ " */\n";
+ $uni_planeoffsets_used += 4;
+ $uni_pageoffsets_used_sum += $uni_pageoffsets_used[$uni_plane];
+ $uni_pageoffsets_space_sum += 256 * 4;
+ $uni_data_used_sum2 += $uni_data_used_sum[$uni_plane];
+ $uni_data_space_sum2 += $uni_data_space_sum[$uni_plane];
+ }
+ else
+ {
+ print OUT " -1, /* plane ", $uni_plane, " */\n";
+ }
+}
+print OUT " /* ",
+ printStats($uni_planeoffsets_used, 17 * 4),
+ ", ",
+ printStats($uni_pageoffsets_used_sum, $uni_pageoffsets_space_sum),
+ ", ",
+ printStats($uni_data_used_sum2, $uni_data_space_sum2),
+ " */\n};\n";
+
+close OUT;
+
+print "Unihan.txt = ", $count_Unihan_txt,
+ ", CNS11643.TXT = ", $count_CNS11643_TXT,
+ ", Uni2CNS = ", $count_Uni2CNS,
+ ", total = ",
+ ($count_Unihan_txt + $count_CNS11643_TXT + $count_Uni2CNS),
+ "\n";
diff --git a/sal/textenc/generate/gb180302000.pl b/sal/textenc/generate/gb180302000.pl
new file mode 100644
index 000000000000..a7d925c61000
--- /dev/null
+++ b/sal/textenc/generate/gb180302000.pl
@@ -0,0 +1,343 @@
+#!/usr/bin/perl
+#*************************************************************************
+#
+# $RCSfile: gb180302000.pl,v $
+#
+# $Revision: 1.1 $
+#
+# last change: $Author: sb $ $Date: 2001-10-12 09:44:51 $
+#
+# The Contents of this file are made available subject to the terms of
+# either of the following licenses
+#
+# - GNU Lesser General Public License Version 2.1
+# - Sun Industry Standards Source License Version 1.1
+#
+# Sun Microsystems Inc., October, 2000
+#
+# GNU Lesser General Public License Version 2.1
+# =============================================
+# Copyright 2000 by Sun Microsystems, Inc.
+# 901 San Antonio Road, Palo Alto, CA 94303, USA
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License version 2.1, as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+# MA 02111-1307 USA
+#
+#
+# Sun Industry Standards Source License Version 1.1
+# =================================================
+# The contents of this file are subject to the Sun Industry Standards
+# Source License Version 1.1 (the "License"); You may not use this file
+# except in compliance with the License. You may obtain a copy of the
+# License at http://www.openoffice.org/license.html.
+#
+# Software provided under this License is provided on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
+# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
+# See the License for the specific provisions governing your rights and
+# obligations concerning the Software.
+#
+# The Initial Developer of the Original Code is: Sun Microsystems, Inc.
+#
+# Copyright: 2000 by Sun Microsystems, Inc.
+#
+# All Rights Reserved.
+#
+# Contributor(s): _______________________________________
+#
+#
+#*************************************************************************
+
+# The following files must be available in a ./input subdir:
+
+# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
+# gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
+# "modified version="3" date="2001-02-21""
+
+$id = "Gb180302000";
+
+sub printUtf32
+{
+ my $utf32 = $_[0];
+ return sprintf("U+%04X", $utf32);
+}
+
+sub printGb
+{
+ if (defined($_[2]))
+ {
+ return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
+ }
+ elsif (defined($_[1]))
+ {
+ return sprintf("%02X%02X", $_[0], $_[1]);
+ }
+ else
+ {
+ return sprintf("%02X", $_[0]);
+ }
+}
+
+$gb_map_2_count = 0;
+$gb_map_4_count = 0;
+$gb_map_4_ranges = 0;
+$gb_map_4_max = 0;
+$uni_map_count = 0;
+
+$range_count = 0;
+
+if (1)
+{
+ $filename = "gb-18030-2000.xml";
+ open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+ while (<IN>)
+ {
+ if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $gb1 = oct("0x" . $2);
+ ($utf32 == $gb1)
+ or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
+ }
+ elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $gb1 = oct("0x" . $2);
+ $gb2 = oct("0x" . $3);
+ $gb_code = ($gb1 - 0x81) * 190
+ + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
+ !defined($gb_map_2[$gb_code])
+ or die "Redefined " . printGb($gb1, $gb2);
+ $gb_map_2[$gb_code] = $utf32;
+ ++$gb_map_2_count;
+
+ !defined($uni_map[$utf32]) or die "Double Unicode mapping";
+ $uni_map[$utf32] = $gb1 << 8 | $gb2;
+ ++$uni_map_count;
+ }
+ elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $gb1 = oct("0x" . $2);
+ $gb2 = oct("0x" . $3);
+ $gb3 = oct("0x" . $4);
+ $gb4 = oct("0x" . $5);
+ $gb_code = ($gb1 - 0x81) * 12600
+ + ($gb2 - 0x30) * 1260
+ + ($gb3 - 0x81) * 10
+ + ($gb4 - 0x30);
+ !defined($gb_map_4[$gb_code])
+ or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
+ $gb_map_4[$gb_code] = $utf32;
+ ++$gb_map_4_count;
+ $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
+
+ !defined($uni_map[$utf32]) or die "Double Unicode mapping";
+ $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
+ ++$uni_map_count;
+ }
+ elsif (/<a /)
+ {
+ die "Bad format";
+ }
+ elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
+ {
+ $utf32_first = oct("0x" . $1);
+ $utf32_last = oct("0x" . $2);
+ $gb1_first = oct("0x" . $3);
+ $gb2_first = oct("0x" . $4);
+ $gb3_first = oct("0x" . $5);
+ $gb4_first = oct("0x" . $6);
+ $gb1_last = oct("0x" . $7);
+ $gb2_last = oct("0x" . $8);
+ $gb3_last = oct("0x" . $9);
+ $gb4_last = oct("0x" . $10);
+ $linear_first
+ = ($gb1_first - 0x81) * 12600
+ + ($gb2_first - 0x30) * 1260
+ + ($gb3_first - 0x81) * 10
+ + ($gb4_first - 0x30);
+ $linear_last
+ = ($gb1_last - 0x81) * 12600
+ + ($gb2_last - 0x30) * 1260
+ + ($gb3_last - 0x81) * 10
+ + ($gb4_last - 0x30);
+ ($utf32_last - $utf32_first == $linear_last - $linear_first)
+ or die "Bad range";
+ if ($linear_first != 189000 || $linear_last != 1237575)
+ {
+ $range_uni_first[$range_count] = $utf32_first;
+ $range_uni_last[$range_count]
+ = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
+ $range_linear_first[$range_count] = $linear_first;
+ $range_linear_last[$range_count] = $linear_last;
+ ++$range_count;
+ $gb_map_4_ranges += $linear_last - $linear_first + 1;
+ $gb_map_4_max = $linear_last
+ if ($linear_last > $gb_map_4_max);
+ }
+ }
+ elsif (/<range /)
+ {
+ die "Bad format";
+ }
+ }
+ close IN;
+}
+
+print "gb_map_2_count = ", $gb_map_2_count,
+ ", gb_map_4_count = ", $gb_map_4_count,
+ ", gb_map_4_ranges = ", $gb_map_4_ranges,
+ ", gb_map_4_max = ", $gb_map_4_max,
+ ", uni_map_count = ", $uni_map_count, "\n";
+($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
+($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
+ or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
+($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
+ or die "Bad uni_map_count";
+
+$range_index = 0;
+$gb_nonrangedataindex[$range_index] = $gb_map_2_count;
+for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
+{
+ if (defined($gb_map_4[$gb_code]))
+ {
+ $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
+ }
+ else
+ {
+ ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
+ $gb_code = $range_linear_last[$range_index];
+ ++$range_index;
+ $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
+ }
+}
+($range_index == $range_count) or die "Bad input";
+
+$filename = lc($id) . ".dat";
+open OUT, ("> " . $filename) or die "Cannot write " . $filename;
+
+{
+ $filename = lc($id). ".pl";
+ open IN, $filename or die "Cannot read ". $filename;
+ $first = 1;
+ while (<IN>)
+ {
+ if (/^\#!.*$/)
+ {
+ }
+ elsif (/^\#(\*.*)$/)
+ {
+ if ($first == 1)
+ {
+ print OUT "/", $1, "\n";
+ $first = 0;
+ }
+ else
+ {
+ print OUT " ", substr($1, 0, length($1) - 1), "/\n";
+ }
+ }
+ elsif (/^\# (.*)$/)
+ {
+ print OUT " *", $1, "\n";
+ }
+ elsif (/^\#(.*)$/)
+ {
+ print OUT " *", $1, "\n";
+ }
+ else
+ {
+ goto done;
+ }
+ }
+ done:
+}
+
+print OUT "\n",
+ "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n",
+ "#include \"convertgb18030.h\"\n",
+ "#endif\n",
+ "\n",
+ "#ifndef _SAL_TYPES_H_\n",
+ "#include \"sal/types.h\"\n",
+ "#endif\n",
+ "\n";
+
+print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n ";
+for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
+{
+ printf OUT "0x%04X,", $gb_map_2[$gb_code];
+ if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
+ {
+ print OUT "\n ";
+ }
+}
+print OUT "\n};\n\n";
+
+print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl",
+ $id,
+ "ToUnicodeRanges[] = {\n";
+for ($range_index = 0; $range_index < $range_count; ++$range_index)
+{
+ printf OUT " { %d, %d, %d, 0x%04X },\n",
+ $gb_nonrangedataindex[$range_index],
+ $range_linear_first[$range_index],
+ $range_linear_last[$range_index] + 1,
+ $range_uni_first[$range_index];
+}
+print OUT " { -1, 0, 0, 0 }\n};\n\n";
+
+print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n ";
+$index = 0;
+$range_index = 0;
+$uni_nonrangedataindex[$range_index] = $index;
+for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
+{
+ if (defined($uni_map[$utf32]))
+ {
+ if ($index > 0 && ($index - 1) % 6 == 5)
+ {
+ print OUT "\n ";
+ }
+ $bytes = $uni_map[$utf32];
+ printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes;
+ ++$index;
+ }
+ else
+ {
+ ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
+ $utf32 = $range_uni_last[$range_index];
+ ++$range_index;
+ $uni_nonrangedataindex[$range_index] = $index;
+ }
+}
+($range_index == $range_count) or die "Bad input";
+print OUT "\n};\n\n";
+
+print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo",
+ $id,
+ "Ranges[] = {\n";
+for ($range_index = 0; $range_index < $range_count; ++$range_index)
+{
+ printf OUT " { %d, 0x%04X, 0x%04X, %d },\n",
+ $uni_nonrangedataindex[$range_index],
+ $range_uni_first[$range_index],
+ $range_uni_last[$range_index],
+ $range_linear_first[$range_index];
+}
+print OUT "};\n";
+
+close OUT;