diff options
author | Stephan Bergmann <sb@openoffice.org> | 2001-10-12 08:44:51 +0000 |
---|---|---|
committer | Stephan Bergmann <sb@openoffice.org> | 2001-10-12 08:44:51 +0000 |
commit | e8366e3816b50dbe4e1f9b697abece8813e11b00 (patch) | |
tree | ed20bc56ce29ba0fbfe57b4a25a646a50401c6b1 | |
parent | ba4f155598069228bd52b0ab2d12744c231d60dc (diff) |
#87140#
-rw-r--r-- | sal/textenc/generate/cns116431992.pl | 786 | ||||
-rw-r--r-- | sal/textenc/generate/gb180302000.pl | 343 |
2 files changed, 1129 insertions, 0 deletions
diff --git a/sal/textenc/generate/cns116431992.pl b/sal/textenc/generate/cns116431992.pl new file mode 100644 index 000000000000..4542e08101e4 --- /dev/null +++ b/sal/textenc/generate/cns116431992.pl @@ -0,0 +1,786 @@ +#!/usr/bin/perl +#************************************************************************* +# +# $RCSfile: cns116431992.pl,v $ +# +# $Revision: 1.1 $ +# +# last change: $Author: sb $ $Date: 2001-10-12 09:44:51 $ +# +# The Contents of this file are made available subject to the terms of +# either of the following licenses +# +# - GNU Lesser General Public License Version 2.1 +# - Sun Industry Standards Source License Version 1.1 +# +# Sun Microsystems Inc., October, 2000 +# +# GNU Lesser General Public License Version 2.1 +# ============================================= +# Copyright 2000 by Sun Microsystems, Inc. +# 901 San Antonio Road, Palo Alto, CA 94303, USA +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1, as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# +# +# Sun Industry Standards Source License Version 1.1 +# ================================================= +# The contents of this file are subject to the Sun Industry Standards +# Source License Version 1.1 (the "License"); You may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at http://www.openoffice.org/license.html. +# +# Software provided under this License is provided on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, +# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, +# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. +# See the License for the specific provisions governing your rights and +# obligations concerning the Software. +# +# The Initial Developer of the Original Code is: Sun Microsystems, Inc. +# +# Copyright: 2000 by Sun Microsystems, Inc. +# +# All Rights Reserved. +# +# Contributor(s): _______________________________________ +# +# +#************************************************************************* + +# The following files must be available in a ./input subdir: + +# <http://www.unicode.org/Public/UNIDATA/Unihan.txt>: +# "Unicode version: 3.1.1 Table version: 1.1 Date: 28 June 2001" +# contains descriptions for: +# U+3400..4DFF CJK Unified Ideographs Extension A +# U+4E00..9FFF CJK Unified Ideographs +# U+F900..FAFF CJK Compatibility Ideographs +# U+20000..2F7FF CJK Unified Ideographs Extension B +# U+2F800..2FFFF CJK Compatibility Ideographs Supplement + +# <http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/CNS11643.TXT>: +# "Unicode version: 1.1 Table version: 0.0d1 Date: 21 October 1994" +# contains mappings for CNS 11643-1986 + +# <http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/Uni2CNS.Z>: +# "Unicode version: 1.1 Table version: 0.49 Date: 26 March 1998" +# contains mappings for CNS 11643-1992 that are incompatible with +# CNS11643.TXT + +$id = "Cns116431992"; + +sub isValidUtf32 +{ + my $utf32 = $_[0]; + return $utf32 >= 0 && $utf32 <= 0x10FFFF + && !($utf32 >= 0xD800 && $utf32 <= 0xDFFF) + && !($utf32 >= 0xFDD0 && $utf32 <= 0xFDEF) + && ($utf32 & 0xFFFF) < 0xFFFE; +} + +sub printUtf32 +{ + my $utf32 = $_[0]; + return sprintf("U+%04X", $utf32); +} + +sub isValidCns116431992 +{ + my $plane = $_[0]; + my $row = $_[1]; + my $column = $_[2]; + return $plane >= 1 && $plane <= 16 + && $row >= 1 && $row <= 94 + && $column >= 1 && $column <= 94; +} + +sub printCns116431992 +{ + my $plane = $_[0]; + my $row = $_[1]; + my $column = $_[2]; + return sprintf("%d-%02d/%02d", $plane, $row, $column); +} + +sub printStats +{ + my $used = $_[0]; + my $space = $_[1]; + return sprintf("%d/%d bytes (%.1f%%)", + $used, + $space, + $used * 100 / $space); +} + +$count_Unihan_txt = 0; +$count_CNS11643_TXT = 0; +$count_Uni2CNS = 0; + +if (1) +{ + $filename = "Unihan.txt"; + open IN, ("input/" . $filename) or die "Cannot read " . $filename; + while (<IN>) + { + if (/^U\+([0-9A-F]+)\tkCNS1992\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])$/) + { + $utf32 = oct("0x" . $1); + $cns_plane = oct("0x" . $2); + $cns_row = oct("0x" . $3) - 0x20; + $cns_column = oct("0x" . $4) - 0x20; + isValidUtf32($utf32) + or die "Bad UTF32 char U+" . printUtf32($utf32); + isValidCns116431992($cns_plane, $cns_row, $cns_column) + or die "Bad CNS11643-1992 char " + . printCns116431992($cns_plane, + $cns_row, + $cns_column); + if (!defined($cns_map[$cns_plane][$cns_row][$cns_column])) + { + $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32; + $cns_plane_used[$cns_plane] = 1; + ++$count_Unihan_txt; + } + else + { + ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32) + or die "Mapping " + . printCns116431992($cns_plane, + $cns_row, + $cns_column) + . " to " + . printUtf32($cns_map[$cns_plane] + [$cns_row] + [$cns_column]) + . ", NOT " + . printUtf32($utf32); + } + } + elsif (/^U\+([0-9A-F]+)\tkIRG_TSource\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])$/) + { + $utf32 = oct("0x" . $1); + $cns_plane = oct("0x" . $2); + $cns_row = oct("0x" . $3) - 0x20; + $cns_column = oct("0x" . $4) - 0x20; + isValidUtf32($utf32) + or die "Bad UTF32 char U+" . printUtf32($utf32); + isValidCns116431992($cns_plane, $cns_row, $cns_column) + or die "Bad CNS11643-1992 char " + . printCns116431992($cns_plane, + $cns_row, + $cns_column); + if (!defined($cns_map[$cns_plane][$cns_row][$cns_column])) + { + $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32; + $cns_plane_used[$cns_plane] = 1; + ++$count_Unihan_txt; + } + else + { + ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32) + or print "WARNING! Mapping ", + printCns116431992($cns_plane, + $cns_row, + $cns_column), + " to ", + printUtf32($cns_map[$cns_plane] + [$cns_row] + [$cns_column]), + ", NOT ", + printUtf32($utf32), + "\n"; + } + } + elsif (/^U\+([0-9A-F]+)\tkCNS1992\t.*$/) + { + die "Bad format"; + } + } + close IN; +} + +if (1) +{ + $filename = "CNS11643.TXT"; + open IN, ("input/" . $filename) or die "Cannot read " . $filename; + while (<IN>) + { + if (/0x([0-9A-F])([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])\t0x([0-9A-F]+)\t\#.*$/) + { + $utf32 = oct("0x" . $4); + $cns_plane = oct("0x" . $1); + $cns_row = oct("0x" . $2) - 0x20; + $cns_column = oct("0x" . $3) - 0x20; + isValidUtf32($utf32) + or die "Bad UTF32 char U+" . printUtf32($utf32); + isValidCns116431992($cns_plane, $cns_row, $cns_column) + or die "Bad CNS11643-1992 char " + . printCns116431992($cns_plane, + $cns_row, + $cns_column); + if ($cns_plane <= 2) + { + if (!defined($cns_map[$cns_plane][$cns_row][$cns_column])) + { + $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32; + $cns_plane_used[$cns_plane] = 1; + ++$count_CNS11643_TXT; + } + else + { + ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32) + or die "Mapping " + . printCns116431992($cns_plane, + $cns_row, + $cns_column) + . " to " + . printUtf32($cns_map[$cns_plane] + [$cns_row] + [$cns_column]) + . ", NOT " + . printUtf32($utf32); + } + } + } + } + close IN; +} + +if (0) +{ + $filename = "Uni2CNS"; + open IN, ("input/" . $filename) or die "Cannot read " . $filename; + while (<IN>) + { + if (/([0-9A-F]+)\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])\t.*$/) + { + $utf32 = oct("0x" . $1); + $cns_plane = oct("0x" . $2); + $cns_row = oct("0x" . $3) - 0x20; + $cns_column = oct("0x" . $4) - 0x20; + isValidUtf32($utf32) + or die "Bad UTF32 char U+" . printUtf32($utf32); + isValidCns116431992($cns_plane, $cns_row, $cns_column) + or die "Bad CNS11643-1992 char " + . printCns116431992($cns_plane, + $cns_row, + $cns_column); + if (!defined($cns_map[$cns_plane][$cns_row][$cns_column])) + { + $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32; + $cns_plane_used[$cns_plane] = 1; + ++$count_Uni2CNS; + } + else + { +# ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32) +# or die "Mapping " +# . printCns116431992($cns_plane, +# $cns_row, +# $cns_column) +# . " to " +# . printUtf32($cns_map[$cns_plane] +# [$cns_row] +# [$cns_column]) +# . ", NOT " +# . printUtf32($utf32); + } + if ($cns_plane == 1) + { + print printCns116431992($cns_plane, $cns_row, $cns_column), + "\n"; + } + } + } + close IN; +} + +for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane) +{ + if (defined($cns_plane_used[$cns_plane])) + { + for ($cns_row = 1; $cns_row <= 94; ++$cns_row) + { + for ($cns_column = 1; $cns_column <= 94; ++$cns_column) + { + if (defined($cns_map[$cns_plane][$cns_row][$cns_column])) + { + $utf32 = $cns_map[$cns_plane][$cns_row][$cns_column]; + $uni_plane = $utf32 >> 16; + $uni_page = ($utf32 >> 8) & 0xFF; + $uni_index = $utf32 & 0xFF; + if (!defined($uni_plane_used[$uni_plane]) + || !defined($uni_page_used[$uni_plane][$uni_page]) + || !defined($uni_map[$uni_plane] + [$uni_page] + [$uni_index])) + { + $uni_map[$uni_plane][$uni_page][$uni_index] + = ($cns_plane << 16) + | ($cns_row << 8) + | $cns_column; + $uni_plane_used[$uni_plane] = 1; + $uni_page_used[$uni_plane][$uni_page] = 1; + } + else + { + $cns1 = $uni_map[$uni_plane][$uni_page][$uni_index]; + $cns1_plane = $cns1 >> 16; + $cns1_row = ($cns1 >> 8) & 0xFF; + $cns1_column = $cns1 & 0xFF; + + # Do not map from Unicode to Fictious Character Set + # Extensions (Lunde, p. 131), if possible: + if ($cns_plane == 3 + && ($cns_row == 66 && $cns_column > 38 + || $cns_row > 66)) + { + print " (", + printUtf32($utf32), + " to fictious ", + printCns116431992($cns_plane, + $cns_row, + $cns_column), + " ignored, favouring ", + printCns116431992($cns1_plane, + $cns1_row, + $cns1_column), + ")\n"; + } + elsif ($cns1_plane == 3 + && ($cns1_row == 66 && $cns1_column > 38 + || $cns1_row > 66)) + { + $uni_map[$uni_plane][$uni_page][$uni_index] + = ($cns_plane << 16) + | ($cns_row << 8) + | $cns_column; + print " (", + printUtf32($utf32), + " to fictious ", + printCns116431992($cns1_plane, + $cns1_row, + $cns1_column), + " ignored, favouring ", + printCns116431992($cns_plane, + $cns_row, + $cns_column), + ")\n"; + } + else + { + print "WARNING! Mapping ", + printUtf32($utf32), + " to ", + printCns116431992($cns1_plane, + $cns1_row, + $cns1_column), + ", NOT ", + printCns116431992($cns_plane, + $cns_row, + $cns_column), + "\n"; + } + } + } + } + } + } +} +if (defined($uni_plane_used[0]) && defined($uni_page_used[0][0])) +{ + for ($utf32 = 0; $utf32 <= 0x7F; ++$utf32) + { + if (defined($uni_map[0][0][$uni_index])) + { + $cns = $uni_map[0][0][$utf32]; + die "Mapping " + . printUtf32($utf32) + . " to " + . printCns116431992($cns >> 16, + ($cns >> 8) & 0xFF, + $cns & 0xFF); + } + } +} + +$filename = lc($id) . ".dat"; +open OUT, ("> " . $filename) or die "Cannot write " . $filename; + +$filename = lc($id) . ".dat"; +open OUT, ("> " . $filename) or die "Cannot write " . $filename; + +{ + $filename = lc($id). ".pl"; + open IN, $filename or die "Cannot read ". $filename; + $first = 1; + while (<IN>) + { + if (/^\#!.*$/) + { + } + elsif (/^\#(\*.*)$/) + { + if ($first == 1) + { + print OUT "/", $1, "\n"; + $first = 0; + } + else + { + print OUT " ", substr($1, 0, length($1) - 1), "/\n"; + } + } + elsif (/^\# (.*)$/) + { + print OUT " *", $1, "\n"; + } + elsif (/^\#(.*)$/) + { + print OUT " *", $1, "\n"; + } + else + { + goto done; + } + } + done: +} + +print OUT "\n", + "#ifndef _SAL_TYPES_H_\n", + "#include \"sal/types.h\"\n", + "#endif\n", + "\n"; + +print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n"; +$cns_data_offset = 0; +for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane) +{ + if (defined($cns_plane_used[$cns_plane])) + { + $cns_rows = 0; + $cns_chars = 0; + for ($cns_row = 1; $cns_row <= 94; ++$cns_row) + { + $cns_row_used = 0; + for ($cns_column = 1; $cns_column <= 94; ++$cns_column) + { + if (defined($cns_map[$cns_plane][$cns_row][$cns_column])) + { + $cns_row_used = 1; + goto found; + } + } + found: + if ($cns_row_used == 1) + { + ++$cns_rows; + print OUT " /* plane ", $cns_plane, ", row ", $cns_row, + " */\n "; + $chars_in_row = 0; + $surrogates_in_row = 0; + for ($cns_column = 1; $cns_column <= 94; ++$cns_column) + { + if (defined($cns_map[$cns_plane][$cns_row][$cns_column])) + { + $utf32 = $cns_map[$cns_plane][$cns_row][$cns_column]; + ++$chars_in_row; + if ($utf32 <= 0xFFFF) + { + printf OUT "0x%04X,", $utf32; + } + else + { + printf OUT "0x%04X,", + (0xD800 | (($utf32 - 0x10000) >> 10)); + ++$surrogates_in_row; + } + } + else + { + printf OUT "0xffff,"; + } + if ($cns_column % 10 == 9) + { + print OUT "\n "; + } + } + print OUT "\n"; + $cns_data_offsets[$cns_plane][$cns_row] = $cns_data_offset++; + if ($surrogates_in_row > 0) + { + print OUT " "; + for ($cns_column = 1; $cns_column <= 94; ++$cns_column) + { + $utf32 = 0; + if (defined($cns_map[$cns_plane] + [$cns_row] + [$cns_column])) + { + $utf32 + = $cns_map[$cns_plane][$cns_row][$cns_column]; + } + if ($utf32 <= 0xFFFF) + { + printf OUT " 0,"; + } + else + { + printf OUT "0x%04X,", + (0xDC00 + | (($utf32 - 0x10000) & 0x3FF)); + } + if ($cns_column % 10 == 9) + { + print OUT "\n "; + } + } + print OUT "\n"; + ++$cns_data_offset; + } + $cns_chars += $chars_in_row; + $cns_data_space[$cns_plane][$cns_row] + = ($surrogates_in_row == 0 ? 94 : 2 * 94) * 2; + $cns_data_used[$cns_plane][$cns_row] + = ($chars_in_row + $surrogates_in_row) * 2; + } + else + { + print OUT " /* plane ", $cns_plane, ", row ", $cns_row, + ": --- */\n"; + $cns_data_offsets[$cns_plane][$cns_row] = -1; + } + } + print "cns plane ", + $cns_plane, + ": rows = ", + $cns_rows, + ", chars = ", + $cns_chars, + "\n"; + } +} +print OUT "};\n\n"; + +print OUT "static sal_Int32 const aImpl", $id, "ToUnicodeRowOffsets[] = {\n"; +for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane) +{ + if (defined ($cns_plane_used[$cns_plane])) + { + $cns_rowoffsets_used[$cns_plane] = 0; + for ($cns_row = 1; $cns_row <= 94; ++$cns_row) + { + if ($cns_data_offsets[$cns_plane][$cns_row] == -1) + { + print OUT " -1, /* plane ", + $cns_plane, + ", row ", + $cns_row, + " */\n"; + } + else + { + print OUT " ", + $cns_data_offsets[$cns_plane][$cns_row], + " * 94, /* plane ", + $cns_plane, + ", row ", + $cns_row, + "; ", + printStats($cns_data_used[$cns_plane][$cns_row], + $cns_data_space[$cns_plane][$cns_row]), + " */\n"; + $cns_rowoffsets_used[$cns_plane] += 4; + } + } + } + else + { + print OUT " /* plane ", $cns_plane, ": --- */\n"; + } +} +print OUT "};\n\n"; + +print OUT "static sal_Int32 const aImpl", + $id, + "ToUnicodePlaneOffsets[] = {\n"; +$cns_row_offset = 0; +for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane) +{ + if (defined ($cns_plane_used[$cns_plane])) + { + print OUT " ", + $cns_row_offset++, + " * 94, /* plane ", + $cns_plane, + "; ", + printStats($cns_rowoffsets_used[$cns_plane], 94 * 4), + " */\n"; + } + else + { + print OUT " -1, /* plane ", $cns_plane, " */\n"; + } +} +print OUT "};\n\n"; + +print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n"; +$uni_data_offset = 0; +for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane) +{ + if (defined($uni_plane_used[$uni_plane])) + { + for ($uni_page = 0; $uni_page <= 255; ++$uni_page) + { + if (defined($uni_page_used[$uni_plane][$uni_page])) + { + $uni_data_used[$uni_plane][$uni_page] = 0; + print OUT " /* plane ", $uni_plane, ", page ", $uni_page, + " */\n "; + for ($uni_index = 0; $uni_index <= 255; ++$uni_index) + { + if (defined($uni_map[$uni_plane][$uni_page][$uni_index])) + { + $cns = $uni_map[$uni_plane][$uni_page][$uni_index]; + printf OUT "0x%02X%02X%02X,", + $cns >> 16, + 0xA0 + ($cns >> 8 & 0xFF), + 0xA0 + ($cns & 0xFF); + $uni_data_used[$uni_plane][$uni_page] += 4; + } + else + { + print OUT " 0,"; + } + if ($uni_index % 8 == 7 && $uni_index != 255) + { + print OUT "\n "; + } + } + print OUT "\n"; + $uni_data_offsets[$uni_plane][$uni_page] = $uni_data_offset++; + } + else + { + print OUT " /* plane ", $uni_plane, ", page ", $uni_page, + ": --- */\n"; + $uni_data_offsets[$uni_plane][$uni_page] = -1; + } + } + } + else + { + print OUT " /* plane ", $uni_plane, ": --- */\n"; + } +} +print OUT "};\n\n"; + +print OUT "static sal_Int32 const aImplUnicodeTo", $id, "PageOffsets[] = {\n"; +for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane) +{ + if (defined($uni_plane_used[$uni_plane])) + { + $uni_pageoffsets_used[$uni_plane] = 0; + $uni_data_used_sum[$uni_plane] = 0; + $uni_data_space_sum[$uni_plane] = 0; + for ($uni_page = 0; $uni_page <= 255; ++$uni_page) + { + $offset = $uni_data_offsets[$uni_plane][$uni_page]; + if ($offset == -1) + { + print OUT " -1, /* plane ", + $uni_plane, + ", page ", + $uni_page, + " */\n"; + } + else + { + print OUT " ", + $offset, + " * 256, /* plane ", + $uni_plane, + ", page ", + $uni_page, + "; ", + printStats($uni_data_used[$uni_plane][$uni_page], + 256 * 4), + " */\n"; + $uni_pageoffsets_used[$uni_plane] += 4; + $uni_data_used_sum[$uni_plane] + += $uni_data_used[$uni_plane][$uni_page]; + $uni_data_space_sum[$uni_plane] += 256 * 4; + } + } + } + else + { + print OUT " /* plane ", $uni_plane, ": --- */\n"; + } +} +print OUT "};\n\n"; + +print OUT "static sal_Int32 const aImplUnicodeTo", + $id, + "PlaneOffsets[] = {\n"; +$uni_page_offset = 0; +$uni_planeoffsets_used = 0; +$uni_pageoffsets_used_sum = 0; +$uni_pageoffsets_space_sum = 0; +$uni_data_used_sum2 = 0; +$uni_data_space_sum2 = 0; +for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane) +{ + if (defined ($uni_plane_used[$uni_plane])) + { + print OUT " ", + $uni_page_offset++, + " * 256, /* plane ", + $uni_plane, + "; ", + printStats($uni_pageoffsets_used[$uni_plane], 256 * 4), + ", ", + printStats($uni_data_used_sum[$uni_plane], + $uni_data_space_sum[$uni_plane]), + " */\n"; + $uni_planeoffsets_used += 4; + $uni_pageoffsets_used_sum += $uni_pageoffsets_used[$uni_plane]; + $uni_pageoffsets_space_sum += 256 * 4; + $uni_data_used_sum2 += $uni_data_used_sum[$uni_plane]; + $uni_data_space_sum2 += $uni_data_space_sum[$uni_plane]; + } + else + { + print OUT " -1, /* plane ", $uni_plane, " */\n"; + } +} +print OUT " /* ", + printStats($uni_planeoffsets_used, 17 * 4), + ", ", + printStats($uni_pageoffsets_used_sum, $uni_pageoffsets_space_sum), + ", ", + printStats($uni_data_used_sum2, $uni_data_space_sum2), + " */\n};\n"; + +close OUT; + +print "Unihan.txt = ", $count_Unihan_txt, + ", CNS11643.TXT = ", $count_CNS11643_TXT, + ", Uni2CNS = ", $count_Uni2CNS, + ", total = ", + ($count_Unihan_txt + $count_CNS11643_TXT + $count_Uni2CNS), + "\n"; diff --git a/sal/textenc/generate/gb180302000.pl b/sal/textenc/generate/gb180302000.pl new file mode 100644 index 000000000000..a7d925c61000 --- /dev/null +++ b/sal/textenc/generate/gb180302000.pl @@ -0,0 +1,343 @@ +#!/usr/bin/perl +#************************************************************************* +# +# $RCSfile: gb180302000.pl,v $ +# +# $Revision: 1.1 $ +# +# last change: $Author: sb $ $Date: 2001-10-12 09:44:51 $ +# +# The Contents of this file are made available subject to the terms of +# either of the following licenses +# +# - GNU Lesser General Public License Version 2.1 +# - Sun Industry Standards Source License Version 1.1 +# +# Sun Microsystems Inc., October, 2000 +# +# GNU Lesser General Public License Version 2.1 +# ============================================= +# Copyright 2000 by Sun Microsystems, Inc. +# 901 San Antonio Road, Palo Alto, CA 94303, USA +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1, as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# +# +# Sun Industry Standards Source License Version 1.1 +# ================================================= +# The contents of this file are subject to the Sun Industry Standards +# Source License Version 1.1 (the "License"); You may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at http://www.openoffice.org/license.html. +# +# Software provided under this License is provided on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, +# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, +# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. +# See the License for the specific provisions governing your rights and +# obligations concerning the Software. +# +# The Initial Developer of the Original Code is: Sun Microsystems, Inc. +# +# Copyright: 2000 by Sun Microsystems, Inc. +# +# All Rights Reserved. +# +# Contributor(s): _______________________________________ +# +# +#************************************************************************* + +# The following files must be available in a ./input subdir: + +# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/ +# gb-18030-2000.xml?rev=1.4&content-type=text/plain>: +# "modified version="3" date="2001-02-21"" + +$id = "Gb180302000"; + +sub printUtf32 +{ + my $utf32 = $_[0]; + return sprintf("U+%04X", $utf32); +} + +sub printGb +{ + if (defined($_[2])) + { + return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]); + } + elsif (defined($_[1])) + { + return sprintf("%02X%02X", $_[0], $_[1]); + } + else + { + return sprintf("%02X", $_[0]); + } +} + +$gb_map_2_count = 0; +$gb_map_4_count = 0; +$gb_map_4_ranges = 0; +$gb_map_4_max = 0; +$uni_map_count = 0; + +$range_count = 0; + +if (1) +{ + $filename = "gb-18030-2000.xml"; + open IN, ("input/" . $filename) or die "Cannot read " . $filename; + while (<IN>) + { + if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/) + { + $utf32 = oct("0x" . $1); + $gb1 = oct("0x" . $2); + ($utf32 == $gb1) + or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1); + } + elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/) + { + $utf32 = oct("0x" . $1); + $gb1 = oct("0x" . $2); + $gb2 = oct("0x" . $3); + $gb_code = ($gb1 - 0x81) * 190 + + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63); + !defined($gb_map_2[$gb_code]) + or die "Redefined " . printGb($gb1, $gb2); + $gb_map_2[$gb_code] = $utf32; + ++$gb_map_2_count; + + !defined($uni_map[$utf32]) or die "Double Unicode mapping"; + $uni_map[$utf32] = $gb1 << 8 | $gb2; + ++$uni_map_count; + } + elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/) + { + $utf32 = oct("0x" . $1); + $gb1 = oct("0x" . $2); + $gb2 = oct("0x" . $3); + $gb3 = oct("0x" . $4); + $gb4 = oct("0x" . $5); + $gb_code = ($gb1 - 0x81) * 12600 + + ($gb2 - 0x30) * 1260 + + ($gb3 - 0x81) * 10 + + ($gb4 - 0x30); + !defined($gb_map_4[$gb_code]) + or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4); + $gb_map_4[$gb_code] = $utf32; + ++$gb_map_4_count; + $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max); + + !defined($uni_map[$utf32]) or die "Double Unicode mapping"; + $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4; + ++$uni_map_count; + } + elsif (/<a /) + { + die "Bad format"; + } + elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/) + { + $utf32_first = oct("0x" . $1); + $utf32_last = oct("0x" . $2); + $gb1_first = oct("0x" . $3); + $gb2_first = oct("0x" . $4); + $gb3_first = oct("0x" . $5); + $gb4_first = oct("0x" . $6); + $gb1_last = oct("0x" . $7); + $gb2_last = oct("0x" . $8); + $gb3_last = oct("0x" . $9); + $gb4_last = oct("0x" . $10); + $linear_first + = ($gb1_first - 0x81) * 12600 + + ($gb2_first - 0x30) * 1260 + + ($gb3_first - 0x81) * 10 + + ($gb4_first - 0x30); + $linear_last + = ($gb1_last - 0x81) * 12600 + + ($gb2_last - 0x30) * 1260 + + ($gb3_last - 0x81) * 10 + + ($gb4_last - 0x30); + ($utf32_last - $utf32_first == $linear_last - $linear_first) + or die "Bad range"; + if ($linear_first != 189000 || $linear_last != 1237575) + { + $range_uni_first[$range_count] = $utf32_first; + $range_uni_last[$range_count] + = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last); + $range_linear_first[$range_count] = $linear_first; + $range_linear_last[$range_count] = $linear_last; + ++$range_count; + $gb_map_4_ranges += $linear_last - $linear_first + 1; + $gb_map_4_max = $linear_last + if ($linear_last > $gb_map_4_max); + } + } + elsif (/<range /) + { + die "Bad format"; + } + } + close IN; +} + +print "gb_map_2_count = ", $gb_map_2_count, + ", gb_map_4_count = ", $gb_map_4_count, + ", gb_map_4_ranges = ", $gb_map_4_ranges, + ", gb_map_4_max = ", $gb_map_4_max, + ", uni_map_count = ", $uni_map_count, "\n"; +($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940"; +($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1) + or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges"; +($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80) + or die "Bad uni_map_count"; + +$range_index = 0; +$gb_nonrangedataindex[$range_index] = $gb_map_2_count; +for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code) +{ + if (defined($gb_map_4[$gb_code])) + { + $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code]; + } + else + { + ($gb_code == $range_linear_first[$range_index]) or die "Bad input"; + $gb_code = $range_linear_last[$range_index]; + ++$range_index; + $gb_nonrangedataindex[$range_index] = $gb_map_2_count; + } +} +($range_index == $range_count) or die "Bad input"; + +$filename = lc($id) . ".dat"; +open OUT, ("> " . $filename) or die "Cannot write " . $filename; + +{ + $filename = lc($id). ".pl"; + open IN, $filename or die "Cannot read ". $filename; + $first = 1; + while (<IN>) + { + if (/^\#!.*$/) + { + } + elsif (/^\#(\*.*)$/) + { + if ($first == 1) + { + print OUT "/", $1, "\n"; + $first = 0; + } + else + { + print OUT " ", substr($1, 0, length($1) - 1), "/\n"; + } + } + elsif (/^\# (.*)$/) + { + print OUT " *", $1, "\n"; + } + elsif (/^\#(.*)$/) + { + print OUT " *", $1, "\n"; + } + else + { + goto done; + } + } + done: +} + +print OUT "\n", + "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n", + "#include \"convertgb18030.h\"\n", + "#endif\n", + "\n", + "#ifndef _SAL_TYPES_H_\n", + "#include \"sal/types.h\"\n", + "#endif\n", + "\n"; + +print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n "; +for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code) +{ + printf OUT "0x%04X,", $gb_map_2[$gb_code]; + if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1) + { + print OUT "\n "; + } +} +print OUT "\n};\n\n"; + +print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl", + $id, + "ToUnicodeRanges[] = {\n"; +for ($range_index = 0; $range_index < $range_count; ++$range_index) +{ + printf OUT " { %d, %d, %d, 0x%04X },\n", + $gb_nonrangedataindex[$range_index], + $range_linear_first[$range_index], + $range_linear_last[$range_index] + 1, + $range_uni_first[$range_index]; +} +print OUT " { -1, 0, 0, 0 }\n};\n\n"; + +print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n "; +$index = 0; +$range_index = 0; +$uni_nonrangedataindex[$range_index] = $index; +for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32) +{ + if (defined($uni_map[$utf32])) + { + if ($index > 0 && ($index - 1) % 6 == 5) + { + print OUT "\n "; + } + $bytes = $uni_map[$utf32]; + printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes; + ++$index; + } + else + { + ($utf32 == $range_uni_first[$range_index]) or die "Bad input"; + $utf32 = $range_uni_last[$range_index]; + ++$range_index; + $uni_nonrangedataindex[$range_index] = $index; + } +} +($range_index == $range_count) or die "Bad input"; +print OUT "\n};\n\n"; + +print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo", + $id, + "Ranges[] = {\n"; +for ($range_index = 0; $range_index < $range_count; ++$range_index) +{ + printf OUT " { %d, 0x%04X, 0x%04X, %d },\n", + $uni_nonrangedataindex[$range_index], + $range_uni_first[$range_index], + $range_uni_last[$range_index], + $range_linear_first[$range_index]; +} +print OUT "};\n"; + +close OUT; |