#87140#

author: Stephan Bergmann <sb@openoffice.org> 2001-10-12 08:44:51 +0000
committer: Stephan Bergmann <sb@openoffice.org> 2001-10-12 08:44:51 +0000
commit: e8366e3816b50dbe4e1f9b697abece8813e11b00 (patch)
tree: ed20bc56ce29ba0fbfe57b4a25a646a50401c6b1
parent: ba4f155598069228bd52b0ab2d12744c231d60dc (diff)
2 files changed, 1129 insertions, 0 deletions
diff --git a/sal/textenc/generate/cns116431992.pl b/sal/textenc/generate/cns116431992.pl
new file mode 100644
index 000000000000..4542e08101e4
--- /dev/null
+++ b/sal/textenc/generate/cns116431992.pl
@@ -0,0 +1,786 @@
+#!/usr/bin/perl
+#*************************************************************************
+#
+#   $RCSfile: cns116431992.pl,v $
+#
+#   $Revision: 1.1 $
+#
+#   last change: $Author: sb $ $Date: 2001-10-12 09:44:51 $
+#
+#   The Contents of this file are made available subject to the terms of
+#   either of the following licenses
+#
+#          - GNU Lesser General Public License Version 2.1
+#          - Sun Industry Standards Source License Version 1.1
+#
+#   Sun Microsystems Inc., October, 2000
+#
+#   GNU Lesser General Public License Version 2.1
+#   =============================================
+#   Copyright 2000 by Sun Microsystems, Inc.
+#   901 San Antonio Road, Palo Alto, CA 94303, USA
+#
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License version 2.1, as published by the Free Software Foundation.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+#   MA  02111-1307  USA
+#
+#
+#   Sun Industry Standards Source License Version 1.1
+#   =================================================
+#   The contents of this file are subject to the Sun Industry Standards
+#   Source License Version 1.1 (the "License"); You may not use this file
+#   except in compliance with the License. You may obtain a copy of the
+#   License at http://www.openoffice.org/license.html.
+#
+#   Software provided under this License is provided on an "AS IS" basis,
+#   WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+#   WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
+#   MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
+#   See the License for the specific provisions governing your rights and
+#   obligations concerning the Software.
+#
+#   The Initial Developer of the Original Code is: Sun Microsystems, Inc.
+#
+#   Copyright: 2000 by Sun Microsystems, Inc.
+#
+#   All Rights Reserved.
+#
+#   Contributor(s): _______________________________________
+#
+#
+#*************************************************************************
+
+# The following files must be available in a ./input subdir:
+
+# <http://www.unicode.org/Public/UNIDATA/Unihan.txt>:
+#  "Unicode version: 3.1.1    Table version: 1.1    Date: 28 June 2001"
+#  contains descriptions for:
+#   U+3400..4DFF CJK Unified Ideographs Extension A
+#   U+4E00..9FFF CJK Unified Ideographs
+#   U+F900..FAFF CJK Compatibility Ideographs
+#   U+20000..2F7FF CJK Unified Ideographs Extension B
+#   U+2F800..2FFFF CJK Compatibility Ideographs Supplement
+
+# <http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/CNS11643.TXT>:
+#  "Unicode version: 1.1    Table version: 0.0d1    Date: 21 October 1994"
+#  contains mappings for CNS 11643-1986
+
+# <http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/Uni2CNS.Z>:
+#  "Unicode version: 1.1    Table version: 0.49    Date: 26 March 1998"
+#  contains mappings for CNS 11643-1992 that are incompatible with
+#   CNS11643.TXT
+
+$id = "Cns116431992";
+
+sub isValidUtf32
+{
+    my $utf32 = $_[0];
+    return $utf32 >= 0 && $utf32 <= 0x10FFFF
+           && !($utf32 >= 0xD800 && $utf32 <= 0xDFFF)
+           && !($utf32 >= 0xFDD0 && $utf32 <= 0xFDEF)
+           && ($utf32 & 0xFFFF) < 0xFFFE;
+}
+
+sub printUtf32
+{
+    my $utf32 = $_[0];
+    return sprintf("U+%04X", $utf32);
+}
+
+sub isValidCns116431992
+{
+    my $plane = $_[0];
+    my $row = $_[1];
+    my $column = $_[2];
+    return $plane >= 1 && $plane <= 16
+           && $row >= 1 && $row <= 94
+           && $column >= 1 && $column <= 94;
+}
+
+sub printCns116431992
+{
+    my $plane = $_[0];
+    my $row = $_[1];
+    my $column = $_[2];
+    return sprintf("%d-%02d/%02d", $plane, $row, $column);
+}
+
+sub printStats
+{
+    my $used = $_[0];
+    my $space = $_[1];
+    return sprintf("%d/%d bytes (%.1f%%)",
+                   $used,
+                   $space,
+                   $used * 100 / $space);
+}
+
+$count_Unihan_txt = 0;
+$count_CNS11643_TXT = 0;
+$count_Uni2CNS = 0;
+
+if (1)
+{
+    $filename = "Unihan.txt";
+    open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+    while (<IN>)
+    {
+        if (/^U\+([0-9A-F]+)\tkCNS1992\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])$/)
+        {
+            $utf32 = oct("0x" . $1);
+            $cns_plane = oct("0x" . $2);
+            $cns_row = oct("0x" . $3) - 0x20;
+            $cns_column = oct("0x" . $4) - 0x20;
+            isValidUtf32($utf32)
+                or die "Bad UTF32 char U+" . printUtf32($utf32);
+            isValidCns116431992($cns_plane, $cns_row, $cns_column)
+                or die "Bad CNS11643-1992 char "
+                           . printCns116431992($cns_plane,
+                                               $cns_row,
+                                               $cns_column);
+            if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+            {
+                $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+                $cns_plane_used[$cns_plane] = 1;
+                ++$count_Unihan_txt;
+            }
+            else
+            {
+                ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+                    or die "Mapping "
+                               . printCns116431992($cns_plane,
+                                                   $cns_row,
+                                                   $cns_column)
+                               . " to "
+                               . printUtf32($cns_map[$cns_plane]
+                                                    [$cns_row]
+                                                    [$cns_column])
+                               . ", NOT "
+                               . printUtf32($utf32);
+            }
+        }
+        elsif (/^U\+([0-9A-F]+)\tkIRG_TSource\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])$/)
+        {
+            $utf32 = oct("0x" . $1);
+            $cns_plane = oct("0x" . $2);
+            $cns_row = oct("0x" . $3) - 0x20;
+            $cns_column = oct("0x" . $4) - 0x20;
+            isValidUtf32($utf32)
+                or die "Bad UTF32 char U+" . printUtf32($utf32);
+            isValidCns116431992($cns_plane, $cns_row, $cns_column)
+                or die "Bad CNS11643-1992 char "
+                           . printCns116431992($cns_plane,
+                                               $cns_row,
+                                               $cns_column);
+            if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+            {
+                $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+                $cns_plane_used[$cns_plane] = 1;
+                ++$count_Unihan_txt;
+            }
+            else
+            {
+                ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+                    or print "WARNING!  Mapping ",
+                             printCns116431992($cns_plane,
+                                               $cns_row,
+                                               $cns_column),
+                             " to ",
+                             printUtf32($cns_map[$cns_plane]
+                                                [$cns_row]
+                                                [$cns_column]),
+                             ", NOT ",
+                             printUtf32($utf32),
+                             "\n";
+            }
+        }
+        elsif (/^U\+([0-9A-F]+)\tkCNS1992\t.*$/)
+        {
+            die "Bad format";
+        }
+    }
+    close IN;
+}
+
+if (1)
+{
+    $filename = "CNS11643.TXT";
+    open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+    while (<IN>)
+    {
+        if (/0x([0-9A-F])([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])\t0x([0-9A-F]+)\t\#.*$/)
+        {
+            $utf32 = oct("0x" . $4);
+            $cns_plane = oct("0x" . $1);
+            $cns_row = oct("0x" . $2) - 0x20;
+            $cns_column = oct("0x" . $3) - 0x20;
+            isValidUtf32($utf32)
+                or die "Bad UTF32 char U+" . printUtf32($utf32);
+            isValidCns116431992($cns_plane, $cns_row, $cns_column)
+                or die "Bad CNS11643-1992 char "
+                           . printCns116431992($cns_plane,
+                                               $cns_row,
+                                               $cns_column);
+            if ($cns_plane <= 2)
+            {
+                if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+                {
+                    $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+                    $cns_plane_used[$cns_plane] = 1;
+                    ++$count_CNS11643_TXT;
+                }
+                else
+                {
+                    ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+                        or die "Mapping "
+                                   . printCns116431992($cns_plane,
+                                                       $cns_row,
+                                                       $cns_column)
+                                   . " to "
+                                   . printUtf32($cns_map[$cns_plane]
+                                                        [$cns_row]
+                                                        [$cns_column])
+                                   . ", NOT "
+                                   . printUtf32($utf32);
+                }
+            }
+        }
+    }
+    close IN;
+}
+
+if (0)
+{
+    $filename = "Uni2CNS";
+    open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+    while (<IN>)
+    {
+        if (/([0-9A-F]+)\t([0-9A-F])-([0-9A-F][0-9A-F])([0-9A-F][0-9A-F])\t.*$/)
+        {
+            $utf32 = oct("0x" . $1);
+            $cns_plane = oct("0x" . $2);
+            $cns_row = oct("0x" . $3) - 0x20;
+            $cns_column = oct("0x" . $4) - 0x20;
+            isValidUtf32($utf32)
+                or die "Bad UTF32 char U+" . printUtf32($utf32);
+            isValidCns116431992($cns_plane, $cns_row, $cns_column)
+                or die "Bad CNS11643-1992 char "
+                           . printCns116431992($cns_plane,
+                                               $cns_row,
+                                               $cns_column);
+            if (!defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+            {
+                $cns_map[$cns_plane][$cns_row][$cns_column] = $utf32;
+                $cns_plane_used[$cns_plane] = 1;
+                ++$count_Uni2CNS;
+            }
+            else
+            {
+#               ($cns_map[$cns_plane][$cns_row][$cns_column] == $utf32)
+#                   or die "Mapping "
+#                              . printCns116431992($cns_plane,
+#                                                  $cns_row,
+#                                                  $cns_column)
+#                              . " to "
+#                              . printUtf32($cns_map[$cns_plane]
+#                                                   [$cns_row]
+#                                                   [$cns_column])
+#                              . ", NOT "
+#                              . printUtf32($utf32);
+            }
+            if ($cns_plane == 1)
+            {
+                print printCns116431992($cns_plane, $cns_row, $cns_column),
+                      "\n";
+            }
+        }
+    }
+    close IN;
+}
+
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+    if (defined($cns_plane_used[$cns_plane]))
+    {
+        for ($cns_row = 1; $cns_row <= 94; ++$cns_row)
+        {
+            for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+            {
+                if (defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+                {
+                    $utf32 = $cns_map[$cns_plane][$cns_row][$cns_column];
+                    $uni_plane = $utf32 >> 16;
+                    $uni_page = ($utf32 >> 8) & 0xFF;
+                    $uni_index = $utf32 & 0xFF;
+                    if (!defined($uni_plane_used[$uni_plane])
+                        || !defined($uni_page_used[$uni_plane][$uni_page])
+                        || !defined($uni_map[$uni_plane]
+                                            [$uni_page]
+                                            [$uni_index]))
+                    {
+                        $uni_map[$uni_plane][$uni_page][$uni_index]
+                            = ($cns_plane << 16)
+                                  | ($cns_row << 8)
+                                  | $cns_column;
+                        $uni_plane_used[$uni_plane] = 1;
+                        $uni_page_used[$uni_plane][$uni_page] = 1;
+                    }
+                    else
+                    {
+                        $cns1 = $uni_map[$uni_plane][$uni_page][$uni_index];
+                        $cns1_plane = $cns1 >> 16;
+                        $cns1_row = ($cns1 >> 8) & 0xFF;
+                        $cns1_column = $cns1 & 0xFF;
+
+                        # Do not map from Unicode to Fictious Character Set
+                        # Extensions (Lunde, p. 131), if possible:
+                        if ($cns_plane == 3
+                            && ($cns_row == 66 && $cns_column > 38
+                                || $cns_row > 66))
+                        {
+                            print " (",
+                                  printUtf32($utf32),
+                                  " to fictious ",
+                                  printCns116431992($cns_plane,
+                                                    $cns_row,
+                                                    $cns_column),
+                                  " ignored, favouring ",
+                                  printCns116431992($cns1_plane,
+                                                    $cns1_row,
+                                                    $cns1_column),
+                                  ")\n";
+                        }
+                        elsif ($cns1_plane == 3
+                               && ($cns1_row == 66 && $cns1_column > 38
+                                   || $cns1_row > 66))
+                        {
+                            $uni_map[$uni_plane][$uni_page][$uni_index]
+                                = ($cns_plane << 16)
+                                       | ($cns_row << 8)
+                                       | $cns_column;
+                            print " (",
+                                  printUtf32($utf32),
+                                  " to fictious ",
+                                  printCns116431992($cns1_plane,
+                                                    $cns1_row,
+                                                    $cns1_column),
+                                  " ignored, favouring ",
+                                  printCns116431992($cns_plane,
+                                                    $cns_row,
+                                                    $cns_column),
+                                  ")\n";
+                        }
+                        else
+                        {
+                            print "WARNING!  Mapping ",
+                                  printUtf32($utf32),
+                                  " to ",
+                                  printCns116431992($cns1_plane,
+                                                    $cns1_row,
+                                                    $cns1_column),
+                                  ", NOT ",
+                                  printCns116431992($cns_plane,
+                                                    $cns_row,
+                                                    $cns_column),
+                                  "\n";
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+if (defined($uni_plane_used[0]) && defined($uni_page_used[0][0]))
+{
+    for ($utf32 = 0; $utf32 <= 0x7F; ++$utf32)
+    {
+        if (defined($uni_map[0][0][$uni_index]))
+        {
+            $cns = $uni_map[0][0][$utf32];
+            die "Mapping "
+                    . printUtf32($utf32)
+                    . " to "
+                    . printCns116431992($cns >> 16,
+                                        ($cns >> 8) & 0xFF,
+                                        $cns & 0xFF);
+        }
+    }
+}
+
+$filename = lc($id) . ".dat";
+open OUT, ("> " . $filename) or die "Cannot write " . $filename;
+
+$filename = lc($id) . ".dat";
+open OUT, ("> " . $filename) or die "Cannot write " . $filename;
+
+{
+    $filename = lc($id). ".pl";
+    open IN, $filename or die "Cannot read ". $filename;
+    $first = 1;
+    while (<IN>)
+    {
+        if (/^\#!.*$/)
+        {
+        }
+        elsif (/^\#(\*.*)$/)
+        {
+            if ($first == 1)
+            {
+                print OUT "/", $1, "\n";
+                $first = 0;
+            }
+            else
+            {
+                print OUT " ", substr($1, 0, length($1) - 1), "/\n";
+            }
+        }
+        elsif (/^\# (.*)$/)
+        {
+            print OUT " *", $1, "\n";
+        }
+        elsif (/^\#(.*)$/)
+        {
+            print OUT " *", $1, "\n";
+        }
+        else
+        {
+            goto done;
+        }
+    }
+  done:
+}
+
+print OUT "\n",
+          "#ifndef _SAL_TYPES_H_\n",
+          "#include \"sal/types.h\"\n",
+          "#endif\n",
+          "\n";
+
+print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n";
+$cns_data_offset = 0;
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+    if (defined($cns_plane_used[$cns_plane]))
+    {
+        $cns_rows = 0;
+        $cns_chars = 0;
+        for ($cns_row = 1; $cns_row <= 94; ++$cns_row)
+        {
+            $cns_row_used = 0;
+            for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+            {
+                if (defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+                {
+                    $cns_row_used = 1;
+                    goto found;
+                }
+            }
+          found:
+            if ($cns_row_used == 1)
+            {
+                ++$cns_rows;
+                print OUT " /* plane ", $cns_plane, ", row ", $cns_row,
+                          " */\n         ";
+                $chars_in_row = 0;
+                $surrogates_in_row = 0;
+                for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+                {
+                    if (defined($cns_map[$cns_plane][$cns_row][$cns_column]))
+                    {
+                        $utf32 = $cns_map[$cns_plane][$cns_row][$cns_column];
+                        ++$chars_in_row;
+                        if ($utf32 <= 0xFFFF)
+                        {
+                            printf OUT "0x%04X,", $utf32;
+                        }
+                        else
+                        {
+                            printf OUT "0x%04X,",
+                                       (0xD800 | (($utf32 - 0x10000) >> 10));
+                            ++$surrogates_in_row;
+                        }
+                    }
+                    else
+                    {
+                        printf OUT "0xffff,";
+                    }
+                    if ($cns_column % 10 == 9)
+                    {
+                        print OUT "\n  ";
+                    }
+                }
+                print OUT "\n";
+                $cns_data_offsets[$cns_plane][$cns_row] = $cns_data_offset++;
+                if ($surrogates_in_row > 0)
+                {
+                    print OUT "         ";
+                    for ($cns_column = 1; $cns_column <= 94; ++$cns_column)
+                    {
+                        $utf32 = 0;
+                        if (defined($cns_map[$cns_plane]
+                                            [$cns_row]
+                                            [$cns_column]))
+                        {
+                            $utf32
+                                = $cns_map[$cns_plane][$cns_row][$cns_column];
+                        }
+                        if ($utf32 <= 0xFFFF)
+                        {
+                            printf OUT "     0,";
+                        }
+                        else
+                        {
+                            printf OUT "0x%04X,",
+                                       (0xDC00
+                                            | (($utf32 - 0x10000) & 0x3FF));
+                        }
+                        if ($cns_column % 10 == 9)
+                        {
+                            print OUT "\n  ";
+                        }
+                    }
+                    print OUT "\n";
+                    ++$cns_data_offset;
+                }
+                $cns_chars += $chars_in_row;
+                $cns_data_space[$cns_plane][$cns_row]
+                    = ($surrogates_in_row == 0 ? 94 : 2 * 94) * 2;
+                $cns_data_used[$cns_plane][$cns_row]
+                    = ($chars_in_row + $surrogates_in_row) * 2;
+            }
+            else
+            {
+                print OUT " /* plane ", $cns_plane, ", row ", $cns_row,
+                          ": --- */\n";
+                $cns_data_offsets[$cns_plane][$cns_row] = -1;
+            }
+        }
+        print "cns plane ",
+              $cns_plane,
+              ": rows = ",
+              $cns_rows,
+              ", chars = ",
+              $cns_chars,
+              "\n";
+    }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImpl", $id, "ToUnicodeRowOffsets[] = {\n";
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+    if (defined ($cns_plane_used[$cns_plane]))
+    {
+        $cns_rowoffsets_used[$cns_plane] = 0;
+        for ($cns_row = 1; $cns_row <= 94; ++$cns_row)
+        {
+            if ($cns_data_offsets[$cns_plane][$cns_row] == -1)
+            {
+                print OUT "  -1, /* plane ",
+                          $cns_plane,
+                          ", row ",
+                          $cns_row,
+                          " */\n";
+            }
+            else
+            {
+                print OUT "  ",
+                          $cns_data_offsets[$cns_plane][$cns_row],
+                          " * 94, /* plane ",
+                          $cns_plane,
+                          ", row ",
+                          $cns_row,
+                          "; ",
+                          printStats($cns_data_used[$cns_plane][$cns_row],
+                                     $cns_data_space[$cns_plane][$cns_row]),
+                          " */\n";
+                $cns_rowoffsets_used[$cns_plane] += 4;
+            }
+        }
+    }
+    else
+    {
+        print OUT "  /* plane ", $cns_plane, ": --- */\n";
+    }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImpl",
+          $id,
+          "ToUnicodePlaneOffsets[] = {\n";
+$cns_row_offset = 0;
+for ($cns_plane = 1; $cns_plane <= 16; ++$cns_plane)
+{
+    if (defined ($cns_plane_used[$cns_plane]))
+    {
+        print OUT "  ",
+                  $cns_row_offset++,
+                  " * 94, /* plane ",
+                  $cns_plane,
+                  "; ",
+                  printStats($cns_rowoffsets_used[$cns_plane], 94 * 4),
+                  " */\n";
+    }
+    else
+    {
+        print OUT "  -1, /* plane ", $cns_plane, " */\n";
+    }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n";
+$uni_data_offset = 0;
+for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane)
+{
+    if (defined($uni_plane_used[$uni_plane]))
+    {
+        for ($uni_page = 0; $uni_page <= 255; ++$uni_page)
+        {
+            if (defined($uni_page_used[$uni_plane][$uni_page]))
+            {
+                $uni_data_used[$uni_plane][$uni_page] = 0;
+                print OUT " /* plane ", $uni_plane, ", page ", $uni_page,
+                          " */\n  ";
+                for ($uni_index = 0; $uni_index <= 255; ++$uni_index)
+                {
+                    if (defined($uni_map[$uni_plane][$uni_page][$uni_index]))
+                    {
+                        $cns = $uni_map[$uni_plane][$uni_page][$uni_index];
+                        printf OUT "0x%02X%02X%02X,",
+                                   $cns >> 16,
+                                   0xA0 + ($cns >> 8 & 0xFF),
+                                   0xA0 + ($cns & 0xFF);
+                        $uni_data_used[$uni_plane][$uni_page] += 4;
+                    }
+                    else
+                    {
+                        print OUT "       0,";
+                    }
+                    if ($uni_index % 8 == 7 && $uni_index != 255)
+                    {
+                        print OUT "\n  ";
+                    }
+                }
+                print OUT "\n";
+                $uni_data_offsets[$uni_plane][$uni_page] = $uni_data_offset++;
+            }
+            else
+            {
+                print OUT " /* plane ", $uni_plane, ", page ", $uni_page,
+                          ": --- */\n";
+                $uni_data_offsets[$uni_plane][$uni_page] = -1;
+            }
+        }
+    }
+    else
+    {
+        print OUT " /* plane ", $uni_plane, ": --- */\n";
+    }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImplUnicodeTo", $id, "PageOffsets[] = {\n";
+for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane)
+{
+    if (defined($uni_plane_used[$uni_plane]))
+    {
+        $uni_pageoffsets_used[$uni_plane] = 0;
+        $uni_data_used_sum[$uni_plane] = 0;
+        $uni_data_space_sum[$uni_plane] = 0;
+        for ($uni_page = 0; $uni_page <= 255; ++$uni_page)
+        {
+            $offset = $uni_data_offsets[$uni_plane][$uni_page];
+            if ($offset == -1)
+            {
+                print OUT "  -1, /* plane ",
+                          $uni_plane,
+                          ", page ",
+                          $uni_page,
+                          " */\n";
+            }
+            else
+            {
+                print OUT "  ",
+                          $offset,
+                          " * 256, /* plane ",
+                          $uni_plane,
+                          ", page ",
+                          $uni_page,
+                          "; ",
+                          printStats($uni_data_used[$uni_plane][$uni_page],
+                                     256 * 4),
+                          " */\n";
+                $uni_pageoffsets_used[$uni_plane] += 4;
+                $uni_data_used_sum[$uni_plane]
+                    += $uni_data_used[$uni_plane][$uni_page];
+                $uni_data_space_sum[$uni_plane] += 256 * 4;
+            }
+        }
+    }
+    else
+    {
+        print OUT "  /* plane ", $uni_plane, ": --- */\n";
+    }
+}
+print OUT "};\n\n";
+
+print OUT "static sal_Int32 const aImplUnicodeTo",
+          $id,
+          "PlaneOffsets[] = {\n";
+$uni_page_offset = 0;
+$uni_planeoffsets_used = 0;
+$uni_pageoffsets_used_sum = 0;
+$uni_pageoffsets_space_sum = 0;
+$uni_data_used_sum2 = 0;
+$uni_data_space_sum2 = 0;
+for ($uni_plane = 0; $uni_plane <= 16; ++$uni_plane)
+{
+    if (defined ($uni_plane_used[$uni_plane]))
+    {
+        print OUT "  ",
+                  $uni_page_offset++,
+                  " * 256, /* plane ",
+                  $uni_plane,
+                  "; ",
+                  printStats($uni_pageoffsets_used[$uni_plane], 256 * 4),
+                  ", ",
+                  printStats($uni_data_used_sum[$uni_plane],
+                             $uni_data_space_sum[$uni_plane]),
+                  " */\n";
+        $uni_planeoffsets_used += 4;
+        $uni_pageoffsets_used_sum += $uni_pageoffsets_used[$uni_plane];
+        $uni_pageoffsets_space_sum += 256 * 4;
+        $uni_data_used_sum2 += $uni_data_used_sum[$uni_plane];
+        $uni_data_space_sum2 += $uni_data_space_sum[$uni_plane];
+    }
+    else
+    {
+        print OUT "  -1, /* plane ", $uni_plane, " */\n";
+    }
+}
+print OUT " /* ",
+          printStats($uni_planeoffsets_used, 17 * 4),
+          ", ",
+          printStats($uni_pageoffsets_used_sum, $uni_pageoffsets_space_sum),
+          ", ",
+          printStats($uni_data_used_sum2, $uni_data_space_sum2),
+          " */\n};\n";
+
+close OUT;
+
+print "Unihan.txt = ", $count_Unihan_txt,
+      ", CNS11643.TXT = ", $count_CNS11643_TXT,
+      ", Uni2CNS = ", $count_Uni2CNS,
+      ", total = ",
+          ($count_Unihan_txt + $count_CNS11643_TXT + $count_Uni2CNS),
+      "\n";
diff --git a/sal/textenc/generate/gb180302000.pl b/sal/textenc/generate/gb180302000.pl
new file mode 100644
index 000000000000..a7d925c61000
--- /dev/null
+++ b/sal/textenc/generate/gb180302000.pl
@@ -0,0 +1,343 @@
+#!/usr/bin/perl
+#*************************************************************************
+#
+#   $RCSfile: gb180302000.pl,v $
+#
+#   $Revision: 1.1 $
+#
+#   last change: $Author: sb $ $Date: 2001-10-12 09:44:51 $
+#
+#   The Contents of this file are made available subject to the terms of
+#   either of the following licenses
+#
+#          - GNU Lesser General Public License Version 2.1
+#          - Sun Industry Standards Source License Version 1.1
+#
+#   Sun Microsystems Inc., October, 2000
+#
+#   GNU Lesser General Public License Version 2.1
+#   =============================================
+#   Copyright 2000 by Sun Microsystems, Inc.
+#   901 San Antonio Road, Palo Alto, CA 94303, USA
+#
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License version 2.1, as published by the Free Software Foundation.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+#   MA  02111-1307  USA
+#
+#
+#   Sun Industry Standards Source License Version 1.1
+#   =================================================
+#   The contents of this file are subject to the Sun Industry Standards
+#   Source License Version 1.1 (the "License"); You may not use this file
+#   except in compliance with the License. You may obtain a copy of the
+#   License at http://www.openoffice.org/license.html.
+#
+#   Software provided under this License is provided on an "AS IS" basis,
+#   WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+#   WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
+#   MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
+#   See the License for the specific provisions governing your rights and
+#   obligations concerning the Software.
+#
+#   The Initial Developer of the Original Code is: Sun Microsystems, Inc.
+#
+#   Copyright: 2000 by Sun Microsystems, Inc.
+#
+#   All Rights Reserved.
+#
+#   Contributor(s): _______________________________________
+#
+#
+#*************************************************************************
+
+# The following files must be available in a ./input subdir:
+
+# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
+# gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
+#  "modified version="3" date="2001-02-21""
+
+$id = "Gb180302000";
+
+sub printUtf32
+{
+    my $utf32 = $_[0];
+    return sprintf("U+%04X", $utf32);
+}
+
+sub printGb
+{
+    if (defined($_[2]))
+    {
+        return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
+    }
+    elsif (defined($_[1]))
+    {
+        return sprintf("%02X%02X", $_[0], $_[1]);
+    }
+    else
+    {
+        return sprintf("%02X", $_[0]);
+    }
+}
+
+$gb_map_2_count = 0;
+$gb_map_4_count = 0;
+$gb_map_4_ranges = 0;
+$gb_map_4_max = 0;
+$uni_map_count = 0;
+
+$range_count = 0;
+
+if (1)
+{
+    $filename = "gb-18030-2000.xml";
+    open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+    while (<IN>)
+    {
+        if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
+        {
+            $utf32 = oct("0x" . $1);
+            $gb1 = oct("0x" . $2);
+            ($utf32 == $gb1)
+                or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
+        }
+        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
+        {
+            $utf32 = oct("0x" . $1);
+            $gb1 = oct("0x" . $2);
+            $gb2 = oct("0x" . $3);
+            $gb_code = ($gb1 - 0x81) * 190
+                           + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
+            !defined($gb_map_2[$gb_code])
+                or die "Redefined " . printGb($gb1, $gb2);
+            $gb_map_2[$gb_code] = $utf32;
+            ++$gb_map_2_count;
+
+            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
+            $uni_map[$utf32] = $gb1 << 8 | $gb2;
+            ++$uni_map_count;
+        }
+        elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
+        {
+            $utf32 = oct("0x" . $1);
+            $gb1 = oct("0x" . $2);
+            $gb2 = oct("0x" . $3);
+            $gb3 = oct("0x" . $4);
+            $gb4 = oct("0x" . $5);
+            $gb_code = ($gb1 - 0x81) * 12600
+                           + ($gb2 - 0x30) * 1260
+                           + ($gb3 - 0x81) * 10
+                           + ($gb4 - 0x30);
+            !defined($gb_map_4[$gb_code])
+                or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
+            $gb_map_4[$gb_code] = $utf32;
+            ++$gb_map_4_count;
+            $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
+
+            !defined($uni_map[$utf32]) or die "Double Unicode mapping";
+            $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
+            ++$uni_map_count;
+        }
+        elsif (/<a /)
+        {
+            die "Bad format";
+        }
+        elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
+        {
+            $utf32_first = oct("0x" . $1);
+            $utf32_last = oct("0x" . $2);
+            $gb1_first = oct("0x" . $3);
+            $gb2_first = oct("0x" . $4);
+            $gb3_first = oct("0x" . $5);
+            $gb4_first = oct("0x" . $6);
+            $gb1_last = oct("0x" . $7);
+            $gb2_last = oct("0x" . $8);
+            $gb3_last = oct("0x" . $9);
+            $gb4_last = oct("0x" . $10);
+            $linear_first
+                = ($gb1_first - 0x81) * 12600
+                    + ($gb2_first - 0x30) * 1260
+                        + ($gb3_first - 0x81) * 10
+                            + ($gb4_first - 0x30);
+            $linear_last
+                = ($gb1_last - 0x81) * 12600
+                    + ($gb2_last - 0x30) * 1260
+                        + ($gb3_last - 0x81) * 10
+                            + ($gb4_last - 0x30);
+            ($utf32_last - $utf32_first == $linear_last - $linear_first)
+                or die "Bad range";
+            if ($linear_first != 189000 || $linear_last != 1237575)
+            {
+                $range_uni_first[$range_count] = $utf32_first;
+                $range_uni_last[$range_count]
+                    = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
+                $range_linear_first[$range_count] = $linear_first;
+                $range_linear_last[$range_count] = $linear_last;
+                ++$range_count;
+                $gb_map_4_ranges += $linear_last - $linear_first + 1;
+                $gb_map_4_max = $linear_last
+                    if ($linear_last > $gb_map_4_max);
+            }
+        }
+        elsif (/<range /)
+        {
+            die "Bad format";
+        }
+    }
+    close IN;
+}
+
+print "gb_map_2_count = ", $gb_map_2_count,
+      ", gb_map_4_count = ", $gb_map_4_count,
+      ", gb_map_4_ranges = ", $gb_map_4_ranges,
+      ", gb_map_4_max = ", $gb_map_4_max,
+      ", uni_map_count = ", $uni_map_count, "\n";
+($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
+($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
+    or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
+($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
+    or die "Bad uni_map_count";
+
+$range_index = 0;
+$gb_nonrangedataindex[$range_index] = $gb_map_2_count;
+for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
+{
+    if (defined($gb_map_4[$gb_code]))
+    {
+        $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
+    }
+    else
+    {
+        ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
+        $gb_code = $range_linear_last[$range_index];
+        ++$range_index;
+        $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
+    }
+}
+($range_index == $range_count) or die "Bad input";
+
+$filename = lc($id) . ".dat";
+open OUT, ("> " . $filename) or die "Cannot write " . $filename;
+
+{
+    $filename = lc($id). ".pl";
+    open IN, $filename or die "Cannot read ". $filename;
+    $first = 1;
+    while (<IN>)
+    {
+        if (/^\#!.*$/)
+        {
+        }
+        elsif (/^\#(\*.*)$/)
+        {
+            if ($first == 1)
+            {
+                print OUT "/", $1, "\n";
+                $first = 0;
+            }
+            else
+            {
+                print OUT " ", substr($1, 0, length($1) - 1), "/\n";
+            }
+        }
+        elsif (/^\# (.*)$/)
+        {
+            print OUT " *", $1, "\n";
+        }
+        elsif (/^\#(.*)$/)
+        {
+            print OUT " *", $1, "\n";
+        }
+        else
+        {
+            goto done;
+        }
+    }
+  done:
+}
+
+print OUT "\n",
+          "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n",
+          "#include \"convertgb18030.h\"\n",
+          "#endif\n",
+          "\n",
+          "#ifndef _SAL_TYPES_H_\n",
+          "#include \"sal/types.h\"\n",
+          "#endif\n",
+          "\n";
+
+print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n  ";
+for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
+{
+    printf OUT "0x%04X,", $gb_map_2[$gb_code];
+    if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
+    {
+        print OUT "\n  ";
+    }
+}
+print OUT "\n};\n\n";
+
+print OUT "static ImplGb180302000ToUnicodeRange const\n    aImpl",
+          $id,
+          "ToUnicodeRanges[] = {\n";
+for ($range_index = 0; $range_index < $range_count; ++$range_index)
+{
+    printf OUT "  { %d, %d, %d, 0x%04X },\n",
+               $gb_nonrangedataindex[$range_index],
+               $range_linear_first[$range_index],
+               $range_linear_last[$range_index] + 1,
+               $range_uni_first[$range_index];
+}
+print OUT "  { -1, 0, 0, 0 }\n};\n\n";
+
+print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n  ";
+$index = 0;
+$range_index = 0;
+$uni_nonrangedataindex[$range_index] = $index;
+for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
+{
+    if (defined($uni_map[$utf32]))
+    {
+        if ($index > 0 && ($index - 1) % 6 == 5)
+        {
+            print OUT "\n  ";
+        }
+        $bytes = $uni_map[$utf32];
+        printf OUT ($bytes <= 0xFFFF ? "    0x%04X," : "0x%08X,"), $bytes;
+        ++$index;
+    }
+    else
+    {
+        ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
+        $utf32 = $range_uni_last[$range_index];
+        ++$range_index;
+        $uni_nonrangedataindex[$range_index] = $index;
+    }
+}
+($range_index == $range_count) or die "Bad input";
+print OUT "\n};\n\n";
+
+print OUT "static ImplUnicodeToGb180302000Range const\n    aImplUnicodeTo",
+          $id,
+          "Ranges[] = {\n";
+for ($range_index = 0; $range_index < $range_count; ++$range_index)
+{
+    printf OUT "  { %d, 0x%04X, 0x%04X, %d },\n",
+               $uni_nonrangedataindex[$range_index],
+               $range_uni_first[$range_index],
+               $range_uni_last[$range_index],
+               $range_linear_first[$range_index];
+}
+print OUT "};\n";
+
+close OUT;
author	Stephan Bergmann <sb@openoffice.org>	2001-10-12 08:44:51 +0000
committer	Stephan Bergmann <sb@openoffice.org>	2001-10-12 08:44:51 +0000
commit	e8366e3816b50dbe4e1f9b697abece8813e11b00 (patch)
tree	ed20bc56ce29ba0fbfe57b4a25a646a50401c6b1
parent	ba4f155598069228bd52b0ab2d12744c231d60dc (diff)