From 4b592ce754e578a347490341caecc1bc45f67242 Mon Sep 17 00:00:00 2001 From: David Tardon Date: Sun, 15 Jul 2012 11:38:07 +0200 Subject: move ulfconv to l10ntools This allows us to drop dependency on setup_native everywhere. Change-Id: Ib033f8d5953682379c6c2ab53d5cf221e9d8cfec --- l10ntools/Executable_ulfconv.mk | 21 ++ l10ntools/Module_l10ntools.mk | 2 + l10ntools/Package_ulfconv.mk | 14 + l10ntools/source/ulfconv/msi-encodinglist.txt | 152 +++++++++++ l10ntools/source/ulfconv/ulfconv.cxx | 361 ++++++++++++++++++++++++++ 5 files changed, 550 insertions(+) create mode 100644 l10ntools/Executable_ulfconv.mk create mode 100644 l10ntools/Package_ulfconv.mk create mode 100644 l10ntools/source/ulfconv/msi-encodinglist.txt create mode 100644 l10ntools/source/ulfconv/ulfconv.cxx (limited to 'l10ntools') diff --git a/l10ntools/Executable_ulfconv.mk b/l10ntools/Executable_ulfconv.mk new file mode 100644 index 000000000000..8ed5bb0e5f2c --- /dev/null +++ b/l10ntools/Executable_ulfconv.mk @@ -0,0 +1,21 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_Executable_Executable,ulfconv)) + +$(eval $(call gb_Executable_use_libraries,ulfconv,\ + sal \ + $(gb_STDLIBS) \ +)) + +$(eval $(call gb_Executable_add_exception_objects,ulfconv,\ + l10ntools/source/ulfconv/ulfconv \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/l10ntools/Module_l10ntools.mk b/l10ntools/Module_l10ntools.mk index c149db078c77..e3c11344a0c9 100644 --- a/l10ntools/Module_l10ntools.mk +++ b/l10ntools/Module_l10ntools.mk @@ -29,6 +29,7 @@ $(eval $(call gb_Module_Module,l10ntools)) $(eval $(call gb_Module_add_targets,l10ntools,\ Executable_helpex \ Executable_idxdict \ + Executable_ulfconv \ Executable_ulfex \ Executable_gsicheck \ Executable_cfgex \ @@ -41,6 +42,7 @@ $(eval $(call gb_Module_add_targets,l10ntools,\ Library_helplinker \ Package_inc \ Package_scripts \ + Package_ulfconv \ )) ifneq ($(SOLAR_JAVA),) diff --git a/l10ntools/Package_ulfconv.mk b/l10ntools/Package_ulfconv.mk new file mode 100644 index 000000000000..41337b26cb6e --- /dev/null +++ b/l10ntools/Package_ulfconv.mk @@ -0,0 +1,14 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_Package_Package,l10ntools_ulfconv,$(SRCDIR)/l10ntools/source/ulfconv)) + +$(eval $(call gb_Package_add_file,l10ntools_ulfconv,bin/msi-encodinglist.txt,msi-encodinglist.txt)) + +# vim: set noet sw=4 ts=4: diff --git a/l10ntools/source/ulfconv/msi-encodinglist.txt b/l10ntools/source/ulfconv/msi-encodinglist.txt new file mode 100644 index 000000000000..1fd3cb96e70d --- /dev/null +++ b/l10ntools/source/ulfconv/msi-encodinglist.txt @@ -0,0 +1,152 @@ +# Syntax: language ANSI-Codepage LCID +# comment lines begin with hash +af 1252 1078 # Afrikaans +am 0 1118 # Amharic +ar 1256 1025 +ar-SA 1256 1025 +as 0 1101 # Assamese +as-IN 0 1101 # Assamese +ast 1252 1610 +be 1251 1059 # Belarusian +be-BY 1251 1059 +bg 1251 1026 # Bulgarian +bn 0 2117 # Bengali +bn-BD 0 2117 # Bengali Bangladesh +bn-IN 0 1093 # Bengali India +bo 0 2121 +br 1252 1150 # Breton +brx 0 1603 # Bodo (India) +bs 0 5146 # bosnian +ca 1252 1027 # Catalan +ca-XV 1252 32771 # Catalan Valencian +cs 1250 1029 # Czech +cy 1252 1106 # Welsh +da 1252 1030 +de 1252 1031 +dgo 0 1604 # Dogri (India) +dz 0 2129 # Dzongkha (same ID as tibetan bhutan (s. i40713)) +el 1253 1032 +en-GB 1252 2057 +en-US 1252 1033 +en-ZA 1252 7177 +eo 0 1553 # Esperanto +es 1252 1034 +et 1257 1061 +eu 1252 1069 # Basque +fa 0 1065 # Farsi +fi 1252 1035 +fo 1252 1080 # Faroese +fr 1252 1036 +fr-CA 1252 3084 +fur 0 1585 +ga 0 2108 # Irish +gd 0 1084 # Gaelic (Scotland) +gl 1252 1110 # Galician +gu 0 1095 # Gujarati +gu-IN 0 1095 # Gujarati +he 1255 1037 +hi 0 1081 +hr 1250 1050 # Croatian +ht 1252 1626 # Haitian +hu 1250 1038 +hy 0 1067 # Armenian +id 1252 1057 # Indonesian +is 1252 1039 # Icelandic +it 1252 1040 +ja 932 1041 +jbo 0 1624 +ka 0 1079 # Georgian +kab 0 1625 +kk 0 1087 +km 0 1107 # Khmer +kn 0 1099 # Kannada +kn-IN 0 1099 # Kannada +ko 949 1042 +kok 0 1111 # Konkani +ks 0 1120 # Kashmiri +ku 0 1574 +ky 0 1088 # Kyrgyz +ky-CN 0 1640 # Kyrgyz (China) +lb 1252 1134 +lo 0 1108 # Lao +lt 1257 1063 # Lithuanian +lv 1257 1062 # Latvian +mai 0 1605 # Maithili (India) +mk 1251 1071 # Macedonian +ml 0 1100 +ml-IN 0 1100 +mn 0 1104 # Mongolian +mni 0 1112 # Manipuri +mn-TR 0 2128 # Mongolian Classical/traditional +mr 0 1102 # Marathi +mr-IN 0 1102 +ms 0 1086 # Malay (Malaysian) +mt 0 1082 # Maltese +my 0 1109 # Burmese +nb 1252 1044 +ne 0 1121 # Nepali +nl 1252 1043 +nn 1252 2068 +no 1252 1044 +nr 0 1580 # Ndebele South +nso 0 1132 +ny 0 1598 +oc 1252 1154 # Occitan-lengadocian +om 0 2162 +or 0 1096 # Oriya +or-IN 0 1096 +pa-IN 0 1094 # Punjabi +pap 0 2171 +pl 1250 1045 +ps 0 2171 +pt 1252 2070 +pt-BR 1252 1046 +pt-PT 1252 2070 +qtz 1252 1033 # key id pseudo language +rm 0 1047 # Raeto-Romance +ro 1250 1048 # Romanian +ru 1251 1049 +rw 0 1569 # Kinyarwanda +sa-IN 0 1103 # Sanskrit +sat 0 1606 # Santali +sb 0 1070 # Sorbian +sc 0 3047 +sd 0 1113 # Sindhi +sh 1250 2074 # Serbian Latin +si 0 2133 +sk 1250 1051 # Slovak +sl 1250 1060 # Slovenian +sq 1250 1052 # Albanian +sr 1251 3098 # Serbian Cyrillic +sr-SP 1251 3098 # Serbian Cyrillic +ss 0 1579 # Swazi +st 0 1072 # Southern Sotho, Sutu +sv 1252 1053 +sw 1252 1089 # Swahili +sw-TZ 1252 1089 # Swahili +so 0 1143 +ta 0 1097 # Tamil +ta-IN 0 1097 # Tamil +te 0 1098 +te-IN 0 1098 +tg 0 1064 # Tajik +th 874 1054 +ti 0 1139 # Tigrinya +ti-ER 0 1139 # Tigrinya +tn 0 1074 # Setsuana +tr 1254 1055 # Turkish +ts 0 1073 # Tsonga +tk 0 1090 +tt 1251 1092 # Tatar +ug 0 1152 +uk 1251 1058 # Ukrainian +ur 1256 1056 # Urdu +ur-IN 0 2080 +uz 0 1091 # Uzbek (Latin) +ve 0 1075 # Venda +vi 1258 1066 # Vietnamese +xh 0 1076 # Xhosa +yi 0 1085 # Yiddish +zh-CN 936 2052 +zh-TW 950 1028 +zu 0 1077 # Zulu diff --git a/l10ntools/source/ulfconv/ulfconv.cxx b/l10ntools/source/ulfconv/ulfconv.cxx new file mode 100644 index 000000000000..1643b330d776 --- /dev/null +++ b/l10ntools/source/ulfconv/ulfconv.cxx @@ -0,0 +1,361 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * + * for a copy of the LGPLv3 License. + * + ************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +/***************************************************************************** + * typedefs + *****************************************************************************/ + +typedef std::map< const std::string, rtl_TextEncoding > EncodingMap; + +struct _pair { + const char *key; + rtl_TextEncoding value; +}; + +static int _pair_compare (const char *key, const _pair *pair); +static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member ); + + +const _pair _ms_encoding_list[] = { + { "0", RTL_TEXTENCODING_UTF8 }, + { "1250", RTL_TEXTENCODING_MS_1250 }, + { "1251", RTL_TEXTENCODING_MS_1251 }, + { "1252", RTL_TEXTENCODING_MS_1252 }, + { "1253", RTL_TEXTENCODING_MS_1253 }, + { "1254", RTL_TEXTENCODING_MS_1254 }, + { "1255", RTL_TEXTENCODING_MS_1255 }, + { "1256", RTL_TEXTENCODING_MS_1256 }, + { "1257", RTL_TEXTENCODING_MS_1257 }, + { "1258", RTL_TEXTENCODING_MS_1258 }, + { "874", RTL_TEXTENCODING_MS_874 }, + { "932", RTL_TEXTENCODING_MS_932 }, + { "936", RTL_TEXTENCODING_MS_936 }, + { "949", RTL_TEXTENCODING_MS_949 }, + { "950", RTL_TEXTENCODING_MS_950 } +}; + + +/***************************************************************************** + * fgets that work with unix line ends on Windows + *****************************************************************************/ + +char * my_fgets(char *s, int n, FILE *fp) +{ + int i; + for( i=0; i < n-1; i++ ) + { + int c = getc(fp); + + if( c == EOF ) + break; + + s[i] = (char) c; + + if( s[i] == '\n' ) + { + i++; + break; + } + } + + if( i>0 ) + { + s[i] = '\0'; + return s; + } + else + { + return NULL; + } +} + +/***************************************************************************** + * compare function for binary search + *****************************************************************************/ + +static int +_pair_compare (const char *key, const _pair *pair) +{ + int result = rtl_str_compareIgnoreAsciiCase( key, pair->key ); + return result; +} + +/***************************************************************************** + * binary search on encoding tables + *****************************************************************************/ + +static const _pair* +_pair_search (const char *key, const _pair *base, unsigned int member ) +{ + unsigned int lower = 0; + unsigned int upper = member; + unsigned int current; + int comparison; + + /* check for validity of input */ + if ( (key == NULL) || (base == NULL) || (member == 0) ) + return NULL; + + /* binary search */ + while ( lower < upper ) + { + current = (lower + upper) / 2; + comparison = _pair_compare( key, base + current ); + if (comparison < 0) + upper = current; + else + if (comparison > 0) + lower = current + 1; + else + return base + current; + } + + return NULL; +} + + +/************************************************************************ + * read_encoding_table + ************************************************************************/ + +void read_encoding_table(char * file, EncodingMap& aEncodingMap) +{ + FILE * fp = fopen(file, "r"); + if ( ! fp ) { + fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno)); + exit(2); + } + + char buffer[512]; + while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) { + + // strip comment lines + if ( buffer[0] == '#' ) + continue; + + // find end of language string + char * cp; + for ( cp = buffer; ! isspace(*cp); cp++ ) + ; + *cp = '\0'; + + // find start of codepage string + for ( ++cp; isspace(*cp); ++cp ) + ; + char * codepage = cp; + + // find end of codepage string + for ( ++cp; ! isspace(*cp); ++cp ) + ; + *cp = '\0'; + + // find the correct mapping for codepage + const unsigned int members = SAL_N_ELEMENTS( _ms_encoding_list ); + const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members ); + + if ( encoding != NULL ) { + const std::string language(buffer); + aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) ); + } + } + + fclose(fp); +} + +/************************************************************************ + * print_legacy_mixed + ************************************************************************/ + +void print_legacy_mixed( + FILE * ostream, + const rtl::OUString& aString, + const std::string& language, + EncodingMap& aEncodingMap) +{ + EncodingMap::iterator iter = aEncodingMap.find(language); + + if ( iter != aEncodingMap.end() ) { + fputs(OUStringToOString(aString, iter->second).getStr(), ostream); + } else { + fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str()); + } +} + +/************************************************************************ + * print_java_style + ************************************************************************/ + +void print_java_style(FILE * ostream, const rtl::OUString& aString) +{ + int imax = aString.getLength(); + for (int i = 0; i < imax; i++) { + sal_Unicode uc = aString[i]; + if ( uc < 128 ) { + fprintf(ostream, "%c", (char) uc); + } else { + fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF ); + } + } +} + +/************************************************************************ + * main + ************************************************************************/ + +int main( int argc, char * const argv[] ) +{ + EncodingMap aEncodingMap; + + FILE *istream = stdin; + FILE *ostream = stdout; + + char *outfile = NULL; + + int errflg = 0; + int argi; + + for( argi=1; argi < argc; argi++ ) + { + if( argv[argi][0] == '-' && argv[argi][2] == '\0' ) + { + switch(argv[argi][1]) { + case 'o': + if (argi+1 >= argc || argv[argi+1][0] == '-') + { + fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]); + errflg++; + break; + } + + ++argi; + outfile = argv[argi]; + break; + case 't': + if (argi+1 >= argc || argv[argi+1][0] == '-') + { + fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]); + errflg++; + break; + } + + read_encoding_table(argv[++argi], aEncodingMap); + break; + default: + fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]); + errflg++; + } + } + else + { + break; + } + } + + if (errflg) { + fprintf(stderr, "Usage: ulfconv [-o ] [-t ] []\n"); + exit(2); + } + + /* assign input file to stdin */ + if ( argi < argc ) + { + istream = fopen(argv[argi], "r"); + if ( istream == NULL ) { + fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno)); + exit(2); + } + } + + /* open output file if any */ + if ( outfile ) + { + ostream = fopen(outfile, "w"); + if ( ostream == NULL ) { + fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno)); + fclose(istream); + exit(2); + } + } + + /* read line by line from stdin */ + char buffer[65536]; + while ( NULL != fgets(buffer, sizeof(buffer), istream) ) { + + /* only handle lines containing " = " */ + char * cp = strstr(buffer, " = \""); + if ( cp ) { + rtl::OUString aString; + + /* find end of lang string */ + int n; + for ( n=0; ! isspace(buffer[n]); n++ ) + ; + + std::string line = buffer; + std::string lang(line, 0, n); + + cp += 4; + rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp, + RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS ); + + fprintf(ostream, "%s = \"", lang.c_str()); + + if ( aEncodingMap.empty() ) { + print_java_style(ostream, aString); + } else { + print_legacy_mixed(ostream, aString, lang, aEncodingMap); + } + + fprintf(ostream, "\"\n"); + + + } else { + fputs(buffer, ostream); + } + } + + fclose(ostream); + fclose(istream); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ -- cgit