summaryrefslogtreecommitdiff
path: root/unotools/source/misc/wincodepage.cxx
blob: 493476a5b80b22eaf46e2deefbcbea859ffe5eb0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#include <unotools/wincodepage.hxx>
#include <rtl/string.h>
#include <rtl/textenc.h>

namespace{

// See https://msdn.microsoft.com/en-us/library/windows/desktop/dd317756
rtl_TextEncoding impl_getWinTextEncodingFromLangStrANSI(const char* pLanguage)
{
    auto nLangLen = rtl_str_getLength(pLanguage);

    struct LangEncodingDef
    {
        const char* const        mpLangStr;
        decltype(nLangLen) const mnLangStrLen;
        rtl_TextEncoding const   meTextEncoding;
    };
    static LangEncodingDef const aLanguageTab[] =
    {
        { "en",    2, RTL_TEXTENCODING_MS_1252 }, // Most used -> first in list
        { "th",    2, RTL_TEXTENCODING_MS_874 },
        { "ja",    2, RTL_TEXTENCODING_MS_932 },
        { "zh-cn", 5, RTL_TEXTENCODING_MS_936 },  // Chinese (simplified) - must go before "zh"
        { "ko",    2, RTL_TEXTENCODING_MS_949 },
        { "zh",    2, RTL_TEXTENCODING_MS_950 },  // Chinese (traditional)
        { "bs",    2, RTL_TEXTENCODING_MS_1250 },
        { "cs",    2, RTL_TEXTENCODING_MS_1250 },
        { "hr",    2, RTL_TEXTENCODING_MS_1250 },
        { "hu",    2, RTL_TEXTENCODING_MS_1250 },
        { "pl",    2, RTL_TEXTENCODING_MS_1250 },
        { "ro",    2, RTL_TEXTENCODING_MS_1250 },
        { "sk",    2, RTL_TEXTENCODING_MS_1250 },
        { "sl",    2, RTL_TEXTENCODING_MS_1250 },
//        { "sr",    2, RTL_TEXTENCODING_MS_1250 },
        { "sq",    2, RTL_TEXTENCODING_MS_1250 },
        { "be",    2, RTL_TEXTENCODING_MS_1251 },
        { "bg",    2, RTL_TEXTENCODING_MS_1251 },
        { "mk",    2, RTL_TEXTENCODING_MS_1251 },
        { "ru",    2, RTL_TEXTENCODING_MS_1251 },
        { "sr",    2, RTL_TEXTENCODING_MS_1251 },
        { "uk",    2, RTL_TEXTENCODING_MS_1251 },
        { "es",    2, RTL_TEXTENCODING_MS_1252 },
        { "el",    2, RTL_TEXTENCODING_MS_1253 },
        { "tr",    2, RTL_TEXTENCODING_MS_1254 },
        { "he",    2, RTL_TEXTENCODING_MS_1255 },
        { "ar",    2, RTL_TEXTENCODING_MS_1256 },
        { "et",    2, RTL_TEXTENCODING_MS_1257 },
        { "lt",    2, RTL_TEXTENCODING_MS_1257 },
        { "lv",    2, RTL_TEXTENCODING_MS_1257 },
        { "vi",    2, RTL_TEXTENCODING_MS_1258 },
    };

    for (auto& def : aLanguageTab)
    {
        if (rtl_str_shortenedCompareIgnoreAsciiCase_WithLength(pLanguage, nLangLen,
                                                               def.mpLangStr, def.mnLangStrLen,
                                                               def.mnLangStrLen) == 0)
        {
            return def.meTextEncoding;
        }
    }

    return RTL_TEXTENCODING_MS_1252;
}

/* ----------------------------------------------------------------------- */

// See https://msdn.microsoft.com/en-us/library/windows/desktop/dd317756
// See http://shapelib.maptools.org/codepage.html
rtl_TextEncoding impl_getWinTextEncodingFromLangStrOEM(const char* pLanguage)
{
    auto nLangLen = rtl_str_getLength(pLanguage);

    struct LangEncodingDef
    {
        const char* const        mpLangStr;
        decltype(nLangLen) const mnLangStrLen;
        rtl_TextEncoding const   meTextEncoding;
    };
    static LangEncodingDef const aLanguageTab[] =
    {
        { "de",    2, RTL_TEXTENCODING_IBM_437 }, // OEM United States
        { "en-us", 5, RTL_TEXTENCODING_IBM_437 }, // OEM United States
        { "fi",    2, RTL_TEXTENCODING_IBM_437 }, // OEM United States
        { "fr-ca", 5, RTL_TEXTENCODING_IBM_863 }, // OEM French Canadian; French Canadian (DOS)
        { "fr",    2, RTL_TEXTENCODING_IBM_437 }, // OEM United States
        { "it",    2, RTL_TEXTENCODING_IBM_437 }, // OEM United States
        { "nl",    2, RTL_TEXTENCODING_IBM_437 }, // OEM United States
        { "sv",    2, RTL_TEXTENCODING_IBM_437 }, // OEM United States
        { "el",    2, RTL_TEXTENCODING_IBM_737 }, // OEM Greek (formerly 437G); Greek (DOS)
        { "et",    2, RTL_TEXTENCODING_IBM_775 }, // OEM Baltic; Baltic (DOS)
        { "lt",    2, RTL_TEXTENCODING_IBM_775 }, // OEM Baltic; Baltic (DOS)
        { "lv",    2, RTL_TEXTENCODING_IBM_775 }, // OEM Baltic; Baltic (DOS)
        { "en",    2, RTL_TEXTENCODING_IBM_850 }, // OEM Multilingual Latin 1; Western European (DOS)
        { "bs",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "cs",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "hr",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "hu",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "pl",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "ro",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "sk",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "sl",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
//        { "sr",    2, RTL_TEXTENCODING_IBM_852 }, // OEM Latin 2; Central European (DOS)
        { "bg",    2, RTL_TEXTENCODING_IBM_855 }, // OEM Cyrillic (primarily Russian)
        { "mk",    2, RTL_TEXTENCODING_IBM_855 }, // OEM Cyrillic (primarily Russian)
        { "sr",    2, RTL_TEXTENCODING_IBM_855 }, // OEM Cyrillic (primarily Russian)
        { "tr",    2, RTL_TEXTENCODING_IBM_857 }, // OEM Turkish; Turkish (DOS)
        { "pt",    2, RTL_TEXTENCODING_IBM_860 }, // OEM Portuguese; Portuguese (DOS)
        { "is",    2, RTL_TEXTENCODING_IBM_861 }, // OEM Icelandic; Icelandic (DOS)
        { "he",    2, RTL_TEXTENCODING_IBM_862 }, // OEM Hebrew; Hebrew (DOS)
        { "ar",    2, RTL_TEXTENCODING_IBM_864 }, // OEM Arabic; Arabic (864)
        { "da",    2, RTL_TEXTENCODING_IBM_865 }, // OEM Nordic; Nordic (DOS)
        { "nn",    2, RTL_TEXTENCODING_IBM_865 }, // OEM Nordic; Nordic (DOS)
        { "be",    2, RTL_TEXTENCODING_IBM_866 }, // OEM Russian; Cyrillic (DOS)
        { "ru",    2, RTL_TEXTENCODING_IBM_866 }, // OEM Russian; Cyrillic (DOS)
        { "uk",    2, RTL_TEXTENCODING_IBM_866 }, // OEM Russian; Cyrillic (DOS)
        { "th",    2, RTL_TEXTENCODING_MS_874 },  // ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
        { "ja",    2, RTL_TEXTENCODING_MS_932 },  // ANSI/OEM Japanese; Japanese (Shift-JIS)
        { "zh-cn", 5, RTL_TEXTENCODING_MS_936 },  // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
        { "ko",    2, RTL_TEXTENCODING_MS_949 },  // ANSI/OEM Korean (Unified Hangul Code)
        { "zh",    2, RTL_TEXTENCODING_MS_950 },  // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
        { "vi",    2, RTL_TEXTENCODING_MS_1258 }, // ANSI/OEM Vietnamese; Vietnamese (Windows)
    };

    for (auto& def : aLanguageTab)
    {
        if (rtl_str_shortenedCompareIgnoreAsciiCase_WithLength(pLanguage, nLangLen,
                                                               def.mpLangStr, def.mnLangStrLen,
                                                               def.mnLangStrLen) == 0)
        {
            return def.meTextEncoding;
        }
    }

    return RTL_TEXTENCODING_IBM_850;
}

} // namespace

rtl_TextEncoding utl_getWinTextEncodingFromLangStr(const char* pLanguage, bool bOEM)
{
    return bOEM ?
        impl_getWinTextEncodingFromLangStrOEM(pLanguage) :
        impl_getWinTextEncodingFromLangStrANSI(pLanguage);
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */