/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 . */ #include #include #include #include #include #include #include #include #include "unicode_data.h" #include #include #include // Workaround for glibc braindamage: // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL" // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL #undef CURRENCY_SYMBOL using namespace ::com::sun::star::i18n; template static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) { sal_Int16 i = 0; css::i18n::UnicodeScript type = typeList[0].to; while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast(type)][UnicodeScriptTypeTo]) { type = typeList[++i].to; } return (type < UnicodeScript_kScriptCount && ch >= UnicodeScriptType[static_cast(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ? typeList[i].value : unknownType; } sal_Int16 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) { return getScriptType(ch, typeList, unknownType); } sal_Unicode unicode::getUnicodeScriptStart( UnicodeScript type) { return UnicodeScriptType[static_cast(type)][UnicodeScriptTypeFrom]; } sal_Unicode unicode::getUnicodeScriptEnd( UnicodeScript type) { return UnicodeScriptType[static_cast(type)][UnicodeScriptTypeTo]; } sal_Int16 unicode::getUnicodeType(const sal_uInt32 ch) { static sal_uInt32 c = 0x00; static sal_uInt32 r = 0x00; if (ch == c) return r; else c = ch; switch (u_charType(ch)) { case U_UNASSIGNED: r = css::i18n::UnicodeType::UNASSIGNED; break; case U_UPPERCASE_LETTER: r = css::i18n::UnicodeType::UPPERCASE_LETTER; break; case U_LOWERCASE_LETTER: r = css::i18n::UnicodeType::LOWERCASE_LETTER; break; case U_TITLECASE_LETTER: r = css::i18n::UnicodeType::TITLECASE_LETTER; break; case U_MODIFIER_LETTER: r = css::i18n::UnicodeType::MODIFIER_LETTER; break; case U_OTHER_LETTER: r = css::i18n::UnicodeType::OTHER_LETTER; break; case U_NON_SPACING_MARK: r = css::i18n::UnicodeType::NON_SPACING_MARK; break; case U_ENCLOSING_MARK: r = css::i18n::UnicodeType::ENCLOSING_MARK; break; case U_COMBINING_SPACING_MARK: r = css::i18n::UnicodeType::COMBINING_SPACING_MARK; break; case U_DECIMAL_DIGIT_NUMBER: r = css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER; break; case U_LETTER_NUMBER: r = css::i18n::UnicodeType::LETTER_NUMBER; break; case U_OTHER_NUMBER: r = css::i18n::UnicodeType::OTHER_NUMBER; break; case U_SPACE_SEPARATOR: r = css::i18n::UnicodeType::SPACE_SEPARATOR; break; case U_LINE_SEPARATOR: r = css::i18n::UnicodeType::LINE_SEPARATOR; break; case U_PARAGRAPH_SEPARATOR: r = css::i18n::UnicodeType::PARAGRAPH_SEPARATOR; break; case U_CONTROL_CHAR: r = css::i18n::UnicodeType::CONTROL; break; case U_FORMAT_CHAR: r = css::i18n::UnicodeType::FORMAT; break; case U_PRIVATE_USE_CHAR: r = css::i18n::UnicodeType::PRIVATE_USE; break; case U_SURROGATE: r = css::i18n::UnicodeType::SURROGATE; break; case U_DASH_PUNCTUATION: r = css::i18n::UnicodeType::DASH_PUNCTUATION; break; case U_INITIAL_PUNCTUATION: r = css::i18n::UnicodeType::INITIAL_PUNCTUATION; break; case U_FINAL_PUNCTUATION: r = css::i18n::UnicodeType::FINAL_PUNCTUATION; break; case U_CONNECTOR_PUNCTUATION: r = css::i18n::UnicodeType::CONNECTOR_PUNCTUATION; break; case U_OTHER_PUNCTUATION: r = css::i18n::UnicodeType::OTHER_PUNCTUATION; break; case U_MATH_SYMBOL: r = css::i18n::UnicodeType::MATH_SYMBOL; break; case U_CURRENCY_SYMBOL: r = css::i18n::UnicodeType::CURRENCY_SYMBOL; break; case U_MODIFIER_SYMBOL: r = css::i18n::UnicodeType::MODIFIER_SYMBOL; break; case U_OTHER_SYMBOL: r = css::i18n::UnicodeType::OTHER_SYMBOL; break; case U_START_PUNCTUATION: r = css::i18n::UnicodeType::START_PUNCTUATION; break; case U_END_PUNCTUATION: r = css::i18n::UnicodeType::END_PUNCTUATION; break; } return r; } sal_uInt8 unicode::getUnicodeDirection( const sal_Unicode ch ) { static sal_Unicode c = 0x00; static sal_uInt8 r = 0x00; if (ch == c) return r; else c = ch; sal_Int16 address = UnicodeDirectionIndex[ch >> 8]; r = (address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]; return r; } sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) { nChar = u_charMirror(nChar); return nChar; } #define bit(name) (1U << name) #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER) #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER) #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER) #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\ bit(UnicodeType::MODIFIER_LETTER)|\ bit(UnicodeType::OTHER_LETTER) #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\ bit(UnicodeType::LINE_SEPARATOR)|\ bit(UnicodeType::PARAGRAPH_SEPARATOR) #define CONTROLMASK bit(UnicodeType::CONTROL)|\ bit(UnicodeType::FORMAT)|\ bit(UnicodeType::LINE_SEPARATOR)|\ bit(UnicodeType::PARAGRAPH_SEPARATOR) #define IsType(func, mask) \ bool func( const sal_uInt32 ch) {\ return (bit(getUnicodeType(ch)) & (mask)) != 0;\ } IsType(unicode::isControl, CONTROLMASK) IsType(unicode::isAlpha, ALPHAMASK) IsType(unicode::isSpace, SPACEMASK) #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\ bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f) bool unicode::isWhiteSpace(const sal_uInt32 ch) { return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE))); } sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript) { //See unicode/uscript.h sal_Int16 nRet; switch (eScript) { case USCRIPT_INVALID_CODE: case USCRIPT_COMMON: case USCRIPT_INHERITED: case USCRIPT_UNWRITTEN_LANGUAGES: case USCRIPT_UNKNOWN: case USCRIPT_MATHEMATICAL_NOTATION: case USCRIPT_SYMBOLS: case USCRIPT_CODE_LIMIT: nRet = ScriptType::WEAK; break; case USCRIPT_ARMENIAN: case USCRIPT_CHEROKEE: case USCRIPT_COPTIC: case USCRIPT_CYRILLIC: case USCRIPT_GEORGIAN: case USCRIPT_GOTHIC: case USCRIPT_GREEK: case USCRIPT_LATIN: case USCRIPT_OGHAM: case USCRIPT_OLD_ITALIC: case USCRIPT_RUNIC: case USCRIPT_CANADIAN_ABORIGINAL: case USCRIPT_BRAILLE: case USCRIPT_CYPRIOT: case USCRIPT_OSMANYA: case USCRIPT_SHAVIAN: case USCRIPT_KATAKANA_OR_HIRAGANA: case USCRIPT_GLAGOLITIC: case USCRIPT_CIRTH: case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC: case USCRIPT_OLD_HUNGARIAN: case USCRIPT_LATIN_FRAKTUR: case USCRIPT_LATIN_GAELIC: nRet = ScriptType::LATIN; break; case USCRIPT_BOPOMOFO: case USCRIPT_HAN: case USCRIPT_HANGUL: case USCRIPT_HIRAGANA: case USCRIPT_KATAKANA: case USCRIPT_YI: case USCRIPT_SIMPLIFIED_HAN: case USCRIPT_TRADITIONAL_HAN: case USCRIPT_JAPANESE: case USCRIPT_KOREAN: case USCRIPT_TANGUT: case USCRIPT_KHITAN_SMALL_SCRIPT: nRet = ScriptType::ASIAN; break; case USCRIPT_ARABIC: case USCRIPT_BENGALI: case USCRIPT_DESERET: case USCRIPT_DEVANAGARI: case USCRIPT_ETHIOPIC: case USCRIPT_GUJARATI: case USCRIPT_GURMUKHI: case USCRIPT_HEBREW: case USCRIPT_KANNADA: case USCRIPT_KHMER: case USCRIPT_LAO: case USCRIPT_MALAYALAM: case USCRIPT_MONGOLIAN: case USCRIPT_MYANMAR: case USCRIPT_ORIYA: case USCRIPT_SINHALA: case USCRIPT_SYRIAC: case USCRIPT_TAMIL: case USCRIPT_TELUGU: case USCRIPT_THAANA: case USCRIPT_THAI: case USCRIPT_TIBETAN: case USCRIPT_TAGALOG: case USCRIPT_HANUNOO: case USCRIPT_BUHID: case USCRIPT_TAGBANWA: case USCRIPT_LIMBU: case USCRIPT_LINEAR_B: case USCRIPT_TAI_LE: case USCRIPT_UGARITIC: case USCRIPT_BUGINESE: case USCRIPT_KHAROSHTHI: case USCRIPT_SYLOTI_NAGRI: case USCRIPT_NEW_TAI_LUE: case USCRIPT_TIFINAGH: case USCRIPT_OLD_PERSIAN: case USCRIPT_BALINESE: case USCRIPT_BATAK: case USCRIPT_BLISSYMBOLS: case USCRIPT_BRAHMI: case USCRIPT_CHAM: case USCRIPT_DEMOTIC_EGYPTIAN: case USCRIPT_HIERATIC_EGYPTIAN: case USCRIPT_EGYPTIAN_HIEROGLYPHS: case USCRIPT_KHUTSURI: case USCRIPT_PAHAWH_HMONG: case USCRIPT_HARAPPAN_INDUS: case USCRIPT_JAVANESE: case USCRIPT_KAYAH_LI: case USCRIPT_LEPCHA: case USCRIPT_LINEAR_A: case USCRIPT_MANDAEAN: case USCRIPT_MAYAN_HIEROGLYPHS: case USCRIPT_MEROITIC: case USCRIPT_NKO: case USCRIPT_ORKHON: case USCRIPT_OLD_PERMIC: case USCRIPT_PHAGS_PA: case USCRIPT_PHOENICIAN: case USCRIPT_PHONETIC_POLLARD: case USCRIPT_RONGORONGO: case USCRIPT_SARATI: case USCRIPT_ESTRANGELO_SYRIAC: case USCRIPT_WESTERN_SYRIAC: case USCRIPT_EASTERN_SYRIAC: case USCRIPT_TENGWAR: case USCRIPT_VAI: case USCRIPT_VISIBLE_SPEECH: case USCRIPT_CUNEIFORM: case USCRIPT_CARIAN: case USCRIPT_LANNA: case USCRIPT_LYCIAN: case USCRIPT_LYDIAN: case USCRIPT_OL_CHIKI: case USCRIPT_REJANG: case USCRIPT_SAURASHTRA: case USCRIPT_SIGN_WRITING: case USCRIPT_SUNDANESE: case USCRIPT_MOON: case USCRIPT_MEITEI_MAYEK: case USCRIPT_IMPERIAL_ARAMAIC: case USCRIPT_AVESTAN: case USCRIPT_CHAKMA: case USCRIPT_KAITHI: case USCRIPT_MANICHAEAN: case USCRIPT_INSCRIPTIONAL_PAHLAVI: case USCRIPT_PSALTER_PAHLAVI: case USCRIPT_BOOK_PAHLAVI: case USCRIPT_INSCRIPTIONAL_PARTHIAN: case USCRIPT_SAMARITAN: case USCRIPT_TAI_VIET: case USCRIPT_BAMUM: case USCRIPT_LISU: case USCRIPT_NAKHI_GEBA: case USCRIPT_OLD_SOUTH_ARABIAN: case USCRIPT_BASSA_VAH: case USCRIPT_DUPLOYAN_SHORTAND: case USCRIPT_ELBASAN: case USCRIPT_GRANTHA: case USCRIPT_KPELLE: case USCRIPT_LOMA: case USCRIPT_MENDE: case USCRIPT_MEROITIC_CURSIVE: case USCRIPT_OLD_NORTH_ARABIAN: case USCRIPT_NABATAEAN: case USCRIPT_PALMYRENE: case USCRIPT_SINDHI: case USCRIPT_WARANG_CITI: default: // anything new is going to be pretty wild nRet = ScriptType::COMPLEX; break; } return nRet; } sal_Int16 unicode::getScriptClassFromLanguageTag( const LanguageTag& rLanguageTag ) { constexpr int32_t nBuf = 42; UScriptCode aBuf[nBuf]; if (rLanguageTag.hasScript()) { aBuf[0] = static_cast(u_getPropertyValueEnum( UCHAR_SCRIPT, OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr())); } else { OUString aName; if (rLanguageTag.getCountry().isEmpty()) aName = rLanguageTag.getLanguage(); else aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry(); UErrorCode status = U_ZERO_ERROR; const int32_t nScripts = uscript_getCode( OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(), aBuf, nBuf, &status); // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer // and required capacity returned, but really.. if (nScripts == 0 || !U_SUCCESS(status)) return css::i18n::ScriptType::LATIN; } return getScriptClassFromUScriptCode( aBuf[0]); } OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript) { OString sRet; switch (eScript) { case USCRIPT_CODE_LIMIT: case USCRIPT_INVALID_CODE: case USCRIPT_MATHEMATICAL_NOTATION: case USCRIPT_SYMBOLS: sRet = "zxx"_ostr; break; case USCRIPT_COMMON: case USCRIPT_INHERITED: case USCRIPT_UNWRITTEN_LANGUAGES: case USCRIPT_UNKNOWN: sRet = "und"_ostr; break; case USCRIPT_ARABIC: sRet = "ar"_ostr; break; case USCRIPT_ARMENIAN: sRet = "hy"_ostr; break; case USCRIPT_BENGALI: sRet = "bn"_ostr; break; case USCRIPT_BOPOMOFO: sRet = "zh"_ostr; break; case USCRIPT_CHEROKEE: sRet = "chr"_ostr; break; case USCRIPT_COPTIC: sRet = "cop"_ostr; break; case USCRIPT_CYRILLIC: sRet = "ru"_ostr; break; case USCRIPT_DESERET: sRet = "en"_ostr; break; case USCRIPT_DEVANAGARI: sRet = "hi"_ostr; break; case USCRIPT_ETHIOPIC: sRet = "am"_ostr; break; case USCRIPT_GEORGIAN: case USCRIPT_KHUTSURI: sRet = "ka"_ostr; break; case USCRIPT_GOTHIC: sRet = "got"_ostr; break; case USCRIPT_GREEK: sRet = "el"_ostr; break; case USCRIPT_GUJARATI: case USCRIPT_KHOJKI: sRet = "gu"_ostr; break; case USCRIPT_GURMUKHI: sRet = "pa"_ostr; break; case USCRIPT_HAN: sRet = "zh"_ostr; break; case USCRIPT_HANGUL: case USCRIPT_KOREAN: case USCRIPT_JAMO: sRet = "ko"_ostr; // Jamo - elements of Hangul Syllables break; case USCRIPT_HEBREW: sRet = "hr"_ostr; break; case USCRIPT_HIRAGANA: sRet = "ja"_ostr; break; case USCRIPT_KANNADA: sRet = "kn"_ostr; break; case USCRIPT_KATAKANA: sRet = "ja"_ostr; break; case USCRIPT_KHMER: sRet = "km"_ostr; break; case USCRIPT_LAO: sRet = "lo"_ostr; break; case USCRIPT_LATIN: sRet = "en"_ostr; break; case USCRIPT_MALAYALAM: sRet = "ml"_ostr; break; case USCRIPT_MONGOLIAN: sRet = "mn"_ostr; break; case USCRIPT_MYANMAR: sRet = "my"_ostr; break; case USCRIPT_OGHAM: sRet = "pgl"_ostr; break; case USCRIPT_OLD_ITALIC: sRet = "osc"_ostr; break; case USCRIPT_ORIYA: sRet = "or"_ostr; break; case USCRIPT_RUNIC: sRet = "ang"_ostr; break; case USCRIPT_SINHALA: sRet = "si"_ostr; break; case USCRIPT_SYRIAC: case USCRIPT_ESTRANGELO_SYRIAC: sRet = "syr"_ostr; break; case USCRIPT_TAMIL: case USCRIPT_GRANTHA: sRet = "ta"_ostr; break; case USCRIPT_TELUGU: sRet = "te"_ostr; break; case USCRIPT_THAANA: sRet = "dv"_ostr; break; case USCRIPT_THAI: sRet = "th"_ostr; break; case USCRIPT_TIBETAN: sRet = "bo"_ostr; break; case USCRIPT_CANADIAN_ABORIGINAL: sRet = "iu"_ostr; break; case USCRIPT_YI: sRet = "ii"_ostr; break; case USCRIPT_TAGALOG: sRet = "tl"_ostr; break; case USCRIPT_HANUNOO: sRet = "hnn"_ostr; break; case USCRIPT_BUHID: sRet = "bku"_ostr; break; case USCRIPT_TAGBANWA: sRet = "tbw"_ostr; break; case USCRIPT_BRAILLE: sRet = "en"_ostr; break; case USCRIPT_CYPRIOT: sRet = "ecy"_ostr; break; case USCRIPT_LIMBU: sRet = "lif"_ostr; break; case USCRIPT_LINEAR_B: sRet = "gmy"_ostr; break; case USCRIPT_OSMANYA: sRet = "so"_ostr; break; case USCRIPT_SHAVIAN: sRet = "en"_ostr; break; case USCRIPT_TAI_LE: sRet = "tdd"_ostr; break; case USCRIPT_UGARITIC: sRet = "uga"_ostr; break; case USCRIPT_KATAKANA_OR_HIRAGANA: sRet = "ja"_ostr; break; case USCRIPT_BUGINESE: sRet = "bug"_ostr; break; case USCRIPT_GLAGOLITIC: sRet = "ch"_ostr; break; case USCRIPT_KHAROSHTHI: case USCRIPT_BRAHMI: sRet = "pra"_ostr; break; case USCRIPT_SYLOTI_NAGRI: sRet = "syl"_ostr; break; case USCRIPT_NEW_TAI_LUE: sRet = "khb"_ostr; break; case USCRIPT_TIFINAGH: sRet = "tmh"_ostr; break; case USCRIPT_OLD_PERSIAN: sRet = "peo"_ostr; break; case USCRIPT_BALINESE: sRet = "ban"_ostr; break; case USCRIPT_BATAK: sRet = "btk"_ostr; break; case USCRIPT_BLISSYMBOLS: sRet = "en"_ostr; break; case USCRIPT_CHAM: sRet = "cja"_ostr; break; case USCRIPT_CIRTH: case USCRIPT_TENGWAR: sRet = "sjn"_ostr; break; case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC: sRet = "cu"_ostr; break; case USCRIPT_DEMOTIC_EGYPTIAN: case USCRIPT_HIERATIC_EGYPTIAN: case USCRIPT_EGYPTIAN_HIEROGLYPHS: sRet = "egy"_ostr; break; case USCRIPT_SIMPLIFIED_HAN: sRet = "zh"_ostr; break; case USCRIPT_TRADITIONAL_HAN: sRet = "zh"_ostr; break; case USCRIPT_PAHAWH_HMONG: sRet = "blu"_ostr; break; case USCRIPT_OLD_HUNGARIAN: sRet = "ohu"_ostr; break; case USCRIPT_HARAPPAN_INDUS: sRet = "xiv"_ostr; break; case USCRIPT_JAVANESE: sRet = "kaw"_ostr; break; case USCRIPT_KAYAH_LI: sRet = "eky"_ostr; break; case USCRIPT_LATIN_FRAKTUR: sRet = "de"_ostr; break; case USCRIPT_LATIN_GAELIC: sRet = "ga"_ostr; break; case USCRIPT_LEPCHA: sRet = "lep"_ostr; break; case USCRIPT_LINEAR_A: sRet = "ecr"_ostr; break; case USCRIPT_MAYAN_HIEROGLYPHS: sRet = "myn"_ostr; break; case USCRIPT_MEROITIC_CURSIVE: case USCRIPT_MEROITIC: sRet = "xmr"_ostr; break; case USCRIPT_NKO: sRet = "nqo"_ostr; break; case USCRIPT_ORKHON: sRet = "otk"_ostr; break; case USCRIPT_OLD_PERMIC: sRet = "kv"_ostr; break; case USCRIPT_PHAGS_PA: sRet = "xng"_ostr; break; case USCRIPT_PHOENICIAN: sRet = "phn"_ostr; break; case USCRIPT_PHONETIC_POLLARD: sRet = "hmd"_ostr; break; case USCRIPT_RONGORONGO: sRet = "rap"_ostr; break; case USCRIPT_SARATI: sRet = "qya"_ostr; break; case USCRIPT_WESTERN_SYRIAC: sRet = "tru"_ostr; break; case USCRIPT_EASTERN_SYRIAC: sRet = "aii"_ostr; break; case USCRIPT_VAI: sRet = "vai"_ostr; break; case USCRIPT_VISIBLE_SPEECH: sRet = "en"_ostr; break; case USCRIPT_CUNEIFORM: sRet = "akk"_ostr; break; case USCRIPT_CARIAN: sRet = "xcr"_ostr; break; case USCRIPT_JAPANESE: sRet = "ja"_ostr; break; case USCRIPT_LANNA: sRet = "nod"_ostr; break; case USCRIPT_LYCIAN: sRet = "xlc"_ostr; break; case USCRIPT_LYDIAN: sRet = "xld"_ostr; break; case USCRIPT_OL_CHIKI: sRet = "sat"_ostr; break; case USCRIPT_REJANG: sRet = "rej"_ostr; break; case USCRIPT_SAURASHTRA: sRet = "saz"_ostr; break; case USCRIPT_SIGN_WRITING: sRet = "en"_ostr; break; case USCRIPT_SUNDANESE: sRet = "su"_ostr; break; case USCRIPT_MOON: sRet = "en"_ostr; break; case USCRIPT_MEITEI_MAYEK: sRet = "mni"_ostr; break; case USCRIPT_IMPERIAL_ARAMAIC: sRet = "arc"_ostr; break; case USCRIPT_AVESTAN: sRet = "ae"_ostr; break; case USCRIPT_CHAKMA: sRet = "ccp"_ostr; break; case USCRIPT_KAITHI: sRet = "awa"_ostr; break; case USCRIPT_MANICHAEAN: sRet = "xmn"_ostr; break; case USCRIPT_INSCRIPTIONAL_PAHLAVI: case USCRIPT_PSALTER_PAHLAVI: case USCRIPT_BOOK_PAHLAVI: case USCRIPT_INSCRIPTIONAL_PARTHIAN: sRet = "xpr"_ostr; break; case USCRIPT_SAMARITAN: sRet = "heb"_ostr; break; case USCRIPT_TAI_VIET: sRet = "blt"_ostr; break; case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */ sRet = "mic"_ostr; break; case USCRIPT_NABATAEAN: sRet = "mis-Nbat"_ostr; // Uncoded with script break; case USCRIPT_PALMYRENE: sRet = "mis-Palm"_ostr; // Uncoded with script break; case USCRIPT_BAMUM: sRet = "bax"_ostr; break; case USCRIPT_LISU: sRet = "lis"_ostr; break; case USCRIPT_NAKHI_GEBA: sRet = "nxq"_ostr; break; case USCRIPT_OLD_SOUTH_ARABIAN: sRet = "xsa"_ostr; break; case USCRIPT_BASSA_VAH: sRet = "bsq"_ostr; break; case USCRIPT_DUPLOYAN_SHORTAND: sRet = "fr"_ostr; break; case USCRIPT_ELBASAN: sRet = "sq"_ostr; break; case USCRIPT_KPELLE: sRet = "kpe"_ostr; break; case USCRIPT_LOMA: sRet = "lom"_ostr; break; case USCRIPT_MENDE: sRet = "men"_ostr; break; case USCRIPT_OLD_NORTH_ARABIAN: sRet = "xna"_ostr; break; case USCRIPT_SINDHI: sRet = "sd"_ostr; break; case USCRIPT_WARANG_CITI: sRet = "hoc"_ostr; break; case USCRIPT_AFAKA: sRet = "djk"_ostr; break; case USCRIPT_JURCHEN: sRet = "juc"_ostr; break; case USCRIPT_MRO: sRet = "cmr"_ostr; break; case USCRIPT_NUSHU: sRet = "mis-Nshu"_ostr; // Uncoded with script break; case USCRIPT_SHARADA: sRet = "sa"_ostr; break; case USCRIPT_SORA_SOMPENG: sRet = "srb"_ostr; break; case USCRIPT_TAKRI: sRet = "doi"_ostr; break; case USCRIPT_TANGUT: sRet = "txg"_ostr; break; case USCRIPT_WOLEAI: sRet = "woe"_ostr; break; case USCRIPT_ANATOLIAN_HIEROGLYPHS: sRet = "hlu"_ostr; break; case USCRIPT_TIRHUTA: sRet = "mai"_ostr; break; case USCRIPT_CAUCASIAN_ALBANIAN: sRet = "xag"_ostr; break; case USCRIPT_MAHAJANI: sRet = "mwr"_ostr; break; case USCRIPT_AHOM: sRet = "aho"_ostr; break; case USCRIPT_HATRAN: sRet = "qly-Hatr"_ostr; break; case USCRIPT_MODI: sRet = "mr-Modi"_ostr; break; case USCRIPT_MULTANI: sRet = "skr-Mutl"_ostr; break; case USCRIPT_PAU_CIN_HAU: sRet = "ctd-Pauc"_ostr; break; case USCRIPT_SIDDHAM: sRet = "sa-Sidd"_ostr; break; case USCRIPT_ADLAM: sRet = "mis-Adlm"_ostr; // Adlam for Fulani, no language code break; case USCRIPT_BHAIKSUKI: sRet = "mis-Bhks"_ostr; // Bhaiksuki for some Buddhist texts, no language code break; case USCRIPT_MARCHEN: sRet = "bo-Marc"_ostr; break; case USCRIPT_NEWA: sRet = "new-Newa"_ostr; break; case USCRIPT_OSAGE: sRet = "osa-Osge"_ostr; break; case USCRIPT_HAN_WITH_BOPOMOFO: sRet = "mis-Hanb"_ostr; // Han with Bopomofo, zh-Hanb ? break; case USCRIPT_SYMBOLS_EMOJI: sRet = "mis-Zsye"_ostr; // Emoji variant break; case USCRIPT_MASARAM_GONDI: sRet = "gon-Gonm"_ostr; // macro language code, could be wsg,esg,gno break; case USCRIPT_SOYOMBO: sRet = "mn-Soyo"_ostr; // abugida to write Mongolian, also Tibetan and Sanskrit break; case USCRIPT_ZANABAZAR_SQUARE: sRet = "mn-Zanb"_ostr; // abugida to write Mongolian break; case USCRIPT_DOGRA: sRet = "dgo"_ostr; // Dogri proper break; case USCRIPT_GUNJALA_GONDI: sRet = "wsg"_ostr; // Adilabad Gondi break; case USCRIPT_MAKASAR: sRet = "mak"_ostr; break; case USCRIPT_MEDEFAIDRIN: sRet = "dmf-Medf"_ostr; break; case USCRIPT_HANIFI_ROHINGYA: sRet = "rhg"_ostr; break; case USCRIPT_SOGDIAN: case USCRIPT_OLD_SOGDIAN: sRet = "sog"_ostr; break; case USCRIPT_ELYMAIC: sRet = "arc-Elym"_ostr; break; case USCRIPT_NYIAKENG_PUACHUE_HMONG: sRet = "hmn-Hmnp"_ostr; // macrolanguage code break; case USCRIPT_NANDINAGARI: sRet = "sa-Nand"_ostr; break; case USCRIPT_WANCHO: sRet = "nnp-Wcho"_ostr; break; case USCRIPT_CHORASMIAN: sRet = "xco-Chrs"_ostr; break; case USCRIPT_DIVES_AKURU: sRet = "dv-Diak"_ostr; break; case USCRIPT_KHITAN_SMALL_SCRIPT: sRet = "zkt-Kits"_ostr; break; case USCRIPT_YEZIDI: sRet = "kmr-Yezi"_ostr; break; #if (U_ICU_VERSION_MAJOR_NUM >= 70) case USCRIPT_CYPRO_MINOAN: sRet = "mis-Cpmn"_ostr; // Uncoded with script break; case USCRIPT_OLD_UYGHUR: sRet = "oui-Ougr"_ostr; break; case USCRIPT_TANGSA: sRet = "nst-Tnsa"_ostr; break; case USCRIPT_TOTO: sRet = "txo-Toto"_ostr; break; case USCRIPT_VITHKUQI: sRet = "sq-Vith"_ostr; // macrolanguage code break; #endif #if (U_ICU_VERSION_MAJOR_NUM >= 72) case USCRIPT_KAWI: sRet = "mis-Kawi"_ostr; // Uncoded with script break; case USCRIPT_NAG_MUNDARI: sRet = "unr-Nagm"_ostr; break; #endif #if (U_ICU_VERSION_MAJOR_NUM >= 75) case USCRIPT_ARABIC_NASTALIQ: sRet = "fa-Aran"_ostr; break; #endif } return sRet; } //Format a number as a percentage according to the rules of the given //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE OUString unicode::formatPercent(double dNumber, const LanguageTag &rLangTag) { // get a currency formatter for this locale ID UErrorCode errorCode=U_ZERO_ERROR; LanguageTag aLangTag(rLangTag); // As of CLDR Version 24 these languages were not listed as using spacing // between number and % but are reported as such by our l10n groups // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html // so format using French which has the desired rules if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl") aLangTag.reset(u"fr-FR"_ustr); icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag); std::unique_ptr xF( icu::NumberFormat::createPercentInstance(aLocale, errorCode)); if(U_FAILURE(errorCode)) { SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed"); return OUString::number(dNumber) + "%"; } icu::UnicodeString output; xF->format(dNumber/100, output); OUString aRet(reinterpret_cast(output.getBuffer()), output.length()); if (rLangTag.getLanguage() == "de") { //narrow no-break space instead of (normal) no-break space return aRet.replace(0x00A0, 0x202F); } return aRet; } bool ToggleUnicodeCodepoint::AllowMoreInput(sal_uInt32 uChar) { //arbitrarily chosen maximum length allowed - normal max usage would be around 30. if( maInput.getLength() > 255 ) mbAllowMoreChars = false; if( !mbAllowMoreChars ) return false; bool bPreventNonHex = false; if( maInput.indexOf("U+") != -1 ) bPreventNonHex = true; switch ( unicode::getUnicodeType(uChar) ) { case css::i18n::UnicodeType::SURROGATE: if( bPreventNonHex ) { mbAllowMoreChars = false; return false; } if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() ) { maUtf16.append(sal_Unicode(uChar)); return true; } if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() ) maUtf16.insert(0, sal_Unicode(uChar)); if (maUtf16.getLength() == 2) { assert(rtl::isHighSurrogate(maUtf16[0]) && rtl::isLowSurrogate(maUtf16[1])); // The resulting codepoint may itself be combining, so may allow more sal_uInt32 nUCS4 = rtl::combineSurrogates(maUtf16[0], maUtf16[1]); maUtf16.setLength(0); return AllowMoreInput(nUCS4); } // unexpected order of high/low, so don't accept more if( !maUtf16.isEmpty() ) maInput.append(maUtf16); if( !maCombining.isEmpty() ) maInput.append(maCombining); mbAllowMoreChars = false; break; case css::i18n::UnicodeType::NON_SPACING_MARK: case css::i18n::UnicodeType::COMBINING_SPACING_MARK: if( bPreventNonHex ) { mbAllowMoreChars = false; return false; } //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark. if( !maUtf16.isEmpty() ) { maInput = maUtf16; if( !maCombining.isEmpty() ) maInput.append(maCombining); mbAllowMoreChars = false; return false; } maCombining.insertUtf32(0, uChar); break; default: //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character. if( !maUtf16.isEmpty() ) { maInput = maUtf16; if( !maCombining.isEmpty() ) maInput.append(maCombining); mbAllowMoreChars = false; return false; } if( !maCombining.isEmpty() ) { maCombining.insertUtf32(0, uChar); maInput = maCombining; mbAllowMoreChars = false; return false; } // 0 - 1f are control characters. Do not process those. if( uChar < 0x20 ) { mbAllowMoreChars = false; return false; } switch( uChar ) { case 'u': case 'U': // U+ notation found. Continue looking for another one. if( mbRequiresU ) { mbRequiresU = false; maInput.insert(0,"U+"); } // treat as a normal character else { mbAllowMoreChars = false; if( !bPreventNonHex ) maInput.insertUtf32(0, uChar); } break; case '+': // + already found: skip when not U, or edge case of +U+xxxx if( mbRequiresU || (maInput.indexOf("U+") == 0) ) mbAllowMoreChars = false; // hex chars followed by '+' - now require a 'U' else if ( !maInput.isEmpty() ) mbRequiresU = true; // treat as a normal character else { mbAllowMoreChars = false; if( !bPreventNonHex ) maInput.insertUtf32(0, uChar); } break; default: // + already found. Since not U, cancel further input if( mbRequiresU ) mbAllowMoreChars = false; // maximum digits per notation is 8: only one notation else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 ) mbAllowMoreChars = false; // maximum digits per notation is 8: previous notation found else if( maInput.indexOf("U+") == 8 ) mbAllowMoreChars = false; // a hex character. Add to string. else if( rtl::isAsciiHexDigit(uChar) ) { mbIsHexString = true; maInput.insertUtf32(0, uChar); } // not a hex character: stop input. keep if it is the first input provided else { mbAllowMoreChars = false; if( maInput.isEmpty() ) maInput.insertUtf32(0, uChar); } } } return mbAllowMoreChars; } OUString ToggleUnicodeCodepoint::StringToReplace() { if( maInput.isEmpty() ) { //edge case - input finished with incomplete low surrogate or combining characters without a base if( mbAllowMoreChars ) { if( !maUtf16.isEmpty() ) maInput = maUtf16; if( !maCombining.isEmpty() ) maInput.append(maCombining); } return maInput.toString(); } if( !mbIsHexString ) return maInput.toString(); //this function potentially modifies the input string. Prevent addition of further characters mbAllowMoreChars = false; //validate unicode notation. OUString sIn; sal_uInt32 nUnicode = 0; sal_Int32 nUPlus = maInput.indexOf("U+"); //if U+ notation used, strip off all extra chars added not in U+ notation if( nUPlus != -1 ) { maInput.remove(0, nUPlus); sIn = maInput.copy(2).makeStringAndClear(); nUPlus = sIn.indexOf("U+"); } else sIn = maInput.toString(); while( nUPlus != -1 ) { nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16); //prevent creating control characters or invalid Unicode values if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 ) maInput = sIn.subView(nUPlus); sIn = sIn.copy(nUPlus+2); nUPlus = sIn.indexOf("U+"); } nUnicode = sIn.toUInt32(16); if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 ) maInput.truncate().append( sIn[sIn.getLength()-1] ); return maInput.toString(); } OUString ToggleUnicodeCodepoint::ReplacementString() { OUString sIn = StringToReplace(); OUStringBuffer output = ""; sal_Int32 nUPlus = sIn.indexOf("U+"); // convert from hex notation to glyph if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) ) { sal_uInt32 nUnicode = 0; if( nUPlus == 0) { sIn = sIn.copy(2); nUPlus = sIn.indexOf("U+"); } while( nUPlus > 0 ) { nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16); output.appendUtf32( nUnicode ); sIn = sIn.copy(nUPlus+2); nUPlus = sIn.indexOf("U+"); } nUnicode = sIn.toUInt32(16); output.appendUtf32( nUnicode ); } // convert from glyph to hex notation else { sal_Int32 nPos = 0; while( nPos < sIn.getLength() ) { OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16); //pad with zeros - minimum length of 4. for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i ) aTmp.insert( 0,"0" ); output.append( "U+" + aTmp ); } } return output.makeStringAndClear(); } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */