diff options
Diffstat (limited to 'i18nutil')
-rw-r--r-- | i18nutil/source/utility/unicode.cxx | 251 |
1 files changed, 251 insertions, 0 deletions
diff --git a/i18nutil/source/utility/unicode.cxx b/i18nutil/source/utility/unicode.cxx index 458a417d6496..cecc7d989df6 100644 --- a/i18nutil/source/utility/unicode.cxx +++ b/i18nutil/source/utility/unicode.cxx @@ -27,6 +27,8 @@ #include <sal/log.hxx> #include <unicode/numfmt.h> #include "unicode_data.h" +#include <com/sun/star/i18n/UnicodeType.hpp> +#include <rtl/character.hxx> // Workaround for glibc braindamage: // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL" @@ -998,4 +1000,253 @@ OUString SAL_CALL unicode::formatPercent(double dNumber, return aRet; } +ToggleUnicodeCodepoint::ToggleUnicodeCodepoint () +{ + maInput = OUStringBuffer(); + maOutput = OUStringBuffer(); + maUtf16 = OUStringBuffer(); + maCombining = OUStringBuffer(); +} + +bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar) +{ + //arbitrarily chosen maximum length allowed - normal max usage would be around 30. + if( maInput.getLength() > 255 ) + mbAllowMoreChars = false; + + if( !mbAllowMoreChars ) + return false; + + bool bPreventNonHex = false; + if( maInput.indexOf("U+") != -1 ) + bPreventNonHex = true; + + switch ( unicode::getUnicodeType(uChar) ) + { + case ::com::sun::star::i18n::UnicodeType::SURROGATE: + if( bPreventNonHex ) + { + mbAllowMoreChars = false; + return false; + } + + if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() ) + { + maUtf16.append(uChar); + return true; + } + if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() ) + maUtf16.insert(0, uChar ); + //end of hex strings, or unexpected order of high/low, so don't accept more + if( !maUtf16.isEmpty() ) + maInput.append(maUtf16); + if( !maCombining.isEmpty() ) + maInput.append(maCombining); + mbAllowMoreChars = false; + break; + + case ::com::sun::star::i18n::UnicodeType::NON_SPACING_MARK: + case ::com::sun::star::i18n::UnicodeType::COMBINING_SPACING_MARK: + if( bPreventNonHex ) + { + mbAllowMoreChars = false; + return false; + } + + //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark. + if( !maUtf16.isEmpty() ) + { + maInput = maUtf16; + if( !maCombining.isEmpty() ) + maInput.append(maCombining); + mbAllowMoreChars = false; + return false; + } + maCombining.insert(0, uChar); + break; + + default: + //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character. + if( !maUtf16.isEmpty() ) + { + maInput = maUtf16; + if( !maCombining.isEmpty() ) + maInput.append(maCombining); + mbAllowMoreChars = false; + return false; + } + + if( !maCombining.isEmpty() ) + { + maCombining.insert(0, uChar); + maInput = maCombining; + mbAllowMoreChars = false; + return false; + } + + switch( uChar ) + { + case 'u': + case 'U': + // U+ notation found. Continue looking for another one. + if( mbRequiresU ) + { + mbRequiresU = false; + maInput.insert(0,"U+"); + } + // treat as a normal character + else + { + mbAllowMoreChars = false; + if( !bPreventNonHex ) + maInput.insertUtf32(0, uChar); + } + break; + case '+': + // + already found: skip when not U, or edge case of +U+xxxx + if( mbRequiresU || (maInput.indexOf("U+") == 0) ) + mbAllowMoreChars = false; + // hex chars followed by '+' - now require a 'U' + else if ( !maInput.isEmpty() ) + mbRequiresU = true; + // treat as a normal character + else + { + mbAllowMoreChars = false; + if( !bPreventNonHex ) + maInput.insertUtf32(0, uChar); + } + break; + case 0: + mbAllowMoreChars = false; + break; + default: + // + already found. Since not U, cancel further input + if( mbRequiresU ) + mbAllowMoreChars = false; + // maximum digits per notation is 8: only one notation + else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 ) + mbAllowMoreChars = false; + // maximum digits per notation is 8: previous notation found + else if( maInput.indexOf("U+") == 8 ) + mbAllowMoreChars = false; + // a hex character. Add to string. + else if( isxdigit(uChar) ) + { + mbIsHexString = true; + maInput.insertUtf32(0, uChar); + } + // not a hex character: stop input. keep if it is the first input provided + else + { + mbAllowMoreChars = false; + if( maInput.isEmpty() ) + maInput.insertUtf32(0, uChar); + } + } + } + return mbAllowMoreChars; +} + +OUString ToggleUnicodeCodepoint::StringToReplace() +{ + if( maInput.isEmpty() ) + { + //edge case - input finished with incomplete low surrogate or combining characters without a base + if( mbAllowMoreChars ) + { + if( !maUtf16.isEmpty() ) + maInput = maUtf16; + if( !maCombining.isEmpty() ) + maInput.append(maCombining); + } + return maInput.toString(); + } + + if( !mbIsHexString ) + return maInput.toString(); + + //this function potentially modifies the input string. Prevent addition of further characters + mbAllowMoreChars = false; + + //validate unicode notation. + OUStringBuffer sIn; + sal_uInt32 nUnicode = 0; + sal_Int32 nUPlus = maInput.indexOf("U+"); + //if U+ notation used, strip off all extra chars added not in U+ notation + if( nUPlus != -1 ) + { + maInput = maInput.copy(nUPlus); + sIn = maInput.copy(2); + nUPlus = sIn.indexOf("U+"); + } + else + sIn = maInput; + while( nUPlus != -1 ) + { + nUnicode = sIn.copy(0, nUPlus).toString().toUInt32(16); + //strip out all null or invalid Unicode values + if( !nUnicode || nUnicode > 0x10ffff ) + maInput = sIn.copy(nUPlus); + sIn = sIn.copy(nUPlus+2); + nUPlus = sIn.indexOf("U+"); + } + + nUnicode = sIn.toString().toUInt32(16); + if( !nUnicode || nUnicode > 0x10ffff ) + maInput.truncate(0).append( sIn[sIn.getLength()-1] ); + return maInput.toString(); +} + +sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete() +{ + OUString sIn = StringToReplace(); + sal_Int32 nPos = 0; + sal_uInt32 counter = 0; + while( nPos < sIn.getLength() ) + { + sIn.iterateCodePoints(&nPos,1); + ++counter; + } + return counter; +} + +OUString ToggleUnicodeCodepoint::ReplacementString() +{ + OUString sIn = StringToReplace(); + maOutput = ""; + sal_Int32 nUPlus = sIn.indexOf("U+"); + // convert from hex notation to glyph + if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) ) + { + sal_uInt32 nUnicode = 0; + if( nUPlus == 0) + { + sIn = sIn.copy(2); + nUPlus = sIn.indexOf("U+"); + } + while( nUPlus > 0 ) + { + nUnicode = sIn.copy(0, nUPlus).toUInt32(16); + maOutput.appendUtf32( nUnicode ); + + sIn = sIn.copy(nUPlus+2); + nUPlus = sIn.indexOf("U+"); + } + nUnicode = sIn.toUInt32(16); + maOutput.appendUtf32( nUnicode ); + } + // convert from glyph to hex notation + else + { + sal_Int32 nPos = 0; + while( nPos < sIn.getLength() ) + { + maOutput.append( "U+" ); + maOutput.append( OUString::number(sIn.iterateCodePoints(&nPos,1),16) ); + } + } + return maOutput.toString(); +} + /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |