summaryrefslogtreecommitdiff
path: root/i18nutil
diff options
context:
space:
mode:
Diffstat (limited to 'i18nutil')
-rw-r--r--i18nutil/source/utility/unicode.cxx251
1 files changed, 251 insertions, 0 deletions
diff --git a/i18nutil/source/utility/unicode.cxx b/i18nutil/source/utility/unicode.cxx
index 458a417d6496..cecc7d989df6 100644
--- a/i18nutil/source/utility/unicode.cxx
+++ b/i18nutil/source/utility/unicode.cxx
@@ -27,6 +27,8 @@
#include <sal/log.hxx>
#include <unicode/numfmt.h>
#include "unicode_data.h"
+#include <com/sun/star/i18n/UnicodeType.hpp>
+#include <rtl/character.hxx>
// Workaround for glibc braindamage:
// glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
@@ -998,4 +1000,253 @@ OUString SAL_CALL unicode::formatPercent(double dNumber,
return aRet;
}
+ToggleUnicodeCodepoint::ToggleUnicodeCodepoint ()
+{
+ maInput = OUStringBuffer();
+ maOutput = OUStringBuffer();
+ maUtf16 = OUStringBuffer();
+ maCombining = OUStringBuffer();
+}
+
+bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
+{
+ //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
+ if( maInput.getLength() > 255 )
+ mbAllowMoreChars = false;
+
+ if( !mbAllowMoreChars )
+ return false;
+
+ bool bPreventNonHex = false;
+ if( maInput.indexOf("U+") != -1 )
+ bPreventNonHex = true;
+
+ switch ( unicode::getUnicodeType(uChar) )
+ {
+ case ::com::sun::star::i18n::UnicodeType::SURROGATE:
+ if( bPreventNonHex )
+ {
+ mbAllowMoreChars = false;
+ return false;
+ }
+
+ if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
+ {
+ maUtf16.append(uChar);
+ return true;
+ }
+ if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
+ maUtf16.insert(0, uChar );
+ //end of hex strings, or unexpected order of high/low, so don't accept more
+ if( !maUtf16.isEmpty() )
+ maInput.append(maUtf16);
+ if( !maCombining.isEmpty() )
+ maInput.append(maCombining);
+ mbAllowMoreChars = false;
+ break;
+
+ case ::com::sun::star::i18n::UnicodeType::NON_SPACING_MARK:
+ case ::com::sun::star::i18n::UnicodeType::COMBINING_SPACING_MARK:
+ if( bPreventNonHex )
+ {
+ mbAllowMoreChars = false;
+ return false;
+ }
+
+ //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
+ if( !maUtf16.isEmpty() )
+ {
+ maInput = maUtf16;
+ if( !maCombining.isEmpty() )
+ maInput.append(maCombining);
+ mbAllowMoreChars = false;
+ return false;
+ }
+ maCombining.insert(0, uChar);
+ break;
+
+ default:
+ //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
+ if( !maUtf16.isEmpty() )
+ {
+ maInput = maUtf16;
+ if( !maCombining.isEmpty() )
+ maInput.append(maCombining);
+ mbAllowMoreChars = false;
+ return false;
+ }
+
+ if( !maCombining.isEmpty() )
+ {
+ maCombining.insert(0, uChar);
+ maInput = maCombining;
+ mbAllowMoreChars = false;
+ return false;
+ }
+
+ switch( uChar )
+ {
+ case 'u':
+ case 'U':
+ // U+ notation found. Continue looking for another one.
+ if( mbRequiresU )
+ {
+ mbRequiresU = false;
+ maInput.insert(0,"U+");
+ }
+ // treat as a normal character
+ else
+ {
+ mbAllowMoreChars = false;
+ if( !bPreventNonHex )
+ maInput.insertUtf32(0, uChar);
+ }
+ break;
+ case '+':
+ // + already found: skip when not U, or edge case of +U+xxxx
+ if( mbRequiresU || (maInput.indexOf("U+") == 0) )
+ mbAllowMoreChars = false;
+ // hex chars followed by '+' - now require a 'U'
+ else if ( !maInput.isEmpty() )
+ mbRequiresU = true;
+ // treat as a normal character
+ else
+ {
+ mbAllowMoreChars = false;
+ if( !bPreventNonHex )
+ maInput.insertUtf32(0, uChar);
+ }
+ break;
+ case 0:
+ mbAllowMoreChars = false;
+ break;
+ default:
+ // + already found. Since not U, cancel further input
+ if( mbRequiresU )
+ mbAllowMoreChars = false;
+ // maximum digits per notation is 8: only one notation
+ else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
+ mbAllowMoreChars = false;
+ // maximum digits per notation is 8: previous notation found
+ else if( maInput.indexOf("U+") == 8 )
+ mbAllowMoreChars = false;
+ // a hex character. Add to string.
+ else if( isxdigit(uChar) )
+ {
+ mbIsHexString = true;
+ maInput.insertUtf32(0, uChar);
+ }
+ // not a hex character: stop input. keep if it is the first input provided
+ else
+ {
+ mbAllowMoreChars = false;
+ if( maInput.isEmpty() )
+ maInput.insertUtf32(0, uChar);
+ }
+ }
+ }
+ return mbAllowMoreChars;
+}
+
+OUString ToggleUnicodeCodepoint::StringToReplace()
+{
+ if( maInput.isEmpty() )
+ {
+ //edge case - input finished with incomplete low surrogate or combining characters without a base
+ if( mbAllowMoreChars )
+ {
+ if( !maUtf16.isEmpty() )
+ maInput = maUtf16;
+ if( !maCombining.isEmpty() )
+ maInput.append(maCombining);
+ }
+ return maInput.toString();
+ }
+
+ if( !mbIsHexString )
+ return maInput.toString();
+
+ //this function potentially modifies the input string. Prevent addition of further characters
+ mbAllowMoreChars = false;
+
+ //validate unicode notation.
+ OUStringBuffer sIn;
+ sal_uInt32 nUnicode = 0;
+ sal_Int32 nUPlus = maInput.indexOf("U+");
+ //if U+ notation used, strip off all extra chars added not in U+ notation
+ if( nUPlus != -1 )
+ {
+ maInput = maInput.copy(nUPlus);
+ sIn = maInput.copy(2);
+ nUPlus = sIn.indexOf("U+");
+ }
+ else
+ sIn = maInput;
+ while( nUPlus != -1 )
+ {
+ nUnicode = sIn.copy(0, nUPlus).toString().toUInt32(16);
+ //strip out all null or invalid Unicode values
+ if( !nUnicode || nUnicode > 0x10ffff )
+ maInput = sIn.copy(nUPlus);
+ sIn = sIn.copy(nUPlus+2);
+ nUPlus = sIn.indexOf("U+");
+ }
+
+ nUnicode = sIn.toString().toUInt32(16);
+ if( !nUnicode || nUnicode > 0x10ffff )
+ maInput.truncate(0).append( sIn[sIn.getLength()-1] );
+ return maInput.toString();
+}
+
+sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
+{
+ OUString sIn = StringToReplace();
+ sal_Int32 nPos = 0;
+ sal_uInt32 counter = 0;
+ while( nPos < sIn.getLength() )
+ {
+ sIn.iterateCodePoints(&nPos,1);
+ ++counter;
+ }
+ return counter;
+}
+
+OUString ToggleUnicodeCodepoint::ReplacementString()
+{
+ OUString sIn = StringToReplace();
+ maOutput = "";
+ sal_Int32 nUPlus = sIn.indexOf("U+");
+ // convert from hex notation to glyph
+ if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
+ {
+ sal_uInt32 nUnicode = 0;
+ if( nUPlus == 0)
+ {
+ sIn = sIn.copy(2);
+ nUPlus = sIn.indexOf("U+");
+ }
+ while( nUPlus > 0 )
+ {
+ nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
+ maOutput.appendUtf32( nUnicode );
+
+ sIn = sIn.copy(nUPlus+2);
+ nUPlus = sIn.indexOf("U+");
+ }
+ nUnicode = sIn.toUInt32(16);
+ maOutput.appendUtf32( nUnicode );
+ }
+ // convert from glyph to hex notation
+ else
+ {
+ sal_Int32 nPos = 0;
+ while( nPos < sIn.getLength() )
+ {
+ maOutput.append( "U+" );
+ maOutput.append( OUString::number(sIn.iterateCodePoints(&nPos,1),16) );
+ }
+ }
+ return maOutput.toString();
+}
+
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */