1 files changed, 251 insertions, 0 deletions
diff --git a/i18nutil/source/utility/unicode.cxx b/i18nutil/source/utility/unicode.cxx
index 458a417d6496..cecc7d989df6 100644
--- a/i18nutil/source/utility/unicode.cxx
+++ b/i18nutil/source/utility/unicode.cxx
@@ -27,6 +27,8 @@
 #include <sal/log.hxx>
 #include <unicode/numfmt.h>
 #include "unicode_data.h"
+#include <com/sun/star/i18n/UnicodeType.hpp>
+#include <rtl/character.hxx>
 
 // Workaround for glibc braindamage:
 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
@@ -998,4 +1000,253 @@ OUString SAL_CALL unicode::formatPercent(double dNumber,
     return aRet;
 }
 
+ToggleUnicodeCodepoint::ToggleUnicodeCodepoint ()
+{
+    maInput = OUStringBuffer();
+    maOutput = OUStringBuffer();
+    maUtf16 = OUStringBuffer();
+    maCombining = OUStringBuffer();
+}
+
+bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
+{
+    //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
+    if( maInput.getLength() > 255 )
+        mbAllowMoreChars = false;
+
+    if( !mbAllowMoreChars )
+        return false;
+
+    bool bPreventNonHex = false;
+    if( maInput.indexOf("U+") != -1 )
+        bPreventNonHex = true;
+
+    switch ( unicode::getUnicodeType(uChar) )
+    {
+        case ::com::sun::star::i18n::UnicodeType::SURROGATE:
+            if( bPreventNonHex )
+            {
+                mbAllowMoreChars = false;
+                return false;
+            }
+
+            if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty()  )
+            {
+                maUtf16.append(uChar);
+                return true;
+            }
+            if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
+                maUtf16.insert(0, uChar );
+            //end of hex strings, or unexpected order of high/low, so don't accept more
+            if( !maUtf16.isEmpty() )
+                maInput.append(maUtf16);
+            if( !maCombining.isEmpty() )
+                maInput.append(maCombining);
+            mbAllowMoreChars = false;
+            break;
+
+        case ::com::sun::star::i18n::UnicodeType::NON_SPACING_MARK:
+        case ::com::sun::star::i18n::UnicodeType::COMBINING_SPACING_MARK:
+            if( bPreventNonHex )
+            {
+                mbAllowMoreChars = false;
+                return false;
+            }
+
+            //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
+            if( !maUtf16.isEmpty() )
+            {
+                maInput = maUtf16;
+                if( !maCombining.isEmpty() )
+                    maInput.append(maCombining);
+                mbAllowMoreChars = false;
+                return false;
+            }
+            maCombining.insert(0, uChar);
+            break;
+
+        default:
+            //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
+            if( !maUtf16.isEmpty() )
+            {
+                maInput = maUtf16;
+                if( !maCombining.isEmpty() )
+                    maInput.append(maCombining);
+                mbAllowMoreChars = false;
+                return false;
+            }
+
+            if( !maCombining.isEmpty() )
+            {
+                maCombining.insert(0, uChar);
+                maInput = maCombining;
+                mbAllowMoreChars = false;
+                return false;
+            }
+
+            switch( uChar )
+            {
+                case 'u':
+                case 'U':
+                    // U+ notation found.  Continue looking for another one.
+                    if( mbRequiresU )
+                    {
+                        mbRequiresU = false;
+                        maInput.insert(0,"U+");
+                    }
+                    // treat as a normal character
+                    else
+                    {
+                        mbAllowMoreChars = false;
+                        if( !bPreventNonHex )
+                            maInput.insertUtf32(0, uChar);
+                    }
+                    break;
+                case '+':
+                    // + already found: skip when not U, or edge case of +U+xxxx
+                    if( mbRequiresU || (maInput.indexOf("U+") == 0) )
+                        mbAllowMoreChars = false;
+                    // hex chars followed by '+' - now require a 'U'
+                    else if ( !maInput.isEmpty() )
+                        mbRequiresU = true;
+                    // treat as a normal character
+                    else
+                    {
+                        mbAllowMoreChars = false;
+                        if( !bPreventNonHex )
+                            maInput.insertUtf32(0, uChar);
+                    }
+                    break;
+                case 0:
+                    mbAllowMoreChars = false;
+                    break;
+                default:
+                    // + already found. Since not U, cancel further input
+                    if( mbRequiresU )
+                        mbAllowMoreChars = false;
+                    // maximum digits per notation is 8: only one notation
+                    else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
+                        mbAllowMoreChars = false;
+                    // maximum digits per notation is 8: previous notation found
+                    else if( maInput.indexOf("U+") == 8 )
+                        mbAllowMoreChars = false;
+                    // a hex character. Add to string.
+                    else if( isxdigit(uChar) )
+                    {
+                        mbIsHexString = true;
+                        maInput.insertUtf32(0, uChar);
+                    }
+                    // not a hex character: stop input. keep if it is the first input provided
+                    else
+                    {
+                        mbAllowMoreChars = false;
+                        if( maInput.isEmpty() )
+                            maInput.insertUtf32(0, uChar);
+                    }
+            }
+    }
+    return mbAllowMoreChars;
+}
+
+OUString ToggleUnicodeCodepoint::StringToReplace()
+{
+    if( maInput.isEmpty() )
+    {
+        //edge case - input finished with incomplete low surrogate or combining characters without a base
+        if( mbAllowMoreChars )
+        {
+            if( !maUtf16.isEmpty() )
+                maInput = maUtf16;
+            if( !maCombining.isEmpty() )
+                maInput.append(maCombining);
+        }
+        return maInput.toString();
+    }
+
+    if( !mbIsHexString )
+        return maInput.toString();
+
+    //this function potentially modifies the input string.  Prevent addition of further characters
+    mbAllowMoreChars = false;
+
+    //validate unicode notation.
+    OUStringBuffer sIn;
+    sal_uInt32 nUnicode = 0;
+    sal_Int32 nUPlus = maInput.indexOf("U+");
+    //if U+ notation used, strip off all extra chars added not in U+ notation
+    if( nUPlus != -1 )
+    {
+        maInput = maInput.copy(nUPlus);
+        sIn = maInput.copy(2);
+        nUPlus = sIn.indexOf("U+");
+    }
+    else
+        sIn = maInput;
+    while( nUPlus != -1 )
+    {
+        nUnicode = sIn.copy(0, nUPlus).toString().toUInt32(16);
+        //strip out all null or invalid Unicode values
+        if( !nUnicode || nUnicode > 0x10ffff )
+            maInput = sIn.copy(nUPlus);
+        sIn = sIn.copy(nUPlus+2);
+        nUPlus =  sIn.indexOf("U+");
+    }
+
+    nUnicode = sIn.toString().toUInt32(16);
+    if( !nUnicode || nUnicode > 0x10ffff )
+       maInput.truncate(0).append( sIn[sIn.getLength()-1] );
+    return maInput.toString();
+}
+
+sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
+{
+    OUString sIn = StringToReplace();
+    sal_Int32 nPos = 0;
+    sal_uInt32 counter = 0;
+    while( nPos < sIn.getLength() )
+    {
+        sIn.iterateCodePoints(&nPos,1);
+        ++counter;
+    }
+    return counter;
+}
+
+OUString ToggleUnicodeCodepoint::ReplacementString()
+{
+    OUString sIn = StringToReplace();
+    maOutput = "";
+    sal_Int32 nUPlus = sIn.indexOf("U+");
+    // convert from hex notation to glyph
+    if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
+    {
+        sal_uInt32 nUnicode = 0;
+        if( nUPlus == 0)
+        {
+            sIn = sIn.copy(2);
+            nUPlus = sIn.indexOf("U+");
+        }
+        while( nUPlus > 0 )
+        {
+            nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
+            maOutput.appendUtf32( nUnicode );
+
+            sIn = sIn.copy(nUPlus+2);
+            nUPlus = sIn.indexOf("U+");
+        }
+        nUnicode = sIn.toUInt32(16);
+        maOutput.appendUtf32( nUnicode );
+    }
+    // convert from glyph to hex notation
+    else
+    {
+        sal_Int32 nPos = 0;
+        while( nPos < sIn.getLength() )
+        {
+            maOutput.append( "U+" );
+            maOutput.append( OUString::number(sIn.iterateCodePoints(&nPos,1),16) );
+        }
+    }
+    return maOutput.toString();
+}
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */