diff options
author | Michael Stahl <mstahl@redhat.com> | 2016-01-21 18:11:12 +0100 |
---|---|---|
committer | Eike Rathke <erack@redhat.com> | 2016-01-25 15:54:14 +0000 |
commit | 6e193148df69eff428fec23560e9d3ad4f503597 (patch) | |
tree | 0de7eb36b0b70d09ea03e242e648bc881d10f3f8 /i18npool/source | |
parent | cb989a58ec456a340a8a1f26683847f9b2a87caa (diff) |
i18npool: handle non-BMP Unicode in cclass_Unicode::parseText()
The UTF-16 code unit limitation was mangling starmath import.
Change-Id: I087e5c5b7954799fdb73e7ee1a8d3d02669f8831
(cherry picked from commit fefd1221be844a033e409a18e05e8c6e98f6d1a7)
Reviewed-on: https://gerrit.libreoffice.org/21784
Reviewed-by: Eike Rathke <erack@redhat.com>
Tested-by: Eike Rathke <erack@redhat.com>
Diffstat (limited to 'i18npool/source')
-rw-r--r-- | i18npool/source/characterclassification/cclass_unicode_parser.cxx | 137 |
1 files changed, 74 insertions, 63 deletions
diff --git a/i18npool/source/characterclassification/cclass_unicode_parser.cxx b/i18npool/source/characterclassification/cclass_unicode_parser.cxx index 7321a7e5f19b..c00395990b63 100644 --- a/i18npool/source/characterclassification/cclass_unicode_parser.cxx +++ b/i18npool/source/characterclassification/cclass_unicode_parser.cxx @@ -351,16 +351,15 @@ const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_Unicode } -sal_Int32 cclass_Unicode::getParseTokensType( const sal_Unicode* aStr, sal_Int32 nPos ) +sal_Int32 cclass_Unicode::getParseTokensType(sal_uInt32 const c, bool const isFirst) { - sal_Unicode c = aStr[nPos]; if ( c < nDefCnt ) return pParseTokensType[ sal_uInt8(c) ]; else { //! all KParseTokens::UNI_... must be matched - switch ( u_charType( (sal_uInt32) c ) ) + switch (u_charType(c)) { case U_UPPERCASE_LETTER : return KParseTokens::UNI_UPALPHA; @@ -372,7 +371,7 @@ sal_Int32 cclass_Unicode::getParseTokensType( const sal_Unicode* aStr, sal_Int32 return KParseTokens::UNI_MODIFIER_LETTER; case U_OTHER_LETTER : // Non_Spacing_Mark could not be as leading character - if (nPos == 0) break; + if (isFirst) break; // fall through, treat it as Other_Letter. case U_NON_SPACING_MARK : return KParseTokens::UNI_OTHER_LETTER; @@ -571,14 +570,13 @@ void cclass_Unicode::destroyParserTable() } -UPT_FLAG_TYPE cclass_Unicode::getFlags( const sal_Unicode* aStr, sal_Int32 nPos ) +UPT_FLAG_TYPE cclass_Unicode::getFlags(sal_uInt32 const c) { UPT_FLAG_TYPE nMask; - sal_Unicode c = aStr[nPos]; if ( c < nDefCnt ) nMask = pTable[ sal_uInt8(c) ]; else - nMask = getFlagsExtended( aStr, nPos ); + nMask = getFlagsExtended(c); switch ( eState ) { case ssGetChar : @@ -608,9 +606,8 @@ UPT_FLAG_TYPE cclass_Unicode::getFlags( const sal_Unicode* aStr, sal_Int32 nPos } -UPT_FLAG_TYPE cclass_Unicode::getFlagsExtended( const sal_Unicode* aStr, sal_Int32 nPos ) +UPT_FLAG_TYPE cclass_Unicode::getFlagsExtended(sal_uInt32 const c) { - sal_Unicode c = aStr[nPos]; if ( c == cGroupSep ) return TOKEN_VALUE; else if ( c == cDecimalSep ) @@ -621,7 +618,7 @@ UPT_FLAG_TYPE cclass_Unicode::getFlagsExtended( const sal_Unicode* aStr, sal_Int sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes); //! all KParseTokens::UNI_... must be matched - switch ( u_charType( (sal_uInt32) c ) ) + switch (u_charType(c)) { case U_UPPERCASE_LETTER : return (nTypes & KParseTokens::UNI_UPALPHA) ? @@ -712,25 +709,29 @@ UPT_FLAG_TYPE cclass_Unicode::getContCharsFlags( sal_Unicode c ) void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType ) { + assert(r.LeadingWhiteSpace == 0); using namespace i18n; - const sal_Unicode* const pTextStart = rText.getStr() + nPos; eState = ssGetChar; //! All the variables below (plus ParseResult) have to be resetted on ssRewindFromValue! - const sal_Unicode* pSym = pTextStart; - const sal_Unicode* pSrc = pSym; OUString aSymbol; - sal_Unicode c = *pSrc; - sal_Unicode cLast = 0; + bool isFirst(true); + sal_Int32 index(nPos); // index of next code point after current + sal_Int32 postSymbolIndex(index); // index of code point following last quote + sal_uInt32 current((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0); + sal_uInt32 cLast = 0; + sal_Int32 nCodePoints(0); int nDecSeps = 0; bool bQuote = false; bool bMightBeWord = true; bool bMightBeWordLast = true; //! All the variables above (plus ParseResult) have to be resetted on ssRewindFromValue! + sal_Int32 nextCharIndex(0); // == index of nextChar - while ( (c != 0) && (eState != ssStop) ) + while ((current != 0) && (eState != ssStop)) { - UPT_FLAG_TYPE nMask = getFlags( pTextStart, pSrc - pTextStart ); + ++nCodePoints; + UPT_FLAG_TYPE nMask = getFlags(current); if ( nMask & TOKEN_EXCLUDED ) eState = ssBounce; if ( bMightBeWord ) @@ -741,8 +742,11 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 else bMightBeWord = ((nMask & TOKEN_WORD) != 0); } - sal_Int32 nParseTokensType = getParseTokensType( pTextStart, pSrc - pTextStart ); - pSrc++; + sal_Int32 nParseTokensType = getParseTokensType(current, isFirst); + isFirst = false; + sal_Int32 const nextIndex(nextCharIndex); // == index of char following current + nextCharIndex = index; // == index of nextChar + sal_uInt32 nextChar((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0); switch (eState) { case ssGetChar : @@ -755,14 +759,14 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 eState = ssGetValue; if ( nMask & TOKEN_VALUE_DIGIT ) { - if ( 128 <= c ) + if (128 <= current) r.TokenType = KParseType::UNI_NUMBER; else r.TokenType = KParseType::ASC_NUMBER; } - else if ( c == cDecimalSep ) + else if (current == cDecimalSep) { - if ( *pSrc ) + if (nextChar) ++nDecSeps; else eState = ssRewindFromValue; @@ -778,14 +782,14 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 { eState = ssGetWordFirstChar; bQuote = true; - pSym++; + postSymbolIndex = nextCharIndex; nParseTokensType = 0; // will be taken of first real character r.TokenType = KParseType::SINGLE_QUOTE_NAME; } else if ( nMask & TOKEN_CHAR_STRING ) { eState = ssGetString; - pSym++; + postSymbolIndex = nextCharIndex; nParseTokensType = 0; // will be taken of first real character r.TokenType = KParseType::DOUBLE_QUOTE_STRING; } @@ -795,8 +799,9 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 { if (eState == ssRewindFromValue) eState = ssIgnoreLeadingInRewind; - r.LeadingWhiteSpace++; - pSym++; + r.LeadingWhiteSpace = nextCharIndex - nPos; + nCodePoints--; // exclude leading whitespace + postSymbolIndex = nextCharIndex; nParseTokensType = 0; // wait until real character bMightBeWord = true; } @@ -821,16 +826,16 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 { if ( nMask & TOKEN_VALUE_DIGIT ) { - if ( 128 <= c ) + if (128 <= current) r.TokenType = KParseType::UNI_NUMBER; else if ( r.TokenType != KParseType::UNI_NUMBER ) r.TokenType = KParseType::ASC_NUMBER; } if ( nMask & TOKEN_VALUE ) { - if ( c == cDecimalSep && ++nDecSeps > 1 ) + if (current == cDecimalSep && ++nDecSeps > 1) { - if ( pSrc - pTextStart == 2 ) + if (nCodePoints == 2) eState = ssRewindFromValue; // consecutive separators else @@ -838,12 +843,12 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 } // else keep it going } - else if ( c == 'E' || c == 'e' ) + else if (current == 'E' || current == 'e') { - UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart ); + UPT_FLAG_TYPE nNext = getFlags(nextChar); if ( nNext & TOKEN_VALUE_EXP ) ; // keep it going - else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) ) + else if (bMightBeWord && ((nNext & TOKEN_WORD) || !nextChar)) { // might be a numerical name (1.2efg) eState = ssGetWord; r.TokenType = KParseType::IDENTNAME; @@ -855,10 +860,10 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 { if ( (cLast == 'E') || (cLast == 'e') ) { - UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart ); + UPT_FLAG_TYPE nNext = getFlags(nextChar); if ( nNext & TOKEN_VALUE_EXP_VALUE ) ; // keep it going - else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) ) + else if (bMightBeWord && ((nNext & TOKEN_WORD) || !nextChar)) { // might be a numerical name (1.2e+fg) eState = ssGetWord; r.TokenType = KParseType::IDENTNAME; @@ -896,15 +901,15 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 { if ( cLast == '\\' ) { // escaped - aSymbol += OUString( pSym, pSrc - pSym - 2 ); - aSymbol += OUString( &c, 1); + aSymbol += rText.copy(postSymbolIndex, nextCharIndex - postSymbolIndex - 2); + aSymbol += OUString(¤t, 1); } else { eState = ssStop; - aSymbol += OUString( pSym, pSrc - pSym - 1 ); + aSymbol += rText.copy(postSymbolIndex, nextCharIndex - postSymbolIndex - 1); } - pSym = pSrc; + postSymbolIndex = nextCharIndex; } else eState = ssStopBack; @@ -921,21 +926,23 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 { if ( cLast == '\\' ) { // escaped - aSymbol += OUString( pSym, pSrc - pSym - 2 ); - aSymbol += OUString( &c, 1); + aSymbol += rText.copy(postSymbolIndex, nextCharIndex - postSymbolIndex - 2); + aSymbol += OUString(¤t, 1); } - else if ( c == *pSrc && + else if (current == nextChar && !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) ) { // "" => literal " escaped - aSymbol += OUString( pSym, pSrc - pSym ); - pSrc++; + aSymbol += rText.copy(postSymbolIndex, nextCharIndex - postSymbolIndex); + nextCharIndex = index; + if (index < rText.getLength()) { ++nCodePoints; } + nextChar = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0; } else { eState = ssStop; - aSymbol += OUString( pSym, pSrc - pSym - 1 ); + aSymbol += rText.copy(postSymbolIndex, nextCharIndex - postSymbolIndex - 1); } - pSym = pSrc; + postSymbolIndex = nextCharIndex; } } break; @@ -956,10 +963,13 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 if ( eState == ssRewindFromValue ) { r = ParseResult(); - pSym = pTextStart; - pSrc = pSym; + index = nPos; + postSymbolIndex = nPos; + nextCharIndex = 0; aSymbol.clear(); - c = *pSrc; + current = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0; + nCodePoints = (nPos < rText.getLength()) ? 1 : 0; + isFirst = true; cLast = 0; nDecSeps = 0; bQuote = false; @@ -973,7 +983,7 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER)) && (nTokenType & KParseType::IDENTNAME) && bMightBeWord ) ; // keep a number that might be a word - else if ( r.LeadingWhiteSpace == (pSrc - pTextStart) ) + else if (r.LeadingWhiteSpace == (nextCharIndex - nPos)) ; // keep ignored white space else if ( !r.TokenType && eState == ssGetValue && (nMask & TOKEN_VALUE_SEP) ) ; // keep uncertain value @@ -987,7 +997,9 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 } if ( eState == ssStopBack ) { // put back - pSrc--; + nextChar = rText.iterateCodePoints(&index, -1); + nextCharIndex = nextIndex; + --nCodePoints; bMightBeWord = bMightBeWordLast; eState = ssStop; } @@ -999,19 +1011,18 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 r.ContFlags |= nParseTokensType; } bMightBeWordLast = bMightBeWord; - cLast = c; - c = *pSrc; + cLast = current; + current = nextChar; } } - // r.CharLen is the length in characters (not code points) of the parsed - // token not including any leading white space, change this calculation if - // multi-code-point Unicode characters are to be supported. - r.CharLen = pSrc - pTextStart - r.LeadingWhiteSpace; - r.EndPos = nPos + (pSrc - pTextStart); + // r.CharLen is the length in characters (not code units) of the parsed + // token not including any leading white space. + r.CharLen = nCodePoints; + r.EndPos = nextCharIndex; if ( r.TokenType & KParseType::ASC_NUMBER ) { - r.Value = rtl_math_uStringToDouble( pTextStart + r.LeadingWhiteSpace, - pTextStart + r.EndPos, cDecimalSep, cGroupSep, nullptr, nullptr ); + r.Value = rtl_math_uStringToDouble(rText.getStr() + nPos + r.LeadingWhiteSpace, + rText.getStr() + r.EndPos, cDecimalSep, cGroupSep, nullptr, nullptr); if ( bMightBeWord ) r.TokenType |= KParseType::IDENTNAME; } @@ -1024,8 +1035,8 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 xNatNumSup = NativeNumberSupplier::create( m_xContext ); } } - OUString aTmp( pTextStart + r.LeadingWhiteSpace, r.EndPos - nPos + - r.LeadingWhiteSpace ); + OUString aTmp(rText.getStr() + nPos + r.LeadingWhiteSpace, + r.EndPos - nPos - r.LeadingWhiteSpace); // transliterate to ASCII aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale, NativeNumberMode::NATNUM0 ); @@ -1035,9 +1046,9 @@ void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 } else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) ) { - if ( pSym < pSrc ) + if (postSymbolIndex < nextCharIndex) { //! open quote - aSymbol += OUString( pSym, pSrc - pSym ); + aSymbol += rText.copy(postSymbolIndex, nextCharIndex - postSymbolIndex - 1); r.TokenType |= KParseType::MISSING_QUOTE; } r.DequotedNameOrString = aSymbol; |