diff options
Diffstat (limited to 'i18npool/source/transliteration/transliteration_body.cxx')
-rwxr-xr-x | i18npool/source/transliteration/transliteration_body.cxx | 473 |
1 files changed, 473 insertions, 0 deletions
diff --git a/i18npool/source/transliteration/transliteration_body.cxx b/i18npool/source/transliteration/transliteration_body.cxx new file mode 100755 index 000000000000..43222b7a41eb --- /dev/null +++ b/i18npool/source/transliteration/transliteration_body.cxx @@ -0,0 +1,473 @@ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * <http://www.openoffice.org/license.html> + * for a copy of the LGPLv3 License. + * + ************************************************************************/ + +// MARKER(update_precomp.py): autogen include statement, do not remove +#include "precompiled_i18npool.hxx" + +#include <rtl/ustrbuf.hxx> +#include <i18nutil/casefolding.hxx> +#include <i18nutil/unicode.hxx> + +#include <comphelper/processfactory.hxx> +#include <osl/diagnose.h> + +#include <string.h> + +#include "characterclassificationImpl.hxx" +#include "breakiteratorImpl.hxx" + +#define TRANSLITERATION_ALL +#include "transliteration_body.hxx" + +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::lang; +using namespace ::rtl; + +#define A2OU(x) OUString::createFromAscii(x) + +namespace com { namespace sun { namespace star { namespace i18n { + + +Transliteration_body::Transliteration_body() +{ + nMappingType = 0; + transliterationName = "Transliteration_body"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body"; +} + +sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException) +{ + return TransliterationType::ONE_TO_ONE; +} + +sal_Bool SAL_CALL Transliteration_body::equals( + const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, + const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) + throw(RuntimeException) +{ + throw RuntimeException(); +} + +Sequence< OUString > SAL_CALL +Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 ) + throw( RuntimeException) +{ + Sequence< OUString > ostr(2); + ostr[0] = str1; + ostr[1] = str2; + return ostr; +} + + +static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar ) +{ + sal_uInt8 nRes = nMappingType; + + // take care of TOGGLE_CASE transliteration: + // nMappingType should not be a combination of flags, thuse we decide now + // which one to use. + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + { + const sal_Int16 nType = unicode::getUnicodeType( cChar ); + if (nType & 0x02 /* lower case*/) + nRes = MappingTypeLowerToUpper; + else + { + // should also work properly for non-upper characters like white spacs, numbers, ... + nRes = MappingTypeUpperToLower; + } + } + + return nRes; +} + + +OUString SAL_CALL +Transliteration_body::transliterate( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >& offset) + throw(RuntimeException) +{ +#if 0 +/* Performance optimization: + * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method! + * getValue() needs about 15%, so there is equal balance if we trade the second + * (shrinking) realloc() for a getValue(). But if the caller initializes the + * sequence to nCount elements there isn't any change in size necessary in most + * cases (one-to-one mapping) and we gain 33%. + * + * Of that constellation the getValue() method takes 20% upon each call, so 40% + * for both. By remembering the first calls' results we could gain some extra + * percentage again, but unfortunately getValue() may return a reference to a + * static buffer, so we can't store the pointer directly but would have to + * copy-construct an array, which doesn't give us any advantage. + * + * Much more is accomplished by working directly on the sequence buffer + * returned by getArray() instead of using operator[] for each and every + * access. + * + * And while we're at it: now that we know the size in advance we don't need to + * copy the buffer anymore, just create the real string buffer and let the + * return value take ownership. + * + * All together these changes result in the new implementation needing only 62% + * of the time of the old implementation (in other words: that one was 1.61 + * times slower ...) + */ + + // Allocate the max possible buffer. Try to use stack instead of heap which + // would have to be reallocated most times anyway. + const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX; + sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL; + + const sal_Unicode *in = inStr.getStr() + startPos; + + if (nCount > 512) + out = aHeapBuf = (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode)); + + if (useOffset) + offset.realloc(nCount * NMAPPINGMAX); + sal_Int32 j = 0; + for (sal_Int32 i = 0; i < nCount; i++) { + Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); + for (sal_Int32 k = 0; k < map.nmap; k++) { + if (useOffset) + offset[j] = i + startPos; + out[j++] = map.map[k]; + } + } + if (useOffset) + offset.realloc(j); + + OUString r(out, j); + + if (aHeapBuf) + free(aHeapBuf); + + return r; +#else + const sal_Unicode *in = inStr.getStr() + startPos; + + // Two different blocks to eliminate the if(useOffset) condition inside the + // inner k loop. Yes, on massive use even such small things do count. + if ( useOffset ) + { + sal_Int32 nOffCount = 0, i; + for (i = 0; i < nCount; i++) + { + // take care of TOGGLE_CASE transliteration: + sal_uInt8 nTmpMappingType = nMappingType; + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); + nOffCount += map.nmap; + } + rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount, 1 ); // our x_rtl_ustring.h + sal_Unicode* out = pStr->buffer; + + if ( nOffCount != offset.getLength() ) + offset.realloc( nOffCount ); + + sal_Int32 j = 0; + sal_Int32 * pArr = offset.getArray(); + for (i = 0; i < nCount; i++) + { + // take care of TOGGLE_CASE transliteration: + sal_uInt8 nTmpMappingType = nMappingType; + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); + for (sal_Int32 k = 0; k < map.nmap; k++) + { + pArr[j] = i + startPos; + out[j++] = map.map[k]; + } + } + out[j] = 0; + + return OUString( pStr, SAL_NO_ACQUIRE ); + } + else + { + // In the simple case of no offset sequence used we can eliminate the + // first getValue() loop. We could also assume that most calls result + // in identical string lengths, thus using a preallocated + // OUStringBuffer could be an easy way to assemble the return string + // without too much hassle. However, for single characters the + // OUStringBuffer::append() method is quite expensive compared to a + // simple array operation, so it pays here to copy the final result + // instead. + + // Allocate the max possible buffer. Try to use stack instead of heap, + // which would have to be reallocated most times anyways. + const sal_Int32 nLocalBuf = 2048; + sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL; + if ( nCount > nLocalBuf ) + out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ]; + + sal_Int32 j = 0; + for ( sal_Int32 i = 0; i < nCount; i++) + { + // take care of TOGGLE_CASE transliteration: + sal_uInt8 nTmpMappingType = nMappingType; + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); + for (sal_Int32 k = 0; k < map.nmap; k++) + { + out[j++] = map.map[k]; + } + } + + OUString aRet( out, j ); + if ( pHeapBuf ) + delete [] pHeapBuf; + return aRet; + } +#endif +} + +OUString SAL_CALL +Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException) +{ + const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); + rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap, 1 ); // our x_rtl_ustring.h + sal_Unicode* out = pStr->buffer; + sal_Int32 i; + + for (i = 0; i < map.nmap; i++) + out[i] = map.map[i]; + out[i] = 0; + + return OUString( pStr, SAL_NO_ACQUIRE ); +} + +sal_Unicode SAL_CALL +Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException) +{ + const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); + if (map.nmap > 1) + throw MultipleCharsOutputException(); + return map.map[0]; +} + +OUString SAL_CALL +Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >& offset) throw(RuntimeException) +{ + return this->transliterate(inStr, startPos, nCount, offset); +} + +Transliteration_casemapping::Transliteration_casemapping() +{ + nMappingType = 0; + transliterationName = "casemapping(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping"; +} + +void SAL_CALL +Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale ) +{ + nMappingType = rMappingType; + aLocale = rLocale; +} + +Transliteration_u2l::Transliteration_u2l() +{ + nMappingType = MappingTypeUpperToLower; + transliterationName = "upper_to_lower(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l"; +} + +Transliteration_l2u::Transliteration_l2u() +{ + nMappingType = MappingTypeLowerToUpper; + transliterationName = "lower_to_upper(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u"; +} + +Transliteration_togglecase::Transliteration_togglecase() +{ + // usually nMappingType must NOT be a combiantion of different flages here, + // but we take care of that problem in Transliteration_body::transliterate above + // before that value is used. There we will decide which of both is to be used on + // a per character basis. + nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower; + transliterationName = "toggle(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; +} + +Transliteration_titlecase::Transliteration_titlecase() +{ + nMappingType = MappingTypeToTitle; + transliterationName = "title(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase"; +} + +#if 0 +struct LigatureData +{ + sal_uInt32 cChar; + sal_Char * pUtf8Text; +}; + +// available Unicode ligatures: +// http://www.unicode.org/charts +// http://www.unicode.org/charts/PDF/UFB00.pdf +static LigatureData aLigatures[] = +{ + { 0x0FB00, "ff" }, + { 0x0FB01, "fi" }, + { 0x0FB02, "fl" }, + { 0x0FB03, "ffi" }, + { 0x0FB04, "ffl" }, + { 0x0FB05, "ft" }, + { 0x0FB06, "st" }, + + { 0x0FB13, "\xD5\xB4\xD5\xB6" }, // Armenian small men now + { 0x0FB14, "\xD5\xB4\xD5\xA5" }, // Armenian small men ech + { 0x0FB15, "\xD5\xB4\xD5\xAB" }, // Armenian small men ini + { 0x0FB16, "\xD5\xBE\xD5\xB6" }, // Armenian small vew now + { 0x0FB17, "\xD5\xB4\xD5\xAD" }, // Armenian small men xeh + { 0x00000, "" } +}; + +static inline bool lcl_IsLigature( sal_uInt32 cChar ) +{ + return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17); +} + +static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar ) +{ + rtl::OUString aRes; + if (lcl_IsLigature( cChar )) + { + LigatureData *pFound = NULL; + LigatureData *pData = aLigatures; + while (!pFound && pData->cChar != 0) + { + if (pData->cChar == cChar) + pFound = pData; + ++pData; + } + if (pFound) + aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 ); + } + else + aRes = rtl::OUString( &cChar, 1 ); + return aRes; +} +#endif // if 0 + +static rtl::OUString transliterate_titlecase_Impl( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + const Locale &rLocale, + Sequence< sal_Int32 >& offset ) + throw(RuntimeException) +{ + const OUString aText( inStr.copy( startPos, nCount ) ); + + OUString aRes; + if (aText.getLength() > 0) + { + Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory(); + CharacterClassificationImpl aCharClassImpl( xMSF ); + + // because aCharClassImpl.toTitle does not handle ligatures or but will raise + // an exception we need to handle the first chara manually... + + // we don't want to change surrogates by accident, thuse we use proper code point iteration + sal_Int32 nPos = 0; + sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos ); + OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) ); + // toUpper can be used to properly resolve ligatures and characters like + aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); + // since toTitle will leave all-uppercase text unchanged we first need to + // use toLower to bring possible 2nd and following charas in lowercase + aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); + sal_Int32 nResolvedLen = aResolvedLigature.getLength(); + + // now we can properly use toTitle to get the expected result for the resolved string. + // The rest of the text should just become lowercase. + aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale ); + aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale ); + offset.realloc( aRes.getLength() ); + + sal_Int32 *pOffset = offset.getArray(); + sal_Int32 nLen = offset.getLength(); + for (sal_Int32 i = 0; i < nLen; ++i) + { + sal_Int32 nIdx = 0; + if (i >= nResolvedLen) + nIdx = i - nResolvedLen + 1; + pOffset[i] = nIdx; + } + } +#if OSL_DEBUG_LEVEL > 1 + const sal_Int32 *pCOffset = offset.getConstArray(); + (void) pCOffset; +#endif + + return aRes; +} + + +// this function expects to be called on a word-by-word basis, +// namely that startPos points to the first char of the word +rtl::OUString SAL_CALL Transliteration_titlecase::transliterate( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >& offset ) + throw(RuntimeException) +{ + return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); +} + + +Transliteration_sentencecase::Transliteration_sentencecase() +{ + nMappingType = MappingTypeToTitle; // though only to be applied to the first word... + transliterationName = "sentence(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase"; +} + + +// this function expects to be called on a sentence-by-sentence basis, +// namely that startPos points to the first word (NOT first char!) in the sentence +rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >& offset ) + throw(RuntimeException) +{ + return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); +} + + +} } } } + |