/************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * Copyright 2000, 2010 Oracle and/or its affiliates. * * OpenOffice.org - a multi-platform office productivity suite * * This file is part of OpenOffice.org. * * OpenOffice.org is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License version 3 * only, as published by the Free Software Foundation. * * OpenOffice.org is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License version 3 for more details * (a copy is included in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU Lesser General Public License * version 3 along with OpenOffice.org. If not, see * * for a copy of the LGPLv3 License. * ************************************************************************/ // MARKER(update_precomp.py): autogen include statement, do not remove #include "precompiled_i18npool.hxx" #include #include #include #include #include #include "characterclassificationImpl.hxx" #include "breakiteratorImpl.hxx" #define TRANSLITERATION_ALL #include "transliteration_body.hxx" using namespace ::com::sun::star::uno; using namespace ::com::sun::star::lang; using namespace ::rtl; #define A2OU(x) OUString::createFromAscii(x) namespace com { namespace sun { namespace star { namespace i18n { Transliteration_body::Transliteration_body() { nMappingType = 0; transliterationName = "Transliteration_body"; implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body"; } sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException) { return TransliterationType::ONE_TO_ONE; } sal_Bool SAL_CALL Transliteration_body::equals( const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) throw(RuntimeException) { throw RuntimeException(); } Sequence< OUString > SAL_CALL Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 ) throw( RuntimeException) { Sequence< OUString > ostr(2); ostr[0] = str1; ostr[1] = str2; return ostr; } static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar ) { sal_uInt8 nRes = nMappingType; // take care of TOGGLE_CASE transliteration: // nMappingType should not be a combination of flags, thuse we decide now // which one to use. if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) { const sal_Int16 nType = unicode::getUnicodeType( cChar ); if (nType & 0x02 /* lower case*/) nRes = MappingTypeLowerToUpper; else { OSL_ENSURE( nType & 0x01 /* upper case */, "uppercase character expected! 'Toggle case' failed?" ); nRes = MappingTypeUpperToLower; } } return nRes; } OUString SAL_CALL Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& offset) throw(RuntimeException) { #if 0 /* Performance optimization: * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method! * getValue() needs about 15%, so there is equal balance if we trade the second * (shrinking) realloc() for a getValue(). But if the caller initializes the * sequence to nCount elements there isn't any change in size necessary in most * cases (one-to-one mapping) and we gain 33%. * * Of that constellation the getValue() method takes 20% upon each call, so 40% * for both. By remembering the first calls' results we could gain some extra * percentage again, but unfortunately getValue() may return a reference to a * static buffer, so we can't store the pointer directly but would have to * copy-construct an array, which doesn't give us any advantage. * * Much more is accomplished by working directly on the sequence buffer * returned by getArray() instead of using operator[] for each and every * access. * * And while we're at it: now that we know the size in advance we don't need to * copy the buffer anymore, just create the real string buffer and let the * return value take ownership. * * All together these changes result in the new implementation needing only 62% * of the time of the old implementation (in other words: that one was 1.61 * times slower ...) */ // Allocate the max possible buffer. Try to use stack instead of heap which // would have to be reallocated most times anyway. const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX; sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL; const sal_Unicode *in = inStr.getStr() + startPos; if (nCount > 512) out = aHeapBuf = (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode)); if (useOffset) offset.realloc(nCount * NMAPPINGMAX); sal_Int32 j = 0; for (sal_Int32 i = 0; i < nCount; i++) { Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); for (sal_Int32 k = 0; k < map.nmap; k++) { if (useOffset) offset[j] = i + startPos; out[j++] = map.map[k]; } } if (useOffset) offset.realloc(j); OUString r(out, j); if (aHeapBuf) free(aHeapBuf); return r; #else const sal_Unicode *in = inStr.getStr() + startPos; // Two different blocks to eliminate the if(useOffset) condition inside the // inner k loop. Yes, on massive use even such small things do count. if ( useOffset ) { sal_Int32 nOffCount = 0, i; for (i = 0; i < nCount; i++) { // take care of TOGGLE_CASE transliteration: sal_uInt8 nTmpMappingType = nMappingType; if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); nOffCount += map.nmap; } rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount, 1 ); // our x_rtl_ustring.h sal_Unicode* out = pStr->buffer; if ( nOffCount != offset.getLength() ) offset.realloc( nOffCount ); sal_Int32 j = 0; sal_Int32 * pArr = offset.getArray(); for (i = 0; i < nCount; i++) { // take care of TOGGLE_CASE transliteration: sal_uInt8 nTmpMappingType = nMappingType; if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); for (sal_Int32 k = 0; k < map.nmap; k++) { pArr[j] = i + startPos; out[j++] = map.map[k]; } } out[j] = 0; return OUString( pStr, SAL_NO_ACQUIRE ); } else { // In the simple case of no offset sequence used we can eliminate the // first getValue() loop. We could also assume that most calls result // in identical string lengths, thus using a preallocated // OUStringBuffer could be an easy way to assemble the return string // without too much hassle. However, for single characters the // OUStringBuffer::append() method is quite expensive compared to a // simple array operation, so it pays here to copy the final result // instead. // Allocate the max possible buffer. Try to use stack instead of heap, // which would have to be reallocated most times anyways. const sal_Int32 nLocalBuf = 2048; sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL; if ( nCount > nLocalBuf ) out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ]; sal_Int32 j = 0; for ( sal_Int32 i = 0; i < nCount; i++) { // take care of TOGGLE_CASE transliteration: sal_uInt8 nTmpMappingType = nMappingType; if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); for (sal_Int32 k = 0; k < map.nmap; k++) { out[j++] = map.map[k]; } } OUString aRet( out, j ); if ( pHeapBuf ) delete [] pHeapBuf; return aRet; } #endif } OUString SAL_CALL Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException) { const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap, 1 ); // our x_rtl_ustring.h sal_Unicode* out = pStr->buffer; sal_Int32 i; for (i = 0; i < map.nmap; i++) out[i] = map.map[i]; out[i] = 0; return OUString( pStr, SAL_NO_ACQUIRE ); } sal_Unicode SAL_CALL Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException) { const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); if (map.nmap > 1) throw MultipleCharsOutputException(); return map.map[0]; } OUString SAL_CALL Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& offset) throw(RuntimeException) { return this->transliterate(inStr, startPos, nCount, offset); } Transliteration_casemapping::Transliteration_casemapping() { nMappingType = 0; transliterationName = "casemapping(generic)"; implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping"; } void SAL_CALL Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale ) { nMappingType = rMappingType; aLocale = rLocale; } Transliteration_u2l::Transliteration_u2l() { nMappingType = MappingTypeUpperToLower; transliterationName = "upper_to_lower(generic)"; implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l"; } Transliteration_l2u::Transliteration_l2u() { nMappingType = MappingTypeLowerToUpper; transliterationName = "lower_to_upper(generic)"; implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u"; } Transliteration_togglecase::Transliteration_togglecase() { // usually nMappingType must NOT be a combiantion of different flages here, // but we take care of that problem in Transliteration_body::transliterate above // before that value is used. There we will decide which of both is to be used on // a per character basis. nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower; transliterationName = "toggle(generic)"; implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; } Transliteration_titlecase::Transliteration_titlecase() { nMappingType = MappingTypeToTitle; transliterationName = "title(generic)"; implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase"; } rtl::OUString SAL_CALL Transliteration_titlecase::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& /*offset*/ ) throw(RuntimeException) { Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory(); CharacterClassificationImpl aCharClassImpl( xMSF ); // possible problem: the locale is not exactly specific for each word in the text... OUString aRes( aCharClassImpl.toTitle( inStr, startPos, nCount, aLocale ) ); return aRes; } Transliteration_sentencecase::Transliteration_sentencecase() { nMappingType = MappingTypeToTitle; // though only to be applied to the first word... transliterationName = "sentence(generic)"; implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase"; } rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& offset ) throw(RuntimeException) { // inspired from Transliteration_body::transliterate sal_Int32 nOffCount = 0, i; bool bPoint = true; if (useOffset) { for( i = 0; i < nCount; ++i ) { sal_Unicode c = inStr.getStr()[ i + startPos ]; if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) { bPoint = true; nOffCount++; } else if( unicode::isAlpha( c ) || unicode::isDigit( c ) ) { const Mapping* map = 0; if( bPoint && unicode::isLower( c )) { map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper); bPoint = false; } else if (!bPoint && unicode::isUpper( c )) { map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower); } if(map == 0) { nOffCount++; } else { nOffCount += map->nmap; } } else { nOffCount++; } } } bPoint = true; rtl::OUStringBuffer result; if (useOffset) { result.ensureCapacity(nOffCount); if ( nOffCount != offset.getLength() ) offset.realloc( nOffCount ); } sal_Int32 j = 0; sal_Int32 * pArr = offset.getArray(); for( i = 0; i < nCount; ++i ) { sal_Unicode c = inStr.getStr()[ i + startPos ]; if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) { bPoint = true; result.append(c); pArr[j++] = i + startPos; } else if( unicode::isAlpha( c ) || unicode::isDigit( c ) ) { const Mapping* map = 0; if( bPoint && unicode::isLower( c )) { map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper); } else if (!bPoint && unicode::isUpper( c )) { map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower); } if(map == 0) { result.append( c ); pArr[j++] = i + startPos; } else { for (sal_Int32 k = 0; k < map->nmap; k++) { result.append( map->map[k] ); pArr[j++] = i + startPos; } } bPoint = false; } else { result.append( c ); pArr[j++] = i + startPos; } } return result.makeStringAndClear(); } #if 0 // TL: alternative implemntation try. But breakiterator has its problem too since // beginOfSentence does not work as expected with '.'. See comment below. // For the time being I will leave this code here as a from-scratch sample if the // breakiterator works better at some point... rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate( const OUString& inStr, sal_Int32 nStartPos, sal_Int32 nCount, Sequence< sal_Int32 >& /*offset*/ ) throw(RuntimeException) { OUString aRes( inStr.copy( nStartPos, nCount ) ); if (nStartPos >= 0 && nStartPos < inStr.getLength() && nCount > 0) { Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory(); BreakIteratorImpl brk( xMSF ); sal_Int32 nSentenceStart = -1, nOldSentenceStart = -1; sal_Int32 nPos = nStartPos + nCount - 1; while (nPos >= nStartPos && nPos != -1) { // possible problem: the locale is not exactly specific for each sentence in the text, // but it is the only one we have... nOldSentenceStart = nSentenceStart; nSentenceStart = brk.beginOfSentence( inStr, nPos, aLocale ); // since the breakiterator completely ignores '.' characvters as end-of-sentence when // the next word is lower case we need to take care of that ourself. The drawback: // la mid-sentence abbreviation like e.g. will now be identified as end-of-sentence. :-( // Well, at least the other product does it in the same way... sal_Int32 nFullStopPos = inStr.lastIndexOf( (sal_Unicode)'.', nPos ); nPos = nSentenceStart; if (nFullStopPos > 0 && nFullStopPos > nSentenceStart) { Boundary aBd2 = brk.nextWord( inStr, nFullStopPos, aLocale, WordType::DICTIONARY_WORD ); nSentenceStart = aBd2.startPos; nPos = nFullStopPos; } if (nSentenceStart < nOldSentenceStart || nOldSentenceStart == -1) { // the sentence start might be a quotation mark or some kind of bracket, thus // we need the first dictionary word starting or following this position // Boundary aBd1 = brk.nextWord( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD ); Boundary aBd2 = brk.getWordBoundary( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD, true ); // OUString aWord1( inStr.copy( aBd1.startPos, aBd1.endPos - aBd1.startPos + 1 ) ); OUString aWord2( inStr.copy( aBd2.startPos, aBd2.endPos - aBd2.startPos + 1 ) ); } else break; // prevent endless loop // continue with previous sentence if (nPos != -1) --nPos; } } return aRes; } #endif } } } }