diff options
author | Mike Kaganski <mike.kaganski@collabora.com> | 2023-03-15 16:30:26 +0300 |
---|---|---|
committer | Mike Kaganski <mike.kaganski@collabora.com> | 2023-03-22 04:31:07 +0000 |
commit | 458f2b8f69fbf0a220f18fbf250e4369fd0491cd (patch) | |
tree | bbe4d94245e88fec598e652628c1dc0d7760967d /basic | |
parent | 1cfeb4bd8ce7f7727a81136bd3e2d6ebea976895 (diff) |
Refactor and fix VBA StrConv
This properly handles null bytes that are expected
when converting between byte strings and Unicode.
It properly handles TransliterationFlags, which are
not a bitset.
In vbProperCase, it uses the correct method to
lowercase the string, working not only with ASCII.
Change-Id: I04e8cdca66ef9863a6516b15205a2a543ed97680
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/149224
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
Diffstat (limited to 'basic')
-rw-r--r-- | basic/qa/vba_tests/strconv.vb | 21 | ||||
-rw-r--r-- | basic/source/runtime/methods.cxx | 210 |
2 files changed, 124 insertions, 107 deletions
diff --git a/basic/qa/vba_tests/strconv.vb b/basic/qa/vba_tests/strconv.vb index b0295df428b2..9b7dfaf21866 100644 --- a/basic/qa/vba_tests/strconv.vb +++ b/basic/qa/vba_tests/strconv.vb @@ -18,25 +18,30 @@ End Function Sub verify_testStrConv() On Error GoTo errorHandler - TestUtil.AssertEqual(StrConv("abc EFG hij", vbUpperCase), "ABC EFG HIJ", "StrConv(""abc EFG hij"", vbUpperCase)") - TestUtil.AssertEqual(StrConv("abc EFG hij", vbLowerCase), "abc efg hij", "StrConv(""abc EFG hij"", vbLowerCase)") - TestUtil.AssertEqual(StrConv("abc EFG hij", vbProperCase), "Abc Efg Hij", "StrConv(""abc EFG hij"", vbProperCase)") + TestUtil.AssertEqual(StrConv("abc EFG hij αβγ ΔΕΖ ηθι", vbUpperCase), "ABC EFG HIJ ΑΒΓ ΔΕΖ ΗΘΙ", "StrConv(""abc EFG hij αβγ ΔΕΖ ηθι"", vbUpperCase)") + TestUtil.AssertEqual(StrConv("abc EFG hij αβγ ΔΕΖ ηθι", vbLowerCase), "abc efg hij αβγ δεζ ηθι", "StrConv(""abc EFG hij αβγ ΔΕΖ ηθι"", vbLowerCase)") + TestUtil.AssertEqual(StrConv("abc EFG hij αβγ ΔΕΖ ηθι", vbProperCase), "Abc Efg Hij Αβγ Δεζ Ηθι", "StrConv(""abc EFG hij αβγ ΔΕΖ ηθι"", vbProperCase)") ' Converts narrow (single-byte) characters in string to wide TestUtil.AssertEqual(StrConv("ABCDEVB¥ì¥¹¥¥å©", vbWide), "ABCDEVB¥ì¥¹¥¥å©", "StrConv(""ABCDEVB¥ì¥¹¥¥å©"", vbWide)") + TestUtil.AssertEqual(StrConv("ABCDEVB¥ì¥¹¥¥å©", vbWide + vbLowerCase), "abcdevb¥ì¥¹¥¥å©", "StrConv(""ABCDEVB¥ì¥¹¥¥å©"", vbWide + vbLowerCase)") ' Converts wide (double-byte) characters in string to narrow (single-byte) characters TestUtil.AssertEqual(StrConv("ABCD@$%23'?EG", vbNarrow), "ABCD@$%23'?EG", "StrConv(""ABCD@$%23'?EG"", vbNarrow)") + TestUtil.AssertEqual(StrConv("ABCD@$%23'?EG", vbNarrow + vbLowerCase), "abcd@$%23'?eg", "StrConv(""ABCD@$%23'?EG"", vbNarrow + vbLowerCase)") ' Converts Hiragana characters in string to Katakana characters TestUtil.AssertEqual(StrConv("かたかな", vbKatakana), "カタカナ", "StrConv(""かたかな"", vbKatakana)") + TestUtil.AssertEqual(StrConv("かたかな abc", vbKatakana + vbUpperCase + vbWide), "カタカナ ABC", "StrConv(""かたかな abc"", vbKatakana + vbUpperCase + vbWide)") ' Converts Katakana characters in string to Hiragana characters TestUtil.AssertEqual(StrConv("カタカナ", vbHiragana), "かたかな", "StrConv(""カタカナ"", vbHiragana)") + TestUtil.AssertEqual(StrConv("カタカナ ABC", vbLowerCase + vbNarrow), "カタカナ abc", "StrConv(""カタカナ ABC"", vbLowerCase + vbNarrow)") - ' Assumes CP-1252 encoding associated with en-US locale used in unit tests. Dim x() As Byte - x = StrConv("ÉϺ£ÊÐABC", vbFromUnicode) + Const Cp1252TestString = "ÉϺ£ÊÐABC" + + x = StrConv(Cp1252TestString, vbFromUnicode, &h0409)' CP-1252 encoding associated with en-US locale TestUtil.AssertEqual(UBound(x), 8, "UBound(x)") TestUtil.AssertEqual(x(0), 201, "x(0)") TestUtil.AssertEqual(x(1), 207, "x(1)") @@ -47,7 +52,11 @@ Sub verify_testStrConv() TestUtil.AssertEqual(x(6), 65, "x(6)") TestUtil.AssertEqual(x(7), 66, "x(7)") TestUtil.AssertEqual(x(8), 67, "x(8)") - TestUtil.AssertEqual(StrConv(x, vbUnicode), "ÉϺ£ÊÐABC", "StrConv(x, vbUnicode)") + TestUtil.AssertEqual(StrConv(x, vbUnicode, &h0409), Cp1252TestString, "StrConv(x, vbUnicode, &h0409)") + + x = StrConv(Cp1252TestString, vbUnicode, &h0409) + TestUtil.AssertEqual(UBound(x), 35, "UBound(x)") + TestUtil.AssertEqual(StrConv(x, vbFromUnicode, &h0409), Cp1252TestString, "StrConv(x, vbFromUnicode, &h0409)") Exit Sub errorHandler: diff --git a/basic/source/runtime/methods.cxx b/basic/source/runtime/methods.cxx index b6d9383d5b37..0b4f89d40115 100644 --- a/basic/source/runtime/methods.cxx +++ b/basic/source/runtime/methods.cxx @@ -59,6 +59,7 @@ #include <com/sun/star/lang/XServiceInfo.hpp> #include <com/sun/star/ucb/SimpleFileAccess.hpp> #include <com/sun/star/script/XErrorQuery.hpp> +#include <ooo/vba/VbStrConv.hpp> #include <ooo/vba/VbTriState.hpp> #include <com/sun/star/bridge/oleautomation/XAutomationObject.hpp> #include <memory> @@ -4084,6 +4085,40 @@ void SbRtl_QBColor(StarBASIC *, SbxArray & rPar, bool) rPar.Get(0)->PutLong(nRGB); } +static std::vector<sal_uInt8> byteArray2Vec(SbxArray* pArr) +{ + std::vector<sal_uInt8> result; + if (pArr) + { + const sal_uInt32 nCount = pArr->Count(); + result.reserve(nCount + 1); // to avoid reallocation when padding in vbFromUnicode + for (sal_uInt32 i = 0; i < nCount; i++) + result.push_back(pArr->Get(i)->GetByte()); + } + return result; +} + +// Makes sure to get the byte array if passed, or the string converted to the bytes using +// StringToByteArray in basic/source/sbx/sbxstr.cxx +static std::vector<sal_uInt8> getByteArray(SbxValue& val) +{ + if (val.GetFullType() == SbxOBJECT) + if (auto pObj = val.GetObject()) + if (pObj->GetType() == (SbxARRAY | SbxBYTE)) + if (auto pArr = dynamic_cast<SbxArray*>(pObj)) + return byteArray2Vec(pArr); + + // Convert to string + tools::SvRef<SbxValue> pStringValue(new SbxValue(SbxSTRING)); + *pStringValue = val; + + // Convert string to byte array + tools::SvRef<SbxValue> pValue(new SbxValue(SbxOBJECT)); + pValue->PutObject(new SbxArray(SbxBYTE)); + *pValue = *pStringValue; // Does the magic of conversion of strings to byte arrays + return byteArray2Vec(dynamic_cast<SbxArray*>(pValue->GetObject())); +} + // StrConv(string, conversion, LCID) void SbRtl_StrConv(StarBASIC *, SbxArray & rPar, bool) { @@ -4094,7 +4129,6 @@ void SbRtl_StrConv(StarBASIC *, SbxArray & rPar, bool) return; } - OUString aOldStr = rPar.Get(1)->GetOUString(); sal_Int32 nConversion = rPar.Get(2)->GetLong(); LanguageType nLanguage = LANGUAGE_SYSTEM; if (nArgCount == 3) @@ -4102,127 +4136,101 @@ void SbRtl_StrConv(StarBASIC *, SbxArray & rPar, bool) sal_Int32 lcid = rPar.Get(3)->GetLong(); nLanguage = LanguageType(lcid); } - OUString sLanguage = LanguageTag(nLanguage).getLanguage(); - rtl_TextEncoding encodingVal = utl_getWinTextEncodingFromLangStr(sLanguage); - sal_Int32 nOldLen = aOldStr.getLength(); - if( nOldLen == 0 ) + if (nConversion == ooo::vba::VbStrConv::vbUnicode) // This mode does not combine { - // null string,return - rPar.Get(0)->PutString(aOldStr); + // Assume that the passed byte array is encoded in the defined encoding, convert to + // UTF-16 and store as string. Passed strings are converted to byte array first. + auto inArray = getByteArray(*rPar.Get(1)); + std::string_view s(reinterpret_cast<char*>(inArray.data()), inArray.size() / sizeof(char)); + const auto encoding = utl_getWinTextEncodingFromLangStr(LanguageTag(nLanguage).getBcp47()); + OUString aOUStr = OStringToOUString(s, encoding); + rPar.Get(0)->PutString(aOUStr); return; } - TransliterationFlags nType = TransliterationFlags::NONE; - if ( (nConversion & 0x03) == 3 ) // vbProperCase - { - const CharClass& rCharClass = GetCharClass(); - aOldStr = rCharClass.titlecase( aOldStr.toAsciiLowerCase(), 0, nOldLen ); - } - else if ( (nConversion & 0x01) == 1 ) // vbUpperCase - { - nType |= TransliterationFlags::LOWERCASE_UPPERCASE; - } - else if ( (nConversion & 0x02) == 2 ) // vbLowerCase - { - nType |= TransliterationFlags::UPPERCASE_LOWERCASE; - } - if ( (nConversion & 0x04) == 4 ) // vbWide - { - nType |= TransliterationFlags::HALFWIDTH_FULLWIDTH; - } - else if ( (nConversion & 0x08) == 8 ) // vbNarrow - { - nType |= TransliterationFlags::FULLWIDTH_HALFWIDTH; - } - if ( (nConversion & 0x10) == 16) // vbKatakana + if (nConversion == ooo::vba::VbStrConv::vbFromUnicode) // This mode does not combine { - nType |= TransliterationFlags::HIRAGANA_KATAKANA; - } - else if ( (nConversion & 0x20) == 32 ) // vbHiragana - { - nType |= TransliterationFlags::KATAKANA_HIRAGANA; - } - OUString aNewStr( aOldStr ); - if( nType != TransliterationFlags::NONE ) - { - uno::Reference< uno::XComponentContext > xContext = getProcessComponentContext(); - ::utl::TransliterationWrapper aTransliterationWrapper( xContext, nType ); - uno::Sequence<sal_Int32> aOffsets; - aTransliterationWrapper.loadModuleIfNeeded( nLanguage ); - aNewStr = aTransliterationWrapper.transliterate( aOldStr, nLanguage, 0, nOldLen, &aOffsets ); - } + // Assume that the passed byte array is UTF-16-encoded (system-endian), convert to specified + // encoding and store as byte array. Passed strings are converted to byte array first. + auto inArray = getByteArray(*rPar.Get(1)); + while (inArray.size() % sizeof(sal_Unicode)) + inArray.push_back('\0'); + std::u16string_view s(reinterpret_cast<sal_Unicode*>(inArray.data()), + inArray.size() / sizeof(sal_Unicode)); + const auto encoding = utl_getWinTextEncodingFromLangStr(LanguageTag(nLanguage).getBcp47()); + OString aOStr = OUStringToOString(s, encoding); + const sal_Int32 lb = IsBaseIndexOne() ? 1 : 0; + const sal_Int32 ub = lb + aOStr.getLength() - 1; + SbxDimArray* pArray = new SbxDimArray(SbxBYTE); + pArray->unoAddDim(lb, ub); - if ( (nConversion & 0x40) == 64 ) // vbUnicode - { - // convert the string to byte string, preserving unicode (2 bytes per character) - sal_Int32 nSize = aNewStr.getLength()*2; - const sal_Unicode* pSrc = aNewStr.getStr(); - std::unique_ptr<char[]> pChar(new char[nSize+1]); - for( sal_Int32 i=0; i < nSize; i++ ) + for (sal_Int32 i = 0; i < aOStr.getLength(); ++i) { - pChar[i] = static_cast< char >( (i%2) ? ((*pSrc) >> 8) & 0xff : (*pSrc) & 0xff ); - if( i%2 ) - { - pSrc++; - } + SbxVariable* pNew = new SbxVariable(SbxBYTE); + pNew->PutByte(aOStr[i]); + pArray->Put(pNew, i); } - pChar[nSize] = '\0'; - std::string_view aOStr(pChar.get()); - // there is no concept about default codepage in unix. so it is incorrectly in unix - OUString aOUStr = OStringToOUString(aOStr, encodingVal); - rPar.Get(0)->PutString(aOUStr); + SbxVariable* retVar = rPar.Get(0); + SbxFlagBits nFlags = retVar->GetFlags(); + retVar->ResetFlag(SbxFlagBits::Fixed); + retVar->PutObject(pArray); + retVar->SetFlags(nFlags); + retVar->SetParameters(nullptr); return; } - else if ( (nConversion & 0x80) == 128 ) // vbFromUnicode + + std::vector<TransliterationFlags> aTranslitSet; + auto check = [&nConversion, &aTranslitSet](sal_Int32 conv, TransliterationFlags flag) { - // there is no concept about default codepage in unix. so it is incorrectly in unix - OString aOStr = OUStringToOString(aNewStr, encodingVal); - const char* pChar = aOStr.getStr(); - sal_Int32 nArraySize = aOStr.getLength(); - SbxDimArray* pArray = new SbxDimArray(SbxBYTE); - bool bIncIndex = IsBaseIndexOne(); - if(nArraySize) + if ((nConversion & conv) != conv) + return false; + + aTranslitSet.push_back(flag); + nConversion &= ~conv; + return true; + }; + + // Check mutually exclusive bits together + + if (!check(ooo::vba::VbStrConv::vbProperCase, TransliterationFlags::TITLE_CASE)) + if (!check(ooo::vba::VbStrConv::vbUpperCase, TransliterationFlags::LOWERCASE_UPPERCASE)) + check(ooo::vba::VbStrConv::vbLowerCase, TransliterationFlags::UPPERCASE_LOWERCASE); + + if (!check(ooo::vba::VbStrConv::vbWide, TransliterationFlags::HALFWIDTH_FULLWIDTH)) + check(ooo::vba::VbStrConv::vbNarrow, TransliterationFlags::FULLWIDTH_HALFWIDTH); + + if (!check(ooo::vba::VbStrConv::vbKatakana, TransliterationFlags::HIRAGANA_KATAKANA)) + check(ooo::vba::VbStrConv::vbHiragana, TransliterationFlags::KATAKANA_HIRAGANA); + + if (nConversion) // unknown / incorrectly combined bits + return StarBASIC::Error(ERRCODE_BASIC_BAD_ARGUMENT); + + OUString aStr = rPar.Get(1)->GetOUString(); + if (!aStr.isEmpty() && !aTranslitSet.empty()) + { + uno::Reference< uno::XComponentContext > xContext = getProcessComponentContext(); + + for (auto transliterationFlag : aTranslitSet) { - if( bIncIndex ) + if (transliterationFlag == TransliterationFlags::TITLE_CASE) { - pArray->AddDim(1, nArraySize); + // TransliterationWrapper only handles the first character of the passed string + // when handling TITLE_CASE; see Transliteration_titlecase::transliterateImpl in + // i18npool/source/transliteration/transliteration_body.cxx + CharClass aCharClass{ xContext, LanguageTag(nLanguage) }; + aStr = aCharClass.titlecase(aCharClass.lowercase(aStr)); } else { - pArray->AddDim(0, nArraySize - 1); - } - } - else - { - pArray->unoAddDim(0, -1); - } - - for( sal_Int32 i=0; i< nArraySize; i++) - { - SbxVariable* pNew = new SbxVariable( SbxBYTE ); - pNew->PutByte(*pChar); - pChar++; - pNew->SetFlag( SbxFlagBits::Write ); - sal_Int32 aIdx[1]; - aIdx[0] = i; - if( bIncIndex ) - { - ++aIdx[0]; + utl::TransliterationWrapper aWrapper(xContext, transliterationFlag); + aStr = aWrapper.transliterate(aStr, nLanguage, 0, aStr.getLength(), nullptr); } - pArray->Put(pNew, aIdx); } - - SbxVariableRef refVar = rPar.Get(0); - SbxFlagBits nFlags = refVar->GetFlags(); - refVar->ResetFlag( SbxFlagBits::Fixed ); - refVar->PutObject( pArray ); - refVar->SetFlags( nFlags ); - refVar->SetParameters( nullptr ); - return; } - rPar.Get(0)->PutString(aNewStr); + + rPar.Get(0)->PutString(aStr); } |