diff options
author | tobias <tobias.schulz@hotmail.com> | 2021-06-06 15:47:06 +0200 |
---|---|---|
committer | Noel Grandin <noel.grandin@collabora.co.uk> | 2021-06-06 18:52:52 +0200 |
commit | 162f5a20095c6937030d23ee03fb8f72c51eefa1 (patch) | |
tree | 5113e0775353231d359b8cd0d19a6425c7da3d9c /sw | |
parent | 89aaa17a0a4413f07da2bc5084b0164f15dc01ac (diff) |
tdf#142669 Consider BOM on text encoding detection
Return a flag if the auto detected text has a BOM.
Save the flag in SwAsciiOptions so that BOM gets set correctly when
file is written.
Change-Id: I358c3ba243bc326a552c2dc24773c94f8319c700
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/116759
Tested-by: Jenkins
Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
Diffstat (limited to 'sw')
-rw-r--r-- | sw/inc/iodetect.hxx | 2 | ||||
-rw-r--r-- | sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt | bin | 0 -> 18 bytes | |||
-rw-r--r-- | sw/qa/extras/txtexport/data/UTF16LECRLF.txt | bin | 18 -> 16 bytes | |||
-rw-r--r-- | sw/qa/extras/txtexport/data/UTF8CRLF.txt | 2 | ||||
-rw-r--r-- | sw/qa/extras/txtexport/txtexport.cxx | 49 | ||||
-rw-r--r-- | sw/source/filter/ascii/parasc.cxx | 5 | ||||
-rw-r--r-- | sw/source/filter/basflt/iodetect.cxx | 8 |
7 files changed, 46 insertions, 20 deletions
diff --git a/sw/inc/iodetect.hxx b/sw/inc/iodetect.hxx index 534b3c1f2bb5..1d5713aaccab 100644 --- a/sw/inc/iodetect.hxx +++ b/sw/inc/iodetect.hxx @@ -104,7 +104,7 @@ public: static bool IsValidStgFilter( const css::uno::Reference < css::embed::XStorage >& rStg, const SfxFilter& rFilter); static bool IsDetectableText( const char* pBuf, sal_uLong &rLen, - rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd ); + rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom); static OUString GetSubStorageName( const SfxFilter& rFltr ); }; diff --git a/sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt b/sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt Binary files differnew file mode 100644 index 000000000000..be232521eafc --- /dev/null +++ b/sw/qa/extras/txtexport/data/UTF16LEBOMCRLF.txt diff --git a/sw/qa/extras/txtexport/data/UTF16LECRLF.txt b/sw/qa/extras/txtexport/data/UTF16LECRLF.txt Binary files differindex be232521eafc..b74e964113de 100644 --- a/sw/qa/extras/txtexport/data/UTF16LECRLF.txt +++ b/sw/qa/extras/txtexport/data/UTF16LECRLF.txt diff --git a/sw/qa/extras/txtexport/data/UTF8CRLF.txt b/sw/qa/extras/txtexport/data/UTF8CRLF.txt new file mode 100644 index 000000000000..62d4d44677b6 --- /dev/null +++ b/sw/qa/extras/txtexport/data/UTF8CRLF.txt @@ -0,0 +1,2 @@ +フー
+バー
diff --git a/sw/qa/extras/txtexport/txtexport.cxx b/sw/qa/extras/txtexport/txtexport.cxx index a5f989cb6689..0e52f51a4e34 100644 --- a/sw/qa/extras/txtexport/txtexport.cxx +++ b/sw/qa/extras/txtexport/txtexport.cxx @@ -20,19 +20,25 @@ public: } protected: - OString readExportedFile() + template <class T> std::vector<T> readMemoryStream() { SvMemoryStream aMemoryStream; SvFileStream aStream(maTempFile.GetURL(), StreamMode::READ); aStream.ReadStream(aMemoryStream); - const char* pData = static_cast<const char*>(aMemoryStream.GetData()); + const T* pData = static_cast<const T*>(aMemoryStream.GetData()); + return std::vector<T>(pData, pData + aMemoryStream.GetSize()); + } + + OString readExportedFile() + { + std::vector<char> aMemStream = readMemoryStream<char>(); int offset = 0; - if (aMemoryStream.GetSize() > 2 && pData[0] == '\xEF' && pData[1] == '\xBB' - && pData[2] == '\xBF') + if (aMemStream.size() > 2 && aMemStream[0] == '\xEF' && aMemStream[1] == '\xBB' + && aMemStream[2] == '\xBF') offset = 3; - return OString(pData + offset, aMemoryStream.GetSize() - offset); + return OString(aMemStream.data() + offset, aMemStream.size() - offset); } }; @@ -64,25 +70,34 @@ DECLARE_TXTEXPORT_TEST(testBullets, "bullets.odt") CPPUNIT_ASSERT_EQUAL(aExpected, aData); } -DECLARE_TXTEXPORT_TEST(testTdf120574_utf8, "UTF8BOMCRLF.txt") +DECLARE_TXTEXPORT_TEST(testTdf120574_utf8bom, "UTF8BOMCRLF.txt") { - SvMemoryStream aMemoryStream; - SvFileStream aStream(maTempFile.GetURL(), StreamMode::READ); - aStream.ReadStream(aMemoryStream); - const char* pData = static_cast<const char*>(aMemoryStream.GetData()); - OString aData(std::string_view(pData, aMemoryStream.GetSize())); + std::vector<char> aMemStream = readMemoryStream<char>(); + OString aData(std::string_view(aMemStream.data(), aMemStream.size())); CPPUNIT_ASSERT_EQUAL(OString(u8"\uFEFFフー\r\nバー\r\n"), aData); } -DECLARE_TXTEXPORT_TEST(testTdf120574_utf16le, "UTF16LECRLF.txt") +DECLARE_TXTEXPORT_TEST(testTdf120574_utf16lebom, "UTF16LEBOMCRLF.txt") { - SvMemoryStream aMemoryStream; - SvFileStream aStream(maTempFile.GetURL(), StreamMode::READ); - aStream.ReadStream(aMemoryStream); - const sal_Unicode* pData = static_cast<const sal_Unicode*>(aMemoryStream.GetData()); - OUString aData(pData, aMemoryStream.GetSize() / sizeof(sal_Unicode)); + std::vector<sal_Unicode> aMemStream = readMemoryStream<sal_Unicode>(); + OUString aData(aMemStream.data(), aMemStream.size() / sizeof(sal_Unicode)); CPPUNIT_ASSERT_EQUAL(OUString(u"\uFEFFフー\r\nバー\r\n"), aData); } + +DECLARE_TXTEXPORT_TEST(testTdf142669_utf8, "UTF8CRLF.txt") +{ + std::vector<char> aMemStream = readMemoryStream<char>(); + OString aData(std::string_view(aMemStream.data(), aMemStream.size())); + CPPUNIT_ASSERT_EQUAL(OString(u8"フー\r\nバー\r\n"), aData); +} + +DECLARE_TXTEXPORT_TEST(testTdf142669_utf16le, "UTF16LECRLF.txt") +{ + std::vector<sal_Unicode> aMemStream = readMemoryStream<sal_Unicode>(); + OUString aData(aMemStream.data(), aMemStream.size() / sizeof(sal_Unicode)); + CPPUNIT_ASSERT_EQUAL(OUString(u"フー\r\nバー\r\n"), aData); +} + CPPUNIT_PLUGIN_IMPLEMENT(); /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sw/source/filter/ascii/parasc.cxx b/sw/source/filter/ascii/parasc.cxx index 8cdfa91ba6b3..871c8315a575 100644 --- a/sw/source/filter/ascii/parasc.cxx +++ b/sw/source/filter/ascii/parasc.cxx @@ -275,8 +275,10 @@ ErrCode SwASCIIParser::ReadChars() nOrig = nLen = m_rInput.ReadBytes(m_pArr.get(), ASC_BUFFLEN); rtl_TextEncoding eCharSet; LineEnd eLineEnd; + bool bHasBom; const bool bRet - = SwIoSystem::IsDetectableText(m_pArr.get(), nLen, &eCharSet, &bSwapUnicode, &eLineEnd); + = SwIoSystem::IsDetectableText(m_pArr.get(), nLen, &eCharSet, + &bSwapUnicode, &eLineEnd, &bHasBom); if (!bRet) return ERRCODE_IO_BROKENPACKAGE; @@ -285,6 +287,7 @@ ErrCode SwASCIIParser::ReadChars() { aEmpty.SetCharSet(eCharSet); aEmpty.SetParaFlags(eLineEnd); + aEmpty.SetIncludeBOM(bHasBom); m_rInput.SeekRel(-(tools::Long(nLen))); } else diff --git a/sw/source/filter/basflt/iodetect.cxx b/sw/source/filter/basflt/iodetect.cxx index 2f49b2b199d1..e4d214391f2c 100644 --- a/sw/source/filter/basflt/iodetect.cxx +++ b/sw/source/filter/basflt/iodetect.cxx @@ -239,11 +239,12 @@ std::shared_ptr<const SfxFilter> SwIoSystem::GetFileFilter(const OUString& rFile } bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen, - rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd) + rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom) { bool bSwap = false; rtl_TextEncoding eCharSet = RTL_TEXTENCODING_DONTKNOW; bool bLE = true; + bool bBom = false; /*See if it's a known unicode type*/ if (rLen >= 2) { @@ -253,17 +254,20 @@ bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen, { eCharSet = RTL_TEXTENCODING_UTF8; nHead = 3; + bBom = true; } else if (sal_uInt8(pBuf[0]) == 0xFE && sal_uInt8(pBuf[1]) == 0xFF) { eCharSet = RTL_TEXTENCODING_UCS2; bLE = false; nHead = 2; + bBom = true; } else if (sal_uInt8(pBuf[1]) == 0xFE && sal_uInt8(pBuf[0]) == 0xFF) { eCharSet = RTL_TEXTENCODING_UCS2; nHead = 2; + bBom = true; } pBuf+=nHead; rLen-=nHead; @@ -400,6 +404,8 @@ bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen, *pSwap = bSwap; if (pLineEnd) *pLineEnd = eLineEnd; + if (pBom) + *pBom = bBom; return !bIsBareUnicode; } |