diff options
author | Vasily Melenchuk <vasily.melenchuk@cib.de> | 2022-04-07 20:59:08 +0300 |
---|---|---|
committer | Xisco Fauli <xiscofauli@libreoffice.org> | 2022-04-11 11:53:51 +0200 |
commit | 8daac72b7a0b7cdf6eb520273829c0c0c15ddef5 (patch) | |
tree | d47b5e7be734d551452d9f22f0021b69f88080db | |
parent | 8e467c88d0ba6e70159382676af55b8ef8d65d54 (diff) |
tdf#95706: RTF import: tolerant font table parsing
While font name in font table should end with semicolon
({\fonttbl{\f42 Arial;}}) it is not always true and
MS Word is tolerant to it: it still able to parse this
correctly. Seems LO also should not require strict spec
conformance.
So idea of font parsing is changed: instead of inserting
font on semicolon, it is done on next \fN or destination
end. All collected text to this moment is a font name.
Change-Id: I6b41951217442a71fd2ebbfc58a3fc79f6f913db
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132686
Tested-by: Jenkins
Reviewed-by: Miklos Vajna <vmiklos@collabora.com>
(cherry picked from commit 844be7358f1eec00094a55fa1fb4fadadb8cd1bf)
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132699
Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
-rw-r--r-- | sw/qa/extras/rtfexport/data/tdf95706_2.rtf | 17 | ||||
-rw-r--r-- | sw/qa/extras/rtfexport/rtfexport4.cxx | 12 | ||||
-rw-r--r-- | writerfilter/source/rtftok/rtfdispatchvalue.cxx | 4 | ||||
-rw-r--r-- | writerfilter/source/rtftok/rtfdocumentimpl.cxx | 162 | ||||
-rw-r--r-- | writerfilter/source/rtftok/rtfdocumentimpl.hxx | 1 |
5 files changed, 122 insertions, 74 deletions
diff --git a/sw/qa/extras/rtfexport/data/tdf95706_2.rtf b/sw/qa/extras/rtfexport/data/tdf95706_2.rtf new file mode 100644 index 000000000000..d36d2ccd2396 --- /dev/null +++ b/sw/qa/extras/rtfexport/data/tdf95706_2.rtf @@ -0,0 +1,17 @@ +{\rtf\ansi
+{\fonttbl
+{\f1 Arial}
+\f2 Impact
+\f3 T\'69mes New Roman
+\f4 T
+a
+h
+o
+m
+a
+}
+\pard\f1\fs26 Arial\par
+\pard\f2\fs26 Impact\par
+\pard\f3\fs26 Times New Roman\par
+\pard\f4\fs26 Tahoma\par
+}
diff --git a/sw/qa/extras/rtfexport/rtfexport4.cxx b/sw/qa/extras/rtfexport/rtfexport4.cxx index 1f1434054085..33a2a246a181 100644 --- a/sw/qa/extras/rtfexport/rtfexport4.cxx +++ b/sw/qa/extras/rtfexport/rtfexport4.cxx @@ -522,6 +522,18 @@ DECLARE_RTFEXPORT_TEST(testTdf95706, "tdf95706.rtf") CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun16, "CharFontName")); } +DECLARE_RTFEXPORT_TEST(testTdf95706_2, "tdf95706_2.rtf") +{ + CPPUNIT_ASSERT_EQUAL(OUString("Arial"), + getProperty<OUString>(getRun(getParagraph(1), 1), "CharFontName")); + CPPUNIT_ASSERT_EQUAL(OUString("Impact"), + getProperty<OUString>(getRun(getParagraph(2), 1), "CharFontName")); + CPPUNIT_ASSERT_EQUAL(OUString("Times New Roman"), + getProperty<OUString>(getRun(getParagraph(3), 1), "CharFontName")); + CPPUNIT_ASSERT_EQUAL(OUString("Tahoma"), + getProperty<OUString>(getRun(getParagraph(4), 1), "CharFontName")); +} + DECLARE_RTFEXPORT_TEST(testTdf111851, "tdf111851.rtf") { uno::Reference<text::XTextTable> xTable(getParagraphOrTable(1), uno::UNO_QUERY); diff --git a/writerfilter/source/rtftok/rtfdispatchvalue.cxx b/writerfilter/source/rtftok/rtfdispatchvalue.cxx index d78f087d76e3..35d3e0128c84 100644 --- a/writerfilter/source/rtftok/rtfdispatchvalue.cxx +++ b/writerfilter/source/rtftok/rtfdispatchvalue.cxx @@ -762,6 +762,10 @@ RTFError RTFDocumentImpl::dispatchValue(RTFKeyword nKeyword, int nParam) if (m_aStates.top().getDestination() == Destination::FONTTABLE || m_aStates.top().getDestination() == Destination::FONTENTRY) { + // Some text in buffer? It is font name. So previous font definition is complete + if (m_aStates.top().getCurrentDestinationText()->getLength()) + handleFontTableEntry(); + m_aFontIndexes.push_back(nParam); m_nCurrentFontIndex = getFontIndex(nParam); } diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.cxx b/writerfilter/source/rtftok/rtfdocumentimpl.cxx index 8ea858abada5..9e77cbf4602e 100644 --- a/writerfilter/source/rtftok/rtfdocumentimpl.cxx +++ b/writerfilter/source/rtftok/rtfdocumentimpl.cxx @@ -1332,6 +1332,74 @@ void RTFDocumentImpl::singleChar(sal_uInt8 nValue, bool bRunProps) } } +void RTFDocumentImpl::handleFontTableEntry() +{ + OUString aName = m_aStates.top().getCurrentDestinationText()->makeStringAndClear(); + + if (aName.isEmpty()) + return; + + if (aName.endsWith(";")) + { + aName = aName.copy(0, aName.getLength() - 1); + } + + // Old documents can contain no encoding information in fontinfo, + // but there can be font name suffixes: Arial CE is not a special + // font, it is ordinal Arial, but with used cp 1250 encoding. + // Moreover these suffixes have priority over \cpgN and \fcharsetN + // in MS Word. + OUString aFontSuffix; + OUString aNameNoSuffix(aName); + sal_Int32 nLastSpace = aName.lastIndexOf(' '); + if (nLastSpace >= 0) + { + aFontSuffix = aName.copy(nLastSpace + 1); + aNameNoSuffix = aName.copy(0, nLastSpace); + sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW; + for (int i = 0; aRTFFontNameSuffixes[i].codepage != RTL_TEXTENCODING_DONTKNOW; i++) + { + if (aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix)) + { + nEncoding = aRTFFontNameSuffixes[i].codepage; + break; + } + } + if (nEncoding > RTL_TEXTENCODING_DONTKNOW) + { + m_nCurrentEncoding = nEncoding; + m_aStates.top().setCurrentEncoding(m_nCurrentEncoding); + } + else + { + // Unknown suffix: looks like it is just a part of font name, restore it + aNameNoSuffix = aName; + } + } + + m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix; + if (m_nCurrentEncoding >= 0) + { + m_aFontEncodings[m_nCurrentFontIndex] = m_nCurrentEncoding; + m_nCurrentEncoding = -1; + } + m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name, + new RTFValue(aNameNoSuffix)); + + writerfilter::Reference<Properties>::Pointer_t const pProp(new RTFReferenceProperties( + m_aStates.top().getTableAttributes(), m_aStates.top().getTableSprms())); + + //See fdo#47347 initial invalid font entry properties are inserted first, + //so when we attempt to insert the correct ones, there's already an + //entry in the map for them, so the new ones aren't inserted. + auto lb = m_aFontTableEntries.lower_bound(m_nCurrentFontIndex); + if (lb != m_aFontTableEntries.end() + && !(m_aFontTableEntries.key_comp()(m_nCurrentFontIndex, lb->first))) + lb->second = pProp; + else + m_aFontTableEntries.insert(lb, std::make_pair(m_nCurrentFontIndex, pProp)); +} + void RTFDocumentImpl::text(OUString& rString) { if (rString.getLength() == 1 && m_aStates.top().getDestination() != Destination::DOCCOMM) @@ -1345,10 +1413,7 @@ void RTFDocumentImpl::text(OUString& rString) bool bRet = true; switch (m_aStates.top().getDestination()) { - // Note: in fonttbl there may or may not be groups; in stylesheet - // and revtbl groups are mandatory - case Destination::FONTTABLE: - case Destination::FONTENTRY: + // Note: in stylesheet and revtbl groups are mandatory case Destination::STYLEENTRY: case Destination::LISTNAME: case Destination::REVISIONENTRY: @@ -1368,68 +1433,6 @@ void RTFDocumentImpl::text(OUString& rString) = m_aStates.top().getCurrentDestinationText()->makeStringAndClear(); switch (m_aStates.top().getDestination()) { - case Destination::FONTTABLE: - case Destination::FONTENTRY: - { - // Old documents can contain no encoding information in fontinfo, - // but there can be font name suffixes: Arial CE is not a special - // font, it is ordinal Arial, but with used cp 1250 encoding. - // Moreover these suffixes have priority over \cpgN and \fcharsetN - // in MS Word. - OUString aFontSuffix; - OUString aNameNoSuffix(aName); - sal_Int32 nLastSpace = aName.lastIndexOf(' '); - if (nLastSpace >= 0) - { - aFontSuffix = aName.copy(nLastSpace + 1); - aNameNoSuffix = aName.copy(0, nLastSpace); - sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW; - for (int i = 0; - aRTFFontNameSuffixes[i].codepage != RTL_TEXTENCODING_DONTKNOW; i++) - { - if (aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix)) - { - nEncoding = aRTFFontNameSuffixes[i].codepage; - break; - } - } - if (nEncoding > RTL_TEXTENCODING_DONTKNOW) - { - m_nCurrentEncoding = nEncoding; - m_aStates.top().setCurrentEncoding(m_nCurrentEncoding); - } - else - { - // Unknown suffix: looks like it is just a part of font name, restore it - aNameNoSuffix = aName; - } - } - - m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix; - if (m_nCurrentEncoding >= 0) - { - m_aFontEncodings[m_nCurrentFontIndex] = m_nCurrentEncoding; - m_nCurrentEncoding = -1; - } - m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name, - new RTFValue(aNameNoSuffix)); - - writerfilter::Reference<Properties>::Pointer_t const pProp( - new RTFReferenceProperties(m_aStates.top().getTableAttributes(), - m_aStates.top().getTableSprms())); - - //See fdo#47347 initial invalid font entry properties are inserted first, - //so when we attempt to insert the correct ones, there's already an - //entry in the map for them, so the new ones aren't inserted. - auto lb = m_aFontTableEntries.lower_bound(m_nCurrentFontIndex); - if (lb != m_aFontTableEntries.end() - && !(m_aFontTableEntries.key_comp()(m_nCurrentFontIndex, lb->first))) - lb->second = pProp; - else - m_aFontTableEntries.insert(lb, - std::make_pair(m_nCurrentFontIndex, pProp)); - } - break; case Destination::STYLEENTRY: { RTFValue::Pointer_t pType @@ -1467,6 +1470,8 @@ void RTFDocumentImpl::text(OUString& rString) } } break; + case Destination::FONTTABLE: + case Destination::FONTENTRY: case Destination::LEVELTEXT: case Destination::SHAPEPROPERTYNAME: case Destination::SHAPEPROPERTYVALUE: @@ -2216,17 +2221,26 @@ RTFError RTFDocumentImpl::beforePopState(RTFParserState& rState) { switch (rState.getDestination()) { + //Note: in fonttbl there may or may not be groups, so process it as no groups case Destination::FONTTABLE: + case Destination::FONTENTRY: { - writerfilter::Reference<Table>::Pointer_t const pTable( - new RTFReferenceTable(m_aFontTableEntries)); - Mapper().table(NS_ooxml::LN_FONTTABLE, pTable); - if (m_nDefaultFontIndex >= 0) + // Some text unhandled? Seems it is last font name + if (m_aStates.top().getCurrentDestinationText()->getLength()) + handleFontTableEntry(); + + if (rState.getDestination() == Destination::FONTTABLE) { - auto pValue = new RTFValue(m_aFontNames[getFontIndex(m_nDefaultFontIndex)]); - putNestedAttribute(m_aDefaultState.getCharacterSprms(), - NS_ooxml::LN_EG_RPrBase_rFonts, NS_ooxml::LN_CT_Fonts_ascii, - pValue); + writerfilter::Reference<Table>::Pointer_t const pTable( + new RTFReferenceTable(m_aFontTableEntries)); + Mapper().table(NS_ooxml::LN_FONTTABLE, pTable); + if (m_nDefaultFontIndex >= 0) + { + auto pValue = new RTFValue(m_aFontNames[getFontIndex(m_nDefaultFontIndex)]); + putNestedAttribute(m_aDefaultState.getCharacterSprms(), + NS_ooxml::LN_EG_RPrBase_rFonts, NS_ooxml::LN_CT_Fonts_ascii, + pValue); + } } } break; diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.hxx b/writerfilter/source/rtftok/rtfdocumentimpl.hxx index 66e27a509be5..14ffc2f630a4 100644 --- a/writerfilter/source/rtftok/rtfdocumentimpl.hxx +++ b/writerfilter/source/rtftok/rtfdocumentimpl.hxx @@ -777,6 +777,7 @@ private: writerfilter::Reference<Properties>::Pointer_t getProperties(const RTFSprms& rAttributes, RTFSprms const& rSprms, Id nStyleType); void checkNeedPap(); + void handleFontTableEntry(); void sectBreak(bool bFinal = false); void prepareProperties(RTFParserState& rState, writerfilter::Reference<Properties>::Pointer_t& o_rpParagraphProperties, |