summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVasily Melenchuk <vasily.melenchuk@cib.de>2022-04-07 20:59:08 +0300
committerXisco Fauli <xiscofauli@libreoffice.org>2022-04-11 11:53:51 +0200
commit8daac72b7a0b7cdf6eb520273829c0c0c15ddef5 (patch)
treed47b5e7be734d551452d9f22f0021b69f88080db
parent8e467c88d0ba6e70159382676af55b8ef8d65d54 (diff)
tdf#95706: RTF import: tolerant font table parsing
While font name in font table should end with semicolon ({\fonttbl{\f42 Arial;}}) it is not always true and MS Word is tolerant to it: it still able to parse this correctly. Seems LO also should not require strict spec conformance. So idea of font parsing is changed: instead of inserting font on semicolon, it is done on next \fN or destination end. All collected text to this moment is a font name. Change-Id: I6b41951217442a71fd2ebbfc58a3fc79f6f913db Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132686 Tested-by: Jenkins Reviewed-by: Miklos Vajna <vmiklos@collabora.com> (cherry picked from commit 844be7358f1eec00094a55fa1fb4fadadb8cd1bf) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132699 Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
-rw-r--r--sw/qa/extras/rtfexport/data/tdf95706_2.rtf17
-rw-r--r--sw/qa/extras/rtfexport/rtfexport4.cxx12
-rw-r--r--writerfilter/source/rtftok/rtfdispatchvalue.cxx4
-rw-r--r--writerfilter/source/rtftok/rtfdocumentimpl.cxx162
-rw-r--r--writerfilter/source/rtftok/rtfdocumentimpl.hxx1
5 files changed, 122 insertions, 74 deletions
diff --git a/sw/qa/extras/rtfexport/data/tdf95706_2.rtf b/sw/qa/extras/rtfexport/data/tdf95706_2.rtf
new file mode 100644
index 000000000000..d36d2ccd2396
--- /dev/null
+++ b/sw/qa/extras/rtfexport/data/tdf95706_2.rtf
@@ -0,0 +1,17 @@
+{\rtf\ansi
+{\fonttbl
+{\f1 Arial}
+\f2 Impact
+\f3 T\'69mes New Roman
+\f4 T
+a
+h
+o
+m
+a
+}
+\pard\f1\fs26 Arial\par
+\pard\f2\fs26 Impact\par
+\pard\f3\fs26 Times New Roman\par
+\pard\f4\fs26 Tahoma\par
+}
diff --git a/sw/qa/extras/rtfexport/rtfexport4.cxx b/sw/qa/extras/rtfexport/rtfexport4.cxx
index 1f1434054085..33a2a246a181 100644
--- a/sw/qa/extras/rtfexport/rtfexport4.cxx
+++ b/sw/qa/extras/rtfexport/rtfexport4.cxx
@@ -522,6 +522,18 @@ DECLARE_RTFEXPORT_TEST(testTdf95706, "tdf95706.rtf")
CPPUNIT_ASSERT_EQUAL(OUString("Arial"), getProperty<OUString>(xRun16, "CharFontName"));
}
+DECLARE_RTFEXPORT_TEST(testTdf95706_2, "tdf95706_2.rtf")
+{
+ CPPUNIT_ASSERT_EQUAL(OUString("Arial"),
+ getProperty<OUString>(getRun(getParagraph(1), 1), "CharFontName"));
+ CPPUNIT_ASSERT_EQUAL(OUString("Impact"),
+ getProperty<OUString>(getRun(getParagraph(2), 1), "CharFontName"));
+ CPPUNIT_ASSERT_EQUAL(OUString("Times New Roman"),
+ getProperty<OUString>(getRun(getParagraph(3), 1), "CharFontName"));
+ CPPUNIT_ASSERT_EQUAL(OUString("Tahoma"),
+ getProperty<OUString>(getRun(getParagraph(4), 1), "CharFontName"));
+}
+
DECLARE_RTFEXPORT_TEST(testTdf111851, "tdf111851.rtf")
{
uno::Reference<text::XTextTable> xTable(getParagraphOrTable(1), uno::UNO_QUERY);
diff --git a/writerfilter/source/rtftok/rtfdispatchvalue.cxx b/writerfilter/source/rtftok/rtfdispatchvalue.cxx
index d78f087d76e3..35d3e0128c84 100644
--- a/writerfilter/source/rtftok/rtfdispatchvalue.cxx
+++ b/writerfilter/source/rtftok/rtfdispatchvalue.cxx
@@ -762,6 +762,10 @@ RTFError RTFDocumentImpl::dispatchValue(RTFKeyword nKeyword, int nParam)
if (m_aStates.top().getDestination() == Destination::FONTTABLE
|| m_aStates.top().getDestination() == Destination::FONTENTRY)
{
+ // Some text in buffer? It is font name. So previous font definition is complete
+ if (m_aStates.top().getCurrentDestinationText()->getLength())
+ handleFontTableEntry();
+
m_aFontIndexes.push_back(nParam);
m_nCurrentFontIndex = getFontIndex(nParam);
}
diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.cxx b/writerfilter/source/rtftok/rtfdocumentimpl.cxx
index 8ea858abada5..9e77cbf4602e 100644
--- a/writerfilter/source/rtftok/rtfdocumentimpl.cxx
+++ b/writerfilter/source/rtftok/rtfdocumentimpl.cxx
@@ -1332,6 +1332,74 @@ void RTFDocumentImpl::singleChar(sal_uInt8 nValue, bool bRunProps)
}
}
+void RTFDocumentImpl::handleFontTableEntry()
+{
+ OUString aName = m_aStates.top().getCurrentDestinationText()->makeStringAndClear();
+
+ if (aName.isEmpty())
+ return;
+
+ if (aName.endsWith(";"))
+ {
+ aName = aName.copy(0, aName.getLength() - 1);
+ }
+
+ // Old documents can contain no encoding information in fontinfo,
+ // but there can be font name suffixes: Arial CE is not a special
+ // font, it is ordinal Arial, but with used cp 1250 encoding.
+ // Moreover these suffixes have priority over \cpgN and \fcharsetN
+ // in MS Word.
+ OUString aFontSuffix;
+ OUString aNameNoSuffix(aName);
+ sal_Int32 nLastSpace = aName.lastIndexOf(' ');
+ if (nLastSpace >= 0)
+ {
+ aFontSuffix = aName.copy(nLastSpace + 1);
+ aNameNoSuffix = aName.copy(0, nLastSpace);
+ sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW;
+ for (int i = 0; aRTFFontNameSuffixes[i].codepage != RTL_TEXTENCODING_DONTKNOW; i++)
+ {
+ if (aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix))
+ {
+ nEncoding = aRTFFontNameSuffixes[i].codepage;
+ break;
+ }
+ }
+ if (nEncoding > RTL_TEXTENCODING_DONTKNOW)
+ {
+ m_nCurrentEncoding = nEncoding;
+ m_aStates.top().setCurrentEncoding(m_nCurrentEncoding);
+ }
+ else
+ {
+ // Unknown suffix: looks like it is just a part of font name, restore it
+ aNameNoSuffix = aName;
+ }
+ }
+
+ m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix;
+ if (m_nCurrentEncoding >= 0)
+ {
+ m_aFontEncodings[m_nCurrentFontIndex] = m_nCurrentEncoding;
+ m_nCurrentEncoding = -1;
+ }
+ m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name,
+ new RTFValue(aNameNoSuffix));
+
+ writerfilter::Reference<Properties>::Pointer_t const pProp(new RTFReferenceProperties(
+ m_aStates.top().getTableAttributes(), m_aStates.top().getTableSprms()));
+
+ //See fdo#47347 initial invalid font entry properties are inserted first,
+ //so when we attempt to insert the correct ones, there's already an
+ //entry in the map for them, so the new ones aren't inserted.
+ auto lb = m_aFontTableEntries.lower_bound(m_nCurrentFontIndex);
+ if (lb != m_aFontTableEntries.end()
+ && !(m_aFontTableEntries.key_comp()(m_nCurrentFontIndex, lb->first)))
+ lb->second = pProp;
+ else
+ m_aFontTableEntries.insert(lb, std::make_pair(m_nCurrentFontIndex, pProp));
+}
+
void RTFDocumentImpl::text(OUString& rString)
{
if (rString.getLength() == 1 && m_aStates.top().getDestination() != Destination::DOCCOMM)
@@ -1345,10 +1413,7 @@ void RTFDocumentImpl::text(OUString& rString)
bool bRet = true;
switch (m_aStates.top().getDestination())
{
- // Note: in fonttbl there may or may not be groups; in stylesheet
- // and revtbl groups are mandatory
- case Destination::FONTTABLE:
- case Destination::FONTENTRY:
+ // Note: in stylesheet and revtbl groups are mandatory
case Destination::STYLEENTRY:
case Destination::LISTNAME:
case Destination::REVISIONENTRY:
@@ -1368,68 +1433,6 @@ void RTFDocumentImpl::text(OUString& rString)
= m_aStates.top().getCurrentDestinationText()->makeStringAndClear();
switch (m_aStates.top().getDestination())
{
- case Destination::FONTTABLE:
- case Destination::FONTENTRY:
- {
- // Old documents can contain no encoding information in fontinfo,
- // but there can be font name suffixes: Arial CE is not a special
- // font, it is ordinal Arial, but with used cp 1250 encoding.
- // Moreover these suffixes have priority over \cpgN and \fcharsetN
- // in MS Word.
- OUString aFontSuffix;
- OUString aNameNoSuffix(aName);
- sal_Int32 nLastSpace = aName.lastIndexOf(' ');
- if (nLastSpace >= 0)
- {
- aFontSuffix = aName.copy(nLastSpace + 1);
- aNameNoSuffix = aName.copy(0, nLastSpace);
- sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW;
- for (int i = 0;
- aRTFFontNameSuffixes[i].codepage != RTL_TEXTENCODING_DONTKNOW; i++)
- {
- if (aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix))
- {
- nEncoding = aRTFFontNameSuffixes[i].codepage;
- break;
- }
- }
- if (nEncoding > RTL_TEXTENCODING_DONTKNOW)
- {
- m_nCurrentEncoding = nEncoding;
- m_aStates.top().setCurrentEncoding(m_nCurrentEncoding);
- }
- else
- {
- // Unknown suffix: looks like it is just a part of font name, restore it
- aNameNoSuffix = aName;
- }
- }
-
- m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix;
- if (m_nCurrentEncoding >= 0)
- {
- m_aFontEncodings[m_nCurrentFontIndex] = m_nCurrentEncoding;
- m_nCurrentEncoding = -1;
- }
- m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name,
- new RTFValue(aNameNoSuffix));
-
- writerfilter::Reference<Properties>::Pointer_t const pProp(
- new RTFReferenceProperties(m_aStates.top().getTableAttributes(),
- m_aStates.top().getTableSprms()));
-
- //See fdo#47347 initial invalid font entry properties are inserted first,
- //so when we attempt to insert the correct ones, there's already an
- //entry in the map for them, so the new ones aren't inserted.
- auto lb = m_aFontTableEntries.lower_bound(m_nCurrentFontIndex);
- if (lb != m_aFontTableEntries.end()
- && !(m_aFontTableEntries.key_comp()(m_nCurrentFontIndex, lb->first)))
- lb->second = pProp;
- else
- m_aFontTableEntries.insert(lb,
- std::make_pair(m_nCurrentFontIndex, pProp));
- }
- break;
case Destination::STYLEENTRY:
{
RTFValue::Pointer_t pType
@@ -1467,6 +1470,8 @@ void RTFDocumentImpl::text(OUString& rString)
}
}
break;
+ case Destination::FONTTABLE:
+ case Destination::FONTENTRY:
case Destination::LEVELTEXT:
case Destination::SHAPEPROPERTYNAME:
case Destination::SHAPEPROPERTYVALUE:
@@ -2216,17 +2221,26 @@ RTFError RTFDocumentImpl::beforePopState(RTFParserState& rState)
{
switch (rState.getDestination())
{
+ //Note: in fonttbl there may or may not be groups, so process it as no groups
case Destination::FONTTABLE:
+ case Destination::FONTENTRY:
{
- writerfilter::Reference<Table>::Pointer_t const pTable(
- new RTFReferenceTable(m_aFontTableEntries));
- Mapper().table(NS_ooxml::LN_FONTTABLE, pTable);
- if (m_nDefaultFontIndex >= 0)
+ // Some text unhandled? Seems it is last font name
+ if (m_aStates.top().getCurrentDestinationText()->getLength())
+ handleFontTableEntry();
+
+ if (rState.getDestination() == Destination::FONTTABLE)
{
- auto pValue = new RTFValue(m_aFontNames[getFontIndex(m_nDefaultFontIndex)]);
- putNestedAttribute(m_aDefaultState.getCharacterSprms(),
- NS_ooxml::LN_EG_RPrBase_rFonts, NS_ooxml::LN_CT_Fonts_ascii,
- pValue);
+ writerfilter::Reference<Table>::Pointer_t const pTable(
+ new RTFReferenceTable(m_aFontTableEntries));
+ Mapper().table(NS_ooxml::LN_FONTTABLE, pTable);
+ if (m_nDefaultFontIndex >= 0)
+ {
+ auto pValue = new RTFValue(m_aFontNames[getFontIndex(m_nDefaultFontIndex)]);
+ putNestedAttribute(m_aDefaultState.getCharacterSprms(),
+ NS_ooxml::LN_EG_RPrBase_rFonts, NS_ooxml::LN_CT_Fonts_ascii,
+ pValue);
+ }
}
}
break;
diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.hxx b/writerfilter/source/rtftok/rtfdocumentimpl.hxx
index 66e27a509be5..14ffc2f630a4 100644
--- a/writerfilter/source/rtftok/rtfdocumentimpl.hxx
+++ b/writerfilter/source/rtftok/rtfdocumentimpl.hxx
@@ -777,6 +777,7 @@ private:
writerfilter::Reference<Properties>::Pointer_t
getProperties(const RTFSprms& rAttributes, RTFSprms const& rSprms, Id nStyleType);
void checkNeedPap();
+ void handleFontTableEntry();
void sectBreak(bool bFinal = false);
void prepareProperties(RTFParserState& rState,
writerfilter::Reference<Properties>::Pointer_t& o_rpParagraphProperties,