diff options
author | Kohei Yoshida <kohei.yoshida@suse.com> | 2011-07-28 00:46:55 -0400 |
---|---|---|
committer | Kohei Yoshida <kohei.yoshida@suse.com> | 2011-08-03 21:27:26 -0400 |
commit | 24d8e4eaf4543c5b39b9e816d8514525b098827d (patch) | |
tree | a2e48bdc99e07c27d22685558e88fc8e4cb23b50 /sc | |
parent | f2ea23963d92d53f29124f8dd411dbf40450c868 (diff) |
Parse CSS in the <style> content and set number formats to cells.
Pick up number formats specified in the CSS content of Excel
generated HTML documents. This makes use of a template-based CSS
parser from the orcus project.
Diffstat (limited to 'sc')
-rw-r--r-- | sc/inc/orcus/README | 5 | ||||
-rw-r--r-- | sc/inc/orcus/css_parser.hpp | 513 | ||||
-rw-r--r-- | sc/source/filter/html/htmlimp.cxx | 5 | ||||
-rw-r--r-- | sc/source/filter/html/htmlpars.cxx | 328 | ||||
-rw-r--r-- | sc/source/filter/inc/htmlimp.hxx | 2 | ||||
-rw-r--r-- | sc/source/filter/inc/htmlpars.hxx | 45 | ||||
-rw-r--r-- | sc/source/filter/rtf/eeimpars.cxx | 24 |
7 files changed, 907 insertions, 15 deletions
diff --git a/sc/inc/orcus/README b/sc/inc/orcus/README new file mode 100644 index 000000000000..3ada1c3c0dae --- /dev/null +++ b/sc/inc/orcus/README @@ -0,0 +1,5 @@ +The headers in this directory are directly copied from the orcus project[1]. +When modifying any of these files, please ping me so that the changes can be +upstreamed. + +[1] http://gitorious.org/orcus diff --git a/sc/inc/orcus/css_parser.hpp b/sc/inc/orcus/css_parser.hpp new file mode 100644 index 000000000000..7a1b3e51241f --- /dev/null +++ b/sc/inc/orcus/css_parser.hpp @@ -0,0 +1,513 @@ +/************************************************************************* + * + * Copyright (c) 2011 Kohei Yoshida + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + ************************************************************************/ + +#ifndef __ORCUS_CSS_PARSER_HPP__ +#define __ORCUS_CSS_PARSER_HPP__ + +#define ORCUS_DEBUG_CSS 0 + +#include <cstdlib> +#include <cstring> +#include <exception> +#include <string> +#include <cassert> +#include <sstream> + +#if ORCUS_DEBUG_CSS +#include <iostream> +#endif + +namespace orcus { + +class css_parse_error : public std::exception +{ + std::string m_msg; +public: + css_parse_error(const std::string& msg) : m_msg(msg) {} + virtual ~css_parse_error() throw() {} + virtual const char* what() const throw() { return m_msg.c_str(); } +}; + +template<typename _Handler> +class css_parser +{ +public: + typedef _Handler handler_type; + + css_parser(const char* p, size_t n, handler_type& hdl); + void parse(); + +private: + // Handlers - at the time a handler is called the current position is + // expected to point to the first unprocessed non-blank character, and + // each handler must set the current position to the next unprocessed + // non-blank character when it finishes. + void rule(); + void at_rule_name(); + void selector_name(); + void property_name(); + void property(); + void quoted_value(); + void value(); + void name_sep(); + void property_sep(); + void block(); + + void identifier(const char*& p, size_t& len); + + void skip_blanks(); + void skip_blanks_reverse(); + void shrink_stream(); + void next(); + char cur_char() const; + + size_t remaining_size() const { return m_length - m_pos - 1; } + bool has_char() const { return m_pos < m_length; } + + static bool is_blank(char c) + { + return c == ' ' || c == '\t' || c == '\n'; + } + + static bool is_alpha(char c) + { + if ('a' <= c && c <= 'z') + return true; + if ('A' <= c && c <= 'Z') + return true; + return false; + } + + static bool is_name_char(char c) + { + switch (c) + { + case '-': + return true; + } + + return false; + } + + static bool is_numeric(char c) + { + if ('0' <= c && c <= '9') + return true; + return false; + } + + handler_type& m_handler; + const char* mp_char; + size_t m_pos; + size_t m_length; +}; + +template<typename _Handler> +css_parser<_Handler>::css_parser(const char* p, size_t n, handler_type& hdl) : + m_handler(hdl), mp_char(p), m_pos(0), m_length(n) {} + +template<typename _Handler> +void css_parser<_Handler>::parse() +{ + shrink_stream(); + +#if ORCUS_DEBUG_CSS + std::cout << "compressed: '"; + const char* p = mp_char; + for (size_t i = m_pos; i < m_length; ++i, ++p) + std::cout << *p; + std::cout << "'" << std::endl; +#endif + m_handler.begin_parse(); + for (; has_char(); next()) + rule(); + m_handler.end_parse(); +} + +template<typename _Handler> +void css_parser<_Handler>::rule() +{ + // <name> , ... , <name> { <properties> } + while (has_char()) + { + char c = cur_char(); + if (is_alpha(c) || c == '.' || c == '@') + { + selector_name(); + } + else if (c == ',') + { + name_sep(); + } + else if (c == '{') + { + block(); + } + else + { + std::ostringstream os; + os << "failed to parse '" << c << "'"; + throw css_parse_error(os.str()); + } + } +} + +template<typename _Handler> +void css_parser<_Handler>::at_rule_name() +{ + assert(has_char()); + assert(cur_char() == '@'); + next(); + char c = cur_char(); + if (!is_alpha(c)) + throw css_parse_error("first character of an at-rule name must be an alphabet."); + + const char* p; + size_t len; + identifier(p, len); + skip_blanks(); + + m_handler.at_rule_name(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "at-rule name: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::selector_name() +{ + // <element name> '.' <class name> + + assert(has_char()); + char c = cur_char(); + if (c == '@') + { + // This is the name of an at-rule. + at_rule_name(); + return; + } + + if (!is_alpha(c) && c != '.') + throw css_parse_error("first character of a name must be an alphabet or a dot."); + + const char* p_elem = NULL; + const char* p_class = NULL; + size_t len_elem = 0; + size_t len_class = 0; + if (c != '.') + identifier(p_elem, len_elem); + + if (cur_char() == '.') + { + next(); + identifier(p_class, len_class); + } + skip_blanks(); + + m_handler.selector_name(p_elem, len_elem, p_class, len_class); +#if ORCUS_DEBUG_CSS + std::string elem_name(p_elem, len_elem), class_name(p_class, len_class); + std::cout << "selector name: (element)'" << elem_name.c_str() << "' (class)'" << class_name.c_str() << "'" << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::property_name() +{ + assert(has_char()); + char c = cur_char(); + if (!is_alpha(c) && c != '.') + throw css_parse_error("first character of a name must be an alphabet or a dot."); + + const char* p; + size_t len; + identifier(p, len); + skip_blanks(); + + m_handler.property_name(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "property name: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::property() +{ + // <name> : <value> , ... , <value> + m_handler.begin_property(); + property_name(); + if (cur_char() != ':') + throw css_parse_error("':' expected."); + next(); + skip_blanks(); + while (has_char()) + { + value(); + char c = cur_char(); + if (c == ',') + { + // separated by commas. + next(); + skip_blanks(); + } + else if (c == ';') + break; + } + skip_blanks(); + m_handler.end_property(); +} + +template<typename _Handler> +void css_parser<_Handler>::quoted_value() +{ + assert(cur_char() == '"'); + next(); + const char* p = mp_char; + size_t len = 1; + for (next(); has_char(); next()) + { + if (cur_char() == '"') + { + // End quote reached. + break; + } + ++len; + } + + if (cur_char() != '"') + throw css_parse_error("end quote has never been reached."); + + next(); + skip_blanks(); + + m_handler.value(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "quoted value: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::value() +{ + assert(has_char()); + char c = cur_char(); + if (c == '"') + { + quoted_value(); + return; + } + + if (!is_alpha(c) && !is_numeric(c) && c != '-' && c != '+' && c != '.') + { + std::ostringstream os; + os << "illegal first character of a value '" << c << "'"; + throw css_parse_error(os.str()); + } + + const char* p = mp_char; + size_t len = 1; + for (next(); has_char(); next()) + { + c = cur_char(); + if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c) && c != '.') + break; + ++len; + } + skip_blanks(); + + m_handler.value(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "value: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::name_sep() +{ + assert(cur_char() == ','); +#if ORCUS_DEBUG_CSS + std::cout << "," << std::endl; +#endif + next(); + skip_blanks(); +} + +template<typename _Handler> +void css_parser<_Handler>::property_sep() +{ +#if ORCUS_DEBUG_CSS + std::cout << ";" << std::endl; +#endif + next(); + skip_blanks(); +} + +template<typename _Handler> +void css_parser<_Handler>::block() +{ + // '{' <property> ';' ... ';' <property> '}' + + assert(cur_char() == '{'); +#if ORCUS_DEBUG_CSS + std::cout << "{" << std::endl; +#endif + m_handler.begin_block(); + + next(); + skip_blanks(); + + // parse properties. + while (has_char()) + { + property(); + if (cur_char() != ';') + break; + property_sep(); + if (cur_char() == '}') + // ';' after the last property. This is optional but allowed. + break; + } + + if (cur_char() != '}') + throw css_parse_error("} expected."); + + m_handler.end_block(); + + next(); + skip_blanks(); + +#if ORCUS_DEBUG_CSS + std::cout << "}" << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::identifier(const char*& p, size_t& len) +{ + p = mp_char; + len = 1; + for (next(); has_char(); next()) + { + char c = cur_char(); + if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c)) + break; + ++len; + } +} + +template<typename _Handler> +void css_parser<_Handler>::skip_blanks() +{ + for (; has_char(); next()) + { + if (!is_blank(*mp_char)) + break; + } +} + +template<typename _Handler> +void css_parser<_Handler>::skip_blanks_reverse() +{ + const char* p = mp_char + remaining_size(); + for (; p != mp_char; --p, --m_length) + { + if (!is_blank(*p)) + break; + } +} + +template<typename _Handler> +void css_parser<_Handler>::shrink_stream() +{ + // Skip any leading blanks. + skip_blanks(); + + if (!remaining_size()) + return; + + // Skip any trailing blanks. + skip_blanks_reverse(); + + // Skip leading <!-- if present. + + const char* com_open = "<!--"; + size_t com_open_len = std::strlen(com_open); + if (remaining_size() < com_open_len) + // Not enough stream left. Bail out. + return; + + const char* p = mp_char; + for (size_t i = 0; i < com_open_len; ++i, ++p) + { + if (*p != com_open[i]) + return; + next(); + } + mp_char = p; + + // Skip leading blanks once again. + skip_blanks(); + + // Skip trailing --> if present. + const char* com_close = "-->"; + size_t com_close_len = std::strlen(com_close); + size_t n = remaining_size(); + if (n < com_close_len) + // Not enough stream left. Bail out. + return; + + p = mp_char + n; // move to the last char. + for (size_t i = com_close_len; i > 0; --i, --p) + { + if (*p != com_close[i-1]) + return; + } + m_length -= com_close_len; + + skip_blanks_reverse(); +} + +template<typename _Handler> +void css_parser<_Handler>::next() +{ + ++m_pos; + ++mp_char; +} + +template<typename _Handler> +char css_parser<_Handler>::cur_char() const +{ + return *mp_char; +} + +} + +#endif diff --git a/sc/source/filter/html/htmlimp.cxx b/sc/source/filter/html/htmlimp.cxx index dd21daf6516a..2e79fddc2c13 100644 --- a/sc/source/filter/html/htmlimp.cxx +++ b/sc/source/filter/html/htmlimp.cxx @@ -77,7 +77,7 @@ ScEEAbsImport *ScFormatFilterPluginImpl::CreateHTMLImport( ScDocument* pDocP, co return new ScHTMLImport( pDocP, rBaseURL, rRange, bCalcWidthHeight ); } -ScHTMLImport::ScHTMLImport( ScDocument* pDocP, const String& rBaseURL, const ScRange& rRange, sal_Bool bCalcWidthHeight ) : +ScHTMLImport::ScHTMLImport( ScDocument* pDocP, const String& rBaseURL, const ScRange& rRange, bool bCalcWidthHeight ) : ScEEImport( pDocP, rRange ) { Size aPageSize; @@ -150,8 +150,7 @@ void ScHTMLImport::WriteToDocument( pGlobTable->ApplyCellBorders( mpDoc, maRange.aStart ); // correct cell borders for merged cells - size_t ListSize = pParser->ListSize(); - for ( size_t i = 0; i < ListSize; ++i ) + for ( size_t i = 0, n = pParser->ListSize(); i < n; ++i ) { const ScEEParseEntry* pEntry = pParser->ListEntry( i ); if( (pEntry->nColOverlap > 1) || (pEntry->nRowOverlap > 1) ) diff --git a/sc/source/filter/html/htmlpars.cxx b/sc/source/filter/html/htmlpars.cxx index 4105782419f2..18bf94ad5143 100644 --- a/sc/source/filter/html/htmlpars.cxx +++ b/sc/source/filter/html/htmlpars.cxx @@ -49,6 +49,7 @@ #include <editeng/justifyitem.hxx> #include <sfx2/objsh.hxx> #include <svl/eitem.hxx> +#include <svl/intitem.hxx> #include <svtools/filter.hxx> #include <svtools/parhtml.hxx> #include <svtools/htmlkywd.hxx> @@ -64,12 +65,125 @@ #include "document.hxx" #include "rangelst.hxx" +#include <orcus/css_parser.hpp> + #include <com/sun/star/document/XDocumentProperties.hpp> #include <com/sun/star/document/XDocumentPropertiesSupplier.hpp> using ::editeng::SvxBorderLine; using namespace ::com::sun::star; +void ScHTMLStyles::add(const char* pElemName, size_t nElemName, const char* pClassName, size_t nClassName, + const rtl::OUString& aProp, const rtl::OUString& aValue) +{ + if (pElemName) + { + rtl::OUString aElem(pElemName, nElemName, RTL_TEXTENCODING_UTF8); + aElem = aElem.toAsciiLowerCase(); + if (pClassName) + { + // Both element and class names given. + + ElemsType::iterator itrElem = maElemProps.find(aElem); + if (itrElem == maElemProps.end()) + { + // new element + std::auto_ptr<NamePropsType> p(new NamePropsType); + std::pair<ElemsType::iterator, bool> r = maElemProps.insert(aElem, p); + if (!r.second) + // insertion failed. + return; + itrElem = r.first; + } + + NamePropsType* pClsProps = itrElem->second; + rtl::OUString aClass(pClassName, nClassName, RTL_TEXTENCODING_UTF8); + aClass = aClass.toAsciiLowerCase(); + insertProp(*pClsProps, aClass, aProp, aValue); + } + else + { + // Element name only. Add it to the element global. + insertProp(maElemGlobalProps, aElem, aProp, aValue); + } + } + else + { + if (pClassName) + { + // Class name only. Add it to the global. + rtl::OUString aClass(pClassName, nClassName, RTL_TEXTENCODING_UTF8); + aClass = aClass.toAsciiLowerCase(); + insertProp(maGlobalProps, aClass, aProp, aValue); + } + } +} + +const rtl::OUString& ScHTMLStyles::getPropertyValue( + const rtl::OUString& rElem, const rtl::OUString& rClass, const rtl::OUString& rPropName) const +{ + // First, look into the element-class storage. + { + ElemsType::const_iterator itr = maElemProps.find(rElem); + if (itr != maElemProps.end()) + { + const NamePropsType* pClasses = itr->second; + NamePropsType::const_iterator itr2 = pClasses->find(rClass); + if (itr2 != pClasses->end()) + { + const PropsType* pProps = itr2->second; + PropsType::const_iterator itr3 = pProps->find(rPropName); + if (itr3 != pProps->end()) + return itr3->second; + } + } + } + // Next, look into the class global storage. + { + NamePropsType::const_iterator itr = maGlobalProps.find(rClass); + if (itr != maGlobalProps.end()) + { + const PropsType* pProps = itr->second; + PropsType::const_iterator itr2 = pProps->find(rPropName); + if (itr2 != pProps->end()) + return itr2->second; + } + } + // As the last resort, look into the element global storage. + { + NamePropsType::const_iterator itr = maElemGlobalProps.find(rClass); + if (itr != maElemGlobalProps.end()) + { + const PropsType* pProps = itr->second; + PropsType::const_iterator itr2 = pProps->find(rPropName); + if (itr2 != pProps->end()) + return itr2->second; + } + } + + return maEmpty; // nothing found. +} + +void ScHTMLStyles::insertProp( + NamePropsType& rStore, const rtl::OUString& aName, + const rtl::OUString& aProp, const rtl::OUString& aValue) +{ + NamePropsType::iterator itr = rStore.find(aName); + if (itr == rStore.end()) + { + // new element + std::auto_ptr<PropsType> p(new PropsType); + std::pair<NamePropsType::iterator, bool> r = rStore.insert(aName, p); + if (!r.second) + // insertion failed. + return; + + itr = r.first; + } + + PropsType* pProps = itr->second; + pProps->insert(PropsType::value_type(aProp, aValue)); +} SV_IMPL_VARARR_SORT( ScHTMLColOffset, sal_uLong ); @@ -91,10 +205,21 @@ ScHTMLParser::~ScHTMLParser() { } +ScHTMLStyles& ScHTMLParser::GetStyles() +{ + return maStyles; +} + +ScDocument& ScHTMLParser::GetDoc() +{ + return *mpDoc; +} // ============================================================================ -ScHTMLLayoutParser::ScHTMLLayoutParser( EditEngine* pEditP, const String& rBaseURL, const Size& aPageSizeP, ScDocument* pDocP ) : +ScHTMLLayoutParser::ScHTMLLayoutParser( + EditEngine* pEditP, const String& rBaseURL, const Size& aPageSizeP, + ScDocument* pDocP ) : ScHTMLParser( pEditP, pDocP ), aPageSize( aPageSizeP ), aBaseURL( rBaseURL ), @@ -1867,6 +1992,7 @@ ScHTMLTable::ScHTMLTable( ScHTMLTable& rParentTable, const ImportInfo& rInfo, bo mrEEParseList( rParentTable.mrEEParseList ), mpCurrEntryList( 0 ), maSize( 1, 1 ), + mpParser(rParentTable.mpParser), mbBorderOn( false ), mbPreFormText( bPreFormText ), mbRowOn( false ), @@ -1902,7 +2028,7 @@ ScHTMLTable::ScHTMLTable( SfxItemPool& rPool, EditEngine& rEditEngine, ::std::vector< ScEEParseEntry* >& rEEParseList, - ScHTMLTableId& rnUnusedId + ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser ) : mpParentTable( 0 ), maTableId( rnUnusedId ), @@ -1911,6 +2037,7 @@ ScHTMLTable::ScHTMLTable( mrEEParseList( rEEParseList ), mpCurrEntryList( 0 ), maSize( 1, 1 ), + mpParser(pParser), mbBorderOn( false ), mbPreFormText( false ), mbRowOn( false ), @@ -2044,6 +2171,52 @@ void ScHTMLTable::RowOff( const ImportInfo& rInfo ) CreateNewEntry( rInfo ); } +namespace { + +/** + * Decode a numbert format string stored in Excel-generated HTML's CSS + * region. + */ +rtl::OUString decodeNumberFormat(const rtl::OUString& rFmt) +{ + rtl::OUStringBuffer aBuf; + const sal_Unicode* p = rFmt.getStr(); + sal_Int32 n = rFmt.getLength(); + for (sal_Int32 i = 0; i < n; ++i, ++p) + { + if (*p == '\\') + { + // Skip '\'. + ++i; + ++p; + + // Parse all subsequent digits until first non-digit is found. + sal_Int32 nDigitCount = 0; + const sal_Unicode* p1 = p; + for (; i < n; ++i, ++p, ++nDigitCount) + { + if (*p < '0' || '9' < *p) + { + --i; + --p; + break; + } + + } + if (nDigitCount) + { + sal_Int32 nVal = rtl::OUString(p1, nDigitCount).toInt32(16); + aBuf.append(static_cast<sal_Unicode>(nVal)); + } + } + else + aBuf.append(*p); + } + return aBuf.makeStringAndClear(); +} + +} + void ScHTMLTable::DataOn( const ImportInfo& rInfo ) { PushEntry( rInfo, true ); @@ -2072,6 +2245,38 @@ void ScHTMLTable::DataOn( const ImportInfo& rInfo ) } ImplDataOn( aSpanSize ); + + const HTMLOptions& rOptions = static_cast<HTMLParser*>(rInfo.pParser)->GetOptions(); + HTMLOptions::const_iterator itr = rOptions.begin(), itrEnd = rOptions.end(); + for (; itr != itrEnd; ++itr) + { + if (itr->GetToken() == HTML_O_CLASS) + { + // This <td> has class property. Pick up the number format + // associated with this class (if any). + rtl::OUString aElem(RTL_CONSTASCII_USTRINGPARAM("td")); + rtl::OUString aClass = itr->GetString(); + rtl::OUString aProp(RTL_CONSTASCII_USTRINGPARAM("mso-number-format")); + const ScHTMLStyles& rStyles = mpParser->GetStyles(); + const rtl::OUString& rVal = rStyles.getPropertyValue(aElem, aClass, aProp); + rtl::OUString aNumFmt = decodeNumberFormat(rVal); + + sal_uInt32 nNumberFormat = GetFormatTable()->GetEntryKey(aNumFmt); + bool bValidFmt = false; + if ( nNumberFormat == NUMBERFORMAT_ENTRY_NOT_FOUND ) + { + xub_StrLen nErrPos = 0; + short nDummy; + bValidFmt = GetFormatTable()->PutEntry(aNumFmt, nErrPos, nDummy, nNumberFormat); + } + else + bValidFmt = true; + + if (bValidFmt) + mxDataItemSet->Put( SfxUInt32Item(ATTR_VALUE_FORMAT, nNumberFormat) ); + } + } + ProcessFormatOptions( *mxDataItemSet, rInfo ); CreateNewEntry( rInfo ); mxCurrEntry->pValStr = pValStr.release(); @@ -2224,6 +2429,11 @@ void ScHTMLTable::ApplyCellBorders( ScDocument* pDoc, const ScAddress& rFirstPos aIter->ApplyCellBorders( pDoc, rFirstPos ); } +SvNumberFormatter* ScHTMLTable::GetFormatTable() +{ + return mpParser->GetDoc().GetFormatTable(); +} + // ---------------------------------------------------------------------------- bool ScHTMLTable::IsEmptyCell() const @@ -2690,9 +2900,10 @@ ScHTMLGlobalTable::ScHTMLGlobalTable( SfxItemPool& rPool, EditEngine& rEditEngine, ::std::vector< ScEEParseEntry* >& rEEParseList, - ScHTMLTableId& rnUnusedId + ScHTMLTableId& rnUnusedId, + ScHTMLParser* pParser ) : - ScHTMLTable( rPool, rEditEngine, rEEParseList, rnUnusedId ) + ScHTMLTable( rPool, rEditEngine, rEEParseList, rnUnusedId, pParser ) { } @@ -2717,7 +2928,8 @@ ScHTMLQueryParser::ScHTMLQueryParser( EditEngine* pEditEngine, ScDocument* pDoc mnUnusedId( SC_HTML_GLOBAL_TABLE ), mbTitleOn( false ) { - mxGlobTable.reset( new ScHTMLGlobalTable( *pPool, *pEdit, maList, mnUnusedId ) ); + mxGlobTable.reset( + new ScHTMLGlobalTable(*pPool, *pEdit, maList, mnUnusedId, this)); mpCurrTable = mxGlobTable.get(); } @@ -2779,6 +2991,9 @@ void ScHTMLQueryParser::ProcessToken( const ImportInfo& rInfo ) case HTML_TITLE_ON: TitleOn( rInfo ); break; // <title> case HTML_TITLE_OFF: TitleOff( rInfo ); break; // </title> + case HTML_STYLE_ON: break; + case HTML_STYLE_OFF: ParseStyle(rInfo.aText); break; + // --- body handling --- case HTML_BODY_ON: mpCurrTable->BodyOn( rInfo ); break; // <body> case HTML_BODY_OFF: mpCurrTable->BodyOff( rInfo ); break; // </body> @@ -2956,6 +3171,109 @@ void ScHTMLQueryParser::CloseTable( const ImportInfo& rInfo ) mpCurrTable = mpCurrTable->CloseTable( rInfo ); } +namespace { + +/** + * Handler class for the CSS parser. + */ +class CSSHandler +{ + struct MemStr + { + const char* mp; + size_t mn; + + MemStr() : mp(NULL), mn(0) {} + MemStr(const char* p, size_t n) : mp(p), mn(n) {} + MemStr(const MemStr& r) : mp(r.mp), mn(r.mn) {} + MemStr& operator=(const MemStr& r) + { + mp = r.mp; + mn = r.mn; + return *this; + } + }; + + typedef std::pair<MemStr, MemStr> SelectorName; // element : class + typedef std::vector<SelectorName> SelectorNames; + SelectorNames maSelectorNames; /// current selector names. + MemStr maPropName; /// current property name. + MemStr maPropValue; /// current property value. + + ScHTMLStyles& mrStyles; +public: + CSSHandler(ScHTMLStyles& rStyles) : mrStyles(rStyles) {} + + void at_rule_name(const char* /*p*/, size_t /*n*/) + { + // For now, we ignore at-rule properties. + } + + void selector_name(const char* p_elem, size_t n_elem, const char* p_class, size_t n_class) + { + MemStr aElem(p_elem, n_elem), aClass(p_class, n_class); + SelectorName aName(aElem, aClass); + maSelectorNames.push_back(aName); + } + + void property_name(const char* p, size_t n) + { + maPropName = MemStr(p, n); + } + + void value(const char* p, size_t n) + { + maPropValue = MemStr(p, n); + } + + void begin_parse() {} + + void end_parse() {} + + void begin_block() {} + + void end_block() + { + maSelectorNames.clear(); + } + + void begin_property() {} + + void end_property() + { + SelectorNames::const_iterator itr = maSelectorNames.begin(), itrEnd = maSelectorNames.end(); + for (; itr != itrEnd; ++itr) + { + // Add this property to the collection for each selector. + const SelectorName& rSelName = *itr; + const MemStr& rElem = rSelName.first; + const MemStr& rClass = rSelName.second; + rtl::OUString aName(maPropName.mp, maPropName.mn, RTL_TEXTENCODING_UTF8); + rtl::OUString aValue(maPropValue.mp, maPropValue.mn, RTL_TEXTENCODING_UTF8); + mrStyles.add(rElem.mp, rElem.mn, rClass.mp, rClass.mn, aName, aValue); + } + maPropName = MemStr(); + maPropValue = MemStr(); + } +}; + +} + +void ScHTMLQueryParser::ParseStyle(const rtl::OUString& rStrm) +{ + rtl::OString aStr = rtl::OUStringToOString(rStrm, RTL_TEXTENCODING_UTF8); + CSSHandler aHdl(GetStyles()); + orcus::css_parser<CSSHandler> aParser(aStr.getStr(), aStr.getLength(), aHdl); + try + { + aParser.parse(); + } + catch (const orcus::css_parse_error&) + { + // Parsing of CSS failed. Do nothing for now. + } +} + // ---------------------------------------------------------------------------- IMPL_LINK( ScHTMLQueryParser, HTMLImportHdl, const ImportInfo*, pInfo ) diff --git a/sc/source/filter/inc/htmlimp.hxx b/sc/source/filter/inc/htmlimp.hxx index c79bf6d4f1e2..9ad09a5135e9 100644 --- a/sc/source/filter/inc/htmlimp.hxx +++ b/sc/source/filter/inc/htmlimp.hxx @@ -39,7 +39,7 @@ private: static void InsertRangeName( ScDocument* pDoc, const String& rName, const ScRange& rRange ); public: - ScHTMLImport( ScDocument* pDoc, const String& rBaseURL, const ScRange& rRange, sal_Bool bCalcWidthHeight = sal_True ); + ScHTMLImport( ScDocument* pDoc, const String& rBaseURL, const ScRange& rRange, bool bCalcWidthHeight ); virtual ~ScHTMLImport(); const ScHTMLParser* GetParser() const { return (ScHTMLParser*)mpParser; } diff --git a/sc/source/filter/inc/htmlpars.hxx b/sc/source/filter/inc/htmlpars.hxx index 6e8753a78548..9ba3d1063b42 100644 --- a/sc/source/filter/inc/htmlpars.hxx +++ b/sc/source/filter/inc/htmlpars.hxx @@ -35,6 +35,8 @@ #include <vector> #include <list> #include <map> +#include <boost/ptr_container/ptr_map.hpp> +#include <boost/unordered_map.hpp> #include "rangelst.hxx" #include "eeparser.hxx" @@ -51,9 +53,40 @@ const sal_uInt16 SC_HTML_OFFSET_TOLERANCE_LARGE = 10; // nested class ScHTMLTable; +/** + * Collection of HTML style data parsed from the content of <style> + * elements. + */ +class ScHTMLStyles +{ + typedef ::boost::unordered_map<rtl::OUString, rtl::OUString, rtl::OUStringHash> PropsType; + typedef ::boost::ptr_map<rtl::OUString, PropsType> NamePropsType; + typedef ::boost::ptr_map<rtl::OUString, NamePropsType> ElemsType; + + NamePropsType maGlobalProps; /// global properties (for a given class for all elements) + NamePropsType maElemGlobalProps; /// element global properties (no class specified) + ElemsType maElemProps; /// element to class to properties (both element and class are given) + const rtl::OUString maEmpty; /// just a persistent empty string. +public: + void add(const char* pElemName, size_t nElemName, const char* pClassName, size_t nClassName, + const rtl::OUString& aProp, const rtl::OUString& aValue); + + /** + * Find best-matching property value for given element and class names. + */ + const rtl::OUString& getPropertyValue( + const rtl::OUString& rElem, const rtl::OUString& rClass, const rtl::OUString& rPropName) const; + +private: + static void insertProp( + NamePropsType& rProps, const rtl::OUString& aName, + const rtl::OUString& aProp, const rtl::OUString& aValue); +}; + /** Base class for HTML parser classes. */ class ScHTMLParser : public ScEEParser { + ScHTMLStyles maStyles; protected: sal_uInt32 maFontHeights[ SC_HTML_FONTSIZES ]; ScDocument* mpDoc; /// The destination document. @@ -64,6 +97,9 @@ public: virtual sal_uLong Read( SvStream& rStrm, const String& rBaseURL ) = 0; + ScHTMLStyles& GetStyles(); + ScDocument& GetDoc(); + /** Returns the "global table" which contains the entire HTML document. */ virtual const ScHTMLTable* GetGlobalTable() const = 0; }; @@ -436,6 +472,8 @@ public: /** Applies border formatting to the passed document. */ void ApplyCellBorders( ScDocument* pDoc, const ScAddress& rFirstPos ) const; + SvNumberFormatter* GetFormatTable(); + protected: /** Creates a new HTML table without parent. @descr This constructor is used to create the "global table". */ @@ -443,7 +481,7 @@ protected: SfxItemPool& rPool, EditEngine& rEditEngine, ::std::vector< ScEEParseEntry* >& rEEParseList, - ScHTMLTableId& rnUnusedId ); + ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser ); /** Fills all empty cells in this and nested tables with dummy parse entries. */ void FillEmptyCells(); @@ -550,6 +588,7 @@ private: ScHTMLSize maSize; /// Size of the table. ScHTMLPos maCurrCell; /// Address of current cell to fill. ScHTMLPos maDocBasePos; /// Resulting base address in a Calc document. + ScHTMLParser* mpParser; bool mbBorderOn; /// true = Table borders on. bool mbPreFormText; /// true = Table from preformatted text (<pre> tag). bool mbRowOn; /// true = Inside of <tr> </tr>. @@ -567,7 +606,7 @@ public: SfxItemPool& rPool, EditEngine& rEditEngine, ::std::vector< ScEEParseEntry* >& rEEParseList, - ScHTMLTableId& rnUnusedId ); + ScHTMLTableId& rnUnusedId, ScHTMLParser* pParser ); virtual ~ScHTMLGlobalTable(); @@ -620,6 +659,8 @@ private: /** Closes the current table, regardless on opening tag. */ void CloseTable( const ImportInfo& rInfo ); + void ParseStyle(const rtl::OUString& rStrm); + DECL_LINK( HTMLImportHdl, const ImportInfo* ); private: diff --git a/sc/source/filter/rtf/eeimpars.cxx b/sc/source/filter/rtf/eeimpars.cxx index 75c2489a2eb0..66fdf511eeb1 100644 --- a/sc/source/filter/rtf/eeimpars.cxx +++ b/sc/source/filter/rtf/eeimpars.cxx @@ -160,7 +160,7 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor, } ScDocumentPool* pDocPool = mpDoc->GetPool(); ScRangeName* pRangeNames = mpDoc->GetRangeName(); - for ( size_t i = 0, nListSize = mpParser->ListSize(); i < nListSize; ++i ) + for ( size_t i = 0, n = mpParser->ListSize(); i < n; ++i ) { pE = mpParser->ListEntry( i ); SCROW nRow = nStartRow + pE->nRow; @@ -274,6 +274,10 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor, const SfxPoolItem* pPosture; if ( rESet.GetItemState( ATTR_FONT_POSTURE, false, &pPosture) != SFX_ITEM_SET ) pPosture = 0; + // Number format + const SfxPoolItem* pNumFmt = NULL; + if ( rESet.GetItemState(ATTR_VALUE_FORMAT, false, &pNumFmt) == SFX_ITEM_SET ) + rSet.Put(*pNumFmt); if ( pFont || pHeight || pWeight || pPosture ) { String aStr( mpEngine->GetText( pE->aSel ) ); @@ -358,10 +362,21 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor, aStr.EraseLeadingAndTrailingChars(); } + bool bTextFormat = false; + + const SfxPoolItem* pNumFmt = NULL; + if (rSet.GetItemState(ATTR_VALUE_FORMAT, false, &pNumFmt) == SFX_ITEM_SET) + { + sal_uInt32 nNumFmt = static_cast<const SfxUInt32Item*>(pNumFmt)->GetValue(); + sal_uInt16 nType = pFormatter->GetType(nNumFmt); + if (nType == NUMBERFORMAT_TEXT) + // Format is set to Text. + bTextFormat = true; + } + // TODO: RTF import should follow the language tag, // currently this follows the HTML options for both, HTML // and RTF. - bool bEnUsRecognized = false; if (bNumbersEnglishUS) { pFormatter->ChangeIntl( LANGUAGE_ENGLISH_US); @@ -369,13 +384,14 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor, double fEnVal = 0.0; if (pFormatter->IsNumberFormat( aStr, nIndex, fEnVal)) { - bEnUsRecognized = true; sal_uInt32 nNewIndex = pFormatter->GetFormatForLanguageIfBuiltIn( nIndex, LANGUAGE_SYSTEM); OSL_ENSURE( nNewIndex != nIndex, "ScEEImport::WriteToDocument: NumbersEnglishUS not a built-in format?"); pFormatter->GetInputLineString( fEnVal, nNewIndex, aStr); } + else + bTextFormat = true; pFormatter->ChangeIntl( LANGUAGE_SYSTEM); } @@ -384,7 +400,7 @@ void ScEEImport::WriteToDocument( sal_Bool bSizeColsRows, double nOutputFactor, aStr.SearchAndReplaceAll( (sal_Unicode)'\t', (sal_Unicode)' ' ); aStr.SearchAndReplaceAll( (sal_Unicode)'\n', (sal_Unicode)' ' ); - if (bNumbersEnglishUS && !bEnUsRecognized) + if (bTextFormat) mpDoc->PutCell( nCol, nRow, nTab, new ScStringCell( aStr)); else { |