diff options
author | Kohei Yoshida <kohei.yoshida@suse.com> | 2011-07-28 00:46:55 -0400 |
---|---|---|
committer | Kohei Yoshida <kohei.yoshida@suse.com> | 2011-08-03 21:27:26 -0400 |
commit | 24d8e4eaf4543c5b39b9e816d8514525b098827d (patch) | |
tree | a2e48bdc99e07c27d22685558e88fc8e4cb23b50 /sc/inc | |
parent | f2ea23963d92d53f29124f8dd411dbf40450c868 (diff) |
Parse CSS in the <style> content and set number formats to cells.
Pick up number formats specified in the CSS content of Excel
generated HTML documents. This makes use of a template-based CSS
parser from the orcus project.
Diffstat (limited to 'sc/inc')
-rw-r--r-- | sc/inc/orcus/README | 5 | ||||
-rw-r--r-- | sc/inc/orcus/css_parser.hpp | 513 |
2 files changed, 518 insertions, 0 deletions
diff --git a/sc/inc/orcus/README b/sc/inc/orcus/README new file mode 100644 index 000000000000..3ada1c3c0dae --- /dev/null +++ b/sc/inc/orcus/README @@ -0,0 +1,5 @@ +The headers in this directory are directly copied from the orcus project[1]. +When modifying any of these files, please ping me so that the changes can be +upstreamed. + +[1] http://gitorious.org/orcus diff --git a/sc/inc/orcus/css_parser.hpp b/sc/inc/orcus/css_parser.hpp new file mode 100644 index 000000000000..7a1b3e51241f --- /dev/null +++ b/sc/inc/orcus/css_parser.hpp @@ -0,0 +1,513 @@ +/************************************************************************* + * + * Copyright (c) 2011 Kohei Yoshida + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + ************************************************************************/ + +#ifndef __ORCUS_CSS_PARSER_HPP__ +#define __ORCUS_CSS_PARSER_HPP__ + +#define ORCUS_DEBUG_CSS 0 + +#include <cstdlib> +#include <cstring> +#include <exception> +#include <string> +#include <cassert> +#include <sstream> + +#if ORCUS_DEBUG_CSS +#include <iostream> +#endif + +namespace orcus { + +class css_parse_error : public std::exception +{ + std::string m_msg; +public: + css_parse_error(const std::string& msg) : m_msg(msg) {} + virtual ~css_parse_error() throw() {} + virtual const char* what() const throw() { return m_msg.c_str(); } +}; + +template<typename _Handler> +class css_parser +{ +public: + typedef _Handler handler_type; + + css_parser(const char* p, size_t n, handler_type& hdl); + void parse(); + +private: + // Handlers - at the time a handler is called the current position is + // expected to point to the first unprocessed non-blank character, and + // each handler must set the current position to the next unprocessed + // non-blank character when it finishes. + void rule(); + void at_rule_name(); + void selector_name(); + void property_name(); + void property(); + void quoted_value(); + void value(); + void name_sep(); + void property_sep(); + void block(); + + void identifier(const char*& p, size_t& len); + + void skip_blanks(); + void skip_blanks_reverse(); + void shrink_stream(); + void next(); + char cur_char() const; + + size_t remaining_size() const { return m_length - m_pos - 1; } + bool has_char() const { return m_pos < m_length; } + + static bool is_blank(char c) + { + return c == ' ' || c == '\t' || c == '\n'; + } + + static bool is_alpha(char c) + { + if ('a' <= c && c <= 'z') + return true; + if ('A' <= c && c <= 'Z') + return true; + return false; + } + + static bool is_name_char(char c) + { + switch (c) + { + case '-': + return true; + } + + return false; + } + + static bool is_numeric(char c) + { + if ('0' <= c && c <= '9') + return true; + return false; + } + + handler_type& m_handler; + const char* mp_char; + size_t m_pos; + size_t m_length; +}; + +template<typename _Handler> +css_parser<_Handler>::css_parser(const char* p, size_t n, handler_type& hdl) : + m_handler(hdl), mp_char(p), m_pos(0), m_length(n) {} + +template<typename _Handler> +void css_parser<_Handler>::parse() +{ + shrink_stream(); + +#if ORCUS_DEBUG_CSS + std::cout << "compressed: '"; + const char* p = mp_char; + for (size_t i = m_pos; i < m_length; ++i, ++p) + std::cout << *p; + std::cout << "'" << std::endl; +#endif + m_handler.begin_parse(); + for (; has_char(); next()) + rule(); + m_handler.end_parse(); +} + +template<typename _Handler> +void css_parser<_Handler>::rule() +{ + // <name> , ... , <name> { <properties> } + while (has_char()) + { + char c = cur_char(); + if (is_alpha(c) || c == '.' || c == '@') + { + selector_name(); + } + else if (c == ',') + { + name_sep(); + } + else if (c == '{') + { + block(); + } + else + { + std::ostringstream os; + os << "failed to parse '" << c << "'"; + throw css_parse_error(os.str()); + } + } +} + +template<typename _Handler> +void css_parser<_Handler>::at_rule_name() +{ + assert(has_char()); + assert(cur_char() == '@'); + next(); + char c = cur_char(); + if (!is_alpha(c)) + throw css_parse_error("first character of an at-rule name must be an alphabet."); + + const char* p; + size_t len; + identifier(p, len); + skip_blanks(); + + m_handler.at_rule_name(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "at-rule name: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::selector_name() +{ + // <element name> '.' <class name> + + assert(has_char()); + char c = cur_char(); + if (c == '@') + { + // This is the name of an at-rule. + at_rule_name(); + return; + } + + if (!is_alpha(c) && c != '.') + throw css_parse_error("first character of a name must be an alphabet or a dot."); + + const char* p_elem = NULL; + const char* p_class = NULL; + size_t len_elem = 0; + size_t len_class = 0; + if (c != '.') + identifier(p_elem, len_elem); + + if (cur_char() == '.') + { + next(); + identifier(p_class, len_class); + } + skip_blanks(); + + m_handler.selector_name(p_elem, len_elem, p_class, len_class); +#if ORCUS_DEBUG_CSS + std::string elem_name(p_elem, len_elem), class_name(p_class, len_class); + std::cout << "selector name: (element)'" << elem_name.c_str() << "' (class)'" << class_name.c_str() << "'" << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::property_name() +{ + assert(has_char()); + char c = cur_char(); + if (!is_alpha(c) && c != '.') + throw css_parse_error("first character of a name must be an alphabet or a dot."); + + const char* p; + size_t len; + identifier(p, len); + skip_blanks(); + + m_handler.property_name(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "property name: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::property() +{ + // <name> : <value> , ... , <value> + m_handler.begin_property(); + property_name(); + if (cur_char() != ':') + throw css_parse_error("':' expected."); + next(); + skip_blanks(); + while (has_char()) + { + value(); + char c = cur_char(); + if (c == ',') + { + // separated by commas. + next(); + skip_blanks(); + } + else if (c == ';') + break; + } + skip_blanks(); + m_handler.end_property(); +} + +template<typename _Handler> +void css_parser<_Handler>::quoted_value() +{ + assert(cur_char() == '"'); + next(); + const char* p = mp_char; + size_t len = 1; + for (next(); has_char(); next()) + { + if (cur_char() == '"') + { + // End quote reached. + break; + } + ++len; + } + + if (cur_char() != '"') + throw css_parse_error("end quote has never been reached."); + + next(); + skip_blanks(); + + m_handler.value(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "quoted value: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::value() +{ + assert(has_char()); + char c = cur_char(); + if (c == '"') + { + quoted_value(); + return; + } + + if (!is_alpha(c) && !is_numeric(c) && c != '-' && c != '+' && c != '.') + { + std::ostringstream os; + os << "illegal first character of a value '" << c << "'"; + throw css_parse_error(os.str()); + } + + const char* p = mp_char; + size_t len = 1; + for (next(); has_char(); next()) + { + c = cur_char(); + if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c) && c != '.') + break; + ++len; + } + skip_blanks(); + + m_handler.value(p, len); +#if ORCUS_DEBUG_CSS + std::string foo(p, len); + std::cout << "value: " << foo.c_str() << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::name_sep() +{ + assert(cur_char() == ','); +#if ORCUS_DEBUG_CSS + std::cout << "," << std::endl; +#endif + next(); + skip_blanks(); +} + +template<typename _Handler> +void css_parser<_Handler>::property_sep() +{ +#if ORCUS_DEBUG_CSS + std::cout << ";" << std::endl; +#endif + next(); + skip_blanks(); +} + +template<typename _Handler> +void css_parser<_Handler>::block() +{ + // '{' <property> ';' ... ';' <property> '}' + + assert(cur_char() == '{'); +#if ORCUS_DEBUG_CSS + std::cout << "{" << std::endl; +#endif + m_handler.begin_block(); + + next(); + skip_blanks(); + + // parse properties. + while (has_char()) + { + property(); + if (cur_char() != ';') + break; + property_sep(); + if (cur_char() == '}') + // ';' after the last property. This is optional but allowed. + break; + } + + if (cur_char() != '}') + throw css_parse_error("} expected."); + + m_handler.end_block(); + + next(); + skip_blanks(); + +#if ORCUS_DEBUG_CSS + std::cout << "}" << std::endl; +#endif +} + +template<typename _Handler> +void css_parser<_Handler>::identifier(const char*& p, size_t& len) +{ + p = mp_char; + len = 1; + for (next(); has_char(); next()) + { + char c = cur_char(); + if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c)) + break; + ++len; + } +} + +template<typename _Handler> +void css_parser<_Handler>::skip_blanks() +{ + for (; has_char(); next()) + { + if (!is_blank(*mp_char)) + break; + } +} + +template<typename _Handler> +void css_parser<_Handler>::skip_blanks_reverse() +{ + const char* p = mp_char + remaining_size(); + for (; p != mp_char; --p, --m_length) + { + if (!is_blank(*p)) + break; + } +} + +template<typename _Handler> +void css_parser<_Handler>::shrink_stream() +{ + // Skip any leading blanks. + skip_blanks(); + + if (!remaining_size()) + return; + + // Skip any trailing blanks. + skip_blanks_reverse(); + + // Skip leading <!-- if present. + + const char* com_open = "<!--"; + size_t com_open_len = std::strlen(com_open); + if (remaining_size() < com_open_len) + // Not enough stream left. Bail out. + return; + + const char* p = mp_char; + for (size_t i = 0; i < com_open_len; ++i, ++p) + { + if (*p != com_open[i]) + return; + next(); + } + mp_char = p; + + // Skip leading blanks once again. + skip_blanks(); + + // Skip trailing --> if present. + const char* com_close = "-->"; + size_t com_close_len = std::strlen(com_close); + size_t n = remaining_size(); + if (n < com_close_len) + // Not enough stream left. Bail out. + return; + + p = mp_char + n; // move to the last char. + for (size_t i = com_close_len; i > 0; --i, --p) + { + if (*p != com_close[i-1]) + return; + } + m_length -= com_close_len; + + skip_blanks_reverse(); +} + +template<typename _Handler> +void css_parser<_Handler>::next() +{ + ++m_pos; + ++mp_char; +} + +template<typename _Handler> +char css_parser<_Handler>::cur_char() const +{ + return *mp_char; +} + +} + +#endif |