summaryrefslogtreecommitdiff
path: root/sc/inc
diff options
context:
space:
mode:
authorKohei Yoshida <kohei.yoshida@suse.com>2011-07-28 00:46:55 -0400
committerKohei Yoshida <kohei.yoshida@suse.com>2011-08-03 21:27:26 -0400
commit24d8e4eaf4543c5b39b9e816d8514525b098827d (patch)
treea2e48bdc99e07c27d22685558e88fc8e4cb23b50 /sc/inc
parentf2ea23963d92d53f29124f8dd411dbf40450c868 (diff)
Parse CSS in the <style> content and set number formats to cells.
Pick up number formats specified in the CSS content of Excel generated HTML documents. This makes use of a template-based CSS parser from the orcus project.
Diffstat (limited to 'sc/inc')
-rw-r--r--sc/inc/orcus/README5
-rw-r--r--sc/inc/orcus/css_parser.hpp513
2 files changed, 518 insertions, 0 deletions
diff --git a/sc/inc/orcus/README b/sc/inc/orcus/README
new file mode 100644
index 000000000000..3ada1c3c0dae
--- /dev/null
+++ b/sc/inc/orcus/README
@@ -0,0 +1,5 @@
+The headers in this directory are directly copied from the orcus project[1].
+When modifying any of these files, please ping me so that the changes can be
+upstreamed.
+
+[1] http://gitorious.org/orcus
diff --git a/sc/inc/orcus/css_parser.hpp b/sc/inc/orcus/css_parser.hpp
new file mode 100644
index 000000000000..7a1b3e51241f
--- /dev/null
+++ b/sc/inc/orcus/css_parser.hpp
@@ -0,0 +1,513 @@
+/*************************************************************************
+ *
+ * Copyright (c) 2011 Kohei Yoshida
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ ************************************************************************/
+
+#ifndef __ORCUS_CSS_PARSER_HPP__
+#define __ORCUS_CSS_PARSER_HPP__
+
+#define ORCUS_DEBUG_CSS 0
+
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <string>
+#include <cassert>
+#include <sstream>
+
+#if ORCUS_DEBUG_CSS
+#include <iostream>
+#endif
+
+namespace orcus {
+
+class css_parse_error : public std::exception
+{
+ std::string m_msg;
+public:
+ css_parse_error(const std::string& msg) : m_msg(msg) {}
+ virtual ~css_parse_error() throw() {}
+ virtual const char* what() const throw() { return m_msg.c_str(); }
+};
+
+template<typename _Handler>
+class css_parser
+{
+public:
+ typedef _Handler handler_type;
+
+ css_parser(const char* p, size_t n, handler_type& hdl);
+ void parse();
+
+private:
+ // Handlers - at the time a handler is called the current position is
+ // expected to point to the first unprocessed non-blank character, and
+ // each handler must set the current position to the next unprocessed
+ // non-blank character when it finishes.
+ void rule();
+ void at_rule_name();
+ void selector_name();
+ void property_name();
+ void property();
+ void quoted_value();
+ void value();
+ void name_sep();
+ void property_sep();
+ void block();
+
+ void identifier(const char*& p, size_t& len);
+
+ void skip_blanks();
+ void skip_blanks_reverse();
+ void shrink_stream();
+ void next();
+ char cur_char() const;
+
+ size_t remaining_size() const { return m_length - m_pos - 1; }
+ bool has_char() const { return m_pos < m_length; }
+
+ static bool is_blank(char c)
+ {
+ return c == ' ' || c == '\t' || c == '\n';
+ }
+
+ static bool is_alpha(char c)
+ {
+ if ('a' <= c && c <= 'z')
+ return true;
+ if ('A' <= c && c <= 'Z')
+ return true;
+ return false;
+ }
+
+ static bool is_name_char(char c)
+ {
+ switch (c)
+ {
+ case '-':
+ return true;
+ }
+
+ return false;
+ }
+
+ static bool is_numeric(char c)
+ {
+ if ('0' <= c && c <= '9')
+ return true;
+ return false;
+ }
+
+ handler_type& m_handler;
+ const char* mp_char;
+ size_t m_pos;
+ size_t m_length;
+};
+
+template<typename _Handler>
+css_parser<_Handler>::css_parser(const char* p, size_t n, handler_type& hdl) :
+ m_handler(hdl), mp_char(p), m_pos(0), m_length(n) {}
+
+template<typename _Handler>
+void css_parser<_Handler>::parse()
+{
+ shrink_stream();
+
+#if ORCUS_DEBUG_CSS
+ std::cout << "compressed: '";
+ const char* p = mp_char;
+ for (size_t i = m_pos; i < m_length; ++i, ++p)
+ std::cout << *p;
+ std::cout << "'" << std::endl;
+#endif
+ m_handler.begin_parse();
+ for (; has_char(); next())
+ rule();
+ m_handler.end_parse();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::rule()
+{
+ // <name> , ... , <name> { <properties> }
+ while (has_char())
+ {
+ char c = cur_char();
+ if (is_alpha(c) || c == '.' || c == '@')
+ {
+ selector_name();
+ }
+ else if (c == ',')
+ {
+ name_sep();
+ }
+ else if (c == '{')
+ {
+ block();
+ }
+ else
+ {
+ std::ostringstream os;
+ os << "failed to parse '" << c << "'";
+ throw css_parse_error(os.str());
+ }
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::at_rule_name()
+{
+ assert(has_char());
+ assert(cur_char() == '@');
+ next();
+ char c = cur_char();
+ if (!is_alpha(c))
+ throw css_parse_error("first character of an at-rule name must be an alphabet.");
+
+ const char* p;
+ size_t len;
+ identifier(p, len);
+ skip_blanks();
+
+ m_handler.at_rule_name(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "at-rule name: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::selector_name()
+{
+ // <element name> '.' <class name>
+
+ assert(has_char());
+ char c = cur_char();
+ if (c == '@')
+ {
+ // This is the name of an at-rule.
+ at_rule_name();
+ return;
+ }
+
+ if (!is_alpha(c) && c != '.')
+ throw css_parse_error("first character of a name must be an alphabet or a dot.");
+
+ const char* p_elem = NULL;
+ const char* p_class = NULL;
+ size_t len_elem = 0;
+ size_t len_class = 0;
+ if (c != '.')
+ identifier(p_elem, len_elem);
+
+ if (cur_char() == '.')
+ {
+ next();
+ identifier(p_class, len_class);
+ }
+ skip_blanks();
+
+ m_handler.selector_name(p_elem, len_elem, p_class, len_class);
+#if ORCUS_DEBUG_CSS
+ std::string elem_name(p_elem, len_elem), class_name(p_class, len_class);
+ std::cout << "selector name: (element)'" << elem_name.c_str() << "' (class)'" << class_name.c_str() << "'" << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property_name()
+{
+ assert(has_char());
+ char c = cur_char();
+ if (!is_alpha(c) && c != '.')
+ throw css_parse_error("first character of a name must be an alphabet or a dot.");
+
+ const char* p;
+ size_t len;
+ identifier(p, len);
+ skip_blanks();
+
+ m_handler.property_name(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "property name: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property()
+{
+ // <name> : <value> , ... , <value>
+ m_handler.begin_property();
+ property_name();
+ if (cur_char() != ':')
+ throw css_parse_error("':' expected.");
+ next();
+ skip_blanks();
+ while (has_char())
+ {
+ value();
+ char c = cur_char();
+ if (c == ',')
+ {
+ // separated by commas.
+ next();
+ skip_blanks();
+ }
+ else if (c == ';')
+ break;
+ }
+ skip_blanks();
+ m_handler.end_property();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::quoted_value()
+{
+ assert(cur_char() == '"');
+ next();
+ const char* p = mp_char;
+ size_t len = 1;
+ for (next(); has_char(); next())
+ {
+ if (cur_char() == '"')
+ {
+ // End quote reached.
+ break;
+ }
+ ++len;
+ }
+
+ if (cur_char() != '"')
+ throw css_parse_error("end quote has never been reached.");
+
+ next();
+ skip_blanks();
+
+ m_handler.value(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "quoted value: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::value()
+{
+ assert(has_char());
+ char c = cur_char();
+ if (c == '"')
+ {
+ quoted_value();
+ return;
+ }
+
+ if (!is_alpha(c) && !is_numeric(c) && c != '-' && c != '+' && c != '.')
+ {
+ std::ostringstream os;
+ os << "illegal first character of a value '" << c << "'";
+ throw css_parse_error(os.str());
+ }
+
+ const char* p = mp_char;
+ size_t len = 1;
+ for (next(); has_char(); next())
+ {
+ c = cur_char();
+ if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c) && c != '.')
+ break;
+ ++len;
+ }
+ skip_blanks();
+
+ m_handler.value(p, len);
+#if ORCUS_DEBUG_CSS
+ std::string foo(p, len);
+ std::cout << "value: " << foo.c_str() << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::name_sep()
+{
+ assert(cur_char() == ',');
+#if ORCUS_DEBUG_CSS
+ std::cout << "," << std::endl;
+#endif
+ next();
+ skip_blanks();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::property_sep()
+{
+#if ORCUS_DEBUG_CSS
+ std::cout << ";" << std::endl;
+#endif
+ next();
+ skip_blanks();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::block()
+{
+ // '{' <property> ';' ... ';' <property> '}'
+
+ assert(cur_char() == '{');
+#if ORCUS_DEBUG_CSS
+ std::cout << "{" << std::endl;
+#endif
+ m_handler.begin_block();
+
+ next();
+ skip_blanks();
+
+ // parse properties.
+ while (has_char())
+ {
+ property();
+ if (cur_char() != ';')
+ break;
+ property_sep();
+ if (cur_char() == '}')
+ // ';' after the last property. This is optional but allowed.
+ break;
+ }
+
+ if (cur_char() != '}')
+ throw css_parse_error("} expected.");
+
+ m_handler.end_block();
+
+ next();
+ skip_blanks();
+
+#if ORCUS_DEBUG_CSS
+ std::cout << "}" << std::endl;
+#endif
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::identifier(const char*& p, size_t& len)
+{
+ p = mp_char;
+ len = 1;
+ for (next(); has_char(); next())
+ {
+ char c = cur_char();
+ if (!is_alpha(c) && !is_name_char(c) && !is_numeric(c))
+ break;
+ ++len;
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::skip_blanks()
+{
+ for (; has_char(); next())
+ {
+ if (!is_blank(*mp_char))
+ break;
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::skip_blanks_reverse()
+{
+ const char* p = mp_char + remaining_size();
+ for (; p != mp_char; --p, --m_length)
+ {
+ if (!is_blank(*p))
+ break;
+ }
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::shrink_stream()
+{
+ // Skip any leading blanks.
+ skip_blanks();
+
+ if (!remaining_size())
+ return;
+
+ // Skip any trailing blanks.
+ skip_blanks_reverse();
+
+ // Skip leading <!-- if present.
+
+ const char* com_open = "<!--";
+ size_t com_open_len = std::strlen(com_open);
+ if (remaining_size() < com_open_len)
+ // Not enough stream left. Bail out.
+ return;
+
+ const char* p = mp_char;
+ for (size_t i = 0; i < com_open_len; ++i, ++p)
+ {
+ if (*p != com_open[i])
+ return;
+ next();
+ }
+ mp_char = p;
+
+ // Skip leading blanks once again.
+ skip_blanks();
+
+ // Skip trailing --> if present.
+ const char* com_close = "-->";
+ size_t com_close_len = std::strlen(com_close);
+ size_t n = remaining_size();
+ if (n < com_close_len)
+ // Not enough stream left. Bail out.
+ return;
+
+ p = mp_char + n; // move to the last char.
+ for (size_t i = com_close_len; i > 0; --i, --p)
+ {
+ if (*p != com_close[i-1])
+ return;
+ }
+ m_length -= com_close_len;
+
+ skip_blanks_reverse();
+}
+
+template<typename _Handler>
+void css_parser<_Handler>::next()
+{
+ ++m_pos;
+ ++mp_char;
+}
+
+template<typename _Handler>
+char css_parser<_Handler>::cur_char() const
+{
+ return *mp_char;
+}
+
+}
+
+#endif