diff options
author | Markus Mohrhard <markus.mohrhard@googlemail.com> | 2017-08-12 11:30:38 +0200 |
---|---|---|
committer | Markus Mohrhard <markus.mohrhard@googlemail.com> | 2017-08-12 18:35:46 +0200 |
commit | 1a4dd1fa2a851f678d728ed342a59d48f8cc74ea (patch) | |
tree | f9b775f44dac9050c586fd07f99a39738de7a92a /sc/source | |
parent | 5059000f219c2709616bf1f919ca8e99a56e3054 (diff) |
external data: add html data provider
Change-Id: I4ae266707f5cf3b5231f726082950f90df3ca1eb
Reviewed-on: https://gerrit.libreoffice.org/41083
Tested-by: Jenkins <ci@libreoffice.org>
Reviewed-by: Markus Mohrhard <markus.mohrhard@googlemail.com>
Diffstat (limited to 'sc/source')
-rw-r--r-- | sc/source/ui/dataprovider/dataprovider.cxx | 7 | ||||
-rw-r--r-- | sc/source/ui/dataprovider/htmldataprovider.cxx | 209 | ||||
-rw-r--r-- | sc/source/ui/dataprovider/htmldataprovider.hxx | 48 |
3 files changed, 263 insertions, 1 deletions
diff --git a/sc/source/ui/dataprovider/dataprovider.cxx b/sc/source/ui/dataprovider/dataprovider.cxx index 4c23420a18a4..119dbdd18e9b 100644 --- a/sc/source/ui/dataprovider/dataprovider.cxx +++ b/sc/source/ui/dataprovider/dataprovider.cxx @@ -14,6 +14,8 @@ #include "officecfg/Office/Calc.hxx" #include <rtl/strbuf.hxx> +#include "htmldataprovider.hxx" + using namespace com::sun::star; namespace sc { @@ -203,13 +205,16 @@ bool DataProviderFactory::isInternalDataProvider(const OUString& rProvider) return rProvider.startsWith("org.libreoffice.calc"); } -std::shared_ptr<DataProvider> DataProviderFactory::getDataProvider(ScDocument* pDoc, const OUString& rProvider, const OUString& rURL, const OUString& /*rID*/, ScDBDataManager* pManager) +std::shared_ptr<DataProvider> DataProviderFactory::getDataProvider(ScDocument* pDoc, const OUString& rProvider, + const OUString& rURL, const OUString& rID, ScDBDataManager* pManager) { bool bInternal = DataProviderFactory::isInternalDataProvider(rProvider); if (bInternal) { if (rProvider == "org.libreoffice.calc.csv") return std::shared_ptr<DataProvider>(new CSVDataProvider(pDoc, rURL, pManager)); + else if (rProvider == "org.libreoffice.calc.html") + return std::shared_ptr<DataProvider>(new HTMLDataProvider(pDoc, rURL, pManager, rID)); } else { diff --git a/sc/source/ui/dataprovider/htmldataprovider.cxx b/sc/source/ui/dataprovider/htmldataprovider.cxx new file mode 100644 index 000000000000..c73efee0260c --- /dev/null +++ b/sc/source/ui/dataprovider/htmldataprovider.cxx @@ -0,0 +1,209 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "htmldataprovider.hxx" +#include <salhelper/thread.hxx> + +#include <libxml/HTMLparser.h> +#include <libxml/HTMLtree.h> + +#include <libxml/xpath.h> +#include <libxml/xpathInternals.h> + +#include <comphelper/string.hxx> + +namespace sc { + +class HTMLFetchThread : public salhelper::Thread +{ + ScDocument& mrDocument; + OUString maURL; + OUString maID; + + Idle* mpIdle; + + void handleTable(xmlNodePtr pTable); + void handleRow(xmlNodePtr pRow, SCROW nRow); + void skipHeadBody(xmlNodePtr pSkip, SCROW& rRow); + void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol); + +public: + HTMLFetchThread(ScDocument& rDoc, const OUString&, const OUString& rID, Idle* pIdle); + + virtual void execute() override; +}; + +HTMLFetchThread::HTMLFetchThread(ScDocument& rDoc, const OUString& rURL, const OUString& rID, Idle* pIdle): + salhelper::Thread("HTML Fetch Thread"), + mrDocument(rDoc), + maURL(rURL), + maID(rID), + mpIdle(pIdle) +{ +} + +namespace { + +OString toString(const xmlChar* pStr) +{ + return OString(reinterpret_cast<const char*>(pStr), xmlStrlen(pStr)); +} + +} + +void HTMLFetchThread::handleCell(xmlNodePtr pCellNode, SCROW nRow, SCCOL nCol) +{ + for (xmlNodePtr cur_node = pCellNode->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_TEXT_NODE) + { + OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8); + OUString aOldString; + do + { + aOldString = aString; + aString = comphelper::string::strip(aString, ' '); + aString = comphelper::string::strip(aString, '\n'); + aString = comphelper::string::strip(aString, '\r'); + aString = comphelper::string::strip(aString, '\t'); + } + while (aOldString != aString); + + mrDocument.SetString(nCol, nRow, 0, aString); + } + } +} + +void HTMLFetchThread::handleRow(xmlNodePtr pRowNode, SCROW nRow) +{ + sal_Int32 nCol = 0; + for (xmlNodePtr cur_node = pRowNode->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + OString aNodeName = toString(cur_node->name); + if (aNodeName == "td" || aNodeName == "th") + { + handleCell(cur_node, nRow, nCol); + ++nCol; + } + } + } +} + +void HTMLFetchThread::skipHeadBody(xmlNodePtr pSkipElement, SCROW& rRow) +{ + for (xmlNodePtr cur_node = pSkipElement->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + OString aNodeName = toString(cur_node->name); + if (aNodeName == "tr") + { + handleRow(cur_node, rRow); + ++rRow; + } + + } + } +} + +void HTMLFetchThread::handleTable(xmlNodePtr pTable) +{ + sal_Int32 nRow = 0; + for (xmlNodePtr cur_node = pTable->children; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + OString aNodeName = toString(cur_node->name); + if (aNodeName == "tr") + { + handleRow(cur_node, nRow); + ++nRow; + } + else if (aNodeName == "thead" || aNodeName == "tbody") + { + skipHeadBody(cur_node, nRow); + } + } + } +} + +void HTMLFetchThread::execute() +{ + OStringBuffer aBuffer(64000); + std::unique_ptr<SvStream> pStream = DataProvider::FetchStreamFromURL(maURL, aBuffer); + + htmlDocPtr pHtmlPtr = htmlParseDoc(reinterpret_cast<xmlChar*>(const_cast<char*>(aBuffer.getStr())), nullptr); + + OString aID = OUStringToOString(maID, RTL_TEXTENCODING_UTF8); + xmlXPathContextPtr pXmlXpathCtx = xmlXPathNewContext(pHtmlPtr); + xmlXPathObjectPtr pXmlXpathObj = xmlXPathEvalExpression(BAD_CAST(aID.getStr()), pXmlXpathCtx); + xmlNodeSetPtr pXmlNodes = pXmlXpathObj->nodesetval; + + if (pXmlNodes->nodeNr == 0) + return; + + xmlNodePtr pNode = pXmlNodes->nodeTab[0]; + handleTable(pNode); + + xmlXPathFreeNodeSet(pXmlNodes); + xmlXPathFreeNodeSetList(pXmlXpathObj); + xmlXPathFreeContext(pXmlXpathCtx); + + SolarMutexGuard aGuard; + mpIdle->Start(); +} + +HTMLDataProvider::HTMLDataProvider(ScDocument* pDoc, const OUString& rURL, ScDBDataManager* pDBManager, + const OUString& rID): + maID(rID), + maURL(rURL), + mpDocument(pDoc), + mpDBDataManager(pDBManager), + maIdle("HTMLDataProvider CopyHandler") +{ + maIdle.SetInvokeHandler(LINK(this, HTMLDataProvider, ImportFinishedHdl)); +} + +HTMLDataProvider::~HTMLDataProvider() +{ + if (mxHTMLFetchThread.is()) + { + mxHTMLFetchThread->join(); + } +} + +void HTMLDataProvider::Import() +{ + // already importing data + if (mpDoc) + return; + + mpDoc.reset(new ScDocument(SCDOCMODE_CLIP)); + mpDoc->ResetClip(mpDocument, (SCTAB)0); + mxHTMLFetchThread = new HTMLFetchThread(*mpDoc, maURL, maID, &maIdle); + mxHTMLFetchThread->launch(); +} + +IMPL_LINK_NOARG(HTMLDataProvider, ImportFinishedHdl, Timer*, void) +{ + mpDBDataManager->WriteToDoc(*mpDoc); + mxHTMLFetchThread.clear(); + mpDoc.reset(); +} + +const OUString& HTMLDataProvider::GetURL() const +{ + return maURL; +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/ui/dataprovider/htmldataprovider.hxx b/sc/source/ui/dataprovider/htmldataprovider.hxx new file mode 100644 index 000000000000..adbcf37baf5e --- /dev/null +++ b/sc/source/ui/dataprovider/htmldataprovider.hxx @@ -0,0 +1,48 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_SC_SOURCE_UI_DATAPROVIDER_HTMLDATAPROVIDER_HXX +#define INCLUDED_SC_SOURCE_UI_DATAPROVIDER_HTMLDATAPROVIDER_HXX + +#include "dataprovider.hxx" + +namespace sc { + +class HTMLFetchThread; + +class HTMLDataProvider : public DataProvider +{ +private: + + OUString maID; + OUString maURL; + ScDocument* mpDocument; + ScDBDataManager* mpDBDataManager; + rtl::Reference<HTMLFetchThread> mxHTMLFetchThread; + + std::unique_ptr<ScDocument> mpDoc; + Idle maIdle; + +public: + + HTMLDataProvider(ScDocument* pDoc, const OUString& rURL, ScDBDataManager* pDBManager, const OUString& rID); + virtual ~HTMLDataProvider() override; + + virtual void Import() override; + + virtual const OUString& GetURL() const override; + + DECL_LINK( ImportFinishedHdl, Timer*, void ); +}; + +} + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |