diff options
author | Maxim Monastirsky <momonasmon@gmail.com> | 2014-01-20 10:17:05 +0200 |
---|---|---|
committer | Kohei Yoshida <libreoffice@kohei.us> | 2014-01-23 14:49:22 +0000 |
commit | cc2893834d8ac699dbb38b152f21f17f3debb06b (patch) | |
tree | f6872c1bb50bff0ada758ba68f2a6867f6b9c053 | |
parent | 6063555744ed89d8a757b667cddcdd4357839466 (diff) |
related: fdo#73682 Introduce HTML detection service
Change-Id: I66bb579019ce8411b821c623955a454fd81cf811
Reviewed-on: https://gerrit.libreoffice.org/7600
Reviewed-by: Kohei Yoshida <libreoffice@kohei.us>
Tested-by: Kohei Yoshida <libreoffice@kohei.us>
-rw-r--r-- | Repository.mk | 1 | ||||
-rw-r--r-- | filter/Library_htmlfd.mk | 36 | ||||
-rw-r--r-- | filter/Module_filter.mk | 1 | ||||
-rw-r--r-- | filter/source/config/fragments/types/generic_HTML.xcu | 2 | ||||
-rw-r--r-- | filter/source/htmlfilterdetect/fdcomp.cxx | 36 | ||||
-rw-r--r-- | filter/source/htmlfilterdetect/filterdetect.cxx | 232 | ||||
-rw-r--r-- | filter/source/htmlfilterdetect/filterdetect.hxx | 64 | ||||
-rw-r--r-- | filter/source/htmlfilterdetect/htmlfd.component | 15 | ||||
-rwxr-xr-x | postprocess/Rdb_services.mk | 1 | ||||
-rw-r--r-- | solenv/gbuild/extensions/pre_MergedLibsList.mk | 1 |
10 files changed, 388 insertions, 1 deletions
diff --git a/Repository.mk b/Repository.mk index 6c4d488a0d64..7066001679bc 100644 --- a/Repository.mk +++ b/Repository.mk @@ -270,6 +270,7 @@ $(eval $(call gb_Helper_register_libraries_for_install,OOOLIBS,ooo, \ $(if $(ENABLE_DIRECTX),gdipluscanvas) \ guesslang \ $(if $(filter DESKTOP,$(BUILD_TYPE)),helplinker) \ + htmlfd \ i18npool \ i18nsearch \ hyphen \ diff --git a/filter/Library_htmlfd.mk b/filter/Library_htmlfd.mk new file mode 100644 index 000000000000..a147509e899e --- /dev/null +++ b/filter/Library_htmlfd.mk @@ -0,0 +1,36 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +#************************************************************************* +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +#************************************************************************* + +$(eval $(call gb_Library_Library,htmlfd)) + +$(eval $(call gb_Library_set_componentfile,htmlfd,filter/source/htmlfilterdetect/htmlfd)) + +$(eval $(call gb_Library_use_external,xmlfd,boost_headers)) + +$(eval $(call gb_Library_use_sdk_api,htmlfd)) + +$(eval $(call gb_Library_use_libraries,htmlfd,\ + ucbhelper \ + cppuhelper \ + cppu \ + sal \ + tl \ + utl \ + svt \ + $(gb_UWINAPI) \ +)) + +$(eval $(call gb_Library_add_exception_objects,htmlfd,\ + filter/source/htmlfilterdetect/fdcomp \ + filter/source/htmlfilterdetect/filterdetect \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/filter/Module_filter.mk b/filter/Module_filter.mk index 403184a93feb..58307b42a7e9 100644 --- a/filter/Module_filter.mk +++ b/filter/Module_filter.mk @@ -34,6 +34,7 @@ $(eval $(call gb_Module_add_targets,filter,\ Library_exp) \ Library_filterconfig \ Library_flash \ + Library_htmlfd \ Library_icd \ Library_icg \ Library_idx \ diff --git a/filter/source/config/fragments/types/generic_HTML.xcu b/filter/source/config/fragments/types/generic_HTML.xcu index ede6d2b8fefb..58ffedc85f1e 100644 --- a/filter/source/config/fragments/types/generic_HTML.xcu +++ b/filter/source/config/fragments/types/generic_HTML.xcu @@ -16,7 +16,7 @@ * the License at http://www.apache.org/licenses/LICENSE-2.0 . --> <node oor:name="generic_HTML" oor:op="replace" > - <prop oor:name="DetectService"><value>com.sun.star.text.FormatDetector</value></prop> + <prop oor:name="DetectService"><value>com.sun.star.comp.filters.HtmlFilterDetect</value></prop> <prop oor:name="URLPattern"><value>private:factory/swriter/web*</value></prop> <prop oor:name="Extensions"><value>html htm</value></prop> <prop oor:name="MediaType"><value>text/html</value></prop> diff --git a/filter/source/htmlfilterdetect/fdcomp.cxx b/filter/source/htmlfilterdetect/fdcomp.cxx new file mode 100644 index 000000000000..40360e923c33 --- /dev/null +++ b/filter/source/htmlfilterdetect/fdcomp.cxx @@ -0,0 +1,36 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <sal/config.h> + +#include <cppuhelper/factory.hxx> +#include <cppuhelper/implementationentry.hxx> +#include <sal/types.h> + +#include "filterdetect.hxx" + +namespace { + +static cppu::ImplementationEntry const services[] = { + { &HtmlFilterDetect_createInstance, &HtmlFilterDetect_getImplementationName, + &HtmlFilterDetect_getSupportedServiceNames, + &cppu::createSingleComponentFactory, 0, 0 }, + { 0, 0, 0, 0, 0, 0 } +}; + +} + +extern "C" SAL_DLLPUBLIC_EXPORT void * SAL_CALL htmlfd_component_getFactory( + char const * pImplName, void * pServiceManager, void * pRegistryKey) +{ + return cppu::component_getFactoryHelper( + pImplName, pServiceManager, pRegistryKey, services); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/filterdetect.cxx b/filter/source/htmlfilterdetect/filterdetect.cxx new file mode 100644 index 000000000000..140912d37379 --- /dev/null +++ b/filter/source/htmlfilterdetect/filterdetect.cxx @@ -0,0 +1,232 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "filterdetect.hxx" + +#include <svtools/htmltokn.h> +#include <tools/urlobj.hxx> +#include <ucbhelper/content.hxx> +#include <unotools/mediadescriptor.hxx> +#include <unotools/ucbstreamhelper.hxx> + +#include <com/sun/star/io/XInputStream.hpp> +#include <cppuhelper/supportsservice.hxx> + +#include <boost/scoped_ptr.hpp> + +using com::sun::star::io::XInputStream; +using com::sun::star::uno::Sequence; +using com::sun::star::uno::Reference; +using com::sun::star::uno::Any; +using com::sun::star::uno::XComponentContext; +using com::sun::star::uno::XInterface; +using com::sun::star::uno::Exception; +using com::sun::star::uno::RuntimeException; +using com::sun::star::ucb::XCommandEnvironment; + +using namespace com::sun::star; +using namespace com::sun::star::beans; + +namespace { + +enum DetectPhase { + BeforeTag, + TagOpened, + InTagName +}; + +bool isHTMLStream(const OString& aStreamHeader) +{ + const char* pHeader = aStreamHeader.getStr(); + const int nLength = aStreamHeader.getLength(); + int nStartOfTagIndex = 0; + int i = 0; + + DetectPhase dp = BeforeTag; + + for ( i = 0; i < nLength; ++i, ++pHeader ) + { + char c = *pHeader; + if ( c == ' ' || c == '\n' || c == '\t' ) + { + if ( dp == TagOpened ) + return false; // Invalid: Should start with a tag name + else if ( dp == InTagName ) + break; // End of tag name reached + } + else if ( c == '<' ) + { + if ( dp == BeforeTag ) + dp = TagOpened; + else + return false; // Invalid: Nested '<' + } + else if ( c == '>' ) + { + if ( dp == InTagName ) + break; // End of tag name reached + else + return false; // Invalid: Empty tag or before '<' + } + else if ( c == '!' ) + { + if ( i == 1 && dp == TagOpened ) + return true; // "<!" at the very beginning of the file + else + return false; // Invalid: '!' before '<' or inside tag name + } + else + { + if ( dp == BeforeTag ) + return false; // Invalid: Should start with a tag + else if ( dp == TagOpened ) + { + nStartOfTagIndex = i; + dp = InTagName; + } + } + } + + // The string following '<' has to be a known HTML token. + if ( GetHTMLToken( OStringToOUString( aStreamHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex ), + RTL_TEXTENCODING_ASCII_US ) ) != 0 ) + return true; + + return false; +} + +} + +OUString SAL_CALL HtmlFilterDetect::detect(Sequence<PropertyValue>& lDescriptor) + throw (RuntimeException) +{ + OUString sUrl; + OUString sDocService; + OString resultString; + Reference<XInputStream> xInStream; + + const PropertyValue *pValue = lDescriptor.getConstArray(); + sal_Int32 nLength = lDescriptor.getLength(); + sal_Int32 location = nLength; + + for ( sal_Int32 i = 0; i < nLength; ++i ) + { + if ( pValue[i].Name == utl::MediaDescriptor::PROP_URL() ) + pValue[i].Value >>= sUrl; + else if ( pValue[i].Name == utl::MediaDescriptor::PROP_INPUTSTREAM() ) + pValue[i].Value >>= xInStream; + else if ( pValue[i].Name == utl::MediaDescriptor::PROP_DOCUMENTSERVICE() ) + { + location = i; + pValue[i].Value >>= sDocService; + } + } + + try + { + if ( !xInStream.is() ) + { + ucbhelper::Content aContent( sUrl, Reference<XCommandEnvironment>(), mxCtx ); + xInStream = aContent.openStream(); + if ( !xInStream.is() ) + return OUString(); + } + + boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) ); + if ( !pInStream || pInStream->GetError() ) + return OUString(); + + pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); + sal_Size nUniPos = pInStream->Tell(); + + const sal_uInt16 nSize = 4096; + + if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode + resultString = read_uInt8s_ToOString( *pInStream, nSize ); + else // UTF-16 + resultString = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US ); + + if ( isHTMLStream( resultString.toAsciiLowerCase() ) ) + { + // Some Apps/Web services use ".xls" extension to indicate that + // the given file should be opened by a spreadsheet software + if ( sDocService.isEmpty() ) + { + INetURLObject aParser( sUrl ); + OUString aExt = aParser.getExtension( INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET ); + aExt = aExt.toAsciiLowerCase(); + + if ( aExt == "xls" ) + { + if ( location == lDescriptor.getLength() ) + { + lDescriptor.realloc( location + 1 ); + lDescriptor[location].Name = utl::MediaDescriptor::PROP_DOCUMENTSERVICE(); + } + lDescriptor[location].Value <<= OUString( "com.sun.star.sheet.SpreadsheetDocument" ); + } + } + return OUString( "generic_HTML" ); + } + } + catch (const Exception &) + { + OSL_FAIL( "An Exception occurred while opening File stream" ); + } + + return OUString(); // Failed +} + +// XInitialization + +void SAL_CALL HtmlFilterDetect::initialize(const Sequence<Any>& /*aArguments*/) + throw (Exception, RuntimeException) +{ +} + +OUString HtmlFilterDetect_getImplementationName() +{ + return OUString( "com.sun.star.comp.filters.HtmlFilterDetect" ); +} + +Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames() +{ + Sequence<OUString> aRet(2); + OUString* pArray = aRet.getArray(); + pArray[0] = "com.sun.star.document.ExtendedTypeDetection"; + pArray[1] = "com.sun.star.comp.filters.HtmlFilterDetect"; + return aRet; +} + +Reference<XInterface> HtmlFilterDetect_createInstance(const Reference<XComponentContext>& rCtx) +{ + return (cppu::OWeakObject*) new HtmlFilterDetect( rCtx ); +} + +// XServiceInfo + +OUString SAL_CALL HtmlFilterDetect::getImplementationName() + throw (RuntimeException) +{ + return HtmlFilterDetect_getImplementationName(); +} + +sal_Bool SAL_CALL HtmlFilterDetect::supportsService(const OUString& rServiceName) + throw (RuntimeException) +{ + return cppu::supportsService( this, rServiceName ); +} + +Sequence<OUString> SAL_CALL HtmlFilterDetect::getSupportedServiceNames() + throw (RuntimeException) +{ + return HtmlFilterDetect_getSupportedServiceNames(); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/filterdetect.hxx b/filter/source/htmlfilterdetect/filterdetect.hxx new file mode 100644 index 000000000000..631d4d3715e5 --- /dev/null +++ b/filter/source/htmlfilterdetect/filterdetect.hxx @@ -0,0 +1,64 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX +#define INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX + +#include <com/sun/star/document/XExtendedFilterDetection.hpp> +#include <com/sun/star/lang/XInitialization.hpp> +#include <com/sun/star/lang/XServiceInfo.hpp> +#include <com/sun/star/uno/XComponentContext.hpp> + +#include <cppuhelper/implbase3.hxx> + +class HtmlFilterDetect : public cppu::WeakImplHelper3< + com::sun::star::document::XExtendedFilterDetection, + com::sun::star::lang::XInitialization, + com::sun::star::lang::XServiceInfo> +{ + com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext> mxCtx; + +public: + + HtmlFilterDetect(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& xCtx) : + mxCtx(xCtx) {} + virtual ~HtmlFilterDetect() {} + + // XExtendedFilterDetection + + virtual OUString SAL_CALL detect(com::sun::star::uno::Sequence<com::sun::star::beans::PropertyValue>& lDescriptor) + throw (com::sun::star::uno::RuntimeException); + + // XInitialization + + virtual void SAL_CALL initialize(const ::com::sun::star::uno::Sequence<com::sun::star::uno::Any>& aArguments) + throw (com::sun::star::uno::Exception, com::sun::star::uno::RuntimeException); + + // XServiceInfo + + virtual OUString SAL_CALL getImplementationName() + throw (com::sun::star::uno::RuntimeException); + + virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName) + throw (com::sun::star::uno::RuntimeException); + + virtual com::sun::star::uno::Sequence<OUString> SAL_CALL getSupportedServiceNames() + throw (com::sun::star::uno::RuntimeException); +}; + +OUString HtmlFilterDetect_getImplementationName(); + +com::sun::star::uno::Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames(); + +com::sun::star::uno::Reference<com::sun::star::uno::XInterface> +HtmlFilterDetect_createInstance(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& rCtx); + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/htmlfd.component b/filter/source/htmlfilterdetect/htmlfd.component new file mode 100644 index 000000000000..32c41b8bef26 --- /dev/null +++ b/filter/source/htmlfilterdetect/htmlfd.component @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + --> + +<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@" + prefix="htmlfd" xmlns="http://openoffice.org/2010/uno-components"> + <implementation name="com.sun.star.comp.filters.HtmlFilterDetect"> + <service name="com.sun.star.document.ExtendedTypeDetection"/> + </implementation> +</component> diff --git a/postprocess/Rdb_services.mk b/postprocess/Rdb_services.mk index cd8e3c92bae4..b0c8a10d29af 100755 --- a/postprocess/Rdb_services.mk +++ b/postprocess/Rdb_services.mk @@ -29,6 +29,7 @@ $(eval $(call gb_Rdb_add_components,services,\ filter/source/config/cache/filterconfig1 \ filter/source/flash/flash \ filter/source/graphic/graphicfilter \ + filter/source/htmlfilterdetect/htmlfd \ filter/source/msfilter/msfilter \ filter/source/odfflatxml/odfflatxml \ filter/source/pdf/pdffilter \ diff --git a/solenv/gbuild/extensions/pre_MergedLibsList.mk b/solenv/gbuild/extensions/pre_MergedLibsList.mk index 9cc207915e11..ba7ad86aeaff 100644 --- a/solenv/gbuild/extensions/pre_MergedLibsList.mk +++ b/solenv/gbuild/extensions/pre_MergedLibsList.mk @@ -46,6 +46,7 @@ gb_EXTRAMERGEDLIBS := \ graphicfilter \ guesslang \ $(if $(ENABLE_JAVA),hsqldb) \ + htmlfd \ hyphen \ icd \ icg \ |