From d5890e87ab5e298e9a74ed0d552b01a98e59b1fa Mon Sep 17 00:00:00 2001 From: Maxim Monastirsky Date: Sat, 1 Mar 2014 20:03:41 +0200 Subject: Merge HTML detection to text detection service Setting the filter name explicitly is not really required, because TypeDetection::impl_checkResultsAndAddBestFilter is able to select the correct filter anyway. But it seems that other detection services also do it, so I followed that way. Change-Id: I6e73fa79c6867d82f98d63e8d2b7865446f088ef Reviewed-on: https://gerrit.libreoffice.org/8213 Reviewed-by: Kohei Yoshida Tested-by: Kohei Yoshida --- filter/Library_htmlfd.mk | 36 ---- filter/Library_textfd.mk | 5 + filter/Module_filter.mk | 1 - .../source/config/fragments/types/generic_HTML.xcu | 2 +- filter/source/htmlfilterdetect/fdcomp.cxx | 36 ---- filter/source/htmlfilterdetect/filterdetect.cxx | 232 --------------------- filter/source/htmlfilterdetect/filterdetect.hxx | 64 ------ filter/source/htmlfilterdetect/htmlfd.component | 15 -- filter/source/textfilterdetect/filterdetect.cxx | 178 +++++++++++----- 9 files changed, 127 insertions(+), 442 deletions(-) delete mode 100644 filter/Library_htmlfd.mk delete mode 100644 filter/source/htmlfilterdetect/fdcomp.cxx delete mode 100644 filter/source/htmlfilterdetect/filterdetect.cxx delete mode 100644 filter/source/htmlfilterdetect/filterdetect.hxx delete mode 100644 filter/source/htmlfilterdetect/htmlfd.component (limited to 'filter') diff --git a/filter/Library_htmlfd.mk b/filter/Library_htmlfd.mk deleted file mode 100644 index cfb708b4fcc3..000000000000 --- a/filter/Library_htmlfd.mk +++ /dev/null @@ -1,36 +0,0 @@ -# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- -#************************************************************************* -# -# This file is part of the LibreOffice project. -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# -#************************************************************************* - -$(eval $(call gb_Library_Library,htmlfd)) - -$(eval $(call gb_Library_set_componentfile,htmlfd,filter/source/htmlfilterdetect/htmlfd)) - -$(eval $(call gb_Library_use_external,htmlfd,boost_headers)) - -$(eval $(call gb_Library_use_sdk_api,htmlfd)) - -$(eval $(call gb_Library_use_libraries,htmlfd,\ - ucbhelper \ - cppuhelper \ - cppu \ - sal \ - tl \ - utl \ - svt \ - $(gb_UWINAPI) \ -)) - -$(eval $(call gb_Library_add_exception_objects,htmlfd,\ - filter/source/htmlfilterdetect/fdcomp \ - filter/source/htmlfilterdetect/filterdetect \ -)) - -# vim: set noet sw=4 ts=4: diff --git a/filter/Library_textfd.mk b/filter/Library_textfd.mk index ac1cda3a1fcf..ef470b7e9d51 100644 --- a/filter/Library_textfd.mk +++ b/filter/Library_textfd.mk @@ -13,14 +13,19 @@ $(eval $(call gb_Library_Library,textfd)) $(eval $(call gb_Library_set_componentfile,textfd,filter/source/textfilterdetect/textfd)) +$(eval $(call gb_Library_use_external,textfd,boost_headers)) + $(eval $(call gb_Library_use_sdk_api,textfd)) $(eval $(call gb_Library_use_libraries,textfd,\ + comphelper \ ucbhelper \ cppuhelper \ cppu \ sal \ tl \ + utl \ + svt \ $(gb_UWINAPI) \ )) diff --git a/filter/Module_filter.mk b/filter/Module_filter.mk index 58307b42a7e9..403184a93feb 100644 --- a/filter/Module_filter.mk +++ b/filter/Module_filter.mk @@ -34,7 +34,6 @@ $(eval $(call gb_Module_add_targets,filter,\ Library_exp) \ Library_filterconfig \ Library_flash \ - Library_htmlfd \ Library_icd \ Library_icg \ Library_idx \ diff --git a/filter/source/config/fragments/types/generic_HTML.xcu b/filter/source/config/fragments/types/generic_HTML.xcu index 58ffedc85f1e..b00b048d3842 100644 --- a/filter/source/config/fragments/types/generic_HTML.xcu +++ b/filter/source/config/fragments/types/generic_HTML.xcu @@ -16,7 +16,7 @@ * the License at http://www.apache.org/licenses/LICENSE-2.0 . --> - com.sun.star.comp.filters.HtmlFilterDetect + com.sun.star.comp.filters.PlainTextFilterDetect private:factory/swriter/web* html htm text/html diff --git a/filter/source/htmlfilterdetect/fdcomp.cxx b/filter/source/htmlfilterdetect/fdcomp.cxx deleted file mode 100644 index 40360e923c33..000000000000 --- a/filter/source/htmlfilterdetect/fdcomp.cxx +++ /dev/null @@ -1,36 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ - -#include - -#include -#include -#include - -#include "filterdetect.hxx" - -namespace { - -static cppu::ImplementationEntry const services[] = { - { &HtmlFilterDetect_createInstance, &HtmlFilterDetect_getImplementationName, - &HtmlFilterDetect_getSupportedServiceNames, - &cppu::createSingleComponentFactory, 0, 0 }, - { 0, 0, 0, 0, 0, 0 } -}; - -} - -extern "C" SAL_DLLPUBLIC_EXPORT void * SAL_CALL htmlfd_component_getFactory( - char const * pImplName, void * pServiceManager, void * pRegistryKey) -{ - return cppu::component_getFactoryHelper( - pImplName, pServiceManager, pRegistryKey, services); -} - -/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/filterdetect.cxx b/filter/source/htmlfilterdetect/filterdetect.cxx deleted file mode 100644 index 5b617c47bf4f..000000000000 --- a/filter/source/htmlfilterdetect/filterdetect.cxx +++ /dev/null @@ -1,232 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ - -#include "filterdetect.hxx" - -#include -#include -#include -#include -#include - -#include -#include - -#include - -using com::sun::star::io::XInputStream; -using com::sun::star::uno::Sequence; -using com::sun::star::uno::Reference; -using com::sun::star::uno::Any; -using com::sun::star::uno::XComponentContext; -using com::sun::star::uno::XInterface; -using com::sun::star::uno::Exception; -using com::sun::star::uno::RuntimeException; -using com::sun::star::ucb::XCommandEnvironment; - -using namespace com::sun::star; -using namespace com::sun::star::beans; - -namespace { - -enum DetectPhase { - BeforeTag, - TagOpened, - InTagName -}; - -bool isHTMLStream(const OString& aStreamHeader) -{ - const char* pHeader = aStreamHeader.getStr(); - const int nLength = aStreamHeader.getLength(); - int nStartOfTagIndex = 0; - int i = 0; - - DetectPhase dp = BeforeTag; - - for ( i = 0; i < nLength; ++i, ++pHeader ) - { - char c = *pHeader; - if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' ) - { - if ( dp == TagOpened ) - return false; // Invalid: Should start with a tag name - else if ( dp == InTagName ) - break; // End of tag name reached - } - else if ( c == '<' ) - { - if ( dp == BeforeTag ) - dp = TagOpened; - else - return false; // Invalid: Nested '<' - } - else if ( c == '>' ) - { - if ( dp == InTagName ) - break; // End of tag name reached - else - return false; // Invalid: Empty tag or before '<' - } - else if ( c == '!' ) - { - if ( dp == TagOpened ) - return true; // "& lDescriptor) - throw (RuntimeException, std::exception) -{ - OUString sUrl; - OUString sDocService; - OString resultString; - Reference xInStream; - - const PropertyValue *pValue = lDescriptor.getConstArray(); - sal_Int32 nLength = lDescriptor.getLength(); - sal_Int32 location = nLength; - - for ( sal_Int32 i = 0; i < nLength; ++i ) - { - if ( pValue[i].Name == utl::MediaDescriptor::PROP_URL() ) - pValue[i].Value >>= sUrl; - else if ( pValue[i].Name == utl::MediaDescriptor::PROP_INPUTSTREAM() ) - pValue[i].Value >>= xInStream; - else if ( pValue[i].Name == utl::MediaDescriptor::PROP_DOCUMENTSERVICE() ) - { - location = i; - pValue[i].Value >>= sDocService; - } - } - - try - { - if ( !xInStream.is() ) - { - ucbhelper::Content aContent( sUrl, Reference(), mxCtx ); - xInStream = aContent.openStream(); - if ( !xInStream.is() ) - return OUString(); - } - - boost::scoped_ptr pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) ); - if ( !pInStream || pInStream->GetError() ) - return OUString(); - - pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); - sal_Size nUniPos = pInStream->Tell(); - - const sal_uInt16 nSize = 4096; - - if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode - resultString = read_uInt8s_ToOString( *pInStream, nSize ); - else // UTF-16 - resultString = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US ); - - if ( isHTMLStream( resultString.toAsciiLowerCase() ) ) - { - // Some Apps/Web services use ".xls" extension to indicate that - // the given file should be opened by a spreadsheet software - if ( sDocService.isEmpty() ) - { - INetURLObject aParser( sUrl ); - OUString aExt = aParser.getExtension( INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET ); - aExt = aExt.toAsciiLowerCase(); - - if ( aExt == "xls" ) - { - if ( location == lDescriptor.getLength() ) - { - lDescriptor.realloc( location + 1 ); - lDescriptor[location].Name = utl::MediaDescriptor::PROP_DOCUMENTSERVICE(); - } - lDescriptor[location].Value <<= OUString( "com.sun.star.sheet.SpreadsheetDocument" ); - } - } - return OUString( "generic_HTML" ); - } - } - catch (const Exception &) - { - OSL_FAIL( "An Exception occurred while opening File stream" ); - } - - return OUString(); // Failed -} - -// XInitialization - -void SAL_CALL HtmlFilterDetect::initialize(const Sequence& /*aArguments*/) - throw (Exception, RuntimeException, std::exception) -{ -} - -OUString HtmlFilterDetect_getImplementationName() -{ - return OUString( "com.sun.star.comp.filters.HtmlFilterDetect" ); -} - -Sequence HtmlFilterDetect_getSupportedServiceNames() -{ - Sequence aRet(2); - OUString* pArray = aRet.getArray(); - pArray[0] = "com.sun.star.document.ExtendedTypeDetection"; - pArray[1] = "com.sun.star.comp.filters.HtmlFilterDetect"; - return aRet; -} - -Reference HtmlFilterDetect_createInstance(const Reference& rCtx) -{ - return (cppu::OWeakObject*) new HtmlFilterDetect( rCtx ); -} - -// XServiceInfo - -OUString SAL_CALL HtmlFilterDetect::getImplementationName() - throw (RuntimeException, std::exception) -{ - return HtmlFilterDetect_getImplementationName(); -} - -sal_Bool SAL_CALL HtmlFilterDetect::supportsService(const OUString& rServiceName) - throw (RuntimeException, std::exception) -{ - return cppu::supportsService( this, rServiceName ); -} - -Sequence SAL_CALL HtmlFilterDetect::getSupportedServiceNames() - throw (RuntimeException, std::exception) -{ - return HtmlFilterDetect_getSupportedServiceNames(); -} - -/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/filterdetect.hxx b/filter/source/htmlfilterdetect/filterdetect.hxx deleted file mode 100644 index f8327af28f4e..000000000000 --- a/filter/source/htmlfilterdetect/filterdetect.hxx +++ /dev/null @@ -1,64 +0,0 @@ -/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -/* - * This file is part of the LibreOffice project. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ - -#ifndef INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX -#define INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX - -#include -#include -#include -#include - -#include - -class HtmlFilterDetect : public cppu::WeakImplHelper3< - com::sun::star::document::XExtendedFilterDetection, - com::sun::star::lang::XInitialization, - com::sun::star::lang::XServiceInfo> -{ - com::sun::star::uno::Reference mxCtx; - -public: - - HtmlFilterDetect(const com::sun::star::uno::Reference& xCtx) : - mxCtx(xCtx) {} - virtual ~HtmlFilterDetect() {} - - // XExtendedFilterDetection - - virtual OUString SAL_CALL detect(com::sun::star::uno::Sequence& lDescriptor) - throw (com::sun::star::uno::RuntimeException, std::exception); - - // XInitialization - - virtual void SAL_CALL initialize(const ::com::sun::star::uno::Sequence& aArguments) - throw (com::sun::star::uno::Exception, com::sun::star::uno::RuntimeException, std::exception); - - // XServiceInfo - - virtual OUString SAL_CALL getImplementationName() - throw (com::sun::star::uno::RuntimeException, std::exception); - - virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName) - throw (com::sun::star::uno::RuntimeException, std::exception); - - virtual com::sun::star::uno::Sequence SAL_CALL getSupportedServiceNames() - throw (com::sun::star::uno::RuntimeException, std::exception); -}; - -OUString HtmlFilterDetect_getImplementationName(); - -com::sun::star::uno::Sequence HtmlFilterDetect_getSupportedServiceNames(); - -com::sun::star::uno::Reference -HtmlFilterDetect_createInstance(const com::sun::star::uno::Reference& rCtx); - -#endif - -/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/htmlfilterdetect/htmlfd.component b/filter/source/htmlfilterdetect/htmlfd.component deleted file mode 100644 index 32c41b8bef26..000000000000 --- a/filter/source/htmlfilterdetect/htmlfd.component +++ /dev/null @@ -1,15 +0,0 @@ - - - - - - - - diff --git a/filter/source/textfilterdetect/filterdetect.cxx b/filter/source/textfilterdetect/filterdetect.cxx index aea331f9140a..ffad7faa0282 100644 --- a/filter/source/textfilterdetect/filterdetect.cxx +++ b/filter/source/textfilterdetect/filterdetect.cxx @@ -9,32 +9,107 @@ #include "filterdetect.hxx" -#include "tools/urlobj.hxx" -#include "ucbhelper/content.hxx" +#include +#include +#include +#include +#include #include #include #include +#include #define WRITER_TEXT_FILTER "Text" #define CALC_TEXT_FILTER "Text - txt - csv (StarCalc)" +#define WEB_HTML_FILTER "HTML" +#define WRITER_HTML_FILTER "HTML (StarWriter)" +#define CALC_HTML_FILTER "calc_HTML_WebQuery" + +#define WRITER_DOCSERVICE "com.sun.star.text.TextDocument" +#define CALC_DOCSERVICE "com.sun.star.sheet.SpreadsheetDocument" + using namespace ::com::sun::star; +using utl::MediaDescriptor; namespace { -template -void setPropValue(uno::Sequence& rProps, sal_Int32 nPos, const char* pName, const T& rValue) +bool IsHTMLStream( const uno::Reference& xInStream ) { - if (nPos >= 0) - rProps[nPos].Value <<= rValue; - else + boost::scoped_ptr pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) ); + if ( !pInStream || pInStream->GetError() ) + // No stream + return false; + + // Read the stream header + pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); + const sal_Size nUniPos = pInStream->Tell(); + const sal_uInt16 nSize = 4096; + + OString sHeader; + if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode + sHeader = read_uInt8s_ToOString( *pInStream, nSize ); + else // UTF-16 (nUniPos = 2) + sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US ); + + // Now check whether the stream begins with a known HTML tag. + enum DetectPhase { BeforeTag, TagOpened, InTagName }; + DetectPhase dp = BeforeTag; + + const char* pHeader = sHeader.getStr(); + const int nLength = sHeader.getLength(); + int i = 0, nStartOfTagIndex = 0; + + for ( i = 0; i < nLength; ++i, ++pHeader ) { - sal_Int32 n = rProps.getLength(); - rProps.realloc(n+1); - rProps[n].Name = OUString::createFromAscii(pName); - rProps[n].Value <<= rValue; + char c = *pHeader; + if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' ) + { + if ( dp == TagOpened ) + return false; // Invalid: Should start with a tag name + else if ( dp == InTagName ) + break; // End of tag name reached + } + else if ( c == '<' ) + { + if ( dp == BeforeTag ) + dp = TagOpened; + else + return false; // Invalid: Nested '<' + } + else if ( c == '>' ) + { + if ( dp == InTagName ) + break; // End of tag name reached + else + return false; // Invalid: Empty tag or before '<' + } + else if ( c == '!' ) + { + if ( dp == TagOpened ) + return true; // "& lDescriptor) throw (uno::RuntimeException, std::exception) { - OUString aType; - OUString aDocService; - OUString aExt; - OUString aUrl; + MediaDescriptor aMediaDesc(lDescriptor); - sal_Int32 nFilter = -1; + OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() ); + OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString() ); + OUString aUrl = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() ); - for (sal_Int32 i = 0, n = lDescriptor.getLength(); i < n; ++i) - { - if (lDescriptor[i].Name == "TypeName") - lDescriptor[i].Value >>= aType; - else if (lDescriptor[i].Name == "FilterName") - nFilter = i; - else if (lDescriptor[i].Name == "DocumentService") - lDescriptor[i].Value >>= aDocService; - else if (lDescriptor[i].Name == "URL") - { - lDescriptor[i].Value >>= aUrl; + // Get the file name extension. + INetURLObject aParser(aUrl); + OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET); + aExt = aExt.toAsciiLowerCase(); - // Get the file name extension. - INetURLObject aParser(aUrl); - aExt = aParser.getExtension( - INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET); - aExt = aExt.toAsciiLowerCase(); - } - } - - if (aType == "generic_Text") + if (aType == "generic_HTML") { - // Generic text type. + uno::Reference xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY); + if (!xInStream.is() || !IsHTMLStream(xInStream)) + return OUString(); // Decide which filter to use based on the document service first, // then on extension if that's not available. - if (aDocService == "com.sun.star.sheet.SpreadsheetDocument") - // Open it in Calc. - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aDocService == "com.sun.star.text.TextDocument") - // Open it in Writer. - setPropValue(lDescriptor, nFilter, "FilterName", OUString(WRITER_TEXT_FILTER)); - else if (aExt == "csv") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aExt == "tsv") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aExt == "tab") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); + if (aDocService == CALC_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER); + else if (aDocService == WRITER_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_HTML_FILTER); else if (aExt == "xls") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aExt == "txt") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(WRITER_TEXT_FILTER)); + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER); else - // No clue. Open it in Writer by default. - setPropValue(lDescriptor, nFilter, "FilterName", OUString(WRITER_TEXT_FILTER)); + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WEB_HTML_FILTER); + } - return aType; + else if (aType == "generic_Text") + { + if (aDocService == CALC_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER); + else if (aDocService == WRITER_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER); + else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls") + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER); + else + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER); } - // failed! - return OUString(); + else + // Nothing to detect. + return OUString(); + + aMediaDesc >> lDescriptor; + return aType; } // XInitialization -- cgit