diff options
Diffstat (limited to 'filter/source/textfilterdetect/filterdetect.cxx')
-rw-r--r-- | filter/source/textfilterdetect/filterdetect.cxx | 178 |
1 files changed, 121 insertions, 57 deletions
diff --git a/filter/source/textfilterdetect/filterdetect.cxx b/filter/source/textfilterdetect/filterdetect.cxx index aea331f9140a..ffad7faa0282 100644 --- a/filter/source/textfilterdetect/filterdetect.cxx +++ b/filter/source/textfilterdetect/filterdetect.cxx @@ -9,32 +9,107 @@ #include "filterdetect.hxx" -#include "tools/urlobj.hxx" -#include "ucbhelper/content.hxx" +#include <svtools/htmltokn.h> +#include <tools/urlobj.hxx> +#include <ucbhelper/content.hxx> +#include <unotools/mediadescriptor.hxx> +#include <unotools/ucbstreamhelper.hxx> #include <com/sun/star/lang/XMultiServiceFactory.hpp> #include <com/sun/star/io/XInputStream.hpp> #include <cppuhelper/supportsservice.hxx> +#include <boost/scoped_ptr.hpp> #define WRITER_TEXT_FILTER "Text" #define CALC_TEXT_FILTER "Text - txt - csv (StarCalc)" +#define WEB_HTML_FILTER "HTML" +#define WRITER_HTML_FILTER "HTML (StarWriter)" +#define CALC_HTML_FILTER "calc_HTML_WebQuery" + +#define WRITER_DOCSERVICE "com.sun.star.text.TextDocument" +#define CALC_DOCSERVICE "com.sun.star.sheet.SpreadsheetDocument" + using namespace ::com::sun::star; +using utl::MediaDescriptor; namespace { -template<typename T> -void setPropValue(uno::Sequence<beans::PropertyValue>& rProps, sal_Int32 nPos, const char* pName, const T& rValue) +bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream ) { - if (nPos >= 0) - rProps[nPos].Value <<= rValue; - else + boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) ); + if ( !pInStream || pInStream->GetError() ) + // No stream + return false; + + // Read the stream header + pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); + const sal_Size nUniPos = pInStream->Tell(); + const sal_uInt16 nSize = 4096; + + OString sHeader; + if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode + sHeader = read_uInt8s_ToOString( *pInStream, nSize ); + else // UTF-16 (nUniPos = 2) + sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US ); + + // Now check whether the stream begins with a known HTML tag. + enum DetectPhase { BeforeTag, TagOpened, InTagName }; + DetectPhase dp = BeforeTag; + + const char* pHeader = sHeader.getStr(); + const int nLength = sHeader.getLength(); + int i = 0, nStartOfTagIndex = 0; + + for ( i = 0; i < nLength; ++i, ++pHeader ) { - sal_Int32 n = rProps.getLength(); - rProps.realloc(n+1); - rProps[n].Name = OUString::createFromAscii(pName); - rProps[n].Value <<= rValue; + char c = *pHeader; + if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' ) + { + if ( dp == TagOpened ) + return false; // Invalid: Should start with a tag name + else if ( dp == InTagName ) + break; // End of tag name reached + } + else if ( c == '<' ) + { + if ( dp == BeforeTag ) + dp = TagOpened; + else + return false; // Invalid: Nested '<' + } + else if ( c == '>' ) + { + if ( dp == InTagName ) + break; // End of tag name reached + else + return false; // Invalid: Empty tag or before '<' + } + else if ( c == '!' ) + { + if ( dp == TagOpened ) + return true; // "<!" - DOCTYPE or comments block + else + return false; // Invalid: '!' before '<' or inside tag name + } + else + { + if ( dp == BeforeTag ) + return false; // Invalid: Should start with a tag + else if ( dp == TagOpened ) + { + nStartOfTagIndex = i; + dp = InTagName; + } + } } + + // The string following '<' has to be a known HTML token. + OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex ); + if ( GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != 0 ) + return true; + + return false; } } @@ -46,65 +121,54 @@ PlainTextFilterDetect::~PlainTextFilterDetect() {} OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor) throw (uno::RuntimeException, std::exception) { - OUString aType; - OUString aDocService; - OUString aExt; - OUString aUrl; + MediaDescriptor aMediaDesc(lDescriptor); - sal_Int32 nFilter = -1; + OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() ); + OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString() ); + OUString aUrl = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() ); - for (sal_Int32 i = 0, n = lDescriptor.getLength(); i < n; ++i) - { - if (lDescriptor[i].Name == "TypeName") - lDescriptor[i].Value >>= aType; - else if (lDescriptor[i].Name == "FilterName") - nFilter = i; - else if (lDescriptor[i].Name == "DocumentService") - lDescriptor[i].Value >>= aDocService; - else if (lDescriptor[i].Name == "URL") - { - lDescriptor[i].Value >>= aUrl; + // Get the file name extension. + INetURLObject aParser(aUrl); + OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET); + aExt = aExt.toAsciiLowerCase(); - // Get the file name extension. - INetURLObject aParser(aUrl); - aExt = aParser.getExtension( - INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET); - aExt = aExt.toAsciiLowerCase(); - } - } - - if (aType == "generic_Text") + if (aType == "generic_HTML") { - // Generic text type. + uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY); + if (!xInStream.is() || !IsHTMLStream(xInStream)) + return OUString(); // Decide which filter to use based on the document service first, // then on extension if that's not available. - if (aDocService == "com.sun.star.sheet.SpreadsheetDocument") - // Open it in Calc. - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aDocService == "com.sun.star.text.TextDocument") - // Open it in Writer. - setPropValue(lDescriptor, nFilter, "FilterName", OUString(WRITER_TEXT_FILTER)); - else if (aExt == "csv") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aExt == "tsv") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aExt == "tab") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); + if (aDocService == CALC_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER); + else if (aDocService == WRITER_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_HTML_FILTER); else if (aExt == "xls") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(CALC_TEXT_FILTER)); - else if (aExt == "txt") - setPropValue(lDescriptor, nFilter, "FilterName", OUString(WRITER_TEXT_FILTER)); + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER); else - // No clue. Open it in Writer by default. - setPropValue(lDescriptor, nFilter, "FilterName", OUString(WRITER_TEXT_FILTER)); + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WEB_HTML_FILTER); + } - return aType; + else if (aType == "generic_Text") + { + if (aDocService == CALC_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER); + else if (aDocService == WRITER_DOCSERVICE) + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER); + else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls") + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER); + else + aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER); } - // failed! - return OUString(); + else + // Nothing to detect. + return OUString(); + + aMediaDesc >> lDescriptor; + return aType; } // XInitialization |