summaryrefslogtreecommitdiff
path: root/filter
diff options
context:
space:
mode:
authorMaxim Monastirsky <momonasmon@gmail.com>2013-12-23 16:40:03 +0200
committerDavid Tardon <dtardon@redhat.com>2013-12-29 07:17:04 +0000
commitbd1461e69330a5265dc8cb395cf1b554d31c4bea (patch)
tree8f75bb952bacb3dd96f00a1360ef6104c16e0835 /filter
parent7a4b8676a283d629d952ceb5c59327827838124d (diff)
Detect UTF-16 encoded XML files
This code doesn't support leading blank characters, since it's invalid, and the filter doesn't handle it anyway. Change-Id: I494e9f85351539d27577dc7df8be420c0c66570e Reviewed-on: https://gerrit.libreoffice.org/7204 Reviewed-by: David Tardon <dtardon@redhat.com> Tested-by: David Tardon <dtardon@redhat.com>
Diffstat (limited to 'filter')
-rw-r--r--filter/Library_xmlfd.mk2
-rw-r--r--filter/source/xmlfilterdetect/filterdetect.cxx86
2 files changed, 34 insertions, 54 deletions
diff --git a/filter/Library_xmlfd.mk b/filter/Library_xmlfd.mk
index 39f10c2ccded..b55f06ace261 100644
--- a/filter/Library_xmlfd.mk
+++ b/filter/Library_xmlfd.mk
@@ -28,6 +28,8 @@ $(eval $(call gb_Library_use_libraries,xmlfd,\
cppuhelper \
cppu \
sal \
+ utl \
+ tl \
$(gb_UWINAPI) \
))
diff --git a/filter/source/xmlfilterdetect/filterdetect.cxx b/filter/source/xmlfilterdetect/filterdetect.cxx
index 0b36b3d78776..c409d8227cc4 100644
--- a/filter/source/xmlfilterdetect/filterdetect.cxx
+++ b/filter/source/xmlfilterdetect/filterdetect.cxx
@@ -42,6 +42,8 @@
#include <com/sun/star/container/XNameAccess.hpp>
#include <com/sun/star/beans/PropertyState.hpp>
#include <ucbhelper/content.hxx>
+#include <unotools/ucbstreamhelper.hxx>
+#include <boost/scoped_ptr.hpp>
using com::sun::star::uno::Sequence;
using com::sun::star::uno::Reference;
@@ -72,57 +74,12 @@ using namespace com::sun::star::beans;
namespace {
-bool isXMLStream(const OString& aHeaderStrm)
-{
- const char* p = aHeaderStrm.getStr();
- size_t n = aHeaderStrm.getLength();
- size_t i = 0;
-
- // Skip UTF-8 BOM
- const unsigned char sBOM[] = {0xEF, 0xBB, 0xBF};
- for (i = 0; i < n; ++i, ++p)
- {
- if (i < 3 && (unsigned char)(*p) == sBOM[i])
- continue;
- else if (i == 3 || i == 0)
- break;
- else if (i > 0)
- return false;
- }
-
- n -= i;
-
- // Skip all preceding blank characters.
- for (i = 0; i < n; ++i, ++p)
- {
- char c = *p;
- if (c == ' ' || c == '\n' || c == '\t')
- continue;
- break;
- }
-
- n -= i;
-
- // First text must be '<?xml', else it's not a valid XML file stream.
- const char* sInitChars = "<?xml";
- const size_t nInitCharLen = std::strlen(sInitChars);
- for (i = 0; i < n; ++i, ++p)
- {
- if (i < nInitCharLen)
- {
- if (*p != sInitChars[i])
- return false;
- }
- }
- return true;
-}
-
-OUString supportedByType( const OUString clipBoardFormat , const OString resultString, const OUString checkType)
+OUString supportedByType( const OUString clipBoardFormat , const OUString resultString, const OUString checkType)
{
OUString sTypeName;
if ( clipBoardFormat.match("doctype:") )
{
- OString tryStr = OUStringToOString(clipBoardFormat.copy(8),RTL_TEXTENCODING_ASCII_US).getStr();
+ OUString tryStr = clipBoardFormat.copy(8);
if (resultString.indexOf(tryStr) >= 0)
{
sTypeName = checkType;
@@ -142,7 +99,7 @@ OUString SAL_CALL FilterDetect::detect( com::sun::star::uno::Sequence< com::sun:
com::sun::star::uno::Reference< com::sun::star::io::XInputStream > xInStream;
const PropertyValue * pValue = aArguments.getConstArray();
sal_Int32 nLength;
- OString resultString;
+ OUString resultString;
nLength = aArguments.getLength();
sal_Int32 location=nLength;
@@ -174,13 +131,34 @@ OUString SAL_CALL FilterDetect::detect( com::sun::star::uno::Sequence< com::sun:
return sTypeName;
}
}
- com::sun::star::uno::Sequence< sal_Int8 > aData;
- /* long nBytesToRead= */ xInStream->available();
- xInStream->skipBytes (0);
- long bytestRead =xInStream->readBytes (aData, 4000);
- resultString=OString((const sal_Char *)aData.getConstArray(),bytestRead) ;
- if (!isXMLStream(resultString))
+ ::boost::scoped_ptr< SvStream > pInStream( ::utl::UcbStreamHelper::CreateStream( xInStream ) );
+ pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
+ sal_Size nUniPos = pInStream->Tell();
+
+ const sal_uInt16 nSize = 4000;
+ bool bTryUtf16 = false;
+
+ if ( nUniPos == 0 ) // No BOM detected, try to guess UTF-16 endianness
+ {
+ sal_uInt16 sHeader = 0;
+ *pInStream >> sHeader;
+ if ( sHeader == 0x003C )
+ bTryUtf16 = true;
+ else if ( sHeader == 0x3C00 )
+ {
+ bTryUtf16 = true;
+ pInStream->SetEndianSwap( !pInStream->IsEndianSwap() );
+ }
+ pInStream->Seek( STREAM_SEEK_TO_BEGIN );
+ }
+
+ if ( nUniPos == 3 || ( nUniPos == 0 && !bTryUtf16 ) ) // UTF-8 or non-Unicode
+ resultString = OStringToOUString( read_uInt8s_ToOString( *pInStream, nSize ), RTL_TEXTENCODING_UTF8 );
+ else if ( nUniPos == 2 || bTryUtf16 ) // UTF-16
+ resultString = read_uInt16s_ToOUString( *pInStream, nSize );
+
+ if ( !resultString.startsWith( "<?xml" ) )
// This is not an XML stream. It makes no sense to try to detect
// a non-XML file type here.
return OUString();