diff options
author | Eike Rathke <erack@redhat.com> | 2012-04-14 18:57:31 +0200 |
---|---|---|
committer | Eike Rathke <erack@redhat.com> | 2012-04-14 18:57:31 +0200 |
commit | 7928b651965f747b02593d2a9fc73fac7c86dbf5 (patch) | |
tree | 079cabd464d84456fc63a44849c402fee1ccd65b /sc/source/ui/docshell/impex.cxx | |
parent | 95cc5de63b20c5986fe8f3913da86002eabd7cb1 (diff) |
resolved fdo#48621 better handling of broken CSV files
* non-escaped (not doubled) quotes in quoted strings are regarded as broken
representation and are taken literally, only a quote followed by a separator
ends a field. If not being a separator themselves, trailing blanks between
the ending quote and the separator are ignored, complementary to leading
blanks between a separator and a quote.
* quotes in a non-quoted string are taken literally
Diffstat (limited to 'sc/source/ui/docshell/impex.cxx')
-rw-r--r-- | sc/source/ui/docshell/impex.cxx | 164 |
1 files changed, 146 insertions, 18 deletions
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx index f33a0d7f5499..cf51c0703dd7 100644 --- a/sc/source/ui/docshell/impex.cxx +++ b/sc/source/ui/docshell/impex.cxx @@ -573,6 +573,77 @@ void ScImportExport::WriteUnicodeOrByteEndl( SvStream& rStrm ) } +enum QuoteType +{ + FIELDSTART_QUOTE, + FIRST_QUOTE, + SECOND_QUOTE, + FIELDEND_QUOTE, + DONTKNOW_QUOTE +}; + + +/** Determine if *p is a quote that ends a quoted field. + + Precondition: we are parsing a quoted field already and *p is a quote. + + @return + FIELDEND_QUOTE if end of field quote + DONTKNOW_QUOTE anything else + */ +static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps ) +{ + // Due to broken CSV generators that don't double embedded quotes check if + // a field separator immediately or with trailing spaces follows the quote, + // only then end the field, or at end of string. + while (p[1] == ' ') + ++p; + if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1])) + return FIELDEND_QUOTE; + return DONTKNOW_QUOTE; +} + + +/** Determine if *p is a quote that is escaped by being doubled or ends a + quoted field. + + Precondition: *p is a quote. + + @param nQuotes + Quote characters encountered so far. + Odd (after opening quote) means either no embedded quotes or only quote + pairs so far. + Even means either not in a quoted field or already one quote + encountered, the first of a pair. + + @return + FIELDSTART_QUOTE if first quote in a field, either starting content or + embedded so caller should check beforehand. + FIRST_QUOTE if first of a doubled quote + SECOND_QUOTE if second of a doubled quote + FIELDEND_QUOTE if end of field quote + DONTKNOW_QUOTE if an unescaped quote we don't consider as end of field, + do not increment nQuotes in caller then! + */ +static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p, + const sal_Unicode* pSeps, sal_Unicode cStr ) +{ + if ((nQuotes % 2) == 0) + { + if (p[-1] == cStr) + return SECOND_QUOTE; + else + { + SAL_WARN( "sc", "lcl_isEscapedOrFieldEndQuote: really want a FIELDSTART_QUOTE?"); + return FIELDSTART_QUOTE; + } + } + if (p[1] == cStr) + return FIRST_QUOTE; + return lcl_isFieldEndQuote( p, pSeps); +} + + /** Append characters of [p1,p2) to rField. @returns TRUE if ok; FALSE if data overflow, truncated @@ -606,7 +677,7 @@ enum DoubledQuoteMode }; static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString, - sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell ) + const sal_Unicode* pSeps, sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell ) { p++; //! jump over opening quote bool bCont; @@ -621,7 +692,18 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString, if( *p == cStr ) { if ( *++p != cStr ) - break; + { + // break or continue for loop + if (eMode == DQM_ESCAPE) + { + if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE) + break; + else + continue; + } + else + break; + } // doubled quote char switch ( eMode ) { @@ -815,6 +897,10 @@ bool ScImportExport::Text2Doc( SvStream& rStrm ) { bool bOk = true; + sal_Unicode pSeps[2]; + pSeps[0] = cSep; + pSeps[1] = 0; + SCCOL nStartCol = aRange.aStart.Col(); SCROW nStartRow = aRange.aStart.Row(); SCCOL nEndCol = aRange.aEnd.Col(); @@ -843,7 +929,7 @@ bool ScImportExport::Text2Doc( SvStream& rStrm ) aCell.Erase(); if( *p == cStr ) { - p = lcl_ScanString( p, aCell, cStr, DQM_KEEP, bOverflowCell ); + p = lcl_ScanString( p, aCell, pSeps, cStr, DQM_KEEP, bOverflowCell ); while( *p && *p != cSep ) p++; if( *p ) @@ -1277,7 +1363,7 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm ) for( ;; ) { aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr); - if ( rStrm.IsEof() ) + if ( rStrm.IsEof() && aLine.isEmpty() ) break; sal_Int32 nLineLen = aLine.getLength(); @@ -1445,7 +1531,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p { rbIsQuoted = true; const sal_Unicode* p1; - p1 = p = lcl_ScanString( p, rField, cStr, DQM_ESCAPE, rbOverflowCell ); + p1 = p = lcl_ScanString( p, rField, pSeps, cStr, DQM_ESCAPE, rbOverflowCell ); while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) ) p++; // Append remaining unquoted and undelimited data (dirty, dirty) to @@ -2212,9 +2298,8 @@ inline const sal_Unicode* lcl_UnicodeStrChr( const sal_Unicode* pStr, return 0; } -rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak, - const String& rFieldSeparators, sal_Unicode cFieldQuote, - bool bAllowBackslashEscape) +rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak, + const String& rFieldSeparators, sal_Unicode cFieldQuote ) { rtl::OUString aStr; rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit); @@ -2226,11 +2311,13 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak, // See if the separator(s) include tab. bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL; + QuoteType eQuoteState = FIELDEND_QUOTE; + bool bFieldStart = true; + sal_Int32 nLastOffset = 0; sal_Int32 nQuotes = 0; while (!rStream.IsEof() && aStr.getLength() < nArbitraryLineLengthLimit) { - bool bBackslashEscaped = false; const sal_Unicode *p, *pStart; p = pStart = aStr.getStr(); p += nLastOffset; @@ -2248,25 +2335,66 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak, break; } - if (*p == cFieldQuote && !bBackslashEscaped) - ++nQuotes; - else if (bAllowBackslashEscape) + if (*p == cFieldQuote) + { + if (bFieldStart) + { + ++nQuotes; + bFieldStart = false; + eQuoteState = FIELDSTART_QUOTE; + } + // Do not detect a FIELDSTART_QUOTE if not in + // bFieldStart mode, in which case for unquoted content + // we are in FIELDEND_QUOTE state. + else if (eQuoteState != FIELDEND_QUOTE) + { + eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote); + // DONTKNOW_QUOTE is an embedded unescaped quote we + // don't count for pairing. + if (eQuoteState != DONTKNOW_QUOTE) + ++nQuotes; + } + } + else if (eQuoteState == FIELDEND_QUOTE) + { + if (bFieldStart) + // If blank is a separator it starts a field, if it + // is not and thus maybe leading before quote we + // are still at start of field regarding quotes. + bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL); + else + bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL); + } + } + else + { + if (*p == cFieldQuote && bFieldStart) + { + nQuotes = 1; + eQuoteState = FIELDSTART_QUOTE; + bFieldStart = false; + } + else if (eQuoteState == FIELDEND_QUOTE) { - if (*p == '\\') - bBackslashEscaped = !bBackslashEscaped; + // This also skips leading blanks at beginning of line + // if followed by a quote. It's debatable whether we + // actually want that or not, but congruent with what + // ScanNextFieldFromString() does. + if (bFieldStart) + bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL); else - bBackslashEscaped = false; + bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL); } } - else if (*p == cFieldQuote && (p == pStart || - lcl_UnicodeStrChr( pSeps, p[-1]))) - nQuotes = 1; // A quote character inside a field content does not start // a quote. ++p; } if (nQuotes % 2 == 0) + // We still have a (theoretical?) problem here if due to + // nArbitraryLineLengthLimit we split a string right between a + // doubled quote pair. break; else { |