diff options
-rw-r--r-- | sc/source/ui/dbgui/scuiasciiopt.cxx | 4 | ||||
-rw-r--r-- | sc/source/ui/docshell/impex.cxx | 85 | ||||
-rw-r--r-- | sc/source/ui/inc/impex.hxx | 53 | ||||
-rw-r--r-- | tools/inc/tools/stream.hxx | 54 | ||||
-rw-r--r-- | tools/source/stream/stream.cxx | 87 |
5 files changed, 138 insertions, 145 deletions
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx index fdb1fa88121e..834d0cc17f9e 100644 --- a/sc/source/ui/dbgui/scuiasciiopt.cxx +++ b/sc/source/ui/dbgui/scuiasciiopt.cxx @@ -476,7 +476,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, String &rText ) bRet = false; break; } - mpDatStream->ReadCsvLine( rText, !bFixed, maFieldSeparators, + ReadCsvLine(*mpDatStream, rText, !bFixed, maFieldSeparators, mcTextSep); mnStreamPos = mpDatStream->Tell(); mpRowPosArray[++mnRowPosCount] = mnStreamPos; @@ -494,7 +494,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, String &rText ) else { Seek( mpRowPosArray[nLine]); - mpDatStream->ReadCsvLine( rText, !bFixed, maFieldSeparators, mcTextSep); + ReadCsvLine(*mpDatStream, rText, !bFixed, maFieldSeparators, mcTextSep); mnStreamPos = mpDatStream->Tell(); } diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx index a10ab3906e1a..b7922b3840e7 100644 --- a/sc/source/ui/docshell/impex.cxx +++ b/sc/source/ui/docshell/impex.cxx @@ -1180,7 +1180,7 @@ sal_Bool ScImportExport::ExtText2Doc( SvStream& rStrm ) while(--nSkipLines>0) { - rStrm.ReadCsvLine( aLine, !bFixed, rSeps, cStr); // content is ignored + ReadCsvLine(rStrm, aLine, !bFixed, rSeps, cStr); // content is ignored if ( rStrm.IsEof() ) break; } @@ -1203,7 +1203,7 @@ sal_Bool ScImportExport::ExtText2Doc( SvStream& rStrm ) { for( ;; ) { - rStrm.ReadCsvLine( aLine, !bFixed, rSeps, cStr); + ReadCsvLine(rStrm, aLine, !bFixed, rSeps, cStr); if ( rStrm.IsEof() ) break; @@ -2109,4 +2109,85 @@ ScFormatFilterPlugin &ScFormatFilter::Get() return *plugin; } +// Precondition: pStr is guaranteed to be non-NULL and points to a 0-terminated +// array. +inline const sal_Unicode* lcl_UnicodeStrChr( const sal_Unicode* pStr, + sal_Unicode c ) +{ + while (*pStr) + { + if (*pStr == c) + return pStr; + ++pStr; + } + return 0; +} + +void ReadCsvLine(SvStream &rStream, String& rStr, sal_Bool bEmbeddedLineBreak, + const String& rFieldSeparators, sal_Unicode cFieldQuote, + sal_Bool bAllowBackslashEscape) +{ + rStream.ReadUniOrByteStringLine(rStr, rStream.GetStreamCharSet()); + + if (bEmbeddedLineBreak) + { + const sal_Unicode* pSeps = rFieldSeparators.GetBuffer(); + + // See if the separator(s) include tab. + bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL; + + xub_StrLen nLastOffset = 0; + xub_StrLen nQuotes = 0; + while (!rStream.IsEof() && rStr.Len() < STRING_MAXLEN) + { + bool bBackslashEscaped = false; + const sal_Unicode *p, *pStart; + p = pStart = rStr.GetBuffer(); + p += nLastOffset; + while (*p) + { + if (nQuotes) + { + if (bTabSep && *p == '\t' && (nQuotes % 2) != 0) + { + // When tab-delimited, tab char ends quoted sequence + // even if we haven't reached the end quote. Doing + // this helps keep mal-formed rows from damaging + // other, well-formed rows. + nQuotes = 0; + break; + } + + if (*p == cFieldQuote && !bBackslashEscaped) + ++nQuotes; + else if (bAllowBackslashEscape) + { + if (*p == '\\') + bBackslashEscaped = !bBackslashEscaped; + else + bBackslashEscaped = false; + } + } + else if (*p == cFieldQuote && (p == pStart || + lcl_UnicodeStrChr( pSeps, p[-1]))) + nQuotes = 1; + // A quote character inside a field content does not start + // a quote. + ++p; + } + + if (nQuotes % 2 == 0) + break; + else + { + nLastOffset = rStr.Len(); + String aNext; + rStream.ReadUniOrByteStringLine(aNext, rStream.GetStreamCharSet()); + rStr += sal_Unicode(_LF); + rStr += aNext; + } + } + } +} + /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx index 302cf4e5c627..8ccfdd96403e 100644 --- a/sc/source/ui/inc/impex.hxx +++ b/sc/source/ui/inc/impex.hxx @@ -189,6 +189,59 @@ public: } }; +/** Read a CSV (comma separated values) data line using + ReadUniOrByteStringLine(). + + @param bEmbeddedLineBreak + If sal_True and a line-break occurs inside a field of data, + a line feed LF '\n' and the next line are appended. Repeats + until a line-break is not in a field. A field is determined + by delimiting rFieldSeparators and optionally surrounded by + a pair of cFieldQuote characters. For a line-break to be + within a field, the field content MUST be surrounded by + cFieldQuote characters, and the opening cFieldQuote MUST be + at the very start of a line or follow right behind a field + separator with no extra characters in between. Anything, + including field separators and escaped quotes (by doubling + them, or preceding them with a backslash if + bAllowBackslashEscape==sal_True) may appear in a quoted + field. + + If bEmbeddedLineBreak==sal_False, nothing is parsed and the + string returned is simply one ReadUniOrByteStringLine(). + + @param rFieldSeparators + A list of characters that each may act as a field separator. + + @param cFieldQuote + The quote character used. + + @param bAllowBackslashEscape + If sal_True, an embedded quote character inside a quoted + field may also be escaped with a preceding backslash. + Normally, quotes are escaped by doubling them. + + check Stream::good() to detect IO problems during read + + @ATTENTION + Note that the string returned may be truncated even inside + a quoted field if STRING_MAXLEN was reached. There + currently is no way to exactly determine the conditions, + whether this was at a line end, or whether open quotes + would have closed the field before the line end, as even a + ReadUniOrByteStringLine() may return prematurely but the + stream was positioned ahead until the real end of line. + Additionally, due to character encoding conversions, string + length and bytes read don't necessarily match, and + resyncing to a previous position matching the string's + length isn't always possible. As a result, a logical line + with embedded line breaks and more than STRING_MAXLEN + characters will be spoiled, and a subsequent ReadCsvLine() + may start under false preconditions. + */ +SC_DLLPUBLIC void ReadCsvLine(SvStream &rStream, String& rStr, sal_Bool bEmbeddedLineBreak, + const String& rFieldSeparators, sal_Unicode cFieldQuote, + sal_Bool bAllowBackslashEscape = sal_False); #endif diff --git a/tools/inc/tools/stream.hxx b/tools/inc/tools/stream.hxx index 85d5f124adc9..4f06610e2f00 100644 --- a/tools/inc/tools/stream.hxx +++ b/tools/inc/tools/stream.hxx @@ -432,60 +432,6 @@ public: sal_Bool WriteUniOrByteChar( sal_Unicode ch ) { return WriteUniOrByteChar( ch, GetStreamCharSet() ); } - /** Read a CSV (comma separated values) data line using - ReadUniOrByteStringLine(). - - @param bEmbeddedLineBreak - If sal_True and a line-break occurs inside a field of data, - a line feed LF '\n' and the next line are appended. Repeats - until a line-break is not in a field. A field is determined - by delimiting rFieldSeparators and optionally surrounded by - a pair of cFieldQuote characters. For a line-break to be - within a field, the field content MUST be surrounded by - cFieldQuote characters, and the opening cFieldQuote MUST be - at the very start of a line or follow right behind a field - separator with no extra characters in between. Anything, - including field separators and escaped quotes (by doubling - them, or preceding them with a backslash if - bAllowBackslashEscape==sal_True) may appear in a quoted - field. - - If bEmbeddedLineBreak==sal_False, nothing is parsed and the - string returned is simply one ReadUniOrByteStringLine(). - - @param rFieldSeparators - A list of characters that each may act as a field separator. - - @param cFieldQuote - The quote character used. - - @param bAllowBackslashEscape - If sal_True, an embedded quote character inside a quoted - field may also be escaped with a preceding backslash. - Normally, quotes are escaped by doubling them. - - check Stream::good() to detect IO problems during read - - @ATTENTION - Note that the string returned may be truncated even inside - a quoted field if STRING_MAXLEN was reached. There - currently is no way to exactly determine the conditions, - whether this was at a line end, or whether open quotes - would have closed the field before the line end, as even a - ReadUniOrByteStringLine() may return prematurely but the - stream was positioned ahead until the real end of line. - Additionally, due to character encoding conversions, string - length and bytes read don't necessarily match, and - resyncing to a previous position matching the string's - length isn't always possible. As a result, a logical line - with embedded line breaks and more than STRING_MAXLEN - characters will be spoiled, and a subsequent ReadCsvLine() - may start under false preconditions. - */ - void ReadCsvLine( String& rStr, sal_Bool bEmbeddedLineBreak, - const String& rFieldSeparators, sal_Unicode cFieldQuote, - sal_Bool bAllowBackslashEscape = sal_False); - void SetBufferSize( sal_uInt16 nBufSize ); sal_uInt16 GetBufferSize() const { return nBufSize; } diff --git a/tools/source/stream/stream.cxx b/tools/source/stream/stream.cxx index 8c0efb66c6f2..fea6025e5263 100644 --- a/tools/source/stream/stream.cxx +++ b/tools/source/stream/stream.cxx @@ -1019,93 +1019,6 @@ sal_Bool SvStream::StartReadingUnicodeText( rtl_TextEncoding eReadBomCharSet ) /************************************************************************* |* -|* Stream::ReadCsvLine() -|* -*************************************************************************/ - -// Precondition: pStr is guaranteed to be non-NULL and points to a 0-terminated -// array. -inline const sal_Unicode* lcl_UnicodeStrChr( const sal_Unicode* pStr, - sal_Unicode c ) -{ - while (*pStr) - { - if (*pStr == c) - return pStr; - ++pStr; - } - return 0; -} - -void SvStream::ReadCsvLine( String& rStr, sal_Bool bEmbeddedLineBreak, - const String& rFieldSeparators, sal_Unicode cFieldQuote, - sal_Bool bAllowBackslashEscape) -{ - ReadUniOrByteStringLine(rStr, GetStreamCharSet()); - - if (bEmbeddedLineBreak) - { - const sal_Unicode* pSeps = rFieldSeparators.GetBuffer(); - - // See if the separator(s) include tab. - bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL; - - xub_StrLen nLastOffset = 0; - xub_StrLen nQuotes = 0; - while (!IsEof() && rStr.Len() < STRING_MAXLEN) - { - bool bBackslashEscaped = false; - const sal_Unicode *p, *pStart; - p = pStart = rStr.GetBuffer(); - p += nLastOffset; - while (*p) - { - if (nQuotes) - { - if (bTabSep && *p == '\t' && (nQuotes % 2) != 0) - { - // When tab-delimited, tab char ends quoted sequence - // even if we haven't reached the end quote. Doing - // this helps keep mal-formed rows from damaging - // other, well-formed rows. - nQuotes = 0; - break; - } - - if (*p == cFieldQuote && !bBackslashEscaped) - ++nQuotes; - else if (bAllowBackslashEscape) - { - if (*p == '\\') - bBackslashEscaped = !bBackslashEscaped; - else - bBackslashEscaped = false; - } - } - else if (*p == cFieldQuote && (p == pStart || - lcl_UnicodeStrChr( pSeps, p[-1]))) - nQuotes = 1; - // A quote character inside a field content does not start - // a quote. - ++p; - } - - if (nQuotes % 2 == 0) - break; - else - { - nLastOffset = rStr.Len(); - String aNext; - ReadUniOrByteStringLine(aNext, GetStreamCharSet()); - rStr += sal_Unicode(_LF); - rStr += aNext; - } - } - } -} - -/************************************************************************* -|* |* Stream::SeekRel() |* *************************************************************************/ |