diff options
author | Eike Rathke <erack@redhat.com> | 2018-07-02 14:41:59 +0200 |
---|---|---|
committer | Eike Rathke <erack@redhat.com> | 2018-07-02 16:23:24 +0200 |
commit | c807e7ea7a0725a4d8375eda07d6f70870e0d50a (patch) | |
tree | c498d4b4052ad0991f55f1bdf7bc4830b1e72204 /sc | |
parent | 77f81dabfd75ef756f6ed7ba9086db19a58984c9 (diff) |
Resolves: tdf#56910 detect a Space (blank) separator if not selected
On populating the CSV import dialog for the first time attempt to
detect a possible space (blank) separator if field separators
don't include it already. This can be necessary because of the
"accept broken misquoted CSV fields" feature that tries to ignore
trailing blanks after a quoted field and if no separator follows
continues to add content to the field assuming the single double
quote was in error. If this blank separator is detected it is
added to field separators and the line and subsequent lines are
reread with the new separators.
Change-Id: I3c6d74ce8883f1d279a810e800e54b349d85ac71
Reviewed-on: https://gerrit.libreoffice.org/56810
Reviewed-by: Eike Rathke <erack@redhat.com>
Tested-by: Jenkins
Diffstat (limited to 'sc')
-rw-r--r-- | sc/source/ui/dbgui/scuiasciiopt.cxx | 30 | ||||
-rw-r--r-- | sc/source/ui/docshell/impex.cxx | 54 | ||||
-rw-r--r-- | sc/source/ui/inc/impex.hxx | 13 | ||||
-rw-r--r-- | sc/source/ui/inc/scuiasciiopt.hxx | 3 |
4 files changed, 83 insertions, 17 deletions
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx index b885e9b9c7ec..aeb718be4d08 100644 --- a/sc/source/ui/dbgui/scuiasciiopt.cxx +++ b/sc/source/ui/dbgui/scuiasciiopt.cxx @@ -288,7 +288,8 @@ ScImportAsciiDlg::ScImportAsciiDlg( vcl::Window* pParent, const OUString& aDatNa aColumnUser ( ScResId( SCSTR_COLUMN_USER ) ), aTextSepList(SCSTR_TEXTSEP), mcTextSep ( ScAsciiOptions::cDefaultTextSep ), - meCall(eCall) + meCall(eCall), + mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS) { get(pFtCharSet, "textcharset"); get(pLbCharSet, "charset"); @@ -558,7 +559,7 @@ void ScImportAsciiDlg::dispose() ModalDialog::dispose(); } -bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText ) +bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& rcDetectSep ) { if (nLine >= ASCIIDLG_MAXROWS || !mpDatStream) return false; @@ -591,7 +592,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText ) break; } rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, - mcTextSep); + mcTextSep, rcDetectSep); mnStreamPos = mpDatStream->Tell(); mpRowPosArray[++mnRowPosCount] = mnStreamPos; } while (nLine >= mnRowPosCount && mpDatStream->good()); @@ -606,7 +607,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText ) else { Seek( mpRowPosArray[nLine]); - rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep); + rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep); mnStreamPos = mpDatStream->Tell(); } @@ -805,6 +806,12 @@ IMPL_LINK( ScImportAsciiDlg, LbColTypeHdl, ListBox&, rListBox, void ) IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void) { + // Checking the separator can only be done once for the very first time + // when the dialog wasn't already presented to the user. + // As a side effect this has the benefit that the check is only done on the + // first set of visible lines. + sal_Unicode cDetectSep = (mbDetectSpaceSep && !pRbFixed->IsChecked() && !pCkbSpace->IsChecked() ? 0 : 0xffff); + sal_Int32 nBaseLine = mpTableBox->GetFirstVisLine(); sal_Int32 nRead = mpTableBox->GetVisLineCount(); // If mnRowPosCount==0, this is an initializing call, read ahead for row @@ -817,12 +824,25 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void) sal_Int32 i; for (i = 0; i < nRead; i++) { - if (!GetLine( nBaseLine + i, maPreviewLine[i])) + if (!GetLine( nBaseLine + i, maPreviewLine[i], cDetectSep)) break; } for (; i < CSV_PREVIEW_LINES; i++) maPreviewLine[i].clear(); + if (mbDetectSpaceSep) + { + mbDetectSpaceSep = false; + if (cDetectSep == ' ') + { + // Expect space to be appended by now so all subsequent + // GetLine()/ReadCsvLine() actually used it. + assert(maFieldSeparators.endsWith(" ")); + // Preselect Space in UI. + pCkbSpace->Check(); + } + } + mpTableBox->Execute( CSVCMD_SETLINECOUNT, mnRowPosCount); bool bMergeSep = pCkbAsOnce->IsChecked(); bool bRemoveSpace = pCkbRemoveSpace->IsChecked(); diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx index 3b3068764f24..854bc92b9635 100644 --- a/sc/source/ui/docshell/impex.cxx +++ b/sc/source/ui/docshell/impex.cxx @@ -564,7 +564,7 @@ enum QuoteType FIELDEND_QUOTE if end of field quote DONTKNOW_QUOTE anything else */ -static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps ) +static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps, sal_Unicode& rcDetectSep ) { // Due to broken CSV generators that don't double embedded quotes check if // a field separator immediately or with trailing spaces follows the quote, @@ -572,6 +572,10 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p const sal_Unicode cBlank = ' '; if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank)) return FIELDEND_QUOTE; + // Detect a possible blank separator if it's not already in the list (which + // was checked right above for p[1]==cBlank). + if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank) + rcDetectSep = cBlank; while (p[1] == cBlank) ++p; if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1])) @@ -601,7 +605,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p do not increment nQuotes in caller then! */ static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p, - const sal_Unicode* pSeps, sal_Unicode cStr ) + const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep ) { if ((nQuotes % 2) == 0) { @@ -615,7 +619,7 @@ static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unic } if (p[1] == cStr) return FIRST_QUOTE; - return lcl_isFieldEndQuote( p, pSeps); + return lcl_isFieldEndQuote( p, pSeps, rcDetectSep); } /** Append characters of [p1,p2) to rField. @@ -664,7 +668,8 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rStrin // break or continue for loop if (eMode == DoubledQuoteMode::ESCAPE) { - if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE) + sal_Unicode cDetectSep = 0xffff; // No separator detection here. + if (lcl_isFieldEndQuote( p-1, pSeps, cDetectSep) == FIELDEND_QUOTE) break; else continue; @@ -1299,8 +1304,8 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm ) SCTAB nTab = aRange.aStart.Tab(); bool bFixed = pExtOptions->IsFixedLen(); - const OUString& rSeps = pExtOptions->GetFieldSeps(); - const sal_Unicode* pSeps = rSeps.getStr(); + OUString aSeps = pExtOptions->GetFieldSeps(); // Need non-const for ReadCsvLine(), + const sal_Unicode* pSeps = aSeps.getStr(); // but it will be const anyway (asserted below). bool bMerge = pExtOptions->IsMergeSeps(); bool bRemoveSpace = pExtOptions->IsRemoveSpace(); sal_uInt16 nInfoCount = pExtOptions->GetInfoCount(); @@ -1336,10 +1341,11 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm ) OUString aCell; sal_uInt16 i; SCROW nRow = nStartRow; + sal_Unicode cDetectSep = 0xffff; // No separator detection here. while(--nSkipLines>0) { - aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr); // content is ignored + aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep); // content is ignored if ( rStrm.eof() ) break; } @@ -1362,10 +1368,12 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm ) { for( ;; ) { - aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr); + aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep); if ( rStrm.eof() && aLine.isEmpty() ) break; + assert(pSeps == aSeps.getStr()); + if ( nRow > MAXROW ) { bOverflowRow = true; // display warning on import @@ -2380,8 +2388,26 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr ) } OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak, - const OUString& rFieldSeparators, sal_Unicode cFieldQuote ) + OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep ) { + enum RetryState + { + FORBID, + ALLOW, + RETRY, + RETRIED + } eRetryState = (bEmbeddedLineBreak && rcDetectSep == 0 ? RetryState::ALLOW : RetryState::FORBID); + + sal_uInt64 nStreamPos = (eRetryState == RetryState::ALLOW ? rStream.Tell() : 0); + +Label_RetryWithNewSep: + + if (eRetryState == RetryState::RETRY) + { + eRetryState = RetryState::RETRIED; + rStream.Seek( nStreamPos); + } + OUString aStr; rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit); @@ -2416,7 +2442,15 @@ OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak, // we are in FIELDEND_QUOTE state. else if (eQuoteState != FIELDEND_QUOTE) { - eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote); + eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep); + + if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ') + { + eRetryState = RetryState::RETRY; + rFieldSeparators += OUString(' '); + goto Label_RetryWithNewSep; + } + // DONTKNOW_QUOTE is an embedded unescaped quote we // don't count for pairing. if (eQuoteState != DONTKNOW_QUOTE) diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx index 152ae2da98ca..e297c1b7498a 100644 --- a/sc/source/ui/inc/impex.hxx +++ b/sc/source/ui/inc/impex.hxx @@ -175,10 +175,21 @@ public: @param rFieldSeparators A list of characters that each may act as a field separator. + If rcDetectSep was 0 and a separator is detected then it is appended to + rFieldSeparators. @param cFieldQuote The quote character used. + @param rcDetectSep + If 0 then attempt to detect a possible space (blank) separator if + rFieldSeparators doesn't include it already. This can be necessary because + of the "accept broken misquoted CSV fields" feature that tries to ignore + trailing blanks after a quoted field and if no separator follows continues + to add content to the field assuming the single double quote was in error. + If this blank separator is detected it is added to rFieldSeparators and the + line is reread with the new separators + check Stream::good() to detect IO problems during read @ATTENTION @@ -199,7 +210,7 @@ public: */ SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak, - const OUString& rFieldSeparators, sal_Unicode cFieldQuote ); + OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep ); #endif diff --git a/sc/source/ui/inc/scuiasciiopt.hxx b/sc/source/ui/inc/scuiasciiopt.hxx index 5a6ea8a1a0c7..c65fb81c3cec 100644 --- a/sc/source/ui/inc/scuiasciiopt.hxx +++ b/sc/source/ui/inc/scuiasciiopt.hxx @@ -89,6 +89,7 @@ class ScImportAsciiDlg : public ModalDialog rtl_TextEncoding meCharSet; /// Selected char set. bool mbCharSetSystem; /// Is System char set selected? ScImportAsciiCall meCall; /// How the dialog is called (see asciiopt.hxx) + bool mbDetectSpaceSep; /// Whether to detect a possible space separator. public: ScImportAsciiDlg( @@ -111,7 +112,7 @@ private: /** Enables or disables all separator checkboxes and edit fields. */ void SetupSeparatorCtrls(); - bool GetLine( sal_uLong nLine, OUString &rText ); + bool GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& rcDetectSep ); void UpdateVertical(); inline bool Seek( sal_uLong nPos ); // synced to and from mnStreamPos |