summaryrefslogtreecommitdiff
path: root/sc
diff options
context:
space:
mode:
authorEike Rathke <erack@redhat.com>2018-07-02 14:41:59 +0200
committerEike Rathke <erack@redhat.com>2018-07-02 16:23:24 +0200
commitc807e7ea7a0725a4d8375eda07d6f70870e0d50a (patch)
treec498d4b4052ad0991f55f1bdf7bc4830b1e72204 /sc
parent77f81dabfd75ef756f6ed7ba9086db19a58984c9 (diff)
Resolves: tdf#56910 detect a Space (blank) separator if not selected
On populating the CSV import dialog for the first time attempt to detect a possible space (blank) separator if field separators don't include it already. This can be necessary because of the "accept broken misquoted CSV fields" feature that tries to ignore trailing blanks after a quoted field and if no separator follows continues to add content to the field assuming the single double quote was in error. If this blank separator is detected it is added to field separators and the line and subsequent lines are reread with the new separators. Change-Id: I3c6d74ce8883f1d279a810e800e54b349d85ac71 Reviewed-on: https://gerrit.libreoffice.org/56810 Reviewed-by: Eike Rathke <erack@redhat.com> Tested-by: Jenkins
Diffstat (limited to 'sc')
-rw-r--r--sc/source/ui/dbgui/scuiasciiopt.cxx30
-rw-r--r--sc/source/ui/docshell/impex.cxx54
-rw-r--r--sc/source/ui/inc/impex.hxx13
-rw-r--r--sc/source/ui/inc/scuiasciiopt.hxx3
4 files changed, 83 insertions, 17 deletions
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx
index b885e9b9c7ec..aeb718be4d08 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -288,7 +288,8 @@ ScImportAsciiDlg::ScImportAsciiDlg( vcl::Window* pParent, const OUString& aDatNa
aColumnUser ( ScResId( SCSTR_COLUMN_USER ) ),
aTextSepList(SCSTR_TEXTSEP),
mcTextSep ( ScAsciiOptions::cDefaultTextSep ),
- meCall(eCall)
+ meCall(eCall),
+ mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS)
{
get(pFtCharSet, "textcharset");
get(pLbCharSet, "charset");
@@ -558,7 +559,7 @@ void ScImportAsciiDlg::dispose()
ModalDialog::dispose();
}
-bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText )
+bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& rcDetectSep )
{
if (nLine >= ASCIIDLG_MAXROWS || !mpDatStream)
return false;
@@ -591,7 +592,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText )
break;
}
rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators,
- mcTextSep);
+ mcTextSep, rcDetectSep);
mnStreamPos = mpDatStream->Tell();
mpRowPosArray[++mnRowPosCount] = mnStreamPos;
} while (nLine >= mnRowPosCount && mpDatStream->good());
@@ -606,7 +607,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText )
else
{
Seek( mpRowPosArray[nLine]);
- rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep);
+ rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep);
mnStreamPos = mpDatStream->Tell();
}
@@ -805,6 +806,12 @@ IMPL_LINK( ScImportAsciiDlg, LbColTypeHdl, ListBox&, rListBox, void )
IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
{
+ // Checking the separator can only be done once for the very first time
+ // when the dialog wasn't already presented to the user.
+ // As a side effect this has the benefit that the check is only done on the
+ // first set of visible lines.
+ sal_Unicode cDetectSep = (mbDetectSpaceSep && !pRbFixed->IsChecked() && !pCkbSpace->IsChecked() ? 0 : 0xffff);
+
sal_Int32 nBaseLine = mpTableBox->GetFirstVisLine();
sal_Int32 nRead = mpTableBox->GetVisLineCount();
// If mnRowPosCount==0, this is an initializing call, read ahead for row
@@ -817,12 +824,25 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
sal_Int32 i;
for (i = 0; i < nRead; i++)
{
- if (!GetLine( nBaseLine + i, maPreviewLine[i]))
+ if (!GetLine( nBaseLine + i, maPreviewLine[i], cDetectSep))
break;
}
for (; i < CSV_PREVIEW_LINES; i++)
maPreviewLine[i].clear();
+ if (mbDetectSpaceSep)
+ {
+ mbDetectSpaceSep = false;
+ if (cDetectSep == ' ')
+ {
+ // Expect space to be appended by now so all subsequent
+ // GetLine()/ReadCsvLine() actually used it.
+ assert(maFieldSeparators.endsWith(" "));
+ // Preselect Space in UI.
+ pCkbSpace->Check();
+ }
+ }
+
mpTableBox->Execute( CSVCMD_SETLINECOUNT, mnRowPosCount);
bool bMergeSep = pCkbAsOnce->IsChecked();
bool bRemoveSpace = pCkbRemoveSpace->IsChecked();
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 3b3068764f24..854bc92b9635 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -564,7 +564,7 @@ enum QuoteType
FIELDEND_QUOTE if end of field quote
DONTKNOW_QUOTE anything else
*/
-static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps )
+static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps, sal_Unicode& rcDetectSep )
{
// Due to broken CSV generators that don't double embedded quotes check if
// a field separator immediately or with trailing spaces follows the quote,
@@ -572,6 +572,10 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
const sal_Unicode cBlank = ' ';
if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank))
return FIELDEND_QUOTE;
+ // Detect a possible blank separator if it's not already in the list (which
+ // was checked right above for p[1]==cBlank).
+ if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank)
+ rcDetectSep = cBlank;
while (p[1] == cBlank)
++p;
if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
@@ -601,7 +605,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
do not increment nQuotes in caller then!
*/
static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
- const sal_Unicode* pSeps, sal_Unicode cStr )
+ const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
{
if ((nQuotes % 2) == 0)
{
@@ -615,7 +619,7 @@ static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unic
}
if (p[1] == cStr)
return FIRST_QUOTE;
- return lcl_isFieldEndQuote( p, pSeps);
+ return lcl_isFieldEndQuote( p, pSeps, rcDetectSep);
}
/** Append characters of [p1,p2) to rField.
@@ -664,7 +668,8 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rStrin
// break or continue for loop
if (eMode == DoubledQuoteMode::ESCAPE)
{
- if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE)
+ sal_Unicode cDetectSep = 0xffff; // No separator detection here.
+ if (lcl_isFieldEndQuote( p-1, pSeps, cDetectSep) == FIELDEND_QUOTE)
break;
else
continue;
@@ -1299,8 +1304,8 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
SCTAB nTab = aRange.aStart.Tab();
bool bFixed = pExtOptions->IsFixedLen();
- const OUString& rSeps = pExtOptions->GetFieldSeps();
- const sal_Unicode* pSeps = rSeps.getStr();
+ OUString aSeps = pExtOptions->GetFieldSeps(); // Need non-const for ReadCsvLine(),
+ const sal_Unicode* pSeps = aSeps.getStr(); // but it will be const anyway (asserted below).
bool bMerge = pExtOptions->IsMergeSeps();
bool bRemoveSpace = pExtOptions->IsRemoveSpace();
sal_uInt16 nInfoCount = pExtOptions->GetInfoCount();
@@ -1336,10 +1341,11 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
OUString aCell;
sal_uInt16 i;
SCROW nRow = nStartRow;
+ sal_Unicode cDetectSep = 0xffff; // No separator detection here.
while(--nSkipLines>0)
{
- aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr); // content is ignored
+ aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep); // content is ignored
if ( rStrm.eof() )
break;
}
@@ -1362,10 +1368,12 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
{
for( ;; )
{
- aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr);
+ aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep);
if ( rStrm.eof() && aLine.isEmpty() )
break;
+ assert(pSeps == aSeps.getStr());
+
if ( nRow > MAXROW )
{
bOverflowRow = true; // display warning on import
@@ -2380,8 +2388,26 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr )
}
OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
- const OUString& rFieldSeparators, sal_Unicode cFieldQuote )
+ OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep )
{
+ enum RetryState
+ {
+ FORBID,
+ ALLOW,
+ RETRY,
+ RETRIED
+ } eRetryState = (bEmbeddedLineBreak && rcDetectSep == 0 ? RetryState::ALLOW : RetryState::FORBID);
+
+ sal_uInt64 nStreamPos = (eRetryState == RetryState::ALLOW ? rStream.Tell() : 0);
+
+Label_RetryWithNewSep:
+
+ if (eRetryState == RetryState::RETRY)
+ {
+ eRetryState = RetryState::RETRIED;
+ rStream.Seek( nStreamPos);
+ }
+
OUString aStr;
rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
@@ -2416,7 +2442,15 @@ OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
// we are in FIELDEND_QUOTE state.
else if (eQuoteState != FIELDEND_QUOTE)
{
- eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote);
+ eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep);
+
+ if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ')
+ {
+ eRetryState = RetryState::RETRY;
+ rFieldSeparators += OUString(' ');
+ goto Label_RetryWithNewSep;
+ }
+
// DONTKNOW_QUOTE is an embedded unescaped quote we
// don't count for pairing.
if (eQuoteState != DONTKNOW_QUOTE)
diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx
index 152ae2da98ca..e297c1b7498a 100644
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -175,10 +175,21 @@ public:
@param rFieldSeparators
A list of characters that each may act as a field separator.
+ If rcDetectSep was 0 and a separator is detected then it is appended to
+ rFieldSeparators.
@param cFieldQuote
The quote character used.
+ @param rcDetectSep
+ If 0 then attempt to detect a possible space (blank) separator if
+ rFieldSeparators doesn't include it already. This can be necessary because
+ of the "accept broken misquoted CSV fields" feature that tries to ignore
+ trailing blanks after a quoted field and if no separator follows continues
+ to add content to the field assuming the single double quote was in error.
+ If this blank separator is detected it is added to rFieldSeparators and the
+ line is reread with the new separators
+
check Stream::good() to detect IO problems during read
@ATTENTION
@@ -199,7 +210,7 @@ public:
*/
SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
- const OUString& rFieldSeparators, sal_Unicode cFieldQuote );
+ OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep );
#endif
diff --git a/sc/source/ui/inc/scuiasciiopt.hxx b/sc/source/ui/inc/scuiasciiopt.hxx
index 5a6ea8a1a0c7..c65fb81c3cec 100644
--- a/sc/source/ui/inc/scuiasciiopt.hxx
+++ b/sc/source/ui/inc/scuiasciiopt.hxx
@@ -89,6 +89,7 @@ class ScImportAsciiDlg : public ModalDialog
rtl_TextEncoding meCharSet; /// Selected char set.
bool mbCharSetSystem; /// Is System char set selected?
ScImportAsciiCall meCall; /// How the dialog is called (see asciiopt.hxx)
+ bool mbDetectSpaceSep; /// Whether to detect a possible space separator.
public:
ScImportAsciiDlg(
@@ -111,7 +112,7 @@ private:
/** Enables or disables all separator checkboxes and edit fields. */
void SetupSeparatorCtrls();
- bool GetLine( sal_uLong nLine, OUString &rText );
+ bool GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& rcDetectSep );
void UpdateVertical();
inline bool Seek( sal_uLong nPos ); // synced to and from mnStreamPos