Parse CSV lines in the reader thread.

Change-Id: I6329a0e6e6fa6576df2ed473482d558bfd6cce08
author: Kohei Yoshida <kohei.yoshida@collabora.com> 2013-12-30 12:16:35 -0500
committer: Kohei Yoshida <kohei.yoshida@collabora.com> 2013-12-30 12:33:28 -0500
commit: 9a623cdca281a682d39b423aefac392c2cc22cf7 (patch)
tree: d8c7222f148818b163b40f1d9302f821542415c8 /sc/source
parent: 3ccb783be184102075fe1f9814f05e85d4968c32 (diff)
3 files changed, 293 insertions, 103 deletions
diff --git a/sc/source/core/tool/stringutil.cxx b/sc/source/core/tool/stringutil.cxx
index e711bcba40d4..5bdc2c2fe752 100644
--- a/sc/source/core/tool/stringutil.cxx
+++ b/sc/source/core/tool/stringutil.cxx
@@ -18,11 +18,13 @@
  */
 
 #include "stringutil.hxx"
-#include "rtl/ustrbuf.hxx"
-#include "rtl/math.hxx"
 #include "global.hxx"
 #include "svl/zforlist.hxx"
 
+#include <rtl/ustrbuf.hxx>
+#include <rtl/strbuf.hxx>
+#include <rtl/math.hxx>
+
 ScSetStringParam::ScSetStringParam() :
     mpNumFormatter(NULL),
     mbDetectNumberFormat(true),
@@ -194,6 +196,150 @@ bool ScStringUtil::parseSimpleNumber(
     return true;
 }
 
+bool ScStringUtil::parseSimpleNumber(
+    const char* p, size_t n, char dsep, char gsep, double& rVal)
+{
+    // Actually almost the entire pre-check is unnecessary and we could call
+    // rtl::math::stringToDouble() just after having exchanged ascii space with
+    // non-breaking space, if it wasn't for check of grouped digits. The NaN
+    // and Inf cases that are accepted by stringToDouble() could be detected
+    // using rtl::math::isFinite() on the result.
+
+    /* TODO: The grouped digits check isn't even valid for locales that do not
+     * group in thousands ... e.g. Indian locales. But that's something also
+     * the number scanner doesn't implement yet, only the formatter. */
+
+    OStringBuffer aBuf;
+
+    size_t i = 0;
+    const char* pLast = p + (n-1);
+    sal_Int32 nPosDSep = -1, nPosGSep = -1;
+    sal_uInt32 nDigitCount = 0;
+    sal_Int32 nPosExponent = -1;
+
+    // Skip preceding spaces.
+    for (i = 0; i < n; ++i, ++p)
+    {
+        char c = *p;
+        if (c != ' ')
+            // first non-space character.  Exit.
+            break;
+    }
+
+    if (i == n)
+        // the whole string is space.  Fail.
+        return false;
+
+    n -= i; // Subtract the length of the preceding spaces.
+
+    // Determine the last non-space character.
+    for (; p != pLast; --pLast, --n)
+    {
+        char c = *pLast;
+        if (c != ' ')
+            // Non space character. Exit.
+            break;
+    }
+
+    for (i = 0; i < n; ++i, ++p)
+    {
+        char c = *p;
+
+        if ('0' <= c && c <= '9')
+        {
+            // this is a digit.
+            aBuf.append(c);
+            ++nDigitCount;
+        }
+        else if (c == dsep)
+        {
+            // this is a decimal separator.
+
+            if (nPosDSep >= 0)
+                // a second decimal separator -> not a valid number.
+                return false;
+
+            if (nPosGSep >= 0 && i - nPosGSep != 4)
+                // the number has a group separator and the decimal sep is not
+                // positioned correctly.
+                return false;
+
+            nPosDSep = i;
+            nPosGSep = -1;
+            aBuf.append(c);
+            nDigitCount = 0;
+        }
+        else if (c == gsep)
+        {
+            // this is a group (thousand) separator.
+
+            if (i == 0)
+                // not allowed as the first character.
+                return false;
+
+            if (nPosDSep >= 0)
+                // not allowed after the decimal separator.
+                return false;
+
+            if (nPosGSep >= 0 && nDigitCount != 3)
+                // must be exactly 3 digits since the last group separator.
+                return false;
+
+            if (nPosExponent >= 0)
+                // not allowed in exponent.
+                return false;
+
+            nPosGSep = i;
+            nDigitCount = 0;
+        }
+        else if (c == '-' || c == '+')
+        {
+            // A sign must be the first character if it's given, or immediately
+            // follow the exponent character if present.
+            if (i == 0 || (nPosExponent >= 0 && i == static_cast<size_t>(nPosExponent+1)))
+                aBuf.append(c);
+            else
+                return false;
+        }
+        else if (c == 'E' || c == 'e')
+        {
+            // this is an exponent designator.
+
+            if (nPosExponent >= 0)
+                // Only one exponent allowed.
+                return false;
+
+            if (nPosGSep >= 0 && nDigitCount != 3)
+                // must be exactly 3 digits since the last group separator.
+                return false;
+
+            aBuf.append(c);
+            nPosExponent = i;
+            nPosDSep = -1;
+            nPosGSep = -1;
+            nDigitCount = 0;
+        }
+        else
+            return false;
+    }
+
+    // finished parsing the number.
+
+    if (nPosGSep >= 0 && nDigitCount != 3)
+        // must be exactly 3 digits since the last group separator.
+        return false;
+
+    rtl_math_ConversionStatus eStatus = rtl_math_ConversionStatus_Ok;
+    sal_Int32 nParseEnd = 0;
+    OString aString( aBuf.makeStringAndClear());
+    rVal = ::rtl::math::stringToDouble( aString, dsep, gsep, &eStatus, &nParseEnd);
+    if (eStatus != rtl_math_ConversionStatus_Ok || nParseEnd < aString.getLength())
+        // Not a valid number or not entire string consumed.
+        return false;
+
+    return true;
+}
+
 sal_Int32 ScStringUtil::GetQuotedTokenCount(const OUString &rIn, const OUString& rQuotedPairs, sal_Unicode cTok )
 {
     assert( !(rQuotedPairs.getLength()%2) );
diff --git a/sc/source/ui/docshell/datastream.cxx b/sc/source/ui/docshell/datastream.cxx
index 1b7f0d233f43..a21cba402d4b 100644
--- a/sc/source/ui/docshell/datastream.cxx
+++ b/sc/source/ui/docshell/datastream.cxx
@@ -56,6 +56,8 @@ double datastream_get_time(int nIdx)
     return fTimes[ nIdx ];
 }
 
+namespace {
+
 inline double getNow()
 {
     TimeValue now;
@@ -63,6 +65,50 @@ inline double getNow()
     return static_cast<double>(now.Seconds) + static_cast<double>(now.Nanosec) / 1000000000.0;
 }
 
+#if ENABLE_ORCUS
+
+class CSVHandler
+{
+    DataStream::Line& mrLine;
+    size_t mnColCount;
+    size_t mnCols;
+    const char* mpLineHead;
+
+public:
+    CSVHandler( DataStream::Line& rLine, size_t nColCount ) :
+        mrLine(rLine), mnColCount(nColCount), mnCols(0), mpLineHead(rLine.maLine.getStr()) {}
+
+    void begin_parse() {}
+    void end_parse() {}
+    void begin_row() {}
+    void end_row() {}
+
+    void cell(const char* p, size_t n)
+    {
+        if (mnCols >= mnColCount)
+            return;
+
+        DataStream::Cell aCell;
+        if (ScStringUtil::parseSimpleNumber(p, n, '.', ',', aCell.mfValue))
+        {
+            aCell.mbValue = true;
+        }
+        else
+        {
+            aCell.mbValue = false;
+            aCell.maStr.Pos = std::distance(mpLineHead, p);
+            aCell.maStr.Size = n;
+        }
+        mrLine.maCells.push_back(aCell);
+
+        ++mnCols;
+    }
+};
+
+#endif
+
+}
+
 namespace datastreams {
 
 class CallerThread : public salhelper::Thread
@@ -96,7 +142,7 @@ private:
     }
 };
 
-void emptyLineQueue( std::queue<LinesList*>& rQueue )
+void emptyLineQueue( std::queue<DataStream::LinesType*>& rQueue )
 {
     while (!rQueue.empty())
     {
@@ -108,22 +154,34 @@ void emptyLineQueue( std::queue<LinesList*>& rQueue )
 class ReaderThread : public salhelper::Thread
 {
     SvStream *mpStream;
+    size_t mnColCount;
     bool mbTerminate;
     osl::Mutex maMtxTerminate;
 
-    std::queue<LinesList* > maPendingLines;
-    std::queue<LinesList* > maUsedLines;
+    std::queue<DataStream::LinesType*> maPendingLines;
+    std::queue<DataStream::LinesType*> maUsedLines;
     osl::Mutex maMtxLines;
 
     osl::Condition maCondReadStream;
     osl::Condition maCondConsume;
 
+#if ENABLE_ORCUS
+    orcus::csv_parser_config maConfig;
+#endif
+
 public:
 
-    ReaderThread(SvStream *pData):
+    ReaderThread(SvStream *pData, size_t nColCount):
         Thread("ReaderThread"),
         mpStream(pData),
-        mbTerminate(false) {}
+        mnColCount(nColCount),
+        mbTerminate(false)
+    {
+#if ENABLE_ORCUS
+        maConfig.delimiters.push_back(',');
+        maConfig.text_qualifier = '"';
+#endif
+    }
 
     virtual ~ReaderThread()
     {
@@ -156,9 +214,9 @@ public:
         maCondConsume.reset();
     }
 
-    LinesList* popNewLines()
+    DataStream::LinesType* popNewLines()
     {
-        LinesList* pLines = maPendingLines.front();
+        DataStream::LinesType* pLines = maPendingLines.front();
         maPendingLines.pop();
         return pLines;
     }
@@ -174,7 +232,7 @@ public:
         return !maPendingLines.empty();
     }
 
-    void pushUsedLines( LinesList* pLines )
+    void pushUsedLines( DataStream::LinesType* pLines )
     {
         maUsedLines.push(pLines);
     }
@@ -189,7 +247,7 @@ private:
     {
         while (!isTerminateRequested())
         {
-            LinesList* pLines = NULL;
+            DataStream::LinesType* pLines = NULL;
             osl::ResettableMutexGuard aGuard(maMtxLines);
 
             if (!maUsedLines.empty())
@@ -202,12 +260,20 @@ private:
             else
             {
                 aGuard.clear(); // unlock
-                pLines = new LinesList(10);
+                pLines = new DataStream::LinesType(10);
             }
 
             // Read & store new lines from stream.
-            for (size_t i = 0; i < pLines->size(); ++i)
-                mpStream->ReadLine( pLines->at(i) );
+            for (size_t i = 0, n = pLines->size(); i < n; ++i)
+            {
+                DataStream::Line& rLine = (*pLines)[i];
+                rLine.maCells.clear();
+                mpStream->ReadLine(rLine.maLine);
+
+                CSVHandler aHdl(rLine, mnColCount);
+                orcus::csv_parser<CSVHandler> parser(rLine.maLine.getStr(), rLine.maLine.getLength(), aHdl, maConfig);
+                parser.parse();
+            }
 
             aGuard.reset(); // lock
             while (!isTerminateRequested() && maPendingLines.size() >= 8)
@@ -228,6 +294,19 @@ private:
 
 }
 
+DataStream::Cell::Cell() : mfValue(0.0), mbValue(true) {}
+
+DataStream::Cell::Cell( const Cell& r ) : mbValue(r.mbValue)
+{
+    if (r.mbValue)
+        mfValue = r.mfValue;
+    else
+    {
+        maStr.Pos = r.maStr.Pos;
+        maStr.Size = r.maStr.Size;
+    }
+}
+
 void DataStream::MakeToolbarVisible()
 {
     css::uno::Reference< css::frame::XFrame > xFrame =
@@ -312,13 +391,13 @@ DataStream::~DataStream()
     delete mpLines;
 }
 
-OString DataStream::ConsumeLine()
+DataStream::Line DataStream::ConsumeLine()
 {
     if (!mpLines || mnLinesCount >= mpLines->size())
     {
         mnLinesCount = 0;
         if (mxReaderThread->isTerminateRequested())
-            return OString();
+            return Line();
 
         osl::ResettableMutexGuard aGuard(mxReaderThread->getLinesMutex());
         if (mpLines)
@@ -402,7 +481,7 @@ void DataStream::StartImport()
             pStream = new SvScriptStream(msURL);
         else
             pStream = new SvFileStream(msURL, STREAM_READ);
-        mxReaderThread = new datastreams::ReaderThread( pStream );
+        mxReaderThread = new datastreams::ReaderThread(pStream, maStartRange.aEnd.Col() - maStartRange.aStart.Col() + 1);
         mxReaderThread->launch();
     }
     mbRunning = true;
@@ -476,79 +555,10 @@ void DataStream::MoveData()
 
 #if ENABLE_ORCUS
 
-namespace {
-
-struct StrVal
-{
-    ScAddress maPos;
-    OUString maStr;
-
-    StrVal( const ScAddress& rPos, const OUString& rStr ) : maPos(rPos), maStr(rStr) {}
-};
-
-struct NumVal
-{
-    ScAddress maPos;
-    double mfVal;
-
-    NumVal( const ScAddress& rPos, double fVal ) : maPos(rPos), mfVal(fVal) {}
-};
-
-typedef std::vector<StrVal> StrValArray;
-typedef std::vector<NumVal> NumValArray;
-
-/**
- * This handler handles a single line CSV input.
- */
-class CSVHandler
-{
-    ScAddress maPos;
-    SCCOL mnEndCol;
-
-    StrValArray maStrs;
-    NumValArray maNums;
-
-public:
-    CSVHandler( const ScAddress& rPos, SCCOL nEndCol ) : maPos(rPos), mnEndCol(nEndCol) {}
-
-    void begin_parse() {}
-    void end_parse() {}
-    void begin_row() {}
-    void end_row() {}
-
-    void cell(const char* p, size_t n)
-    {
-        if (maPos.Col() <= mnEndCol)
-        {
-            OUString aStr(p, n, RTL_TEXTENCODING_UTF8);
-            double fVal;
-            if (ScStringUtil::parseSimpleNumber(aStr, '.', ',', fVal))
-                maNums.push_back(NumVal(maPos, fVal));
-            else
-                maStrs.push_back(StrVal(maPos, aStr));
-        }
-        maPos.IncCol();
-    }
-
-    const StrValArray& getStrs() const { return maStrs; }
-    const NumValArray& getNums() const { return maNums; }
-};
-
-}
-
 void DataStream::Text2Doc()
 {
-    OString aLine = ConsumeLine();
-    orcus::csv_parser_config aConfig;
-    aConfig.delimiters.push_back(',');
-    aConfig.text_qualifier = '"';
-    CSVHandler aHdl(ScAddress(maStartRange.aStart.Col(), mnCurRow, maStartRange.aStart.Tab()), maStartRange.aEnd.Col());
-    orcus::csv_parser<CSVHandler> parser(aLine.getStr(), aLine.getLength(), aHdl, aConfig);
-    parser.parse();
-
-    const StrValArray& rStrs = aHdl.getStrs();
-    const NumValArray& rNums = aHdl.getNums();
-    if (rStrs.empty() && rNums.empty() && mbRefreshOnEmptyLine)
+    Line aLine = ConsumeLine();
+    if (aLine.maCells.empty() && mbRefreshOnEmptyLine)
     {
         // Empty line detected.  Trigger refresh and discard it.
         Refresh();
@@ -559,15 +569,24 @@ void DataStream::Text2Doc()
 
     MoveData();
     {
-        StrValArray::const_iterator it = rStrs.begin(), itEnd = rStrs.end();
-        for (; it != itEnd; ++it)
-            maDocAccess.setStringCell(it->maPos, it->maStr);
-    }
-
-    {
-        NumValArray::const_iterator it = rNums.begin(), itEnd = rNums.end();
-        for (; it != itEnd; ++it)
-            maDocAccess.setNumericCell(it->maPos, it->mfVal);
+        std::vector<Cell>::const_iterator it = aLine.maCells.begin(), itEnd = aLine.maCells.end();
+        SCCOL nCol = maStartRange.aStart.Col();
+        const char* pLineHead = aLine.maLine.getStr();
+        for (; it != itEnd; ++it, ++nCol)
+        {
+            const Cell& rCell = *it;
+            if (rCell.mbValue)
+            {
+                maDocAccess.setNumericCell(
+                    ScAddress(nCol, mnCurRow, maStartRange.aStart.Tab()), rCell.mfValue);
+            }
+            else
+            {
+                maDocAccess.setStringCell(
+                    ScAddress(nCol, mnCurRow, maStartRange.aStart.Tab()),
+                    OUString(pLineHead+rCell.maStr.Pos, rCell.maStr.Size, RTL_TEXTENCODING_UTF8));
+            }
+        }
     }
 
     fTimes[ DEBUG_TIME_IMPORT ] = getNow() - fStart;
diff --git a/sc/source/ui/inc/datastream.hxx b/sc/source/ui/inc/datastream.hxx
index 5a4d8cd444b7..5af2dc7c29a6 100644
--- a/sc/source/ui/inc/datastream.hxx
+++ b/sc/source/ui/inc/datastream.hxx
@@ -33,15 +33,37 @@ namespace datastreams {
     class ReaderThread;
 }
 
-typedef std::vector<OString> LinesList;
 
 class DataStream : boost::noncopyable
 {
-    OString ConsumeLine();
-    void MoveData();
-    void Text2Doc();
-
 public:
+    struct Cell
+    {
+        struct Str
+        {
+            size_t Pos;
+            size_t Size;
+        };
+
+        union
+        {
+            Str maStr;
+            double mfValue;
+        };
+
+        bool mbValue;
+
+        Cell();
+        Cell( const Cell& r );
+    };
+
+    struct Line
+    {
+        OString maLine;
+        std::vector<Cell> maCells;
+    };
+    typedef std::vector<Line> LinesType;
+
     enum MoveType { NO_MOVE, RANGE_DOWN, MOVE_DOWN, MOVE_UP };
     enum { SCRIPT_STREAM = 1, VALUES_IN_LINE = 2 };
 
@@ -75,6 +97,9 @@ public:
     void SetRefreshOnEmptyLine( bool bVal );
 
 private:
+    Line ConsumeLine();
+    void MoveData();
+    void Text2Doc();
     void Refresh();
 
 private:
@@ -89,7 +114,7 @@ private:
     bool mbRunning;
     bool mbValuesInLine;
     bool mbRefreshOnEmptyLine;
-    LinesList* mpLines;
+    LinesType* mpLines;
     size_t mnLinesCount;
     size_t mnLinesSinceRefresh;
     double mfLastRefreshTime;
author	Kohei Yoshida <kohei.yoshida@collabora.com>	2013-12-30 12:16:35 -0500
committer	Kohei Yoshida <kohei.yoshida@collabora.com>	2013-12-30 12:33:28 -0500
commit	9a623cdca281a682d39b423aefac392c2cc22cf7 (patch)
tree	d8c7222f148818b163b40f1d9302f821542415c8 /sc/source
parent	3ccb783be184102075fe1f9814f05e85d4968c32 (diff)