diff options
author | Kohei Yoshida <kohei.yoshida@collabora.com> | 2013-12-30 12:16:35 -0500 |
---|---|---|
committer | Kohei Yoshida <kohei.yoshida@collabora.com> | 2013-12-30 12:33:28 -0500 |
commit | 9a623cdca281a682d39b423aefac392c2cc22cf7 (patch) | |
tree | d8c7222f148818b163b40f1d9302f821542415c8 /sc/source | |
parent | 3ccb783be184102075fe1f9814f05e85d4968c32 (diff) |
Parse CSV lines in the reader thread.
Change-Id: I6329a0e6e6fa6576df2ed473482d558bfd6cce08
Diffstat (limited to 'sc/source')
-rw-r--r-- | sc/source/core/tool/stringutil.cxx | 150 | ||||
-rw-r--r-- | sc/source/ui/docshell/datastream.cxx | 209 | ||||
-rw-r--r-- | sc/source/ui/inc/datastream.hxx | 37 |
3 files changed, 293 insertions, 103 deletions
diff --git a/sc/source/core/tool/stringutil.cxx b/sc/source/core/tool/stringutil.cxx index e711bcba40d4..5bdc2c2fe752 100644 --- a/sc/source/core/tool/stringutil.cxx +++ b/sc/source/core/tool/stringutil.cxx @@ -18,11 +18,13 @@ */ #include "stringutil.hxx" -#include "rtl/ustrbuf.hxx" -#include "rtl/math.hxx" #include "global.hxx" #include "svl/zforlist.hxx" +#include <rtl/ustrbuf.hxx> +#include <rtl/strbuf.hxx> +#include <rtl/math.hxx> + ScSetStringParam::ScSetStringParam() : mpNumFormatter(NULL), mbDetectNumberFormat(true), @@ -194,6 +196,150 @@ bool ScStringUtil::parseSimpleNumber( return true; } +bool ScStringUtil::parseSimpleNumber( + const char* p, size_t n, char dsep, char gsep, double& rVal) +{ + // Actually almost the entire pre-check is unnecessary and we could call + // rtl::math::stringToDouble() just after having exchanged ascii space with + // non-breaking space, if it wasn't for check of grouped digits. The NaN + // and Inf cases that are accepted by stringToDouble() could be detected + // using rtl::math::isFinite() on the result. + + /* TODO: The grouped digits check isn't even valid for locales that do not + * group in thousands ... e.g. Indian locales. But that's something also + * the number scanner doesn't implement yet, only the formatter. */ + + OStringBuffer aBuf; + + size_t i = 0; + const char* pLast = p + (n-1); + sal_Int32 nPosDSep = -1, nPosGSep = -1; + sal_uInt32 nDigitCount = 0; + sal_Int32 nPosExponent = -1; + + // Skip preceding spaces. + for (i = 0; i < n; ++i, ++p) + { + char c = *p; + if (c != ' ') + // first non-space character. Exit. + break; + } + + if (i == n) + // the whole string is space. Fail. + return false; + + n -= i; // Subtract the length of the preceding spaces. + + // Determine the last non-space character. + for (; p != pLast; --pLast, --n) + { + char c = *pLast; + if (c != ' ') + // Non space character. Exit. + break; + } + + for (i = 0; i < n; ++i, ++p) + { + char c = *p; + + if ('0' <= c && c <= '9') + { + // this is a digit. + aBuf.append(c); + ++nDigitCount; + } + else if (c == dsep) + { + // this is a decimal separator. + + if (nPosDSep >= 0) + // a second decimal separator -> not a valid number. + return false; + + if (nPosGSep >= 0 && i - nPosGSep != 4) + // the number has a group separator and the decimal sep is not + // positioned correctly. + return false; + + nPosDSep = i; + nPosGSep = -1; + aBuf.append(c); + nDigitCount = 0; + } + else if (c == gsep) + { + // this is a group (thousand) separator. + + if (i == 0) + // not allowed as the first character. + return false; + + if (nPosDSep >= 0) + // not allowed after the decimal separator. + return false; + + if (nPosGSep >= 0 && nDigitCount != 3) + // must be exactly 3 digits since the last group separator. + return false; + + if (nPosExponent >= 0) + // not allowed in exponent. + return false; + + nPosGSep = i; + nDigitCount = 0; + } + else if (c == '-' || c == '+') + { + // A sign must be the first character if it's given, or immediately + // follow the exponent character if present. + if (i == 0 || (nPosExponent >= 0 && i == static_cast<size_t>(nPosExponent+1))) + aBuf.append(c); + else + return false; + } + else if (c == 'E' || c == 'e') + { + // this is an exponent designator. + + if (nPosExponent >= 0) + // Only one exponent allowed. + return false; + + if (nPosGSep >= 0 && nDigitCount != 3) + // must be exactly 3 digits since the last group separator. + return false; + + aBuf.append(c); + nPosExponent = i; + nPosDSep = -1; + nPosGSep = -1; + nDigitCount = 0; + } + else + return false; + } + + // finished parsing the number. + + if (nPosGSep >= 0 && nDigitCount != 3) + // must be exactly 3 digits since the last group separator. + return false; + + rtl_math_ConversionStatus eStatus = rtl_math_ConversionStatus_Ok; + sal_Int32 nParseEnd = 0; + OString aString( aBuf.makeStringAndClear()); + rVal = ::rtl::math::stringToDouble( aString, dsep, gsep, &eStatus, &nParseEnd); + if (eStatus != rtl_math_ConversionStatus_Ok || nParseEnd < aString.getLength()) + // Not a valid number or not entire string consumed. + return false; + + return true; +} + sal_Int32 ScStringUtil::GetQuotedTokenCount(const OUString &rIn, const OUString& rQuotedPairs, sal_Unicode cTok ) { assert( !(rQuotedPairs.getLength()%2) ); diff --git a/sc/source/ui/docshell/datastream.cxx b/sc/source/ui/docshell/datastream.cxx index 1b7f0d233f43..a21cba402d4b 100644 --- a/sc/source/ui/docshell/datastream.cxx +++ b/sc/source/ui/docshell/datastream.cxx @@ -56,6 +56,8 @@ double datastream_get_time(int nIdx) return fTimes[ nIdx ]; } +namespace { + inline double getNow() { TimeValue now; @@ -63,6 +65,50 @@ inline double getNow() return static_cast<double>(now.Seconds) + static_cast<double>(now.Nanosec) / 1000000000.0; } +#if ENABLE_ORCUS + +class CSVHandler +{ + DataStream::Line& mrLine; + size_t mnColCount; + size_t mnCols; + const char* mpLineHead; + +public: + CSVHandler( DataStream::Line& rLine, size_t nColCount ) : + mrLine(rLine), mnColCount(nColCount), mnCols(0), mpLineHead(rLine.maLine.getStr()) {} + + void begin_parse() {} + void end_parse() {} + void begin_row() {} + void end_row() {} + + void cell(const char* p, size_t n) + { + if (mnCols >= mnColCount) + return; + + DataStream::Cell aCell; + if (ScStringUtil::parseSimpleNumber(p, n, '.', ',', aCell.mfValue)) + { + aCell.mbValue = true; + } + else + { + aCell.mbValue = false; + aCell.maStr.Pos = std::distance(mpLineHead, p); + aCell.maStr.Size = n; + } + mrLine.maCells.push_back(aCell); + + ++mnCols; + } +}; + +#endif + +} + namespace datastreams { class CallerThread : public salhelper::Thread @@ -96,7 +142,7 @@ private: } }; -void emptyLineQueue( std::queue<LinesList*>& rQueue ) +void emptyLineQueue( std::queue<DataStream::LinesType*>& rQueue ) { while (!rQueue.empty()) { @@ -108,22 +154,34 @@ void emptyLineQueue( std::queue<LinesList*>& rQueue ) class ReaderThread : public salhelper::Thread { SvStream *mpStream; + size_t mnColCount; bool mbTerminate; osl::Mutex maMtxTerminate; - std::queue<LinesList* > maPendingLines; - std::queue<LinesList* > maUsedLines; + std::queue<DataStream::LinesType*> maPendingLines; + std::queue<DataStream::LinesType*> maUsedLines; osl::Mutex maMtxLines; osl::Condition maCondReadStream; osl::Condition maCondConsume; +#if ENABLE_ORCUS + orcus::csv_parser_config maConfig; +#endif + public: - ReaderThread(SvStream *pData): + ReaderThread(SvStream *pData, size_t nColCount): Thread("ReaderThread"), mpStream(pData), - mbTerminate(false) {} + mnColCount(nColCount), + mbTerminate(false) + { +#if ENABLE_ORCUS + maConfig.delimiters.push_back(','); + maConfig.text_qualifier = '"'; +#endif + } virtual ~ReaderThread() { @@ -156,9 +214,9 @@ public: maCondConsume.reset(); } - LinesList* popNewLines() + DataStream::LinesType* popNewLines() { - LinesList* pLines = maPendingLines.front(); + DataStream::LinesType* pLines = maPendingLines.front(); maPendingLines.pop(); return pLines; } @@ -174,7 +232,7 @@ public: return !maPendingLines.empty(); } - void pushUsedLines( LinesList* pLines ) + void pushUsedLines( DataStream::LinesType* pLines ) { maUsedLines.push(pLines); } @@ -189,7 +247,7 @@ private: { while (!isTerminateRequested()) { - LinesList* pLines = NULL; + DataStream::LinesType* pLines = NULL; osl::ResettableMutexGuard aGuard(maMtxLines); if (!maUsedLines.empty()) @@ -202,12 +260,20 @@ private: else { aGuard.clear(); // unlock - pLines = new LinesList(10); + pLines = new DataStream::LinesType(10); } // Read & store new lines from stream. - for (size_t i = 0; i < pLines->size(); ++i) - mpStream->ReadLine( pLines->at(i) ); + for (size_t i = 0, n = pLines->size(); i < n; ++i) + { + DataStream::Line& rLine = (*pLines)[i]; + rLine.maCells.clear(); + mpStream->ReadLine(rLine.maLine); + + CSVHandler aHdl(rLine, mnColCount); + orcus::csv_parser<CSVHandler> parser(rLine.maLine.getStr(), rLine.maLine.getLength(), aHdl, maConfig); + parser.parse(); + } aGuard.reset(); // lock while (!isTerminateRequested() && maPendingLines.size() >= 8) @@ -228,6 +294,19 @@ private: } +DataStream::Cell::Cell() : mfValue(0.0), mbValue(true) {} + +DataStream::Cell::Cell( const Cell& r ) : mbValue(r.mbValue) +{ + if (r.mbValue) + mfValue = r.mfValue; + else + { + maStr.Pos = r.maStr.Pos; + maStr.Size = r.maStr.Size; + } +} + void DataStream::MakeToolbarVisible() { css::uno::Reference< css::frame::XFrame > xFrame = @@ -312,13 +391,13 @@ DataStream::~DataStream() delete mpLines; } -OString DataStream::ConsumeLine() +DataStream::Line DataStream::ConsumeLine() { if (!mpLines || mnLinesCount >= mpLines->size()) { mnLinesCount = 0; if (mxReaderThread->isTerminateRequested()) - return OString(); + return Line(); osl::ResettableMutexGuard aGuard(mxReaderThread->getLinesMutex()); if (mpLines) @@ -402,7 +481,7 @@ void DataStream::StartImport() pStream = new SvScriptStream(msURL); else pStream = new SvFileStream(msURL, STREAM_READ); - mxReaderThread = new datastreams::ReaderThread( pStream ); + mxReaderThread = new datastreams::ReaderThread(pStream, maStartRange.aEnd.Col() - maStartRange.aStart.Col() + 1); mxReaderThread->launch(); } mbRunning = true; @@ -476,79 +555,10 @@ void DataStream::MoveData() #if ENABLE_ORCUS -namespace { - -struct StrVal -{ - ScAddress maPos; - OUString maStr; - - StrVal( const ScAddress& rPos, const OUString& rStr ) : maPos(rPos), maStr(rStr) {} -}; - -struct NumVal -{ - ScAddress maPos; - double mfVal; - - NumVal( const ScAddress& rPos, double fVal ) : maPos(rPos), mfVal(fVal) {} -}; - -typedef std::vector<StrVal> StrValArray; -typedef std::vector<NumVal> NumValArray; - -/** - * This handler handles a single line CSV input. - */ -class CSVHandler -{ - ScAddress maPos; - SCCOL mnEndCol; - - StrValArray maStrs; - NumValArray maNums; - -public: - CSVHandler( const ScAddress& rPos, SCCOL nEndCol ) : maPos(rPos), mnEndCol(nEndCol) {} - - void begin_parse() {} - void end_parse() {} - void begin_row() {} - void end_row() {} - - void cell(const char* p, size_t n) - { - if (maPos.Col() <= mnEndCol) - { - OUString aStr(p, n, RTL_TEXTENCODING_UTF8); - double fVal; - if (ScStringUtil::parseSimpleNumber(aStr, '.', ',', fVal)) - maNums.push_back(NumVal(maPos, fVal)); - else - maStrs.push_back(StrVal(maPos, aStr)); - } - maPos.IncCol(); - } - - const StrValArray& getStrs() const { return maStrs; } - const NumValArray& getNums() const { return maNums; } -}; - -} - void DataStream::Text2Doc() { - OString aLine = ConsumeLine(); - orcus::csv_parser_config aConfig; - aConfig.delimiters.push_back(','); - aConfig.text_qualifier = '"'; - CSVHandler aHdl(ScAddress(maStartRange.aStart.Col(), mnCurRow, maStartRange.aStart.Tab()), maStartRange.aEnd.Col()); - orcus::csv_parser<CSVHandler> parser(aLine.getStr(), aLine.getLength(), aHdl, aConfig); - parser.parse(); - - const StrValArray& rStrs = aHdl.getStrs(); - const NumValArray& rNums = aHdl.getNums(); - if (rStrs.empty() && rNums.empty() && mbRefreshOnEmptyLine) + Line aLine = ConsumeLine(); + if (aLine.maCells.empty() && mbRefreshOnEmptyLine) { // Empty line detected. Trigger refresh and discard it. Refresh(); @@ -559,15 +569,24 @@ void DataStream::Text2Doc() MoveData(); { - StrValArray::const_iterator it = rStrs.begin(), itEnd = rStrs.end(); - for (; it != itEnd; ++it) - maDocAccess.setStringCell(it->maPos, it->maStr); - } - - { - NumValArray::const_iterator it = rNums.begin(), itEnd = rNums.end(); - for (; it != itEnd; ++it) - maDocAccess.setNumericCell(it->maPos, it->mfVal); + std::vector<Cell>::const_iterator it = aLine.maCells.begin(), itEnd = aLine.maCells.end(); + SCCOL nCol = maStartRange.aStart.Col(); + const char* pLineHead = aLine.maLine.getStr(); + for (; it != itEnd; ++it, ++nCol) + { + const Cell& rCell = *it; + if (rCell.mbValue) + { + maDocAccess.setNumericCell( + ScAddress(nCol, mnCurRow, maStartRange.aStart.Tab()), rCell.mfValue); + } + else + { + maDocAccess.setStringCell( + ScAddress(nCol, mnCurRow, maStartRange.aStart.Tab()), + OUString(pLineHead+rCell.maStr.Pos, rCell.maStr.Size, RTL_TEXTENCODING_UTF8)); + } + } } fTimes[ DEBUG_TIME_IMPORT ] = getNow() - fStart; diff --git a/sc/source/ui/inc/datastream.hxx b/sc/source/ui/inc/datastream.hxx index 5a4d8cd444b7..5af2dc7c29a6 100644 --- a/sc/source/ui/inc/datastream.hxx +++ b/sc/source/ui/inc/datastream.hxx @@ -33,15 +33,37 @@ namespace datastreams { class ReaderThread; } -typedef std::vector<OString> LinesList; class DataStream : boost::noncopyable { - OString ConsumeLine(); - void MoveData(); - void Text2Doc(); - public: + struct Cell + { + struct Str + { + size_t Pos; + size_t Size; + }; + + union + { + Str maStr; + double mfValue; + }; + + bool mbValue; + + Cell(); + Cell( const Cell& r ); + }; + + struct Line + { + OString maLine; + std::vector<Cell> maCells; + }; + typedef std::vector<Line> LinesType; + enum MoveType { NO_MOVE, RANGE_DOWN, MOVE_DOWN, MOVE_UP }; enum { SCRIPT_STREAM = 1, VALUES_IN_LINE = 2 }; @@ -75,6 +97,9 @@ public: void SetRefreshOnEmptyLine( bool bVal ); private: + Line ConsumeLine(); + void MoveData(); + void Text2Doc(); void Refresh(); private: @@ -89,7 +114,7 @@ private: bool mbRunning; bool mbValuesInLine; bool mbRefreshOnEmptyLine; - LinesList* mpLines; + LinesType* mpLines; size_t mnLinesCount; size_t mnLinesSinceRefresh; double mfLastRefreshTime; |