count words in text with mixed script punctuation correctly

author: Caolán McNamara <caolanm@redhat.com> 2012-03-30 12:21:14 +0100
committer: Caolán McNamara <caolanm@redhat.com> 2012-03-30 14:07:21 +0100
commit: 417823d4c23100713813171865d70f5e2719ace2 (patch)
tree: 02021a89e8cc755957947916dfb51b18ce05d977 /sw
parent: 45e4f9e99b75ef8a125228bc42df7fb98af9d7bc (diff)
2 files changed, 76 insertions, 17 deletions
diff --git a/sw/qa/core/swdoc-test.cxx b/sw/qa/core/swdoc-test.cxx
index d7d453f4f9c6..6106da20c34f 100644
--- a/sw/qa/core/swdoc-test.cxx
+++ b/sw/qa/core/swdoc-test.cxx
@@ -36,6 +36,8 @@
 #include <tools/urlobj.hxx>
 #include <unotools/tempfile.hxx>
 
+#include <editeng/langitem.hxx>
+
 #include <sfx2/app.hxx>
 #include <sfx2/docfilt.hxx>
 #include <sfx2/docfile.hxx>
@@ -214,7 +216,6 @@ void SwDocTest::testUserPerceivedCharCount()
     CPPUNIT_ASSERT_MESSAGE("Surrogate Pair should be counted as single character", nCount == 1);
 }
 
-//See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation
 void SwDocTest::testSwScanner()
 {
     SwNodeIndex aIdx(m_pDoc->GetNodes().GetEndOfContent(), -1);
@@ -224,26 +225,80 @@ void SwDocTest::testSwScanner()
 
     CPPUNIT_ASSERT_MESSAGE("Has Text Node", pTxtNode);
 
+    //See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation
+    //See https://bugs.freedesktop.org/show_bug.cgi?id=39365 for motivation
     //Use a temporary rtl::OUString as the arg, as that's the trouble behind
     //fdo#40449 and fdo#39365
-    SwScanner aScanner(*pTxtNode,
-        rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("Hello World")),
-        0, 0, i18n::WordType::DICTIONARY_WORD, 0,
-        RTL_CONSTASCII_LENGTH("Hello World"));
-
-    bool bFirstOk = aScanner.NextWord();
-    CPPUNIT_ASSERT_MESSAGE("First Token", bFirstOk);
-    const rtl::OUString &rHello = aScanner.GetWord();
-    CPPUNIT_ASSERT_MESSAGE("Should be Hello",
-        rHello.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("Hello")));
-
-    bool bSecondOk = aScanner.NextWord();
-    CPPUNIT_ASSERT_MESSAGE("Second Token", bSecondOk);
-    const rtl::OUString &rWorld = aScanner.GetWord();
-    CPPUNIT_ASSERT_MESSAGE("Should be World",
-        rWorld.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("World")));
+    {
+        SwScanner aScanner(*pTxtNode,
+            rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("Hello World")),
+            0, 0, i18n::WordType::DICTIONARY_WORD, 0,
+            RTL_CONSTASCII_LENGTH("Hello World"));
+
+        bool bFirstOk = aScanner.NextWord();
+        CPPUNIT_ASSERT_MESSAGE("First Token", bFirstOk);
+        const rtl::OUString &rHello = aScanner.GetWord();
+        CPPUNIT_ASSERT_MESSAGE("Should be Hello",
+            rHello.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("Hello")));
+
+        bool bSecondOk = aScanner.NextWord();
+        CPPUNIT_ASSERT_MESSAGE("Second Token", bSecondOk);
+        const rtl::OUString &rWorld = aScanner.GetWord();
+        CPPUNIT_ASSERT_MESSAGE("Should be World",
+            rWorld.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("World")));
+    }
+
+    //See https://www.libreoffice.org/bugzilla/show_bug.cgi?id=45271 for motivation
+    {
+        const sal_Unicode IDEOGRAPHICFULLSTOP_D[] = { 0x3002, 'D' };
+
+        m_pDoc->InsertString(aPaM, rtl::OUString(IDEOGRAPHICFULLSTOP_D,
+            SAL_N_ELEMENTS(IDEOGRAPHICFULLSTOP_D)));
+
+        SvxLanguageItem aCJKLangItem( LANGUAGE_CHINESE_SIMPLIFIED, RES_CHRATR_CJK_LANGUAGE );
+        SvxLanguageItem aWestLangItem( LANGUAGE_ENGLISH_US, RES_CHRATR_LANGUAGE );
+        m_pDoc->InsertPoolItem(aPaM, aCJKLangItem, 0 );
+        m_pDoc->InsertPoolItem(aPaM, aWestLangItem, 0 );
+
+        SwDocStat aDocStat;
+        pTxtNode->CountWords(aDocStat, 0, SAL_N_ELEMENTS(IDEOGRAPHICFULLSTOP_D));
+
+        CPPUNIT_ASSERT_MESSAGE("Should be 2", aDocStat.nChar == 2);
+        CPPUNIT_ASSERT_MESSAGE("Should be 2", aDocStat.nCharExcludingSpaces == 2);
+    }
+
+    //See https://issues.apache.org/ooo/show_bug.cgi?id=89042 for motivation
+    {
+        SwDocStat aDocStat;
+
+        const sal_Unicode aShouldBeThree[] = {
+            0x0053, 0x0068, 0x006F, 0x0075, 0x006C, 0x0064, 0x0020,
+            0x2018, 0x0062, 0x0065, 0x0020, 0x0074, 0x0068, 0x0072,
+            0x0065, 0x0065, 0x2019
+        };
+
+        m_pDoc->AppendTxtNode(*aPaM.GetPoint());
+        m_pDoc->InsertString(aPaM, rtl::OUString(aShouldBeThree, SAL_N_ELEMENTS(aShouldBeThree)));
+        pTxtNode = aPaM.GetNode()->GetTxtNode();
+        pTxtNode->CountWords(aDocStat, 0, SAL_N_ELEMENTS(aShouldBeThree));
+        CPPUNIT_ASSERT_MESSAGE("Should be 3", aDocStat.nWord == 3);
+
+        const sal_Unicode aShouldBeFive[] = {
+            0x0046, 0x0072, 0x0065, 0x006E, 0x0063, 0x0068, 0x0020,
+            0x00AB, 0x00A0, 0x0073, 0x0061, 0x0076, 0x006F, 0x0069,
+            0x0072, 0x0020, 0x0063, 0x0061, 0x006C, 0x0063, 0x0075,
+            0x006C, 0x0065, 0x0072, 0x00A0, 0x00BB
+        };
+
+        m_pDoc->AppendTxtNode(*aPaM.GetPoint());
+        m_pDoc->InsertString(aPaM, rtl::OUString(aShouldBeFive, SAL_N_ELEMENTS(aShouldBeFive)));
+        pTxtNode = aPaM.GetNode()->GetTxtNode();
+        pTxtNode->CountWords(aDocStat, 0, SAL_N_ELEMENTS(aShouldBeFive));
+        CPPUNIT_ASSERT_MESSAGE("Should be 5", aDocStat.nWord == 5);
+    }
 }
 
+
 //See https://bugs.freedesktop.org/show_bug.cgi?id=40599 for motivation
 void SwDocTest::testGraphicAnchorDeletion()
 {
diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx
index 71749fdeb1bc..a186cc646857 100644
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -723,6 +723,10 @@ sal_Bool SwScanner::NextWord()
                 pBreakIt->GetLocale( aCurrLang ), nWordType, sal_True );
         OSL_ENSURE( aBound.endPos >= aBound.startPos, "broken aBound result" );
 
+        // we don't want to include preceeding text
+        if (aBound.startPos < nBegin)
+            aBound.startPos = nBegin;
+
         //no word boundaries could be found
         if(aBound.endPos == aBound.startPos)
             return sal_False;
author	Caolán McNamara <caolanm@redhat.com>	2012-03-30 12:21:14 +0100
committer	Caolán McNamara <caolanm@redhat.com>	2012-03-30 14:07:21 +0100
commit	417823d4c23100713813171865d70f5e2719ace2 (patch)
tree	02021a89e8cc755957947916dfb51b18ce05d977 /sw
parent	45e4f9e99b75ef8a125228bc42df7fb98af9d7bc (diff)