summaryrefslogtreecommitdiff
path: root/sw
diff options
context:
space:
mode:
authorCaolán McNamara <caolanm@redhat.com>2011-11-03 11:00:38 +0000
committerCaolán McNamara <caolanm@redhat.com>2011-11-03 12:18:17 +0000
commit12db5315fca413ae66e88c4cd8212ee3b01667b7 (patch)
tree2e8abe3a62ee41a39b3a7aeaf261923f02bc41f6 /sw
parent56d5a48099165dabb72eb3a2655c2eb503684f1c (diff)
Follow UAX-29 and present user-perceived character counts
Not a count of code-units, nor a count of code-points, but try and follow http://unicode.org/reports/tr29/ and present the grapheme count. Add a few representative tests to try and avoid gotchas.
Diffstat (limited to 'sw')
-rw-r--r--sw/inc/breakit.hxx4
-rw-r--r--sw/qa/core/swdoc-test.cxx40
-rw-r--r--sw/source/core/bastyp/breakit.cxx17
-rw-r--r--sw/source/core/txtnode/txtedt.cxx9
4 files changed, 57 insertions, 13 deletions
diff --git a/sw/inc/breakit.hxx b/sw/inc/breakit.hxx
index ae05468aa16b..3075fc9dfe63 100644
--- a/sw/inc/breakit.hxx
+++ b/sw/inc/breakit.hxx
@@ -110,7 +110,9 @@ public:
}
sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const;
- sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const;
+ sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const;
+
+ sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const;
};
#define SW_BREAKITER() SwBreakIt::Get()
diff --git a/sw/qa/core/swdoc-test.cxx b/sw/qa/core/swdoc-test.cxx
index 3d27fa071d39..d6f145f0d9bf 100644
--- a/sw/qa/core/swdoc-test.cxx
+++ b/sw/qa/core/swdoc-test.cxx
@@ -41,19 +41,20 @@
#include <sfx2/docfile.hxx>
#include <sfx2/sfxmodelfactory.hxx>
-#include "init.hxx"
-#include "swtypes.hxx"
-#include "docstat.hxx"
+#include "breakit.hxx"
#include "doc.hxx"
-#include "ndtxt.hxx"
#include "docsh.hxx"
-#include "shellres.hxx"
+#include "docstat.hxx"
#include "docufld.hxx"
#include "fmtanchr.hxx"
-#include "swscanner.hxx"
+#include "init.hxx"
+#include "ndtxt.hxx"
+#include "shellio.hxx"
+#include "shellres.hxx"
#include "swcrsr.hxx"
+#include "swscanner.hxx"
#include "swmodule.hxx"
-#include "shellio.hxx"
+#include "swtypes.hxx"
SO2_DECL_REF(SwDocShell)
SO2_IMPL_REF(SwDocShell)
@@ -73,14 +74,15 @@ public:
void testFileNameFields();
void testDocStat();
void testSwScanner();
+ void testUserPerceivedCharCount();
void testGraphicAnchorDeletion();
CPPUNIT_TEST_SUITE(SwDocTest);
CPPUNIT_TEST(randomTest);
CPPUNIT_TEST(testPageDescName);
CPPUNIT_TEST(testFileNameFields);
- CPPUNIT_TEST(testDocStat);
CPPUNIT_TEST(testSwScanner);
+ CPPUNIT_TEST(testUserPerceivedCharCount);
CPPUNIT_TEST(testGraphicAnchorDeletion);
CPPUNIT_TEST_SUITE_END();
@@ -189,6 +191,28 @@ void SwDocTest::testDocStat()
CPPUNIT_ASSERT_MESSAGE("And cache is updated too", m_pDoc->GetDocStat().nChar == nLen);
}
+//For UI character counts we should follow UAX#29 and display the user
+//perceived characters, not the number of codepoints, nor the number of code
+//units http://unicode.org/reports/tr29/
+void SwDocTest::testUserPerceivedCharCount()
+{
+ SwBreakIt *pBreakIter = SwBreakIt::Get();
+
+ //Grapheme example, two different unicode code-points perceived by the user as a single
+ //glyph
+ const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
+ ::rtl::OUString sALEF_QAMATS(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
+ sal_Int32 nGraphemeCount = pBreakIter->getGraphemeCount(sALEF_QAMATS);
+ CPPUNIT_ASSERT_MESSAGE("Grapheme Count should be 1", nGraphemeCount == 1);
+
+ //Surrogate pair example, one single unicode code-point (U+1D11E)
+ //represented as two code units in UTF-8
+ const sal_Unicode GCLEF[] = { 0xD834, 0xDD1E };
+ ::rtl::OUString sGCLEF(GCLEF, SAL_N_ELEMENTS(GCLEF));
+ sal_Int32 nCount = pBreakIter->getGraphemeCount(sGCLEF);
+ CPPUNIT_ASSERT_MESSAGE("Surrogate Pair should be counted as single character", nCount == 1);
+}
+
//See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation
void SwDocTest::testSwScanner()
{
diff --git a/sw/source/core/bastyp/breakit.cxx b/sw/source/core/bastyp/breakit.cxx
index 69a35b7f9f4b..2435aae33180 100644
--- a/sw/source/core/bastyp/breakit.cxx
+++ b/sw/source/core/bastyp/breakit.cxx
@@ -33,6 +33,7 @@
#include <unicode/uchar.h>
#include <com/sun/star/lang/XMultiServiceFactory.hpp>
#include <com/sun/star/i18n/ScriptType.hdl>
+#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
#include <unotools/localedatawrapper.hxx>
#include <editeng/unolingu.hxx>
@@ -169,4 +170,20 @@ sal_uInt16 SwBreakIt::GetAllScriptsOfText( const String& rTxt ) const
return nRet;
}
+sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText) const
+{
+ sal_Int32 nGraphemeCount = 0;
+
+ sal_Int32 nCurPos = 0;
+ while (nCurPos < rText.getLength())
+ {
+ sal_Int32 nCount2 = 1;
+ nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(),
+ i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
+ ++nGraphemeCount;
+ }
+
+ return nGraphemeCount;
+}
+
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx
index 0c70225b8eee..8fe6051f4fb8 100644
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -1862,12 +1862,13 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() ))
{
++nTmpWords;
- nTmpCharsExcludingSpaces += aScanner.GetLen();
+ nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
}
}
}
- nTmpChars = nExpandEnd - nExpandBegin - nNumOfMaskedChars;
+ nTmpChars = pBreakIt->getGraphemeCount(aExpandText.copy(nExpandBegin, nExpandEnd - nExpandBegin));
+ nTmpChars -= nNumOfMaskedChars;
// no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars
// nor for mid-word selection - set scanner bClip = true at creation
@@ -1889,10 +1890,10 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
while ( aScanner.NextWord() )
{
++nTmpWords;
- nTmpCharsExcludingSpaces += aScanner.GetLen();
+ nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord());
}
- nTmpChars += nNumStringLen;
+ nTmpChars = pBreakIt->getGraphemeCount(aNumString);
}
else if ( HasBullet() )
{