diff options
author | Caolán McNamara <caolanm@redhat.com> | 2011-11-03 11:00:38 +0000 |
---|---|---|
committer | Caolán McNamara <caolanm@redhat.com> | 2011-11-03 12:18:17 +0000 |
commit | 12db5315fca413ae66e88c4cd8212ee3b01667b7 (patch) | |
tree | 2e8abe3a62ee41a39b3a7aeaf261923f02bc41f6 /sw | |
parent | 56d5a48099165dabb72eb3a2655c2eb503684f1c (diff) |
Follow UAX-29 and present user-perceived character counts
Not a count of code-units, nor a count of code-points, but try and follow
http://unicode.org/reports/tr29/ and present the grapheme count.
Add a few representative tests to try and avoid gotchas.
Diffstat (limited to 'sw')
-rw-r--r-- | sw/inc/breakit.hxx | 4 | ||||
-rw-r--r-- | sw/qa/core/swdoc-test.cxx | 40 | ||||
-rw-r--r-- | sw/source/core/bastyp/breakit.cxx | 17 | ||||
-rw-r--r-- | sw/source/core/txtnode/txtedt.cxx | 9 |
4 files changed, 57 insertions, 13 deletions
diff --git a/sw/inc/breakit.hxx b/sw/inc/breakit.hxx index ae05468aa16b..3075fc9dfe63 100644 --- a/sw/inc/breakit.hxx +++ b/sw/inc/breakit.hxx @@ -110,7 +110,9 @@ public: } sal_uInt16 GetRealScriptOfText( const String& rTxt, xub_StrLen nPos ) const; - sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const; + sal_uInt16 GetAllScriptsOfText( const String& rTxt ) const; + + sal_Int32 getGraphemeCount(const rtl::OUString& rStr) const; }; #define SW_BREAKITER() SwBreakIt::Get() diff --git a/sw/qa/core/swdoc-test.cxx b/sw/qa/core/swdoc-test.cxx index 3d27fa071d39..d6f145f0d9bf 100644 --- a/sw/qa/core/swdoc-test.cxx +++ b/sw/qa/core/swdoc-test.cxx @@ -41,19 +41,20 @@ #include <sfx2/docfile.hxx> #include <sfx2/sfxmodelfactory.hxx> -#include "init.hxx" -#include "swtypes.hxx" -#include "docstat.hxx" +#include "breakit.hxx" #include "doc.hxx" -#include "ndtxt.hxx" #include "docsh.hxx" -#include "shellres.hxx" +#include "docstat.hxx" #include "docufld.hxx" #include "fmtanchr.hxx" -#include "swscanner.hxx" +#include "init.hxx" +#include "ndtxt.hxx" +#include "shellio.hxx" +#include "shellres.hxx" #include "swcrsr.hxx" +#include "swscanner.hxx" #include "swmodule.hxx" -#include "shellio.hxx" +#include "swtypes.hxx" SO2_DECL_REF(SwDocShell) SO2_IMPL_REF(SwDocShell) @@ -73,14 +74,15 @@ public: void testFileNameFields(); void testDocStat(); void testSwScanner(); + void testUserPerceivedCharCount(); void testGraphicAnchorDeletion(); CPPUNIT_TEST_SUITE(SwDocTest); CPPUNIT_TEST(randomTest); CPPUNIT_TEST(testPageDescName); CPPUNIT_TEST(testFileNameFields); - CPPUNIT_TEST(testDocStat); CPPUNIT_TEST(testSwScanner); + CPPUNIT_TEST(testUserPerceivedCharCount); CPPUNIT_TEST(testGraphicAnchorDeletion); CPPUNIT_TEST_SUITE_END(); @@ -189,6 +191,28 @@ void SwDocTest::testDocStat() CPPUNIT_ASSERT_MESSAGE("And cache is updated too", m_pDoc->GetDocStat().nChar == nLen); } +//For UI character counts we should follow UAX#29 and display the user +//perceived characters, not the number of codepoints, nor the number of code +//units http://unicode.org/reports/tr29/ +void SwDocTest::testUserPerceivedCharCount() +{ + SwBreakIt *pBreakIter = SwBreakIt::Get(); + + //Grapheme example, two different unicode code-points perceived by the user as a single + //glyph + const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 }; + ::rtl::OUString sALEF_QAMATS(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS)); + sal_Int32 nGraphemeCount = pBreakIter->getGraphemeCount(sALEF_QAMATS); + CPPUNIT_ASSERT_MESSAGE("Grapheme Count should be 1", nGraphemeCount == 1); + + //Surrogate pair example, one single unicode code-point (U+1D11E) + //represented as two code units in UTF-8 + const sal_Unicode GCLEF[] = { 0xD834, 0xDD1E }; + ::rtl::OUString sGCLEF(GCLEF, SAL_N_ELEMENTS(GCLEF)); + sal_Int32 nCount = pBreakIter->getGraphemeCount(sGCLEF); + CPPUNIT_ASSERT_MESSAGE("Surrogate Pair should be counted as single character", nCount == 1); +} + //See https://bugs.freedesktop.org/show_bug.cgi?id=40449 for motivation void SwDocTest::testSwScanner() { diff --git a/sw/source/core/bastyp/breakit.cxx b/sw/source/core/bastyp/breakit.cxx index 69a35b7f9f4b..2435aae33180 100644 --- a/sw/source/core/bastyp/breakit.cxx +++ b/sw/source/core/bastyp/breakit.cxx @@ -33,6 +33,7 @@ #include <unicode/uchar.h> #include <com/sun/star/lang/XMultiServiceFactory.hpp> #include <com/sun/star/i18n/ScriptType.hdl> +#include <com/sun/star/i18n/CharacterIteratorMode.hpp> #include <unotools/localedatawrapper.hxx> #include <editeng/unolingu.hxx> @@ -169,4 +170,20 @@ sal_uInt16 SwBreakIt::GetAllScriptsOfText( const String& rTxt ) const return nRet; } +sal_Int32 SwBreakIt::getGraphemeCount(const rtl::OUString& rText) const +{ + sal_Int32 nGraphemeCount = 0; + + sal_Int32 nCurPos = 0; + while (nCurPos < rText.getLength()) + { + sal_Int32 nCount2 = 1; + nCurPos = xBreak->nextCharacters(rText, nCurPos, lang::Locale(), + i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2); + ++nGraphemeCount; + } + + return nGraphemeCount; +} + /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx index 0c70225b8eee..8fe6051f4fb8 100644 --- a/sw/source/core/txtnode/txtedt.cxx +++ b/sw/source/core/txtnode/txtedt.cxx @@ -1862,12 +1862,13 @@ void SwTxtNode::CountWords( SwDocStat& rStat, if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() )) { ++nTmpWords; - nTmpCharsExcludingSpaces += aScanner.GetLen(); + nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord()); } } } - nTmpChars = nExpandEnd - nExpandBegin - nNumOfMaskedChars; + nTmpChars = pBreakIt->getGraphemeCount(aExpandText.copy(nExpandBegin, nExpandEnd - nExpandBegin)); + nTmpChars -= nNumOfMaskedChars; // no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars // nor for mid-word selection - set scanner bClip = true at creation @@ -1889,10 +1890,10 @@ void SwTxtNode::CountWords( SwDocStat& rStat, while ( aScanner.NextWord() ) { ++nTmpWords; - nTmpCharsExcludingSpaces += aScanner.GetLen(); + nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(aScanner.GetWord()); } - nTmpChars += nNumStringLen; + nTmpChars = pBreakIt->getGraphemeCount(aNumString); } else if ( HasBullet() ) { |