Implement PreserveSpaces boolean HTML/ReqIF export filter option

This option changes how HTML/ReqIF export handles paragraphs with leading/trailing spaces, or multiple sequential spaces. Normally export may insert newlines every ~256 characters, in place of normal space characters; this relies on default processing of spaces, where leading/trailing spaces are trimmed, and runs of spaces are reduced to a single space. When PreserveSpaces is true, HTML/ReqIF export takes care to not alter spaces inside paragraphs. For that, it checks if paragraphs contain sequences of spaces that normally would be reduced; and for those paragraphs, it adds "white-space: pre-wrap" to style (in HTML), or 'xml::space="preserve"' attribute (in ReqIF). Import of 'xml::space' attribute and "white-space: pre-wrap" style is implemented; when paragraph has these, it keeps the spaces read from HTML/ReqIF intact. Import does not currently support this attribute/style in elements other than 'p'. Change-Id: I62dba5eaf313b965bf37d8fa5e3f5bbb8f5e8357 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/158362 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
author: Mike Kaganski <mike.kaganski@collabora.com> 2023-10-23 19:52:14 +0300
committer: Mike Kaganski <mike.kaganski@collabora.com> 2023-10-23 22:10:06 +0200
commit: 926826e40955175a8c115472e0d2f6c7f2f1a453 (patch)
tree: ba914ce7dcb96fa5f8ddb36a1b40e8d5bc1bb805 /svtools
parent: bae0736bf0ec54828766c3d903e2a27458643395 (diff)
2 files changed, 34 insertions, 26 deletions
diff --git a/svtools/source/svhtml/htmlkywd.cxx b/svtools/source/svhtml/htmlkywd.cxx
index 5f81b3e3ca30..d1b0ea2ee03e 100644
--- a/svtools/source/svhtml/htmlkywd.cxx
+++ b/svtools/source/svhtml/htmlkywd.cxx
@@ -599,6 +599,7 @@ static HTML_OptionEntry aHTMLOptionTab[] = {
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_O_valign),        HtmlOptionId::VALIGN},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_O_valuetype), HtmlOptionId::VALUETYPE},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_O_wrap),      HtmlOptionId::WRAP},
+    {std::u16string_view(u"" OOO_STRING_SVTOOLS_XHTML_O_xml_space), HtmlOptionId::XML_SPACE},
 
 // Attributes with script code value
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_O_onblur),        HtmlOptionId::ONBLUR}, // JavaScript
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
index 7e8ac63fc61e..d94a24632779 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -377,9 +377,14 @@ namespace {
 
 constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
 
+constexpr bool HTML_ISSPACE(sal_uInt32 c)
+{
+    return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c;
+}
+
 }
 
-HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak )
+HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak)
 {
     OUStringBuffer sTmpBuffer( MAX_LEN );
     bool bContinue = true;
@@ -705,37 +710,39 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak )
             {
                 break;
             }
-            nNextCh = ' ';
+            if (!m_bPreserveSpaces)
+                nNextCh = ' ';
             [[fallthrough]];
         case ' ':
-            sTmpBuffer.appendUtf32( nNextCh );
-            if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
-                                !bReadPRE && !bReadTextArea) )
+            if (!m_bPreserveSpaces)
             {
-                // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
-                do {
-                    nNextCh = GetNextChar();
-                    if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
+                sTmpBuffer.appendUtf32(nNextCh);
+                if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
+                {
+                    // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
+                    do
                     {
-                        if( !aToken.isEmpty() || sTmpBuffer.getLength() > 1 )
+                        nNextCh = GetNextChar();
+                        if (sal_Unicode(EOF) == nNextCh && rInput.eof())
                         {
-                            // Have seen s.th. aside from blanks?
-                            aToken.append( sTmpBuffer );
-                            sTmpBuffer.setLength(0);
-                            return HtmlTokenId::TEXTTOKEN;
+                            if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
+                            {
+                                // Have seen s.th. aside from blanks?
+                                aToken.append(sTmpBuffer);
+                                sTmpBuffer.setLength(0);
+                                return HtmlTokenId::TEXTTOKEN;
+                            }
+                            else
+                                // Only read blanks: no text must be returned
+                                // and GetNextToken_ has to read until EOF
+                                return HtmlTokenId::NONE;
                         }
-                        else
-                            // Only read blanks: no text must be returned
-                            // and GetNextToken_ has to read until EOF
-                            return HtmlTokenId::NONE;
-                    }
-                } while ( ' ' == nNextCh || '\t' == nNextCh ||
-                          '\r' == nNextCh || '\n' == nNextCh ||
-                          '\x0b' == nNextCh );
-                bNextCh = false;
+                    } while (HTML_ISSPACE(nNextCh));
+                    bNextCh = false;
+                }
+                break;
             }
-            break;
-
+            [[fallthrough]];
         default:
             bEqSignFound = false;
             if (nNextCh == cBreak && !cQuote)
@@ -743,7 +750,7 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak )
             else
             {
                 do {
-                    if (!linguistic::IsControlChar(nNextCh))
+                    if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
                     {
                     // All remaining characters make their way into the text.
                         sTmpBuffer.appendUtf32( nNextCh );
author	Mike Kaganski <mike.kaganski@collabora.com>	2023-10-23 19:52:14 +0300
committer	Mike Kaganski <mike.kaganski@collabora.com>	2023-10-23 22:10:06 +0200
commit	926826e40955175a8c115472e0d2f6c7f2f1a453 (patch)
tree	ba914ce7dcb96fa5f8ddb36a1b40e8d5bc1bb805 /svtools
parent	bae0736bf0ec54828766c3d903e2a27458643395 (diff)