vcl/osx/HtmlFmtFlt.cxx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */

#include "HtmlFmtFlt.hxx"

#include <rtl/string.h>
#include <osl/diagnose.h>

#include <string>
#include <sstream>
#include <vector>
#include <iomanip>
#include <cassert>

using namespace com::sun::star::uno;

// converts the openoffice text/html clipboard format to the HTML Format
// well known under MS Windows
// the MS HTML Format has a header before the real html data

// Version:1.0      Version number of the clipboard. Starting is 0.9
// StartHTML:       Byte count from the beginning of the clipboard to the start
//                  of the context, or -1 if no context
// EndHTML:         Byte count from the beginning of the clipboard to the end
//                  of the context, or -1 if no context
// StartFragment:   Byte count from the beginning of the clipboard to the
//                  start of the fragment
// EndFragment:     Byte count from the beginning of the clipboard to the
//                  end of the fragment
// StartSelection:  Byte count from the beginning of the clipboard to the
//                  start of the selection
// EndSelection:    Byte count from the beginning of the clipboard to the
//                  end of the selection

// StartSelection and EndSelection are optional
// The fragment should be preceded and followed by the HTML comments
// <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
// text

namespace
{
std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
{
    std::ostringstream htmlHeader;
    htmlHeader << "Version:1.0" << '\r' << '\n';
    htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
    htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
    htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
    htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
    return htmlHeader.str();
}

}

// the office always writes the start and end html tag in upper cases and
// without spaces both tags don't allow parameters
const std::string TAG_HTML = std::string("<html>");
const std::string TAG_END_HTML = std::string("</html>");

// The body tag may have parameters so we need to search for the
// closing '>' manually e.g. <BODY param> #92840#
const std::string TAG_BODY = std::string("<body");
const std::string TAG_END_BODY = std::string("</body");

Sequence<sal_Int8> TextHtmlToHTMLFormat(Sequence<sal_Int8> const & aTextHtml)
{
    OSL_ASSERT(aTextHtml.getLength() > 0);

    if (aTextHtml.getLength() <= 0)
        return Sequence<sal_Int8>();

    // fill the buffer with dummy values to calc the exact length
    std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
    size_t lHtmlFormatHeader = dummyHtmlHeader.length();

    std::string textHtml(
        reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
        reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());

    std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
    std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?

    // The body tag may have parameters so we need to search for the
    // closing '>' manually e.g. <BODY param> #92840#
    std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
    std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;

    std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
    htmlFormat += textHtml;

    Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
    memset(byteSequence.getArray(), 0, byteSequence.getLength());

    memcpy(
        static_cast<void*>(byteSequence.getArray()),
        static_cast<const void*>(htmlFormat.c_str()),
        htmlFormat.length());

    return byteSequence;
}

const char* const HtmlStartTag = "<html";

Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
{
  assert(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");

  Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
  sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
  sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
  const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);

  assert(htmlStartTag && "Seems to be no HTML at all");

  // It doesn't seem to be HTML? Well then simply return what has been
  // provided in non-debug builds
  if (htmlStartTag == nullptr)
    {
    return aHTMLFormat;
    }

  sal_Int32 len = dataEnd - htmlStartTag;
  Sequence<sal_Int8> plainHtmlData(len);

  memcpy(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);

  return plainHtmlData;
}

/* A simple format detection. We are just comparing the first few bytes
   of the provided byte sequence to see whether or not it is the MS
   Office Html format. If it shows that this is not reliable enough we
   can improve this
*/
const char HtmlFormatStart[] = "Version:";
int const HtmlFormatStartLen = sizeof(HtmlFormatStart) - 1;

bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
{
  if (aHtmlSequence.getLength() < HtmlFormatStartLen)
    return false;

  return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
                                                   HtmlFormatStartLen,
                                                   reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
                                                   HtmlFormatStartLen) == 0;
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */