1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include "HtmlFmtFlt.hxx"
21 
22 #include <rtl/string.h>
23 #include <osl/diagnose.h>
24 
25 #include <string>
26 #include <sstream>
27 #include <vector>
28 #include <iomanip>
29 #include <cassert>
30 
31 using namespace com::sun::star::uno;
32 
33 // converts the openoffice text/html clipboard format to the HTML Format
34 // well known under MS Windows
35 // the MS HTML Format has a header before the real html data
36 
37 // Version:1.0      Version number of the clipboard. Starting is 0.9
38 // StartHTML:       Byte count from the beginning of the clipboard to the start
39 //                  of the context, or -1 if no context
40 // EndHTML:         Byte count from the beginning of the clipboard to the end
41 //                  of the context, or -1 if no context
42 // StartFragment:   Byte count from the beginning of the clipboard to the
43 //                  start of the fragment
44 // EndFragment:     Byte count from the beginning of the clipboard to the
45 //                  end of the fragment
46 // StartSelection:  Byte count from the beginning of the clipboard to the
47 //                  start of the selection
48 // EndSelection:    Byte count from the beginning of the clipboard to the
49 //                  end of the selection
50 
51 // StartSelection and EndSelection are optional
52 // The fragment should be preceded and followed by the HTML comments
53 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
54 // text
55 
56 namespace
57 {
GetHtmlFormatHeader(size_t startHtml,size_t endHtml,size_t startFragment,size_t endFragment)58 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
59 {
60     std::ostringstream htmlHeader;
61     htmlHeader << "Version:1.0" << '\r' << '\n';
62     htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
63     htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
64     htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
65     htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
66     return htmlHeader.str();
67 }
68 
69 }
70 
71 // the office always writes the start and end html tag in upper cases and
72 // without spaces both tags don't allow parameters
73 const std::string TAG_HTML("<html>");
74 const std::string TAG_END_HTML("</html>");
75 
76 // The body tag may have parameters so we need to search for the
77 // closing '>' manually e.g. <BODY param> #92840#
78 const std::string TAG_BODY("<body");
79 const std::string TAG_END_BODY("</body");
80 
TextHtmlToHTMLFormat(Sequence<sal_Int8> const & aTextHtml)81 Sequence<sal_Int8> TextHtmlToHTMLFormat(Sequence<sal_Int8> const & aTextHtml)
82 {
83     OSL_ASSERT(aTextHtml.getLength() > 0);
84 
85     if (aTextHtml.getLength() <= 0)
86         return Sequence<sal_Int8>();
87 
88     // fill the buffer with dummy values to calc the exact length
89     std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
90     size_t lHtmlFormatHeader = dummyHtmlHeader.length();
91 
92     std::string textHtml(
93         reinterpret_cast<const char*>(aTextHtml.getConstArray()),
94         reinterpret_cast<const char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
95 
96     std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
97     std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
98 
99     // The body tag may have parameters so we need to search for the
100     // closing '>' manually e.g. <BODY param> #92840#
101     std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
102     std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
103 
104     std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
105     htmlFormat += textHtml;
106 
107     Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
108     memset(byteSequence.getArray(), 0, byteSequence.getLength());
109 
110     memcpy(
111         static_cast<void*>(byteSequence.getArray()),
112         static_cast<const void*>(htmlFormat.c_str()),
113         htmlFormat.length());
114 
115     return byteSequence;
116 }
117 
118 const char* const HtmlStartTag = "<html";
119 
HTMLFormatToTextHtml(const Sequence<sal_Int8> & aHTMLFormat)120 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
121 {
122   assert(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
123 
124   Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
125   char* dataStart = reinterpret_cast<char*>(nonconstHTMLFormatRef.getArray());
126   char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
127   const char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
128 
129   assert(htmlStartTag && "Seems to be no HTML at all");
130 
131   // It doesn't seem to be HTML? Well then simply return what has been
132   // provided in non-debug builds
133   if (htmlStartTag == nullptr)
134     {
135     return aHTMLFormat;
136     }
137 
138   sal_Int32 len = dataEnd - htmlStartTag;
139   Sequence<sal_Int8> plainHtmlData(len);
140 
141   memcpy(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
142 
143   return plainHtmlData;
144 }
145 
146 /* A simple format detection. We are just comparing the first few bytes
147    of the provided byte sequence to see whether or not it is the MS
148    Office Html format. If it shows that this is not reliable enough we
149    can improve this
150 */
151 const char HtmlFormatStart[] = "Version:";
152 int const HtmlFormatStartLen = sizeof(HtmlFormatStart) - 1;
153 
isHTMLFormat(const Sequence<sal_Int8> & aHtmlSequence)154 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
155 {
156   if (aHtmlSequence.getLength() < HtmlFormatStartLen)
157     return false;
158 
159   return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
160                                                    HtmlFormatStartLen,
161                                                    reinterpret_cast<const char*>(aHtmlSequence.getConstArray()),
162                                                    HtmlFormatStartLen) == 0;
163 }
164 
165 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
166