1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <oox/vml/vmlinputstream.hxx>
21 
22 #include <com/sun/star/io/IOException.hpp>
23 #include <com/sun/star/io/XTextInputStream2.hpp>
24 #include <map>
25 #include <string.h>
26 #include <rtl/strbuf.hxx>
27 #include <osl/diagnose.h>
28 #include <oox/helper/textinputstream.hxx>
29 
30 namespace oox::vml {
31 
32 using namespace ::com::sun::star::io;
33 using namespace ::com::sun::star::uno;
34 
35 namespace {
36 
lclFindCharacter(const char * pcBeg,const char * pcEnd,char cChar)37 const char* lclFindCharacter( const char* pcBeg, const char* pcEnd, char cChar )
38 {
39     sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
40     return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
41 }
42 
lclIsWhiteSpace(char cChar)43 bool lclIsWhiteSpace( char cChar )
44 {
45     return cChar <= 32;
46 }
47 
lclFindWhiteSpace(const char * pcBeg,const char * pcEnd)48 const char* lclFindWhiteSpace( const char* pcBeg, const char* pcEnd )
49 {
50     for( ; pcBeg < pcEnd; ++pcBeg )
51         if( lclIsWhiteSpace( *pcBeg ) )
52             return pcBeg;
53     return pcEnd;
54 }
55 
lclFindNonWhiteSpace(const char * pcBeg,const char * pcEnd)56 const char* lclFindNonWhiteSpace( const char* pcBeg, const char* pcEnd )
57 {
58     for( ; pcBeg < pcEnd; ++pcBeg )
59         if( !lclIsWhiteSpace( *pcBeg ) )
60             return pcBeg;
61     return pcEnd;
62 }
63 
lclTrimWhiteSpaceFromEnd(const char * pcBeg,const char * pcEnd)64 const char* lclTrimWhiteSpaceFromEnd( const char* pcBeg, const char* pcEnd )
65 {
66     while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
67         --pcEnd;
68     return pcEnd;
69 }
70 
lclAppendToBuffer(OStringBuffer & rBuffer,const char * pcBeg,const char * pcEnd)71 void lclAppendToBuffer( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
72 {
73     rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
74 }
75 
lclProcessAttribs(OStringBuffer & rBuffer,const char * pcBeg,const char * pcEnd)76 void lclProcessAttribs( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
77 {
78     /*  Map attribute names to char-pointer of all attributes. This map is used
79         to find multiple occurrences of attributes with the same name. The
80         mapped pointers are used as map key in the next map below. */
81     typedef ::std::map< OString, const char* > AttributeNameMap;
82     AttributeNameMap aAttributeNames;
83 
84     /*  Map the char-pointers of all attributes to the full attribute definition
85         string. This preserves the original order of the used attributes. */
86     typedef ::std::map< const char*, OString > AttributeDataMap;
87     AttributeDataMap aAttributes;
88 
89     bool bOk = true;
90     const char* pcNameBeg = pcBeg;
91     while( bOk && (pcNameBeg < pcEnd) )
92     {
93         // pcNameBeg points to begin of attribute name, find equality sign
94         const char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
95         bOk = (pcEqualSign < pcEnd);
96         if (bOk)
97         {
98             // find end of attribute name (ignore whitespace between name and equality sign)
99             const char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
100             bOk = (pcNameBeg < pcNameEnd);
101             if( bOk )
102             {
103                 // find begin of attribute value (must be single or double quote)
104                 const char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
105                 bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'));
106                 if( bOk )
107                 {
108                     // find end of attribute value (matching quote character)
109                     const char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
110                     bOk = (pcValueEnd < pcEnd);
111                     if( bOk )
112                     {
113                         ++pcValueEnd;
114                         OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
115                         OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
116                         // search for an existing attribute with the same name
117                         AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
118                         // remove its definition from the data map
119                         if( aIt != aAttributeNames.end() )
120                             aAttributes.erase( aIt->second );
121                         // insert the attribute into both maps
122                         aAttributeNames[ aAttribName ] = pcNameBeg;
123                         aAttributes[ pcNameBeg ] = aAttribData;
124                         // continue with next attribute (skip whitespace after this attribute)
125                         pcNameBeg = pcValueEnd;
126                         if( pcNameBeg < pcEnd )
127                         {
128                             bOk = lclIsWhiteSpace( *pcNameBeg );
129                             if( bOk )
130                                 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
131                         }
132                     }
133                 }
134             }
135         }
136     }
137 
138     // if no error has occurred, build the resulting attribute list
139     if( bOk )
140         for (auto const& attrib : aAttributes)
141             rBuffer.append( ' ' ).append( attrib.second );
142     // on error, just append the complete passed string
143     else
144         lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
145 }
146 
lclProcessElement(OStringBuffer & rBuffer,const OString & rElement)147 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
148 {
149     // check that passed string starts and ends with the brackets of an XML element
150     sal_Int32 nElementLen = rElement.getLength();
151     if( nElementLen == 0 )
152         return;
153 
154     const char* pcOpen = rElement.getStr();
155     const char* pcClose = pcOpen + nElementLen - 1;
156 
157     // no complete element found
158     if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
159     {
160         // just append all passed characters
161         rBuffer.append( rElement );
162     }
163 
164     // skip parser instructions: '<![...]>'
165     else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
166     {
167         // do nothing
168     }
169 
170     // just append any xml prolog (text directive) or processing instructions: <?...?>
171     else if( (nElementLen >= 4) && (pcOpen[ 1 ] == '?') && (pcClose[ -1 ] == '?') )
172     {
173         rBuffer.append( rElement );
174     }
175 
176     // replace '<br>' element with newline
177     else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
178     {
179         rBuffer.append( '\n' );
180     }
181 
182     // check start elements and simple elements for repeated attributes
183     else if( pcOpen[ 1 ] != '/' )
184     {
185         // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
186         const char* pcContentBeg = pcOpen + 1;
187         bool bIsEmptyElement = pcClose[ -1 ] == '/';
188         const char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
189         // append opening bracket and element name to buffer
190         const char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
191         lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
192         // find begin of attributes, and process all attributes
193         const char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
194         if( pcAttribBeg < pcContentEnd )
195             lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
196         // close the element
197         if( bIsEmptyElement )
198             rBuffer.append( '/' );
199         rBuffer.append( '>' );
200     }
201 
202     // append end elements without further processing
203     else
204     {
205         rBuffer.append( rElement );
206     }
207 }
208 
lclProcessCharacters(OStringBuffer & rBuffer,const OString & rChars)209 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
210 {
211     /*  MSO has a very weird way to store and handle whitespaces. The stream
212         may contain lots of spaces, tabs, and newlines which have to be handled
213         as single space character. This will be done in this function.
214 
215         If the element text contains a literal line break, it will be stored as
216         <br> tag (without matching </br> element). This input stream wrapper
217         will replace this element with a literal LF character (see below).
218 
219         A single space character for its own is stored as is. Example: The
220         element
221             <font> </font>
222         represents a single space character. The XML parser will ignore this
223         space character completely without issuing a 'characters' event. The
224         VML import filter implementation has to react on this case manually.
225 
226         A single space character following another character is stored
227         literally and must not be stripped away here. Example: The element
228             <font>abc </font>
229         contains the three letters a, b, and c, followed by a space character.
230 
231         Consecutive space characters, or a leading single space character, are
232         stored in a <span> element. If there are N space characters (N > 1),
233         then the <span> element contains exactly (N-1) NBSP (non-breaking
234         space) characters, followed by a regular space character. Examples:
235         The element
236             <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
237         represents 4 consecutive space characters. Has to be handled by the
238         implementation. The element
239             <font><span style='mso-spacerun:yes'> abc</span></font>
240         represents a space characters followed by the letters a, b, c. These
241         strings have to be handled by the VML import filter implementation.
242      */
243 
244     // passed string ends with the leading opening bracket of an XML element
245     const char* pcBeg = rChars.getStr();
246     const char* pcEnd = pcBeg + rChars.getLength();
247     bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
248     if( bHasBracket ) --pcEnd;
249 
250     // skip leading whitespace
251     const char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
252     while( pcContentsBeg < pcEnd )
253     {
254         const char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
255         lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
256         if( pcWhitespaceBeg < pcEnd )
257             rBuffer.append( ' ' );
258         pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
259     }
260 
261     return bHasBracket;
262 }
263 
264 } // namespace
265 
266 constexpr OStringLiteral gaOpeningCData( "<![CDATA[" );
267 constexpr OStringLiteral gaClosingCData( "]]>" );
268 
InputStream(const Reference<XComponentContext> & rxContext,const Reference<XInputStream> & rxInStrm)269 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
270     // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
271     mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
272     maOpeningBracket( 1 ),
273     maClosingBracket( 1 ),
274     mnBufferPos( 0 )
275 {
276     if (!mxTextStrm.is())
277         throw IOException();
278     maOpeningBracket[ 0 ] = '<';
279     maClosingBracket[ 0 ] = '>';
280 }
281 
~InputStream()282 InputStream::~InputStream()
283 {
284 }
285 
readBytes(Sequence<sal_Int8> & rData,sal_Int32 nBytesToRead)286 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
287 {
288     if( nBytesToRead < 0 )
289         throw IOException();
290 
291     rData.realloc( nBytesToRead );
292     sal_Int8* pcDest = rData.getArray();
293     sal_Int32 nRet = 0;
294     while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
295     {
296         updateBuffer();
297         sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
298         if( nReadSize > 0 )
299         {
300             memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
301             mnBufferPos += nReadSize;
302             nBytesToRead -= nReadSize;
303             nRet += nReadSize;
304         }
305     }
306     if( nRet < rData.getLength() )
307         rData.realloc( nRet );
308     return nRet;
309 }
310 
readSomeBytes(Sequence<sal_Int8> & rData,sal_Int32 nMaxBytesToRead)311 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
312 {
313     return readBytes( rData, nMaxBytesToRead );
314 }
315 
skipBytes(sal_Int32 nBytesToSkip)316 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
317 {
318     if( nBytesToSkip < 0 )
319         throw IOException();
320 
321     while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
322     {
323         updateBuffer();
324         sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
325         mnBufferPos += nSkipSize;
326         nBytesToSkip -= nSkipSize;
327     }
328 }
329 
available()330 sal_Int32 SAL_CALL InputStream::available()
331 {
332     updateBuffer();
333     return maBuffer.getLength() - mnBufferPos;
334 }
335 
closeInput()336 void SAL_CALL InputStream::closeInput()
337 {
338     mxTextStrm->closeInput();
339 }
340 
341 // private --------------------------------------------------------------------
342 
updateBuffer()343 void InputStream::updateBuffer()
344 {
345     while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
346     {
347         // collect new contents in a string buffer
348         OStringBuffer aBuffer;
349 
350         // read and process characters until the opening bracket of the next XML element
351         OString aChars = readToElementBegin();
352         bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
353 
354         // read and process characters until (and including) closing bracket (an XML element)
355         OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
356         if( bHasOpeningBracket && !mxTextStrm->isEOF() )
357         {
358             // read the element text (add the leading opening bracket manually)
359             OString aElement = "<" + readToElementEnd();
360             // check for CDATA part, starting with '<![CDATA['
361             if( aElement.match( gaOpeningCData ) )
362             {
363                 // search the end tag ']]>'
364                 while( ((aElement.getLength() < gaClosingCData.getLength()) || !aElement.endsWith( gaClosingCData )) && !mxTextStrm->isEOF() )
365                     aElement += readToElementEnd();
366                 // copy the entire CDATA part
367                 aBuffer.append( aElement );
368             }
369             else
370             {
371                 // no CDATA part - process the contents of the element
372                 lclProcessElement( aBuffer, aElement );
373             }
374         }
375 
376         maBuffer = aBuffer.makeStringAndClear();
377         mnBufferPos = 0;
378     }
379 }
380 
readToElementBegin()381 OString InputStream::readToElementBegin()
382 {
383     return OUStringToOString( mxTextStrm->readString( maOpeningBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
384 }
385 
readToElementEnd()386 OString InputStream::readToElementEnd()
387 {
388     OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
389     OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
390     return aText;
391 }
392 
393 } // namespace oox::vml
394 
395 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
396