LibreOffice Module oox (master) 1
vmlinputstream.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
21
22#include <com/sun/star/io/IOException.hpp>
23#include <com/sun/star/io/XTextInputStream2.hpp>
24#include <map>
25#include <string.h>
26#include <rtl/strbuf.hxx>
27#include <osl/diagnose.h>
29
30namespace oox::vml {
31
32using namespace ::com::sun::star::io;
33using namespace ::com::sun::star::uno;
34
35namespace {
36
37const char* lclFindCharacter( const char* pcBeg, const char* pcEnd, char cChar )
38{
39 sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
40 return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
41}
42
43bool lclIsWhiteSpace( char cChar )
44{
45 return cChar >= 0 && cChar <= 32;
46}
47
48const char* lclFindWhiteSpace( const char* pcBeg, const char* pcEnd )
49{
50 for( ; pcBeg < pcEnd; ++pcBeg )
51 if( lclIsWhiteSpace( *pcBeg ) )
52 return pcBeg;
53 return pcEnd;
54}
55
56const char* lclFindNonWhiteSpace( const char* pcBeg, const char* pcEnd )
57{
58 for( ; pcBeg < pcEnd; ++pcBeg )
59 if( !lclIsWhiteSpace( *pcBeg ) )
60 return pcBeg;
61 return pcEnd;
62}
63
64const char* lclTrimWhiteSpaceFromEnd( const char* pcBeg, const char* pcEnd )
65{
66 while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
67 --pcEnd;
68 return pcEnd;
69}
70
71void lclAppendToBuffer( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
72{
73 rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
74}
75
76void lclProcessAttribs( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
77{
78 /* Map attribute names to char-pointer of all attributes. This map is used
79 to find multiple occurrences of attributes with the same name. The
80 mapped pointers are used as map key in the next map below. */
81 typedef ::std::map< OString, const char* > AttributeNameMap;
82 AttributeNameMap aAttributeNames;
83
84 /* Map the char-pointers of all attributes to the full attribute definition
85 string. This preserves the original order of the used attributes. */
86 typedef ::std::map< const char*, OString > AttributeDataMap;
87 AttributeDataMap aAttributes;
88
89 bool bOk = true;
90 const char* pcNameBeg = pcBeg;
91 while( bOk && (pcNameBeg < pcEnd) )
92 {
93 // pcNameBeg points to begin of attribute name, find equality sign
94 const char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
95 bOk = (pcEqualSign < pcEnd);
96 if (bOk)
97 {
98 // find end of attribute name (ignore whitespace between name and equality sign)
99 const char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
100 bOk = (pcNameBeg < pcNameEnd);
101 if( bOk )
102 {
103 // find begin of attribute value (must be single or double quote)
104 const char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
105 bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'));
106 if( bOk )
107 {
108 // find end of attribute value (matching quote character)
109 const char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
110 bOk = (pcValueEnd < pcEnd);
111 if( bOk )
112 {
113 ++pcValueEnd;
114 OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
115 OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
116 // search for an existing attribute with the same name
117 AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
118 // remove its definition from the data map
119 if( aIt != aAttributeNames.end() )
120 aAttributes.erase( aIt->second );
121 // insert the attribute into both maps
122 aAttributeNames[ aAttribName ] = pcNameBeg;
123 aAttributes[ pcNameBeg ] = aAttribData;
124 // continue with next attribute (skip whitespace after this attribute)
125 pcNameBeg = pcValueEnd;
126 if( pcNameBeg < pcEnd )
127 {
128 bOk = lclIsWhiteSpace( *pcNameBeg );
129 if( bOk )
130 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
131 }
132 }
133 }
134 }
135 }
136 }
137
138 // if no error has occurred, build the resulting attribute list
139 if( bOk )
140 for (auto const& attrib : aAttributes)
141 rBuffer.append( ' ' ).append( attrib.second );
142 // on error, just append the complete passed string
143 else
144 lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
145}
146
147void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
148{
149 // check that passed string starts and ends with the brackets of an XML element
150 sal_Int32 nElementLen = rElement.getLength();
151 if( nElementLen == 0 )
152 return;
153
154 const char* pcOpen = rElement.getStr();
155 const char* pcClose = pcOpen + nElementLen - 1;
156
157 // no complete element found
158 if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
159 {
160 // just append all passed characters
161 rBuffer.append( rElement );
162 }
163
164 // skip parser instructions: '<![...]>'
165 else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
166 {
167 // do nothing
168 }
169
170 // just append any xml prolog (text directive) or processing instructions: <?...?>
171 else if( (nElementLen >= 4) && (pcOpen[ 1 ] == '?') && (pcClose[ -1 ] == '?') )
172 {
173 rBuffer.append( rElement );
174 }
175
176 // replace '<br>' element with newline
177 else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
178 {
179 rBuffer.append( '\n' );
180 }
181
182 // check start elements and simple elements for repeated attributes
183 else if( pcOpen[ 1 ] != '/' )
184 {
185 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
186 const char* pcContentBeg = pcOpen + 1;
187 bool bIsEmptyElement = pcClose[ -1 ] == '/';
188 const char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
189 // append opening bracket and element name to buffer
190 const char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
191 lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
192 // find begin of attributes, and process all attributes
193 const char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
194 if( pcAttribBeg < pcContentEnd )
195 lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
196 // close the element
197 if( bIsEmptyElement )
198 rBuffer.append( '/' );
199 rBuffer.append( '>' );
200 }
201
202 // append end elements without further processing
203 else
204 {
205 rBuffer.append( rElement );
206 }
207}
208
209bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
210{
211 /* MSO has a very weird way to store and handle whitespaces. The stream
212 may contain lots of spaces, tabs, and newlines which have to be handled
213 as single space character. This will be done in this function.
214
215 If the element text contains a literal line break, it will be stored as
216 <br> tag (without matching </br> element). This input stream wrapper
217 will replace this element with a literal LF character (see below).
218
219 A single space character for its own is stored as is. Example: The
220 element
221 <font> </font>
222 represents a single space character. The XML parser will ignore this
223 space character completely without issuing a 'characters' event. The
224 VML import filter implementation has to react on this case manually.
225
226 A single space character following another character is stored
227 literally and must not be stripped away here. Example: The element
228 <font>abc </font>
229 contains the three letters a, b, and c, followed by a space character.
230
231 Consecutive space characters, or a leading single space character, are
232 stored in a <span> element. If there are N space characters (N > 1),
233 then the <span> element contains exactly (N-1) NBSP (non-breaking
234 space) characters, followed by a regular space character. Examples:
235 The element
236 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
237 represents 4 consecutive space characters. Has to be handled by the
238 implementation. The element
239 <font><span style='mso-spacerun:yes'> abc</span></font>
240 represents a space characters followed by the letters a, b, c. These
241 strings have to be handled by the VML import filter implementation.
242 */
243
244 // passed string ends with the leading opening bracket of an XML element
245 const char* pcBeg = rChars.getStr();
246 const char* pcEnd = pcBeg + rChars.getLength();
247 bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
248 if( bHasBracket ) --pcEnd;
249
250 // skip leading whitespace
251 const char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
252 while( pcContentsBeg < pcEnd )
253 {
254 const char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
255 lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
256 if( pcWhitespaceBeg < pcEnd )
257 rBuffer.append( ' ' );
258 pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
259 }
260
261 return bHasBracket;
262}
263
264} // namespace
265
266constexpr OStringLiteral gaOpeningCData( "<![CDATA[" );
267constexpr OStringLiteral gaClosingCData( "]]>" );
268
269InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
270 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
271 mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
272 maOpeningBracket{ '<' },
273 maClosingBracket{ '>' },
274 mnBufferPos( 0 )
275{
276 if (!mxTextStrm.is())
277 throw IOException();
278}
279
280InputStream::~InputStream()
281{
282}
283
284sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
285{
286 if( nBytesToRead < 0 )
287 throw IOException();
288
289 rData.realloc( nBytesToRead );
290 sal_Int8* pcDest = rData.getArray();
291 sal_Int32 nRet = 0;
292 while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
293 {
294 updateBuffer();
295 sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
296 if( nReadSize > 0 )
297 {
298 memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
299 mnBufferPos += nReadSize;
300 nBytesToRead -= nReadSize;
301 nRet += nReadSize;
302 }
303 }
304 if( nRet < rData.getLength() )
305 rData.realloc( nRet );
306 return nRet;
307}
308
309sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
310{
311 return readBytes( rData, nMaxBytesToRead );
312}
313
314void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
315{
316 if( nBytesToSkip < 0 )
317 throw IOException();
318
319 while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
320 {
321 updateBuffer();
322 sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
323 mnBufferPos += nSkipSize;
324 nBytesToSkip -= nSkipSize;
325 }
326}
327
328sal_Int32 SAL_CALL InputStream::available()
329{
330 updateBuffer();
331 return maBuffer.getLength() - mnBufferPos;
332}
333
334void SAL_CALL InputStream::closeInput()
335{
336 mxTextStrm->closeInput();
337}
338
339// private --------------------------------------------------------------------
340
341void InputStream::updateBuffer()
342{
343 while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
344 {
345 // collect new contents in a string buffer
346 OStringBuffer aBuffer;
347
348 // read and process characters until the opening bracket of the next XML element
349 OString aChars = readToElementBegin();
350 bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
351
352 // read and process characters until (and including) closing bracket (an XML element)
353 OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
354 if( bHasOpeningBracket && !mxTextStrm->isEOF() )
355 {
356 // read the element text (add the leading opening bracket manually)
357 OString aElement = "<" + readToElementEnd();
358 // check for CDATA part, starting with '<![CDATA['
359 if( aElement.match( gaOpeningCData ) )
360 {
361 // search the end tag ']]>'
362 while( ((aElement.getLength() < gaClosingCData.getLength()) || !aElement.endsWith( gaClosingCData )) && !mxTextStrm->isEOF() )
363 aElement += readToElementEnd();
364 // copy the entire CDATA part
365 aBuffer.append( aElement );
366 }
367 else
368 {
369 // no CDATA part - process the contents of the element
370 lclProcessElement( aBuffer, aElement );
371 }
372 }
373
374 maBuffer = aBuffer.makeStringAndClear();
375 mnBufferPos = 0;
376 }
377}
378
379OString InputStream::readToElementBegin()
380{
381 return OUStringToOString( mxTextStrm->readString( maOpeningBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
382}
383
384OString InputStream::readToElementEnd()
385{
386 OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
387 OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
388 return aText;
389}
390
391} // namespace oox::vml
392
393/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
InputStream(const css::uno::Reference< css::uno::XComponentContext > &rxContext, const css::uno::Reference< css::io::XInputStream > &rxInStrm)
sal_Int32 nIndex
constexpr OStringLiteral gaOpeningCData("<![CDATA[")
constexpr OStringLiteral gaClosingCData("]]>")
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
signed char sal_Int8
std::unique_ptr< char[]> aBuffer