LibreOffice Module oox (master)  1
vmlinputstream.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
21 
22 #include <com/sun/star/io/IOException.hpp>
23 #include <com/sun/star/io/XTextInputStream2.hpp>
24 #include <map>
25 #include <string.h>
26 #include <rtl/strbuf.hxx>
27 #include <osl/diagnose.h>
29 
30 namespace oox::vml {
31 
32 using namespace ::com::sun::star::io;
33 using namespace ::com::sun::star::uno;
34 
35 namespace {
36 
37 const char* lclFindCharacter( const char* pcBeg, const char* pcEnd, char cChar )
38 {
39  sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
40  return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
41 }
42 
43 bool lclIsWhiteSpace( char cChar )
44 {
45  return cChar <= 32;
46 }
47 
48 const char* lclFindWhiteSpace( const char* pcBeg, const char* pcEnd )
49 {
50  for( ; pcBeg < pcEnd; ++pcBeg )
51  if( lclIsWhiteSpace( *pcBeg ) )
52  return pcBeg;
53  return pcEnd;
54 }
55 
56 const char* lclFindNonWhiteSpace( const char* pcBeg, const char* pcEnd )
57 {
58  for( ; pcBeg < pcEnd; ++pcBeg )
59  if( !lclIsWhiteSpace( *pcBeg ) )
60  return pcBeg;
61  return pcEnd;
62 }
63 
64 const char* lclTrimWhiteSpaceFromEnd( const char* pcBeg, const char* pcEnd )
65 {
66  while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
67  --pcEnd;
68  return pcEnd;
69 }
70 
71 void lclAppendToBuffer( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
72 {
73  rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
74 }
75 
76 void lclProcessAttribs( OStringBuffer& rBuffer, const char* pcBeg, const char* pcEnd )
77 {
78  /* Map attribute names to char-pointer of all attributes. This map is used
79  to find multiple occurrences of attributes with the same name. The
80  mapped pointers are used as map key in the next map below. */
81  typedef ::std::map< OString, const char* > AttributeNameMap;
82  AttributeNameMap aAttributeNames;
83 
84  /* Map the char-pointers of all attributes to the full attribute definition
85  string. This preserves the original order of the used attributes. */
86  typedef ::std::map< const char*, OString > AttributeDataMap;
87  AttributeDataMap aAttributes;
88 
89  bool bOk = true;
90  const char* pcNameBeg = pcBeg;
91  while( bOk && (pcNameBeg < pcEnd) )
92  {
93  // pcNameBeg points to begin of attribute name, find equality sign
94  const char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
95  bOk = (pcEqualSign < pcEnd);
96  if (bOk)
97  {
98  // find end of attribute name (ignore whitespace between name and equality sign)
99  const char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
100  bOk = (pcNameBeg < pcNameEnd);
101  if( bOk )
102  {
103  // find begin of attribute value (must be single or double quote)
104  const char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
105  bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'));
106  if( bOk )
107  {
108  // find end of attribute value (matching quote character)
109  const char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
110  bOk = (pcValueEnd < pcEnd);
111  if( bOk )
112  {
113  ++pcValueEnd;
114  OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
115  OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
116  // search for an existing attribute with the same name
117  AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
118  // remove its definition from the data map
119  if( aIt != aAttributeNames.end() )
120  aAttributes.erase( aIt->second );
121  // insert the attribute into both maps
122  aAttributeNames[ aAttribName ] = pcNameBeg;
123  aAttributes[ pcNameBeg ] = aAttribData;
124  // continue with next attribute (skip whitespace after this attribute)
125  pcNameBeg = pcValueEnd;
126  if( pcNameBeg < pcEnd )
127  {
128  bOk = lclIsWhiteSpace( *pcNameBeg );
129  if( bOk )
130  pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
131  }
132  }
133  }
134  }
135  }
136  }
137 
138  // if no error has occurred, build the resulting attribute list
139  if( bOk )
140  for (auto const& attrib : aAttributes)
141  rBuffer.append( ' ' ).append( attrib.second );
142  // on error, just append the complete passed string
143  else
144  lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
145 }
146 
147 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
148 {
149  // check that passed string starts and ends with the brackets of an XML element
150  sal_Int32 nElementLen = rElement.getLength();
151  if( nElementLen == 0 )
152  return;
153 
154  const char* pcOpen = rElement.getStr();
155  const char* pcClose = pcOpen + nElementLen - 1;
156 
157  // no complete element found
158  if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
159  {
160  // just append all passed characters
161  rBuffer.append( rElement );
162  }
163 
164  // skip parser instructions: '<![...]>'
165  else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
166  {
167  // do nothing
168  }
169 
170  // just append any xml prolog (text directive) or processing instructions: <?...?>
171  else if( (nElementLen >= 4) && (pcOpen[ 1 ] == '?') && (pcClose[ -1 ] == '?') )
172  {
173  rBuffer.append( rElement );
174  }
175 
176  // replace '<br>' element with newline
177  else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
178  {
179  rBuffer.append( '\n' );
180  }
181 
182  // check start elements and simple elements for repeated attributes
183  else if( pcOpen[ 1 ] != '/' )
184  {
185  // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
186  const char* pcContentBeg = pcOpen + 1;
187  bool bIsEmptyElement = pcClose[ -1 ] == '/';
188  const char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
189  // append opening bracket and element name to buffer
190  const char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
191  lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
192  // find begin of attributes, and process all attributes
193  const char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
194  if( pcAttribBeg < pcContentEnd )
195  lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
196  // close the element
197  if( bIsEmptyElement )
198  rBuffer.append( '/' );
199  rBuffer.append( '>' );
200  }
201 
202  // append end elements without further processing
203  else
204  {
205  rBuffer.append( rElement );
206  }
207 }
208 
209 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
210 {
211  /* MSO has a very weird way to store and handle whitespaces. The stream
212  may contain lots of spaces, tabs, and newlines which have to be handled
213  as single space character. This will be done in this function.
214 
215  If the element text contains a literal line break, it will be stored as
216  <br> tag (without matching </br> element). This input stream wrapper
217  will replace this element with a literal LF character (see below).
218 
219  A single space character for its own is stored as is. Example: The
220  element
221  <font> </font>
222  represents a single space character. The XML parser will ignore this
223  space character completely without issuing a 'characters' event. The
224  VML import filter implementation has to react on this case manually.
225 
226  A single space character following another character is stored
227  literally and must not be stripped away here. Example: The element
228  <font>abc </font>
229  contains the three letters a, b, and c, followed by a space character.
230 
231  Consecutive space characters, or a leading single space character, are
232  stored in a <span> element. If there are N space characters (N > 1),
233  then the <span> element contains exactly (N-1) NBSP (non-breaking
234  space) characters, followed by a regular space character. Examples:
235  The element
236  <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
237  represents 4 consecutive space characters. Has to be handled by the
238  implementation. The element
239  <font><span style='mso-spacerun:yes'> abc</span></font>
240  represents a space characters followed by the letters a, b, c. These
241  strings have to be handled by the VML import filter implementation.
242  */
243 
244  // passed string ends with the leading opening bracket of an XML element
245  const char* pcBeg = rChars.getStr();
246  const char* pcEnd = pcBeg + rChars.getLength();
247  bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
248  if( bHasBracket ) --pcEnd;
249 
250  // skip leading whitespace
251  const char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
252  while( pcContentsBeg < pcEnd )
253  {
254  const char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
255  lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
256  if( pcWhitespaceBeg < pcEnd )
257  rBuffer.append( ' ' );
258  pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
259  }
260 
261  return bHasBracket;
262 }
263 
264 } // namespace
265 
266 constexpr OStringLiteral gaOpeningCData( "<![CDATA[" );
267 constexpr OStringLiteral gaClosingCData( "]]>" );
268 
270  // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
271  mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
272  maOpeningBracket( 1 ),
273  maClosingBracket( 1 ),
274  mnBufferPos( 0 )
275 {
276  if (!mxTextStrm.is())
277  throw IOException();
278  maOpeningBracket[ 0 ] = '<';
279  maClosingBracket[ 0 ] = '>';
280 }
281 
283 {
284 }
285 
286 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
287 {
288  if( nBytesToRead < 0 )
289  throw IOException();
290 
291  rData.realloc( nBytesToRead );
292  sal_Int8* pcDest = rData.getArray();
293  sal_Int32 nRet = 0;
294  while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
295  {
296  updateBuffer();
297  sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
298  if( nReadSize > 0 )
299  {
300  memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
301  mnBufferPos += nReadSize;
302  nBytesToRead -= nReadSize;
303  nRet += nReadSize;
304  }
305  }
306  if( nRet < rData.getLength() )
307  rData.realloc( nRet );
308  return nRet;
309 }
310 
311 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
312 {
313  return readBytes( rData, nMaxBytesToRead );
314 }
315 
316 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
317 {
318  if( nBytesToSkip < 0 )
319  throw IOException();
320 
321  while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
322  {
323  updateBuffer();
324  sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
325  mnBufferPos += nSkipSize;
326  nBytesToSkip -= nSkipSize;
327  }
328 }
329 
330 sal_Int32 SAL_CALL InputStream::available()
331 {
332  updateBuffer();
333  return maBuffer.getLength() - mnBufferPos;
334 }
335 
336 void SAL_CALL InputStream::closeInput()
337 {
338  mxTextStrm->closeInput();
339 }
340 
341 // private --------------------------------------------------------------------
342 
344 {
345  while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
346  {
347  // collect new contents in a string buffer
348  OStringBuffer aBuffer;
349 
350  // read and process characters until the opening bracket of the next XML element
351  OString aChars = readToElementBegin();
352  bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
353 
354  // read and process characters until (and including) closing bracket (an XML element)
355  OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
356  if( bHasOpeningBracket && !mxTextStrm->isEOF() )
357  {
358  // read the element text (add the leading opening bracket manually)
359  OString aElement = "<" + readToElementEnd();
360  // check for CDATA part, starting with '<![CDATA['
361  if( aElement.match( gaOpeningCData ) )
362  {
363  // search the end tag ']]>'
364  while( ((aElement.getLength() < gaClosingCData.getLength()) || !aElement.endsWith( gaClosingCData )) && !mxTextStrm->isEOF() )
365  aElement += readToElementEnd();
366  // copy the entire CDATA part
367  aBuffer.append( aElement );
368  }
369  else
370  {
371  // no CDATA part - process the contents of the element
372  lclProcessElement( aBuffer, aElement );
373  }
374  }
375 
376  maBuffer = aBuffer.makeStringAndClear();
377  mnBufferPos = 0;
378  }
379 }
380 
382 {
383  return OUStringToOString( mxTextStrm->readString( maOpeningBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
384 }
385 
387 {
388  OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, false ), RTL_TEXTENCODING_ISO_8859_1 );
389  OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
390  return aText;
391 }
392 
393 } // namespace oox::vml
394 
395 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
sal_Int32 nIndex
virtual void SAL_CALL closeInput() override
signed char sal_Int8
virtual void SAL_CALL skipBytes(sal_Int32 nBytesToSkip) override
constexpr OStringLiteral gaOpeningCData("<![CDATA[")
virtual sal_Int32 SAL_CALL available() override
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
virtual sal_Int32 SAL_CALL readBytes(css::uno::Sequence< sal_Int8 > &rData, sal_Int32 nBytesToRead) override
std::unique_ptr< char[]> aBuffer
css::uno::Sequence< sal_Unicode > maClosingBracket
virtual sal_Int32 SAL_CALL readSomeBytes(css::uno::Sequence< sal_Int8 > &rData, sal_Int32 nMaxBytesToRead) override
InputStream(const css::uno::Reference< css::uno::XComponentContext > &rxContext, const css::uno::Reference< css::io::XInputStream > &rxInStrm)
virtual ~InputStream() override
css::uno::Reference< css::io::XTextInputStream2 > mxTextStrm
css::uno::Sequence< sal_Unicode > maOpeningBracket
constexpr OStringLiteral gaClosingCData("]]>")