LibreOffice Module sax (master) 1
xml2utf.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19#include <string.h>
20
21#include <algorithm>
22
23#include <sal/types.h>
24
25#include <rtl/textenc.h>
26#include <rtl/tencinfo.h>
27#include <com/sun/star/io/NotConnectedException.hpp>
28#include <com/sun/star/io/XInputStream.hpp>
29
30using namespace ::com::sun::star::uno;
31using namespace ::com::sun::star::io;
32
33
34#include <xml2utf.hxx>
35#include <memory>
36
37namespace sax_expatwrap {
38
39sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
40{
41 if( ! m_in.is() ) {
42 throw NotConnectedException();
43 }
44 if( ! m_bStarted ) {
45 // it should be possible to find the encoding attribute
46 // within the first 512 bytes == 128 chars in UCS-4
47 nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
48 }
49
50 sal_Int32 nRead;
51 Sequence< sal_Int8 > seqStart;
52 while( true )
53 {
54 nRead = m_in->readSomeBytes( seq , nMaxToRead );
55
56 if( nRead + seqStart.getLength())
57 {
58 // if nRead is 0, the file is already eof.
59 if( ! m_bStarted && nRead )
60 {
61 // ensure that enough data is available to parse encoding
62 if( seqStart.hasElements() )
63 {
64 // prefix with what we had so far.
65 sal_Int32 nLength = seq.getLength();
66 seq.realloc( seqStart.getLength() + nLength );
67
68 memmove (seq.getArray() + seqStart.getLength(),
69 seq.getConstArray(),
70 nLength);
71 memcpy (seq.getArray(),
72 seqStart.getConstArray(),
73 seqStart.getLength());
74 }
75
76 // autodetection with the first bytes
77 if( ! isEncodingRecognizable( seq ) )
78 {
79 // remember what we have so far.
80 seqStart = seq;
81
82 // read more !
83 continue;
84 }
85 if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
86 // initialize decoding
88 }
89 seqStart = Sequence < sal_Int8 > ();
90 }
91
92 // do the encoding
94 m_pText2Unicode->canContinue() ) {
95
96 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
97 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
98 }
99
100 if( ! m_bStarted )
101 {
102 // it must now be ensured, that no encoding attribute exist anymore
103 // ( otherwise the expat-Parser will crash )
104 // This must be done after decoding !
105 // ( e.g. Files decoded in ucs-4 cannot be read properly )
106 m_bStarted = true;
107 removeEncoding( seq );
108 }
109 nRead = seq.getLength();
110 }
111
112 break;
113 }
114 return nRead;
115}
116
117void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
118{
119 const sal_Int8 *pSource = seq.getArray();
120 if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5))
121 return;
122
123 // scan for encoding
124 OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
125
126 // cut sequence to first line break
127 // find first line break;
128 int nMax = str.indexOf( 10 );
129 if( nMax >= 0 )
130 {
131 str = str.copy( 0 , nMax );
132 }
133
134 int nFound = str.indexOf( " encoding" );
135 if( nFound < 0 ) return;
136
137 int nStop;
138 int nStart = str.indexOf( "\"" , nFound );
139 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
140 {
141 nStart = str.indexOf( "'" , nFound );
142 nStop = str.indexOf( "'" , nStart +1 );
143 }
144 else
145 {
146 nStop = str.indexOf( "\"" , nStart +1);
147 }
148
149 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
150 {
151 // remove encoding tag from file
152 memmove( &( seq.getArray()[nFound] ) ,
153 &( seq.getArray()[nStop+1]) ,
154 seq.getLength() - nStop -1);
155 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
156 }
157}
158
159// Checks, if enough data has been accumulated to recognize the encoding
161{
162 const sal_Int8 *pSource = seq.getConstArray();
163 bool bCheckIfFirstClosingBracketExists = false;
164
165 if( seq.getLength() < 8 ) {
166 // no recognition possible, when less than 8 bytes are available
167 return false;
168 }
169
170 if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) {
171 // scan if the <?xml tag finishes within this buffer
172 bCheckIfFirstClosingBracketExists = true;
173 }
174 else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
175 ('?' == pSource[4] || '?' == pSource[6] ) )
176 {
177 // check for utf-16
178 bCheckIfFirstClosingBracketExists = true;
179 }
180 else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
181 ( '?' == pSource[5] || '?' == pSource[7] ) )
182 {
183 // check for
184 bCheckIfFirstClosingBracketExists = true;
185 }
186
187 if( bCheckIfFirstClosingBracketExists )
188 {
189 // whole <?xml tag is valid
190 return std::find(seq.begin(), seq.end(), '>') != seq.end();
191 }
192
193 // No <? tag in front, no need for a bigger buffer
194 return true;
195}
196
198{
199 const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
200 bool bReturn = true;
201
202 if( seq.getLength() < 4 ) {
203 // no recognition possible, when less than 4 bytes are available
204 return false;
205 }
206
207 // first level : detect possible file formats
208 if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) {
209 // scan for encoding
210 OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
211
212 // cut sequence to first line break
213 //find first line break;
214 int nMax = str.indexOf( 10 );
215 if( nMax >= 0 )
216 {
217 str = str.copy( 0 , nMax );
218 }
219
220 int nFound = str.indexOf( " encoding" );
221 if( nFound >= 0 ) {
222 int nStop;
223 int nStart = str.indexOf( "\"" , nFound );
224 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
225 {
226 nStart = str.indexOf( "'" , nFound );
227 nStop = str.indexOf( "'" , nStart +1 );
228 }
229 else
230 {
231 nStop = str.indexOf( "\"" , nStart +1);
232 }
233 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
234 {
235 // encoding found finally
236 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
237 }
238 }
239 }
240 else if( 0xFE == pSource[0] &&
241 0xFF == pSource[1] ) {
242 // UTF-16 big endian
243 // conversion is done so that encoding information can be easily extracted
244 m_sEncoding = "utf-16";
245 }
246 else if( 0xFF == pSource[0] &&
247 0xFE == pSource[1] ) {
248 // UTF-16 little endian
249 // conversion is done so that encoding information can be easily extracted
250 m_sEncoding = "utf-16";
251 }
252 else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
253 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
254 // The byte order mark is simply added
255
256 // simply add the byte order mark !
257 seq.realloc( seq.getLength() + 2 );
258 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
259 reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
260 reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
261
262 m_sEncoding = "utf-16";
263 }
264 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
265 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
266 // The byte order mark is simply added
267
268 seq.realloc( seq.getLength() + 2 );
269 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
270 reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
271 reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
272
273 m_sEncoding = "utf-16";
274 }
275 else if( 0xEF == pSource[0] &&
276 0xBB == pSource[1] &&
277 0xBF == pSource[2] )
278 {
279 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
280 // The BOM is removed.
281 memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
282 seq.realloc( seq.getLength() - 3 );
283 m_sEncoding = "utf-8";
284 }
285 else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
286 // UCS-4 big endian
287 m_sEncoding = "ucs-4";
288 }
289 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
290 // UCS-4 little endian
291 m_sEncoding = "ucs-4";
292 }
293/* TODO: no need to test for the moment since we return sal_False like default case anyway
294 else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
295 0xa7 == static_cast<unsigned char> (pSource[2]) &&
296 0x94 == static_cast<unsigned char> (pSource[3]) ) {
297 // EBCDIC
298 bReturn = sal_False; // must be extended
299 }
300*/
301 else {
302 // other
303 // UTF8 is directly recognized by the parser.
304 bReturn = false;
305 }
306
307 return bReturn;
308}
309
311{
312
313 if( !m_sEncoding.isEmpty() )
314 {
315 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
316 if( encoding != RTL_TEXTENCODING_UTF8 )
317 {
318 m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding );
319 m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 );
320 }
321 }
322}
323
324
325// Text2UnicodeConverter
326
327
329 : m_convText2Unicode(nullptr)
330 , m_contextText2Unicode(nullptr)
331{
332 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
333 if( RTL_TEXTENCODING_DONTKNOW == encoding )
334 {
335 m_bCanContinue = false;
336 m_bInitialized = false;
337 }
338 else
339 {
340 init( encoding );
341 }
342}
343
345{
346 if( m_bInitialized )
347 {
348 rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
349 rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
350 }
351}
352
353void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
354{
355 m_bCanContinue = true;
356 m_bInitialized = true;
357
358 m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
359 m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
360}
361
362
363Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
364{
365 sal_uInt32 uiInfo;
366 sal_Size nSrcCvtBytes = 0;
367 sal_Size nTargetCount = 0;
368 sal_Size nSourceCount = 0;
369
370 // the whole source size
371 sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
372 Sequence<sal_Unicode> seqUnicode ( nSourceSize );
373
374 const sal_Int8 *pbSource = seqText.getConstArray();
375 std::unique_ptr<sal_Int8[]> pbTempMem;
376
377 if( m_seqSource.hasElements() ) {
378 // put old rest and new byte sequence into one array
379 pbTempMem.reset(new sal_Int8[ nSourceSize ]);
380 memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
381 memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
382 pbSource = pbTempMem.get();
383
384 // set to zero again
386 }
387
388 while( true ) {
389
390 /* All invalid characters are transformed to the unicode undefined char */
391 nTargetCount += rtl_convertTextToUnicode(
394 reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
395 nSourceSize - nSourceCount ,
396 &( seqUnicode.getArray()[ nTargetCount ] ),
397 seqUnicode.getLength() - nTargetCount,
398 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
399 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
400 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
401 &uiInfo,
402 &nSrcCvtBytes );
403 nSourceCount += nSrcCvtBytes;
404
405 if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) {
406 // save necessary bytes for next conversion
407 seqUnicode.realloc( seqUnicode.getLength() * 2 );
408 continue;
409 }
410 break;
411 }
412 if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) {
413 m_seqSource.realloc( nSourceSize - nSourceCount );
414 memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
415 }
416
417 // set to correct unicode size
418 seqUnicode.realloc( nTargetCount );
419
420 return seqUnicode;
421}
422
423
424// Unicode2TextConverter
425
426
428{
429 m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
430 m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
431}
432
433
435{
436 rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
437 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
438}
439
440
441Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
442{
443 std::unique_ptr<sal_Unicode[]> puTempMem;
444
445 if( m_seqSource.hasElements() ) {
446 // For surrogates !
447 // put old rest and new byte sequence into one array
448 // In general when surrogates are used, they should be rarely
449 // cut off between two convert()-calls. So this code is used
450 // rarely and the extra copy is acceptable.
451 puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
452 memcpy( puTempMem.get() ,
453 m_seqSource.getConstArray() ,
454 m_seqSource.getLength() * sizeof( sal_Unicode ) );
455 memcpy(
456 &(puTempMem[ m_seqSource.getLength() ]) ,
457 puSource ,
458 nSourceSize*sizeof( sal_Unicode ) );
459 puSource = puTempMem.get();
460 nSourceSize += m_seqSource.getLength();
461
462 m_seqSource = Sequence< sal_Unicode > ();
463 }
464
465
466 sal_Size nTargetCount = 0;
467 sal_Size nSourceCount = 0;
468
469 sal_uInt32 uiInfo;
470 sal_Size nSrcCvtChars;
471
472 // take nSourceSize * 3 as preference
473 // this is an upper boundary for converting to utf8,
474 // which most often used as the target.
475 sal_Int32 nSeqSize = nSourceSize * 3;
476
477 Sequence<sal_Int8> seqText( nSeqSize );
478 char *pTarget = reinterpret_cast<char *>(seqText.getArray());
479 while( true ) {
480
481 nTargetCount += rtl_convertUnicodeToText(
484 &( puSource[nSourceCount] ),
485 nSourceSize - nSourceCount ,
486 &( pTarget[nTargetCount] ),
487 nSeqSize - nTargetCount,
488 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
489 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
490 &uiInfo,
491 &nSrcCvtChars);
492 nSourceCount += nSrcCvtChars;
493
494 if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
495 nSeqSize = nSeqSize *2;
496 seqText.realloc( nSeqSize ); // double array size
497 pTarget = reinterpret_cast<char *>(seqText.getArray());
498 continue;
499 }
500 break;
501 }
502
503 // for surrogates
504 if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
505 m_seqSource.realloc( nSourceSize - nSourceCount );
506 memcpy( m_seqSource.getArray() ,
507 &(puSource[nSourceCount]),
508 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
509 }
510
511 // reduce the size of the buffer (fast, no copy necessary)
512 seqText.realloc( nTargetCount );
513
514 return seqText;
515}
516
517}
518
519/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
css::uno::Sequence< sal_Int8 > m_seqSource
Definition: xml2utf.hxx:51
void init(rtl_TextEncoding encoding)
Definition: xml2utf.cxx:353
rtl_TextToUnicodeContext m_contextText2Unicode
Definition: xml2utf.hxx:48
rtl_TextToUnicodeConverter m_convText2Unicode
Definition: xml2utf.hxx:47
Text2UnicodeConverter(const OString &sEncoding)
Definition: xml2utf.cxx:328
css::uno::Sequence< sal_Unicode > convert(const css::uno::Sequence< sal_Int8 > &)
Definition: xml2utf.cxx:363
css::uno::Sequence< sal_Int8 > convert(const sal_Unicode *, sal_Int32 nLength)
Definition: xml2utf.cxx:441
Unicode2TextConverter(rtl_TextEncoding encoding)
Definition: xml2utf.cxx:427
rtl_UnicodeToTextConverter m_convUnicode2Text
Definition: xml2utf.hxx:68
rtl_UnicodeToTextContext m_contextUnicode2Text
Definition: xml2utf.hxx:69
css::uno::Sequence< sal_Unicode > m_seqSource
Definition: xml2utf.hxx:70
css::uno::Reference< css::io::XInputStream > m_in
Definition: xml2utf.hxx:117
bool scanForEncoding(css::uno::Sequence< sal_Int8 > &seq)
Definition: xml2utf.cxx:197
sal_Int32 readAndConvert(css::uno::Sequence< sal_Int8 > &seq, sal_Int32 nMaxToRead)
Definition: xml2utf.cxx:39
std::unique_ptr< Text2UnicodeConverter > m_pText2Unicode
Definition: xml2utf.hxx:122
static void removeEncoding(css::uno::Sequence< sal_Int8 > &seq)
Definition: xml2utf.cxx:117
std::unique_ptr< Unicode2TextConverter > m_pUnicode2Text
Definition: xml2utf.hxx:123
static bool isEncodingRecognizable(const css::uno::Sequence< sal_Int8 > &seq)
Definition: xml2utf.cxx:160
unsigned char sal_uInt8
sal_uInt16 sal_Unicode
signed char sal_Int8
sal_Int32 nLength