sax/html/xml2utf_8cxx_source.html

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/*

 * This file is part of the LibreOffice project.

 *

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/.

 *

 * This file incorporates work covered by the following license notice:

 *

 *   Licensed to the Apache Software Foundation (ASF) under one or more

 *   contributor license agreements. See the NOTICE file distributed

 *   with this work for additional information regarding copyright

 *   ownership. The ASF licenses this file to you under the Apache

 *   License, Version 2.0 (the "License"); you may not use this file

 *   except in compliance with the License. You may obtain a copy of

 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .

 */

#include <string.h>


#include <algorithm>


#include <sal/types.h>


#include <rtl/textenc.h>

#include <rtl/tencinfo.h>

#include <com/sun/star/io/NotConnectedException.hpp>

#include <com/sun/star/io/XInputStream.hpp>


using namespace ::com::sun::star::uno;

using namespace ::com::sun::star::io;


#include <xml2utf.hxx>

#include <memory>


namespace sax_expatwrap {


sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )

{

    if( ! m_in.is() ) {

        throw NotConnectedException();

    }

    if( ! m_bStarted ) {

        // it should be possible to find the encoding attribute

        // within the first 512 bytes == 128 chars in UCS-4

        nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );

    }


    sal_Int32 nRead;

    Sequence< sal_Int8 > seqStart;

    while( true )

    {

        nRead = m_in->readSomeBytes( seq , nMaxToRead );


        if( nRead + seqStart.getLength())

        {

            // if nRead is 0, the file is already eof.

            if( ! m_bStarted && nRead )

            {

                // ensure that enough data is available to parse encoding

                if( seqStart.hasElements() )

                {

                  // prefix with what we had so far.

                  sal_Int32 nLength = seq.getLength();

                  seq.realloc( seqStart.getLength() + nLength );


                  memmove (seq.getArray() + seqStart.getLength(),

                       seq.getConstArray(),

                       nLength);

                  memcpy  (seq.getArray(),

                       seqStart.getConstArray(),

                       seqStart.getLength());

                }


                // autodetection with the first bytes

                if( ! isEncodingRecognizable( seq ) )

                {

                  // remember what we have so far.

                  seqStart = seq;


                  // read more !

                  continue;

                }

                if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {

                    // initialize decoding

                    initializeDecoding();

                }

                seqStart = Sequence < sal_Int8 > ();

            }


            // do the encoding

            if( m_pText2Unicode && m_pUnicode2Text &&

                m_pText2Unicode->canContinue() ) {


                Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );

                seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );

            }


            if( ! m_bStarted )

            {

                // it must now be ensured, that no encoding attribute exist anymore

                // ( otherwise the expat-Parser will crash )

                // This must be done after decoding !

                // ( e.g. Files decoded in ucs-4 cannot be read properly )

                m_bStarted = true;

                removeEncoding( seq );

            }

            nRead = seq.getLength();

        }


        break;

    }

    return nRead;

}


void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )

{

    const sal_Int8 *pSource = seq.getArray();

    if (seq.getLength() < 5 || strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5))

        return;


    // scan for encoding

    OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );


    // cut sequence to first line break

    // find first line break;

    int nMax = str.indexOf( 10 );

    if( nMax >= 0 )

    {

        str = str.copy( 0 , nMax );

    }


    int nFound = str.indexOf( " encoding" );

    if( nFound < 0 )        return;


    int nStop;

    int nStart = str.indexOf( "\"" , nFound );

    if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )

    {

        nStart = str.indexOf( "'" , nFound );

        nStop  = str.indexOf( "'" , nStart +1 );

    }

    else

    {

        nStop  = str.indexOf( "\"" , nStart +1);

    }


    if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )

    {

        // remove encoding tag from file

        memmove(        &( seq.getArray()[nFound] ) ,

                        &( seq.getArray()[nStop+1]) ,

                        seq.getLength() - nStop -1);

        seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );

    }

}


// Checks, if enough data has been accumulated to recognize the encoding

bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)

{

    const sal_Int8 *pSource = seq.getConstArray();

    bool bCheckIfFirstClosingBracketExists = false;


    if( seq.getLength() < 8 ) {

        // no recognition possible, when less than 8 bytes are available

        return false;

    }


    if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 5 ) ) {

        // scan if the <?xml tag finishes within this buffer

        bCheckIfFirstClosingBracketExists = true;

    }

    else if( ('<' == pSource[0] || '<' == pSource[2] ) &&

             ('?' == pSource[4] || '?' == pSource[6] ) )

    {

        // check for utf-16

        bCheckIfFirstClosingBracketExists = true;

    }

    else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&

             ( '?' == pSource[5] || '?' == pSource[7] ) )

    {

        // check for

        bCheckIfFirstClosingBracketExists = true;

    }


    if( bCheckIfFirstClosingBracketExists )

    {

        // whole <?xml tag is valid

        return std::find(seq.begin(), seq.end(), '>') != seq.end();

    }


    // No <? tag in front, no need for a bigger buffer

    return true;

}


bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )

{

    const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );

    bool bReturn = true;


    if( seq.getLength() < 4 ) {

        // no recognition possible, when less than 4 bytes are available

        return false;

    }


    // first level : detect possible file formats

    if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<const char *>(pSource), "<?xml", 5)) {

        // scan for encoding

        OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );


        // cut sequence to first line break

        //find first line break;

        int nMax = str.indexOf( 10 );

        if( nMax >= 0 )

        {

            str = str.copy( 0 , nMax );

        }


        int nFound = str.indexOf( " encoding" );

        if( nFound >= 0 ) {

            int nStop;

            int nStart = str.indexOf( "\"" , nFound );

            if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )

            {

                nStart = str.indexOf( "'" , nFound );

                nStop  = str.indexOf( "'" , nStart +1 );

            }

            else

            {

                nStop  = str.indexOf( "\"" , nStart +1);

            }

            if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )

            {

                // encoding found finally

                m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );

            }

        }

    }

    else if( 0xFE == pSource[0] &&

             0xFF == pSource[1] ) {

        // UTF-16 big endian

        // conversion is done so that encoding information can be easily extracted

        m_sEncoding = "utf-16";

    }

    else if( 0xFF == pSource[0] &&

             0xFE == pSource[1] ) {

        // UTF-16 little endian

        // conversion is done so that encoding information can be easily extracted

        m_sEncoding = "utf-16";

    }

    else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {

        // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)

        // The byte order mark is simply added


        // simply add the byte order mark !

        seq.realloc( seq.getLength() + 2 );

        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );

        reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;

        reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;


        m_sEncoding = "utf-16";

    }

    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {

        // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)

        // The byte order mark is simply added


        seq.realloc( seq.getLength() + 2 );

        memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );

        reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;

        reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;


        m_sEncoding = "utf-16";

    }

    else if( 0xEF == pSource[0] &&

             0xBB == pSource[1] &&

             0xBF == pSource[2] )

    {

        // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order

        // The BOM is removed.

        memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );

        seq.realloc( seq.getLength() - 3 );

        m_sEncoding = "utf-8";

    }

    else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {

        // UCS-4 big endian

        m_sEncoding = "ucs-4";

    }

    else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {

        // UCS-4 little endian

        m_sEncoding = "ucs-4";

    }

/* TODO: no need to test for the moment since we return sal_False like default case anyway

    else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&

             0xa7 == static_cast<unsigned char> (pSource[2]) &&

             0x94 == static_cast<unsigned char> (pSource[3]) ) {

        // EBCDIC

        bReturn = sal_False;   // must be extended

    }

*/

    else {

        // other

        // UTF8 is directly recognized by the parser.

        bReturn = false;

    }


    return bReturn;

}


void XMLFile2UTFConverter::initializeDecoding()

{


    if( !m_sEncoding.isEmpty() )

    {

        rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );

        if( encoding != RTL_TEXTENCODING_UTF8 )

        {

            m_pText2Unicode = std::make_unique<Text2UnicodeConverter>( m_sEncoding );

            m_pUnicode2Text = std::make_unique<Unicode2TextConverter>( RTL_TEXTENCODING_UTF8 );

        }

    }

}


// Text2UnicodeConverter


Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )

    : m_convText2Unicode(nullptr)

    , m_contextText2Unicode(nullptr)

{

    rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );

    if( RTL_TEXTENCODING_DONTKNOW == encoding )

    {

        m_bCanContinue = false;

        m_bInitialized = false;

    }

    else

    {

        init( encoding );

    }

}


Text2UnicodeConverter::~Text2UnicodeConverter()

{

    if( m_bInitialized )

    {

        rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );

        rtl_destroyUnicodeToTextConverter( m_convText2Unicode );

    }

}


void Text2UnicodeConverter::init( rtl_TextEncoding encoding )

{

    m_bCanContinue = true;

    m_bInitialized = true;


    m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);

    m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );

}


Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )

{

    sal_uInt32 uiInfo;

    sal_Size nSrcCvtBytes   = 0;

    sal_Size nTargetCount   = 0;

    sal_Size nSourceCount   = 0;


    // the whole source size

    sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();

    Sequence<sal_Unicode>   seqUnicode ( nSourceSize );


    const sal_Int8 *pbSource = seqText.getConstArray();

    std::unique_ptr<sal_Int8[]> pbTempMem;


    if( m_seqSource.hasElements() ) {

        // put old rest and new byte sequence into one array

        pbTempMem.reset(new sal_Int8[ nSourceSize ]);

        memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );

        memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );

        pbSource = pbTempMem.get();


        // set to zero again

        m_seqSource = Sequence< sal_Int8 >();

    }


    while( true ) {


        /* All invalid characters are transformed to the unicode undefined char */

        nTargetCount +=     rtl_convertTextToUnicode(

                                    m_convText2Unicode,

                                    m_contextText2Unicode,

                                    reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),

                                    nSourceSize - nSourceCount ,

                                    &( seqUnicode.getArray()[ nTargetCount ] ),

                                    seqUnicode.getLength() - nTargetCount,

                                    RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |

                                    RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |

                                    RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,

                                    &uiInfo,

                                    &nSrcCvtBytes );

        nSourceCount += nSrcCvtBytes;


        if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL ) {

            // save necessary bytes for next conversion

            seqUnicode.realloc( seqUnicode.getLength() * 2 );

            continue;

        }

        break;

    }

    if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL ) {

        m_seqSource.realloc( nSourceSize - nSourceCount );

        memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );

    }


    // set to correct unicode size

    seqUnicode.realloc( nTargetCount );


    return seqUnicode;

}


// Unicode2TextConverter


Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )

{

    m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );

    m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );

}


Unicode2TextConverter::~Unicode2TextConverter()

{

    rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );

    rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );

}


Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)

{

    std::unique_ptr<sal_Unicode[]> puTempMem;


    if( m_seqSource.hasElements() ) {

        // For surrogates !

        // put old rest and new byte sequence into one array

        // In general when surrogates are used, they should be rarely

        // cut off between two convert()-calls. So this code is used

        // rarely and the extra copy is acceptable.

        puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);

        memcpy( puTempMem.get() ,

                m_seqSource.getConstArray() ,

                m_seqSource.getLength() * sizeof( sal_Unicode ) );

        memcpy(

            &(puTempMem[ m_seqSource.getLength() ]) ,

            puSource ,

            nSourceSize*sizeof( sal_Unicode ) );

        puSource = puTempMem.get();

        nSourceSize += m_seqSource.getLength();


        m_seqSource = Sequence< sal_Unicode > ();

    }


    sal_Size nTargetCount = 0;

    sal_Size nSourceCount = 0;


    sal_uInt32 uiInfo;

    sal_Size nSrcCvtChars;


    // take nSourceSize * 3 as preference

    // this is an upper boundary for converting to utf8,

    // which most often used as the target.

    sal_Int32 nSeqSize =  nSourceSize * 3;


    Sequence<sal_Int8>  seqText( nSeqSize );

    char *pTarget = reinterpret_cast<char *>(seqText.getArray());

    while( true ) {


        nTargetCount += rtl_convertUnicodeToText(

                                    m_convUnicode2Text,

                                    m_contextUnicode2Text,

                                    &( puSource[nSourceCount] ),

                                    nSourceSize - nSourceCount ,

                                    &( pTarget[nTargetCount] ),

                                    nSeqSize - nTargetCount,

                                    RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |

                                    RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,

                                    &uiInfo,

                                    &nSrcCvtChars);

        nSourceCount += nSrcCvtChars;


        if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {

            nSeqSize = nSeqSize *2;

            seqText.realloc( nSeqSize );  // double array size

            pTarget = reinterpret_cast<char *>(seqText.getArray());

            continue;

        }

        break;

    }


    // for surrogates

    if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {

        m_seqSource.realloc( nSourceSize - nSourceCount );

        memcpy( m_seqSource.getArray() ,

                &(puSource[nSourceCount]),

                (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );

    }


    // reduce the size of the buffer (fast, no copy necessary)

    seqText.realloc( nTargetCount );


    return seqText;

}


}


/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

com::sun::star::uno::Sequence< sal_Int8 >

sax_expatwrap::Text2UnicodeConverter::m_seqSource
css::uno::Sequence< sal_Int8 > m_seqSource
Definition: xml2utf.hxx:51

sax_expatwrap::Text2UnicodeConverter::init
void init(rtl_TextEncoding encoding)
Definition: xml2utf.cxx:353

sax_expatwrap::Text2UnicodeConverter::m_contextText2Unicode
rtl_TextToUnicodeContext m_contextText2Unicode
Definition: xml2utf.hxx:48

sax_expatwrap::Text2UnicodeConverter::m_bInitialized
bool m_bInitialized
Definition: xml2utf.hxx:50

sax_expatwrap::Text2UnicodeConverter::m_convText2Unicode
rtl_TextToUnicodeConverter m_convText2Unicode
Definition: xml2utf.hxx:47

sax_expatwrap::Text2UnicodeConverter::Text2UnicodeConverter
Text2UnicodeConverter(const OString &sEncoding)
Definition: xml2utf.cxx:328

sax_expatwrap::Text2UnicodeConverter::m_bCanContinue
bool m_bCanContinue
Definition: xml2utf.hxx:49

sax_expatwrap::Text2UnicodeConverter::~Text2UnicodeConverter
~Text2UnicodeConverter()
Definition: xml2utf.cxx:344

sax_expatwrap::Text2UnicodeConverter::convert
css::uno::Sequence< sal_Unicode > convert(const css::uno::Sequence< sal_Int8 > &)
Definition: xml2utf.cxx:363

sax_expatwrap::Unicode2TextConverter::convert
css::uno::Sequence< sal_Int8 > convert(const sal_Unicode *, sal_Int32 nLength)
Definition: xml2utf.cxx:441

sax_expatwrap::Unicode2TextConverter::~Unicode2TextConverter
~Unicode2TextConverter()
Definition: xml2utf.cxx:434

sax_expatwrap::Unicode2TextConverter::Unicode2TextConverter
Unicode2TextConverter(rtl_TextEncoding encoding)
Definition: xml2utf.cxx:427

sax_expatwrap::Unicode2TextConverter::m_convUnicode2Text
rtl_UnicodeToTextConverter m_convUnicode2Text
Definition: xml2utf.hxx:68

sax_expatwrap::Unicode2TextConverter::m_contextUnicode2Text
rtl_UnicodeToTextContext m_contextUnicode2Text
Definition: xml2utf.hxx:69

sax_expatwrap::Unicode2TextConverter::m_seqSource
css::uno::Sequence< sal_Unicode > m_seqSource
Definition: xml2utf.hxx:70

sax_expatwrap::XMLFile2UTFConverter::m_in
css::uno::Reference< css::io::XInputStream > m_in
Definition: xml2utf.hxx:117

sax_expatwrap::XMLFile2UTFConverter::scanForEncoding
bool scanForEncoding(css::uno::Sequence< sal_Int8 > &seq)
Definition: xml2utf.cxx:197

sax_expatwrap::XMLFile2UTFConverter::readAndConvert
sal_Int32 readAndConvert(css::uno::Sequence< sal_Int8 > &seq, sal_Int32 nMaxToRead)
Definition: xml2utf.cxx:39

sax_expatwrap::XMLFile2UTFConverter::m_sEncoding
OString m_sEncoding
Definition: xml2utf.hxx:120

sax_expatwrap::XMLFile2UTFConverter::initializeDecoding
void initializeDecoding()
Definition: xml2utf.cxx:310

sax_expatwrap::XMLFile2UTFConverter::m_pText2Unicode
std::unique_ptr< Text2UnicodeConverter > m_pText2Unicode
Definition: xml2utf.hxx:122

sax_expatwrap::XMLFile2UTFConverter::removeEncoding
static void removeEncoding(css::uno::Sequence< sal_Int8 > &seq)
Definition: xml2utf.cxx:117

sax_expatwrap::XMLFile2UTFConverter::m_bStarted
bool m_bStarted
Definition: xml2utf.hxx:119

sax_expatwrap::XMLFile2UTFConverter::m_pUnicode2Text
std::unique_ptr< Unicode2TextConverter > m_pUnicode2Text
Definition: xml2utf.hxx:123

sax_expatwrap::XMLFile2UTFConverter::isEncodingRecognizable
static bool isEncodingRecognizable(const css::uno::Sequence< sal_Int8 > &seq)
Definition: xml2utf.cxx:160

com::sun::star::io

com::sun::star::uno

sax_expatwrap
Definition: xml2utf.cxx:37

types.h

sal_uInt8
unsigned char sal_uInt8

sal_Unicode
sal_uInt16 sal_Unicode

sal_Int8
signed char sal_Int8

xml2utf.hxx

nLength
sal_Int32 nLength