i18npool/html/breakiterator__unicode_8cxx_source.html

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/*

 * This file is part of the LibreOffice project.

 *

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/.

 *

 * This file incorporates work covered by the following license notice:

 *

 *   Licensed to the Apache Software Foundation (ASF) under one or more

 *   contributor license agreements. See the NOTICE file distributed

 *   with this work for additional information regarding copyright

 *   ownership. The ASF licenses this file to you under the Apache

 *   License, Version 2.0 (the "License"); you may not use this file

 *   except in compliance with the License. You may obtain a copy of

 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .

 */


#include <breakiterator_unicode.hxx>

#include <cppuhelper/supportsservice.hxx>

#include <localedata.hxx>

#include <i18nlangtag/languagetag.hxx>

#include <i18nlangtag/languagetagicu.hxx>

#include <unicode/uchar.h>

#include <unicode/locid.h>

#include <unicode/rbbi.h>

#include <unicode/udata.h>

#include <rtl/strbuf.hxx>

#include <rtl/ustring.hxx>


#include <com/sun/star/i18n/BreakType.hpp>

#include <com/sun/star/i18n/CharacterIteratorMode.hpp>

#include <com/sun/star/i18n/WordType.hpp>


U_CDECL_BEGIN

extern const char OpenOffice_dat[];

U_CDECL_END


using namespace ::com::sun::star;

using namespace ::com::sun::star::i18n;

using namespace ::com::sun::star::lang;


namespace i18npool {


// Cache map of breakiterators, stores state information so has to be

// thread_local.

thread_local static BreakIterator_Unicode::BIMap theBIMap;


BreakIterator_Unicode::BreakIterator_Unicode()

    : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" )    // implementation name

    , lineRule( "line" )

    , icuBI( nullptr )

{

}


BreakIterator_Unicode::~BreakIterator_Unicode()

{

}


namespace {


/*

    Wrapper class to provide public access to the icu::RuleBasedBreakIterator's

    setbreakType method.

*/

class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator

{

    public:

    OOoRuleBasedBreakIterator(UDataMemory* image,

                              UErrorCode &status)

        : icu::RuleBasedBreakIterator(image, status)

        { };


};


}


// loading ICU breakiterator on demand.

void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,

        sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText)

{

    bool bNewBreak = false;

    UErrorCode status = U_ZERO_ERROR;

    sal_Int16 breakType = 0;

    switch (rBreakType) {

        case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;

        case LOAD_WORD_BREAKITERATOR:

            assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);

            icuBI=&words[nWordType];

            switch (nWordType) {

                case WordType::ANY_WORD: break; // odd but previous behavior

                case WordType::ANYWORD_IGNOREWHITESPACES:

                    breakType = 0; rule = "edit_word"; break;

                case WordType::DICTIONARY_WORD:

                    breakType = 1; rule = "dict_word"; break;

                default:

                case WordType::WORD_COUNT:

                    breakType = 2; rule = "count_word"; break;

            }

            break;

        case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;

        case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;

    }


    // Using the cache map prevents accessing the file system for each

    // udata_open() where ICU tries first files then data objects. And that for

    // two fallbacks worst case... for each new allocated EditEngine, layout

    // cell, ... *ouch*  Also non-rule locale based iterators can be mapped.

    // This also speeds up loading iterators for alternating or generally more

    // than one language/locale in that iterators are not constructed and

    // destroyed en masse.

    // Four possible keys, locale rule based with break type, locale rule based

    // only, rule based only, locale based with break type. A fifth global key

    // for the initial lookup.

    // Multiple global keys may map to identical value data.

    // All enums used here should be in the range 0..9 so assert that and avoid

    // expensive numeric conversion in append() for faster construction of the

    // always used global key.

    assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);

    const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());

    OStringBuffer aKeyBuf(64);

    aKeyBuf.append( aLangtagStr + ";" );

    if (rule)

        aKeyBuf.append(rule);

    aKeyBuf.append(";" + OStringChar(static_cast<char>('0'+breakType)) + ";"

        + OStringChar(static_cast<char>('0'+rBreakType)) + ";"

        + OStringChar( static_cast<char>('0'+nWordType)));

    // langtag;rule;breakType;rBreakType;nWordType

    const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());


    if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)

    {


        auto aMapIt( theBIMap.find( aBIMapGlobalKey));

        bool bInMap = (aMapIt != theBIMap.end());

        if (bInMap)

            icuBI->mpValue = aMapIt->second;

        else

            icuBI->mpValue.reset();


        if (!bInMap && rule)

            do

            {

                const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);


                status = U_ZERO_ERROR;

                udata_setAppData("OpenOffice", OpenOffice_dat, &status);

                if ( !U_SUCCESS(status) ) throw uno::RuntimeException();


                std::shared_ptr<OOoRuleBasedBreakIterator> rbi;


                if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())

                {

                    // langtag;rule;breakType

                    const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));

                    aMapIt = theBIMap.find( aBIMapRuleTypeKey);

                    bInMap = (aMapIt != theBIMap.end());

                    if (bInMap)

                    {

                        icuBI->mpValue = aMapIt->second;

                        icuBI->maBIMapKey = aBIMapGlobalKey;

                        theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));

                        break;  // do

                    }


                    rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk",

                        OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);


                    if (U_SUCCESS(status))

                    {

                        icuBI->mpValue = std::make_shared<BI_ValueData>();

                        icuBI->mpValue->mpBreakIterator = rbi;

                        theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));

                    }

                    else

                    {

                        rbi.reset();

                    }

                }

                //use icu's breakiterator for Thai, Tibetan and Dzongkha

                else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")

                {

                    // language;rule (not langtag, unless we'd actually load such)

                    OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());

                    const OString aBIMapRuleKey( aLanguage + ";" + rule);

                    aMapIt = theBIMap.find( aBIMapRuleKey);

                    bInMap = (aMapIt != theBIMap.end());

                    if (bInMap)

                    {

                        icuBI->mpValue = aMapIt->second;

                        icuBI->maBIMapKey = aBIMapGlobalKey;

                        theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));

                        break;  // do

                    }


                    status = U_ZERO_ERROR;

                    OString aUDName = OString::Concat(rule) + "_" + aLanguage;

                    UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);

                    if( U_SUCCESS(status) )

                        rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);

                    if ( U_SUCCESS(status) )

                    {

                        icuBI->mpValue = std::make_shared<BI_ValueData>();

                        icuBI->mpValue->mpBreakIterator = rbi;

                        theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));

                    }

                    else

                    {

                        rbi.reset();


                        // ;rule (only)

                        const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule);

                        aMapIt = theBIMap.find( aBIMapRuleOnlyKey);

                        bInMap = (aMapIt != theBIMap.end());

                        if (bInMap)

                        {

                            icuBI->mpValue = aMapIt->second;

                            icuBI->maBIMapKey = aBIMapGlobalKey;

                            theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));

                            break;  // do

                        }


                        status = U_ZERO_ERROR;

                        pUData = udata_open("OpenOffice", "brk", rule, &status);

                        if( U_SUCCESS(status) )

                            rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);

                        if ( U_SUCCESS(status) )

                        {

                            icuBI->mpValue = std::make_shared<BI_ValueData>();

                            icuBI->mpValue->mpBreakIterator = rbi;

                            theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));

                        }

                        else

                        {

                            rbi.reset();

                        }

                    }

                }

            } while (false);


        if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)

            do

            {

                // langtag;;;rBreakType (empty rule; empty breakType)

                const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));

                aMapIt = theBIMap.find( aBIMapLocaleTypeKey);

                bInMap = (aMapIt != theBIMap.end());

                if (bInMap)

                {

                    icuBI->mpValue = aMapIt->second;

                    icuBI->maBIMapKey = aBIMapGlobalKey;

                    theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));

                    break;  // do

                }


                icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));

                std::shared_ptr< icu::BreakIterator > pBI;


                status = U_ZERO_ERROR;

                switch (rBreakType) {

                    case LOAD_CHARACTER_BREAKITERATOR:

                        pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );

                        break;

                    case LOAD_WORD_BREAKITERATOR:

                        pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );

                        break;

                    case LOAD_SENTENCE_BREAKITERATOR:

                        pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );

                        break;

                    case LOAD_LINE_BREAKITERATOR:

                        pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );

                        break;

                }

                if ( !U_SUCCESS(status) || !pBI ) {

                    throw uno::RuntimeException();

                }

                icuBI->mpValue = std::make_shared<BI_ValueData>();

                icuBI->mpValue->mpBreakIterator = pBI;

                theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));

            } while (false);

        if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {

            throw uno::RuntimeException();

        }

        icuBI->maBIMapKey = aBIMapGlobalKey;

        if (!bInMap)

            theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));

        bNewBreak=true;

    }


    if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData))

        return;


    const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());


    status = U_ZERO_ERROR;

    icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);


    if (!U_SUCCESS(status))

        throw uno::RuntimeException();


    icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);


    if (!U_SUCCESS(status))

        throw uno::RuntimeException();


    icuBI->mpValue->maICUText = rText;

}


sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,

        sal_Int32 nStartPos, const lang::Locale &rLocale,

        sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )

{

    if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode

        loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);

        icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();

        for (nDone = 0; nDone < nCount; nDone++) {

            nStartPos = pBI->following(nStartPos);

            if (nStartPos == icu::BreakIterator::DONE)

                return Text.getLength();

        }

    } else { // for CHARACTER mode

        for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)

            Text.iterateCodePoints(&nStartPos);

    }

    return nStartPos;

}


sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,

        sal_Int32 nStartPos, const lang::Locale& rLocale,

        sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )

{

    if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode

        loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);

        icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();

        for (nDone = 0; nDone < nCount; nDone++) {

            nStartPos = pBI->preceding(nStartPos);

            if (nStartPos == icu::BreakIterator::DONE)

                return 0;

        }

    } else { // for BS to delete one char and CHARACTER mode.

        for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)

            Text.iterateCodePoints(&nStartPos, -1);

    }

    return nStartPos;

}


Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,

    const lang::Locale& rLocale, sal_Int16 rWordType )

{

    loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);


    Boundary rv;

    rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);

    if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )

        rv.endPos = result.startPos;

    else {

        if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES

             && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))

            || (rWordType == WordType::DICTIONARY_WORD

                && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))

            rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);


        rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);

        if(rv.endPos == icu::BreakIterator::DONE)

            rv.endPos = rv.startPos;

    }

    return rv;

}


Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,

        const lang::Locale& rLocale, sal_Int16 rWordType)

{

    loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);


    Boundary rv;

    rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);

    if( rv.startPos < 0)

        rv.endPos = rv.startPos;

    else {


        if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES

             && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))

            || (rWordType == WordType::DICTIONARY_WORD

                && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))

            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);


        rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);

        if(rv.endPos == icu::BreakIterator::DONE)

            rv.endPos = rv.startPos;

    }

    return rv;

}


Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,

        sal_Int16 rWordType, sal_Bool bDirection )

{

    loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);

    sal_Int32 len = Text.getLength();


    Boundary rv;

    if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {

        rv.startPos = rv.endPos = nPos;

        if((bDirection || nPos == 0) && nPos < len) //forward

            rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);

        else

            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);

    } else {

        if(nPos <= 0) {

            rv.startPos = 0;

            rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;

        } else if(nPos >= len) {

            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);

            rv.endPos = len;

        } else {

            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);

            rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);

        }

    }

    if (rv.startPos == icu::BreakIterator::DONE)

        rv.startPos = rv.endPos;

    else if (rv.endPos == icu::BreakIterator::DONE)

        rv.endPos = rv.startPos;


    return rv;

}


sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,

        const lang::Locale &rLocale )

{

    loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);


    sal_Int32 len = Text.getLength();

    if (len > 0 && nStartPos == len)

        Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence

    if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))

        nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);


    // skip preceding space.

    sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);

    while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);

    Text.iterateCodePoints(&nStartPos, -1);


    return nStartPos;

}


sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,

        const lang::Locale &rLocale )

{

    loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);


    sal_Int32 len = Text.getLength();

    if (len > 0 && nStartPos == len)

        Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence

    nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);


    sal_Int32 nPos=nStartPos;

    while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;


    return nStartPos;

}


LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(

        const OUString& Text, sal_Int32 nStartPos,

        const lang::Locale& rLocale, sal_Int32 nMinBreakPos,

        const LineBreakHyphenationOptions& hOptions,

        const LineBreakUserOptions& /*rOptions*/ )

{

    LineBreakResults lbr;


    if (nStartPos >= Text.getLength()) {

        lbr.breakIndex = Text.getLength();

        lbr.breakType = BreakType::WORDBOUNDARY;

        return lbr;

    }


    loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);


    icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();

    bool GlueSpace=true;

    while (GlueSpace) {

        // don't break with Slash U+002F SOLIDUS at end of line; see "else" below!

        if (pLineBI->preceding(nStartPos + 1) == nStartPos

                && (nStartPos == 0 || Text[nStartPos - 1] != '/'))

        { //Line boundary break

            lbr.breakIndex = nStartPos;

            lbr.breakType = BreakType::WORDBOUNDARY;

        } else if (hOptions.rHyphenator.is()) { //Hyphenation break

            sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;

            pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"


            sal_Int32 nStartPosWordEnd = nStartPos;

            while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation

                nStartPosWordEnd --;


            Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,

                WordType::DICTIONARY_WORD, false);


            nStartPosWordEnd = wBoundary.endPos;

            while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation

                nStartPosWordEnd ++;

            nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;

            if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;

#define SPACE 0x0020

            while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);

            uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,

                        wBoundary.endPos - wBoundary.startPos), rLocale,

                    static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);

            if (aHyphenatedWord.is()) {

                lbr.rHyphenatedWord = aHyphenatedWord;

                if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )

                    lbr.breakIndex = -1;

                else

                    lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();

                lbr.breakType = BreakType::HYPHENATION;


                // check not optimal hyphenation of "word-word" (word with hyphens)

                if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {

                    lbr.breakIndex = pLineBI->current();

                    lbr.breakType = BreakType::WORDBOUNDARY;

                }


            } else {

                lbr.breakIndex = pLineBI->preceding(nStartPos);

                lbr.breakType = BreakType::WORDBOUNDARY;

            }

        } else { //word boundary break

            lbr.breakIndex = pLineBI->preceding(nStartPos);

            lbr.breakType = BreakType::WORDBOUNDARY;


            // Special case for Slash U+002F SOLIDUS in URI and path names.

            // TR14 defines that as SY: Symbols Allowing Break After (A).

            // This is unwanted in paths, see also i#17155

            if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')

            {

                // Look backward and take any whitespace before as a break

                // opportunity. This also glues something like "w/o".

                // Avoid an overly long path and break it as was indicated.

                // Overly long here is arbitrarily defined.

                const sal_Int32 nOverlyLong = 66;

                sal_Int32 nPos = lbr.breakIndex - 1;

                while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)

                {

                    if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))

                    {

                        lbr.breakIndex = nPos + 1;

                        break;

                    }

                }

            }

        }


#define WJ 0x2060   // Word Joiner

        GlueSpace=false;

        if (lbr.breakType == BreakType::WORDBOUNDARY) {

            nStartPos = lbr.breakIndex;

            if (nStartPos >= 0 && Text[nStartPos--] == WJ)

                GlueSpace=true;

            while (nStartPos >= 0 &&

                    (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {

                if (Text[nStartPos--] == WJ)

                    GlueSpace=true;

            }

            if (GlueSpace && nStartPos < 0)  {

                lbr.breakIndex = 0;

                break;

            }

        }

    }


    return lbr;

}


OUString SAL_CALL

BreakIterator_Unicode::getImplementationName()

{

    return OUString::createFromAscii(cBreakIterator);

}


sal_Bool SAL_CALL

BreakIterator_Unicode::supportsService(const OUString& rServiceName)

{

    return cppu::supportsService(this, rServiceName);

}


uno::Sequence< OUString > SAL_CALL

BreakIterator_Unicode::getSupportedServiceNames()

{

    uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };

    return aRet;

}


}


extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *

com_sun_star_i18n_BreakIterator_Unicode_get_implementation(

    css::uno::XComponentContext *,

    css::uno::Sequence<css::uno::Any> const &)

{

    return cppu::acquire(new i18npool::BreakIterator_Unicode());

}


/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

com_sun_star_i18n_BreakIterator_Unicode_get_implementation
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_Unicode_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
Definition: breakiterator_unicode.cxx:600

SPACE
#define SPACE

WJ
#define WJ

OpenOffice_dat
U_CDECL_BEGIN const char OpenOffice_dat[]

breakiterator_unicode.hxx

LOAD_WORD_BREAKITERATOR
#define LOAD_WORD_BREAKITERATOR
Definition: breakiterator_unicode.hxx:31

LOAD_SENTENCE_BREAKITERATOR
#define LOAD_SENTENCE_BREAKITERATOR
Definition: breakiterator_unicode.hxx:32

LOAD_LINE_BREAKITERATOR
#define LOAD_LINE_BREAKITERATOR
Definition: breakiterator_unicode.hxx:33

LOAD_CHARACTER_BREAKITERATOR
#define LOAD_CHARACTER_BREAKITERATOR
Definition: breakiterator_unicode.hxx:30

LanguageTagIcu::getIcuLocale
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)

LanguageTag

LanguageTag::convertToBcp47
static OUString convertToBcp47(LanguageType nLangID)

i18npool::BreakIteratorImpl::result
css::i18n::Boundary result
Definition: breakiteratorImpl.hxx:102

i18npool::BreakIterator_Unicode
Definition: breakiterator_unicode.hxx:38

i18npool::BreakIterator_Unicode::previousCharacters
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
Definition: breakiterator_unicode.cxx:329

i18npool::BreakIterator_Unicode::BreakIterator_Unicode
BreakIterator_Unicode()
Definition: breakiterator_unicode.cxx:50

i18npool::BreakIterator_Unicode::loadICUBreakIterator
void loadICUBreakIterator(const css::lang::Locale &rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const char *name, const OUString &rText)
Definition: breakiterator_unicode.cxx:80

i18npool::BreakIterator_Unicode::supportsService
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
Definition: breakiterator_unicode.cxx:585

i18npool::BreakIterator_Unicode::~BreakIterator_Unicode
virtual ~BreakIterator_Unicode() override
Definition: breakiterator_unicode.cxx:57

i18npool::BreakIterator_Unicode::nextCharacters
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
Definition: breakiterator_unicode.cxx:310

i18npool::BreakIterator_Unicode::getLineBreak
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
Definition: breakiterator_unicode.cxx:467

i18npool::BreakIterator_Unicode::words
BI_Data words[4]
Definition: breakiterator_unicode.hxx:96

i18npool::BreakIterator_Unicode::icuBI
struct i18npool::BreakIterator_Unicode::BI_Data * icuBI

i18npool::BreakIterator_Unicode::sentence
struct i18npool::BreakIterator_Unicode::BI_Data sentence

i18npool::BreakIterator_Unicode::endOfSentence
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
Definition: breakiterator_unicode.cxx:451

i18npool::BreakIterator_Unicode::line
struct i18npool::BreakIterator_Unicode::BI_Data line

i18npool::BreakIterator_Unicode::getImplementationName
virtual OUString SAL_CALL getImplementationName() override
Definition: breakiterator_unicode.cxx:579

i18npool::BreakIterator_Unicode::cBreakIterator
const char * cBreakIterator
Definition: breakiterator_unicode.hxx:73

i18npool::BreakIterator_Unicode::getWordBoundary
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
Definition: breakiterator_unicode.cxx:398

i18npool::BreakIterator_Unicode::lineRule
const char * lineRule
Definition: breakiterator_unicode.hxx:73

i18npool::BreakIterator_Unicode::previousWord
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
Definition: breakiterator_unicode.cxx:373

i18npool::BreakIterator_Unicode::BIMap
std::unordered_map< OString, std::shared_ptr< BI_ValueData > > BIMap
Definition: breakiterator_unicode.hxx:103

i18npool::BreakIterator_Unicode::nextWord
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
Definition: breakiterator_unicode.cxx:349

i18npool::BreakIterator_Unicode::character
struct i18npool::BreakIterator_Unicode::BI_Data character

i18npool::BreakIterator_Unicode::getSupportedServiceNames
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
Definition: breakiterator_unicode.cxx:591

i18npool::BreakIterator_Unicode::beginOfSentence
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
Definition: breakiterator_unicode.cxx:432

i18npool::LocaleDataImpl::get
static rtl::Reference< LocaleDataImpl > get()
Definition: localedata.hxx:77

nCount
int nCount

languagetag.hxx

languagetagicu.hxx

nPos
sal_uInt16 nPos

localedata.hxx

com::sun::star::i18n

com::sun::star::lang

com::sun::star

cppu::supportsService
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)

i18npool
Constant values shared between i18npool and, for example, the number formatter.
Definition: breakiterator_cjk.cxx:30

i18npool::theBIMap
static thread_local BreakIterator_Unicode::BIMap theBIMap
Definition: breakiterator_unicode.cxx:48

Length::ch
@ ch

OUStringToOString
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)

SwNodeType::Text
@ Text

i18npool::BreakIterator_Unicode::BI_Data::maBIMapKey
OString maBIMapKey
Definition: breakiterator_unicode.hxx:94

i18npool::BreakIterator_Unicode::BI_Data::mpValue
std::shared_ptr< BI_ValueData > mpValue
Definition: breakiterator_unicode.hxx:93

supportsservice.hxx

sal_Bool
unsigned char sal_Bool