i18npool/html/textsearch_8cxx_source.html

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/*

 * This file is part of the LibreOffice project.

 *

 * This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/.

 *

 * This file incorporates work covered by the following license notice:

 *

 *   Licensed to the Apache Software Foundation (ASF) under one or more

 *   contributor license agreements. See the NOTICE file distributed

 *   with this work for additional information regarding copyright

 *   ownership. The ASF licenses this file to you under the Apache

 *   License, Version 2.0 (the "License"); you may not use this file

 *   except in compliance with the License. You may obtain a copy of

 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .

 */


#include "textsearch.hxx"

#include "levdis.hxx"

#include <com/sun/star/i18n/BreakIterator.hpp>

#include <com/sun/star/util/SearchAlgorithms2.hpp>

#include <com/sun/star/util/SearchFlags.hpp>

#include <com/sun/star/i18n/WordType.hpp>

#include <com/sun/star/i18n/ScriptType.hpp>

#include <com/sun/star/i18n/CharacterIteratorMode.hpp>

#include <com/sun/star/i18n/CharacterClassification.hpp>

#include <com/sun/star/i18n/KCharacterType.hpp>

#include <com/sun/star/i18n/Transliteration.hpp>

#include <cppuhelper/supportsservice.hxx>

#include <cppuhelper/weak.hxx>

#include <i18nutil/transliteration.hxx>

#include <rtl/ustrbuf.hxx>

#include <sal/log.hxx>


#include <unicode/regex.h>


using namespace ::com::sun::star::util;

using namespace ::com::sun::star::uno;

using namespace ::com::sun::star::lang;

using namespace ::com::sun::star::i18n;

using namespace ::com::sun::star;


const TransliterationFlags COMPLEX_TRANS_MASK =

    TransliterationFlags::ignoreBaFa_ja_JP |

    TransliterationFlags::ignoreIterationMark_ja_JP |

    TransliterationFlags::ignoreTiJi_ja_JP |

    TransliterationFlags::ignoreHyuByu_ja_JP |

    TransliterationFlags::ignoreSeZe_ja_JP |

    TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |

    TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |

    TransliterationFlags::ignoreProlongedSoundMark_ja_JP;


namespace

{

TransliterationFlags maskComplexTrans( TransliterationFlags n )

{

    // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect

    // in complex transliteration.

    return

        n & (COMPLEX_TRANS_MASK |                       // all set ignore bits

        TransliterationFlags::IGNORE_KANA |            // plus IGNORE_KANA bit

        TransliterationFlags::FULLWIDTH_HALFWIDTH);    // and the FULLWIDTH_HALFWIDTH value

}


bool isComplexTrans( TransliterationFlags n )

{

    return bool(n & COMPLEX_TRANS_MASK);

}


TransliterationFlags maskSimpleTrans( TransliterationFlags n )

{

    return n & ~COMPLEX_TRANS_MASK;

}


bool isSimpleTrans( TransliterationFlags n )

{

    return bool(maskSimpleTrans(n));

}


// Regex patterns are case sensitive.

TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )

{

    TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;

    TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;

    if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)

        v = TransliterationFlags::NONE;

    return (m | v) & ~COMPLEX_TRANS_MASK;

}


bool isSimpleRegexTrans( TransliterationFlags n )

{

    return bool(maskSimpleRegexTrans(n));

}

};


TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)

        : m_xContext( rxContext )

{

    SearchOptions2 aOpt;

    aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;

    aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;

    aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;

    //aOpt.Locale = ???;

    setOptions( aOpt );

}


TextSearch::~TextSearch()

{

    pRegexMatcher.reset();

    pWLD.reset();

    pJumpTable.reset();

    pJumpTable2.reset();

}


void TextSearch::setOptions2( const SearchOptions2& rOptions )

{

    std::unique_lock g(m_aMutex);


    aSrchPara = rOptions;


    pRegexMatcher.reset();

    pWLD.reset();

    pJumpTable.reset();

    pJumpTable2.reset();

    maWildcardReversePattern.clear();

    maWildcardReversePattern2.clear();

    TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);

    bSearchApostrophe = false;

    bool bReplaceApostrophe = false;

    if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)

    {

        // RESrchPrepare will consider aSrchPara.transliterateFlags when

        // picking the actual regex pattern

        // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting

        // case-insensitivity. Create transliteration instance, if any, without

        // ignore-case so later in TextSearch::searchForward() the string to

        // match is not case-altered, leave case-(in)sensitive to regex engine.

        transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;

    }

    else if ( aSrchPara.searchString.indexOf('\'') > - 1 )

    {

        bSearchApostrophe = true;

        bReplaceApostrophe = aSrchPara.searchString.indexOf(u'\u2019') > -1;

    }


    // Create Transliteration class

    if( isSimpleTrans( transliterateFlags) )

    {

        if( !xTranslit.is() )

            xTranslit.set( Transliteration::create( m_xContext ) );

        xTranslit->loadModule(

             static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),

             aSrchPara.Locale);

    }

    else if( xTranslit.is() )

        xTranslit = nullptr;


    // Create Transliteration for 2<->1, 2<->2 transliteration

    if ( isComplexTrans( transliterateFlags) )

    {

        if( !xTranslit2.is() )

            xTranslit2.set( Transliteration::create( m_xContext ) );

        // Load transliteration module

        xTranslit2->loadModule(

             static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),

             aSrchPara.Locale);

    }


    if ( !xBreak.is() )

        xBreak = css::i18n::BreakIterator::create( m_xContext );


    sSrchStr = aSrchPara.searchString;


    // Transliterate search string.

    if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)

    {

        if (isSimpleRegexTrans(transliterateFlags))

        {

            if (maskSimpleRegexTrans(transliterateFlags) !=

                    maskSimpleTrans(transliterateFlags))

            {

                css::uno::Reference< XExtendedTransliteration > xTranslitPattern(

                         Transliteration::create( m_xContext ));

                if (xTranslitPattern.is())

                {

                    xTranslitPattern->loadModule(

                            static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),

                            aSrchPara.Locale);

                    sSrchStr = xTranslitPattern->transliterateString2String(

                            aSrchPara.searchString, 0, aSrchPara.searchString.getLength());

                }

            }

            else

            {

                if (xTranslit.is())

                    sSrchStr = xTranslit->transliterateString2String(

                            aSrchPara.searchString, 0, aSrchPara.searchString.getLength());

            }

            // xTranslit2 complex transliterated sSrchStr2 is not used in

            // regex, see TextSearch::searchForward() and

            // TextSearch::searchBackward()

        }

    }

    else

    {

        if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )

            sSrchStr = xTranslit->transliterateString2String(

                    aSrchPara.searchString, 0, aSrchPara.searchString.getLength());


        if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )

            sSrchStr2 = xTranslit2->transliterateString2String(

                    aSrchPara.searchString, 0, aSrchPara.searchString.getLength());

    }


    if ( bReplaceApostrophe )

        sSrchStr = sSrchStr.replace(u'\u2019', '\'');


    // Take the new SearchOptions2::AlgorithmType2 field and ignore

    // SearchOptions::algorithmType

    switch( aSrchPara.AlgorithmType2)

    {

        case SearchAlgorithms2::REGEXP:

            fnForward = &TextSearch::RESrchFrwrd;

            fnBackward = &TextSearch::RESrchBkwrd;

            RESrchPrepare( aSrchPara);

            break;


        case SearchAlgorithms2::APPROXIMATE:

            fnForward = &TextSearch::ApproxSrchFrwrd;

            fnBackward = &TextSearch::ApproxSrchBkwrd;


            pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,

                    aSrchPara.insertedChars, aSrchPara.deletedChars,

                    0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );


            nLimit = pWLD->GetLimit();

            break;


        case SearchAlgorithms2::WILDCARD:

            mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);

            mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);

            fnForward = &TextSearch::WildcardSrchFrwrd;

            fnBackward = &TextSearch::WildcardSrchBkwrd;

            break;


        default:

            SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");

            [[fallthrough]];

        case SearchAlgorithms2::ABSOLUTE:

            fnForward = &TextSearch::NSrchFrwrd;

            fnBackward = &TextSearch::NSrchBkwrd;

            break;

    }

}


void TextSearch::setOptions( const SearchOptions& rOptions )

{

    sal_Int16 nAlgorithmType2;

    switch (rOptions.algorithmType)

    {

        case SearchAlgorithms_REGEXP:

            nAlgorithmType2 = SearchAlgorithms2::REGEXP;

            break;

        case SearchAlgorithms_APPROXIMATE:

            nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;

            break;

        default:

            SAL_WARN("i18npool","TextSearch::setOptions - default what?");

            [[fallthrough]];

        case SearchAlgorithms_ABSOLUTE:

            nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;

            break;

    }

    // It would be nice if an inherited struct had a ctor that takes an

    // instance of the object the struct derived from...

    SearchOptions2 aOptions2(

            rOptions.algorithmType,

            rOptions.searchFlag,

            rOptions.searchString,

            rOptions.replaceString,

            rOptions.Locale,

            rOptions.changedChars,

            rOptions.deletedChars,

            rOptions.insertedChars,

            rOptions.transliterateFlags,

            nAlgorithmType2,

            0   // no wildcard search, no escape character...

            );

    setOptions2( aOptions2);

}


static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )

{

    auto pOff = std::find_if(rOff.begin(), rOff.end(),

        [nPos](const sal_Int32 nOff) { return nOff >= nPos; });

    return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));

}


SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )

{

    std::unique_lock g(m_aMutex);


    SearchResult sres;


    OUString in_str(searchStr);


    // in non-regex mode, allow searching typographical apostrophe with the ASCII one

    // to avoid regression after using automatic conversion to U+2019 during typing in Writer

    bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;


    bUsePrimarySrchStr = true;


    if ( xTranslit.is() )

    {

        // apply normal transliteration (1<->1, 1<->0)


        sal_Int32 nInStartPos = startPos;

        if (pRegexMatcher && startPos > 0)

        {

            // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet

            // transliterate enough of the leading text to allow sensible look-behind assertions.

            // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.

            // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.

            // When search regex doesn't start with an assertion, 3 is to allow startPos to be in

            // the middle of a surrogate pair, preceded by another surrogate pair.

            const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;

            nInStartPos -= std::min(nMaxLeadingLen, startPos);

        }

        sal_Int32 nInEndPos = endPos;

        if (pRegexMatcher && endPos < searchStr.getLength())

        {

            // tdf#65038: ditto for look-ahead assertions

            const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;

            nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);

        }


        css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);

        in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);


        if ( bReplaceApostrophe )

            in_str = in_str.replace(u'\u2019', '\'');


        // JP 20.6.2001: also the start and end positions must be corrected!

        sal_Int32 newStartPos =

            (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );


        sal_Int32 newEndPos = (endPos < searchStr.getLength())

            ? FindPosInSeq_Impl( offset, endPos )

            : in_str.getLength();


        sres = (this->*fnForward)( in_str, newStartPos, newEndPos );


        // Map offsets back to untransliterated string.

        const sal_Int32 nOffsets = offset.getLength();

        if (nOffsets)

        {

            auto sres_startOffsetRange = asNonConstRange(sres.startOffset);

            auto sres_endOffsetRange = asNonConstRange(sres.endOffset);

            // For regex nGroups is the number of groups+1 with group 0 being

            // the entire match.

            const sal_Int32 nGroups = sres.startOffset.getLength();

            for ( sal_Int32 k = 0; k < nGroups; k++ )

            {

                const sal_Int32 nStart = sres.startOffset[k];

                // Result offsets are negative (-1) if a group expression was

                // not matched.

                if (nStart >= 0)

                    sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));

                // JP 20.6.2001: end is ever exclusive and then don't return

                //               the position of the next character - return the

                //               next position behind the last found character!

                //               "a b c" find "b" must return 2,3 and not 2,4!!!

                const sal_Int32 nStop = sres.endOffset[k];

                if (nStop >= 0)

                {

                    if (nStop > 0)

                        sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;

                    else

                        sres_endOffsetRange[k] = offset[0];

                }

            }

        }

    }

    else

    {

        if ( bReplaceApostrophe )

            in_str = in_str.replace(u'\u2019', '\'');


        sres = (this->*fnForward)( in_str, startPos, endPos );

    }


    if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)

    {

        SearchResult sres2;


        in_str = searchStr;

        css::uno::Sequence <sal_Int32> offset( in_str.getLength());


        in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );


        if( startPos )

            startPos = FindPosInSeq_Impl( offset, startPos );


        if( endPos < searchStr.getLength() )

            endPos = FindPosInSeq_Impl( offset, endPos );

        else

            endPos = in_str.getLength();


        bUsePrimarySrchStr = false;

        sres2 = (this->*fnForward)( in_str, startPos, endPos );

        auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);

        auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);


        for ( int k = 0; k < sres2.startOffset.getLength(); k++ )

        {

            if (sres2.startOffset[k])

                sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;

            if (sres2.endOffset[k])

                sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;

        }


        // pick first and long one

        if ( sres.subRegExpressions == 0)

            return sres2;

        if ( sres2.subRegExpressions == 1)

        {

            if ( sres.startOffset[0] > sres2.startOffset[0])

                return sres2;

            else if ( sres.startOffset[0] == sres2.startOffset[0] &&

                    sres.endOffset[0] < sres2.endOffset[0])

                return sres2;

        }

    }


    return sres;

}


SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )

{

    std::unique_lock g(m_aMutex);


    SearchResult sres;


    OUString in_str(searchStr);


    // in non-regex mode, allow searching typographical apostrophe with the ASCII one

    // to avoid regression after using automatic conversion to U+2019 during typing in Writer

    bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;


    bUsePrimarySrchStr = true;


    if ( xTranslit.is() )

    {

        // apply only simple 1<->1 transliteration here

        css::uno::Sequence<sal_Int32> offset(startPos - endPos);

        in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );


        if ( bReplaceApostrophe )

            in_str = in_str.replace(u'\u2019', '\'');


        // JP 20.6.2001: also the start and end positions must be corrected!

        sal_Int32 const newStartPos = (startPos < searchStr.getLength())

            ? FindPosInSeq_Impl( offset, startPos )

            : in_str.getLength();


        sal_Int32 const newEndPos =

            (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );


        // TODO: this would need nExtraOffset handling to avoid $ matching

        // if (pRegexMatcher && startPos < searchStr.getLength())

        // but that appears to be impossible with ICU regex


        sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );


        // Map offsets back to untransliterated string.

        const sal_Int32 nOffsets = offset.getLength();

        if (nOffsets)

        {

            auto sres_startOffsetRange = asNonConstRange(sres.startOffset);

            auto sres_endOffsetRange = asNonConstRange(sres.endOffset);

            // For regex nGroups is the number of groups+1 with group 0 being

            // the entire match.

            const sal_Int32 nGroups = sres.startOffset.getLength();

            for ( sal_Int32 k = 0; k < nGroups; k++ )

            {

                const sal_Int32 nStart = sres.startOffset[k];

                // Result offsets are negative (-1) if a group expression was

                // not matched.

                if (nStart >= 0)

                {

                    if (nStart > 0)

                        sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;

                    else

                        sres_startOffsetRange[k] = offset[0];

                }

                // JP 20.6.2001: end is ever exclusive and then don't return

                //               the position of the next character - return the

                //               next position behind the last found character!

                //               "a b c" find "b" must return 2,3 and not 2,4!!!

                const sal_Int32 nStop = sres.endOffset[k];

                if (nStop >= 0)

                    sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));

            }

        }

    }

    else

    {

        if ( bReplaceApostrophe )

            in_str = in_str.replace(u'\u2019', '\'');


        sres = (this->*fnBackward)( in_str, startPos, endPos );

    }


    if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )

    {

        SearchResult sres2;


        in_str = searchStr;

        css::uno::Sequence <sal_Int32> offset( in_str.getLength());


        in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);


        if( startPos < searchStr.getLength() )

            startPos = FindPosInSeq_Impl( offset, startPos );

        else

            startPos = in_str.getLength();


        if( endPos )

            endPos = FindPosInSeq_Impl( offset, endPos );


        bUsePrimarySrchStr = false;

        sres2 = (this->*fnBackward)( in_str, startPos, endPos );

        auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);

        auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);


        for( int k = 0; k < sres2.startOffset.getLength(); k++ )

        {

            if (sres2.startOffset[k])

                sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;

            if (sres2.endOffset[k])

                sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;

        }


        // pick last and long one

        if ( sres.subRegExpressions == 0 )

            return sres2;

        if ( sres2.subRegExpressions == 1 )

        {

            if ( sres.startOffset[0] < sres2.startOffset[0] )

                return sres2;

            if ( sres.startOffset[0] == sres2.startOffset[0] &&

                    sres.endOffset[0] > sres2.endOffset[0] )

                return sres2;

        }

    }


    return sres;

}


bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const

{

    bool bRet = true;

    if( '\x7f' != rStr[nPos])

    {

        if ( !xCharClass.is() )

             xCharClass = CharacterClassification::create( m_xContext );

        sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,

                aSrchPara.Locale );

        if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |

                        KCharacterType::LETTER ) & nCType ) )

            bRet = false;

    }

    return bRet;

}


// --------- helper methods for Boyer-Moore like text searching ----------

// TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available


void TextSearch::MakeForwardTab()

{

    // create the jumptable for the search text


    if( pJumpTable && bIsForwardTab )

    {

        return; // the jumpTable is ok

    }

    bIsForwardTab = true;


    sal_Int32 n, nLen = sSrchStr.getLength();

    pJumpTable.reset( new TextSearchJumpTable );


    for( n = 0; n < nLen - 1; ++n )

    {

        sal_Unicode cCh = sSrchStr[n];

        sal_Int32 nDiff = nLen - n - 1;

        TextSearchJumpTable::value_type aEntry( cCh, nDiff );


        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =

            pJumpTable->insert( aEntry );

        if ( !aPair.second )

            (*(aPair.first)).second = nDiff;

    }

}


void TextSearch::MakeForwardTab2()

{

    // create the jumptable for the search text

    if( pJumpTable2 && bIsForwardTab )

    {

        return;        // the jumpTable is ok

    }

    bIsForwardTab = true;


    sal_Int32 n, nLen = sSrchStr2.getLength();

    pJumpTable2.reset( new TextSearchJumpTable );


    for( n = 0; n < nLen - 1; ++n )

    {

        sal_Unicode cCh = sSrchStr2[n];

        sal_Int32 nDiff = nLen - n - 1;


        TextSearchJumpTable::value_type aEntry( cCh, nDiff );

        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =

            pJumpTable2->insert( aEntry );

        if ( !aPair.second )

            (*(aPair.first)).second = nDiff;

    }

}


void TextSearch::MakeBackwardTab()

{

    // create the jumptable for the search text

    if( pJumpTable && !bIsForwardTab)

    {

        return;   // the jumpTable is ok

    }

    bIsForwardTab = false;


    sal_Int32 n, nLen = sSrchStr.getLength();

    pJumpTable.reset( new TextSearchJumpTable );


    for( n = nLen-1; n > 0; --n )

    {

        sal_Unicode cCh = sSrchStr[n];

        TextSearchJumpTable::value_type aEntry( cCh, n );

        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =

            pJumpTable->insert( aEntry );

        if ( !aPair.second )

            (*(aPair.first)).second = n;

    }

}


void TextSearch::MakeBackwardTab2()

{

    // create the jumptable for the search text

    if( pJumpTable2 && !bIsForwardTab )

    {

        return;    // the jumpTable is ok

    }

    bIsForwardTab = false;


    sal_Int32 n, nLen = sSrchStr2.getLength();

    pJumpTable2.reset( new TextSearchJumpTable );


    for( n = nLen-1; n > 0; --n )

    {

        sal_Unicode cCh = sSrchStr2[n];

        TextSearchJumpTable::value_type aEntry( cCh, n );

        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =

            pJumpTable2->insert( aEntry );

        if ( !aPair.second )

            (*(aPair.first)).second = n;

    }

}


sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const

{

    TextSearchJumpTable *pJump;

    OUString sSearchKey;


    if ( bUsePrimarySrchStr ) {

        pJump = pJumpTable.get();

        sSearchKey = sSrchStr;

    } else {

        pJump = pJumpTable2.get();

        sSearchKey = sSrchStr2;

    }


    TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );

    if ( iLook == pJump->end() )

        return sSearchKey.getLength();

    return (*iLook).second;

}


SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )

{

    SearchResult aRet;

    aRet.subRegExpressions = 0;


    OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;


    sal_Int32 nSuchIdx = searchStr.getLength();

    sal_Int32 nEnd = endPos;

    if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )

        return aRet;


    if( nEnd < sSearchKey.getLength() )   // position inside the search region ?

        return aRet;


    nEnd -= sSearchKey.getLength();


    if (bUsePrimarySrchStr)

      MakeForwardTab();                   // create the jumptable

    else

      MakeForwardTab2();


    for (sal_Int32 nCmpIdx = startPos; // start position for the search

            nCmpIdx <= nEnd;

            nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))

    {

        nSuchIdx = sSearchKey.getLength() - 1;

        while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])

        {

            if( nSuchIdx == 0 )

            {

                if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )

                {

                    sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();

                    bool bAtStart = !nCmpIdx;

                    bool bAtEnd = nFndEnd == endPos;

                    bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );

                    bool bDelimBehind = bAtEnd || IsDelimiter(  searchStr, nFndEnd );

                    //  *       1 -> only one word in the paragraph

                    //  *       2 -> at begin of paragraph

                    //  *       3 -> at end of paragraph

                    //  *       4 -> inside the paragraph

                    if( !(  ( bAtStart && bAtEnd ) ||           // 1

                                ( bAtStart && bDelimBehind ) ||     // 2

                                ( bAtEnd && bDelimBefore ) ||       // 3

                                ( bDelimBefore && bDelimBehind )))  // 4

                        break;

                }


                aRet.subRegExpressions = 1;

                aRet.startOffset = { nCmpIdx };

                aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };


                return aRet;

            }

            else

                nSuchIdx--;

        }

    }

    return aRet;

}


SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )

{

    SearchResult aRet;

    aRet.subRegExpressions = 0;


    OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;


    sal_Int32 nSuchIdx = searchStr.getLength();

    sal_Int32 nEnd = endPos;

    if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)

        return aRet;


    if (bUsePrimarySrchStr)

        MakeBackwardTab();                  // create the jumptable

    else

        MakeBackwardTab2();


    if( nEnd == nSuchIdx )                  // end position for the search

        nEnd = sSearchKey.getLength();

    else

        nEnd += sSearchKey.getLength();


    sal_Int32 nCmpIdx = startPos;          // start position for the search


    while (nCmpIdx >= nEnd)

    {

        nSuchIdx = 0;

        while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==

                searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )

            nSuchIdx++;

        if( nSuchIdx >= sSearchKey.getLength() )

        {

            if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )

            {

                sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();

                bool bAtStart = !nFndStt;

                bool bAtEnd = nCmpIdx == startPos;

                bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );

                bool bDelimBefore = bAtStart || // begin of paragraph

                    IsDelimiter( searchStr, nFndStt-1 );

                //  *       1 -> only one word in the paragraph

                //  *       2 -> at begin of paragraph

                //  *       3 -> at end of paragraph

                //  *       4 -> inside the paragraph

                if( ( bAtStart && bAtEnd ) ||           // 1

                        ( bAtStart && bDelimBehind ) ||     // 2

                        ( bAtEnd && bDelimBefore ) ||       // 3

                        ( bDelimBefore && bDelimBehind ))   // 4

                {

                    aRet.subRegExpressions = 1;

                    aRet.startOffset = { nCmpIdx };

                    aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };

                    return aRet;

                }

            }

            else

            {

                aRet.subRegExpressions = 1;

                aRet.startOffset = { nCmpIdx };

                aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };

                return aRet;

            }

        }

        nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );

        if( nCmpIdx < nSuchIdx )

            return aRet;

        nCmpIdx -= nSuchIdx;

    }

    return aRet;

}


void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)

{

    TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);

    // select the transliterated pattern string

    const OUString& rPatternStr =

        (isSimpleTrans(transliterateFlags) ? sSrchStr

        : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));


    sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability

    // map css::util::SearchFlags to ICU uregex.h flags

    // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE

    // REG_NEWLINE is neither properly defined nor used anywhere => not implemented

    // REG_NOSUB is not used anywhere => not implemented

    // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute

    // LEV_RELAXED is only used for SearchAlgorithm==Approximate

    // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO

    // probably because the transliteration flag IGNORE_CASE handles it as well.

    if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0

    ||  (transliterateFlags & TransliterationFlags::IGNORE_CASE))

        nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;

    UErrorCode nIcuErr = U_ZERO_ERROR;

    // assumption: transliteration didn't mangle regexp control chars

    icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());

#ifndef DISABLE_WORDBOUND_EMULATION

    // for convenience specific syntax elements of the old regex engine are emulated

    // - by replacing < with "word-break followed by a look-ahead word-char"

    static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);

    static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);

    static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);

    aChevronMatcherB.reset( aIcuSearchPatStr);

    aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);

    aChevronMatcherB.reset();

    // - by replacing > with "look-behind word-char followed by a word-break"

    static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);

    static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);

    static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);

    aChevronMatcherE.reset( aIcuSearchPatStr);

    aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);

    aChevronMatcherE.reset();

#endif

    pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );

    if (nIcuErr)

    {

        SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);

        pRegexMatcher.reset();

    }

    else

    {

        // Pathological patterns may result in exponential run time making the

        // application appear to be frozen. Limit that. Documentation for this

        // call says

        // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00

        // "The units of the limit are steps of the match engine.

        // Correspondence with actual processor time will depend on the speed

        // of the processor and the details of the specific pattern, but will

        // typically be on the order of milliseconds."

        // Just what is a good value? 42 is always an answer ... the 23 enigma

        // as well... which on the dev's machine is roughly 50 seconds with the

        // pattern of fdo#70627.

        /* TODO: make this a configuration settable value and possibly take

         * complexity of expression into account and maybe even length of text

         * to be matched; currently (2013-11-25) that is at most one 64k

         * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */

        pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);

    }

}


static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,

                          sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)

{

    pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);

    pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's

    pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for

                                               // look-ahead/behind assertions


    if (!pRegexMatcher->find(rIcuErr))

    {

        /* TODO: future versions could pass the UErrorCode or translations

         * thereof to the caller, for example to inform the user of

         * U_REGEX_TIME_OUT. The strange thing though is that an error is set

         * only after the second call that returns immediately and not if

         * timeout occurred on the first call?!? */

        SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);

        return false;

    }

    return true;

}


SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,

                                      sal_Int32 startPos, sal_Int32 endPos )

{

    SearchResult aRet;

    aRet.subRegExpressions = 0;

    if( !pRegexMatcher)

        return aRet;


    if( endPos > searchStr.getLength())

        endPos = searchStr.getLength();


    // use the ICU RegexMatcher to find the matches

    UErrorCode nIcuErr = U_ZERO_ERROR;

    const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),

                                        searchStr.getLength());

    pRegexMatcher->reset( aSearchTargetStr);

    // search until there is a valid match

    for(;;)

    {

        if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))

            return aRet;


        // #i118887# ignore zero-length matches e.g. "a*" in "bc"

        int nStartOfs = pRegexMatcher->start( nIcuErr);

        int nEndOfs = pRegexMatcher->end( nIcuErr);

        if( nStartOfs < nEndOfs)

            break;

        // If the zero-length match is behind the string, do not match it again

        // and again until startPos reaches there. A match behind the string is

        // a "$" anchor.

        if (nStartOfs == endPos)

            break;

        // try at next position if there was a zero-length match

        if( ++startPos >= endPos)

            return aRet;

    }


    // extract the result of the search

    const int nGroupCount = pRegexMatcher->groupCount();

    aRet.subRegExpressions = nGroupCount + 1;

    aRet.startOffset.realloc( aRet.subRegExpressions);

    auto pstartOffset = aRet.startOffset.getArray();

    aRet.endOffset.realloc( aRet.subRegExpressions);

    auto pendOffset = aRet.endOffset.getArray();

    pstartOffset[0] = pRegexMatcher->start( nIcuErr);

    pendOffset[0]   = pRegexMatcher->end( nIcuErr);

    for( int i = 1; i <= nGroupCount; ++i) {

        pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);

        pendOffset[i]   = pRegexMatcher->end( i, nIcuErr);

    }


    return aRet;

}


SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,

                                      sal_Int32 startPos, sal_Int32 endPos )

{

    // NOTE: for backwards search callers provide startPos/endPos inverted!

    SearchResult aRet;

    aRet.subRegExpressions = 0;

    if( !pRegexMatcher)

        return aRet;


    if( startPos > searchStr.getLength())

        startPos = searchStr.getLength();


    // use the ICU RegexMatcher to find the matches

    // TODO: use ICU's backward searching once it becomes available

    //       as its replacement using forward search is not as good as the real thing

    UErrorCode nIcuErr = U_ZERO_ERROR;

    const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),

                                        searchStr.getLength());

    pRegexMatcher->reset( aSearchTargetStr);

    if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))

        return aRet;


    // find the last match

    int nLastPos = 0;

    int nFoundEnd = 0;

    int nGoodPos = 0, nGoodEnd = 0;

    bool bFirst = true;

    do {

        nLastPos = pRegexMatcher->start( nIcuErr);

        nFoundEnd = pRegexMatcher->end( nIcuErr);

        if (nLastPos < nFoundEnd)

        {

            // remember last non-zero-length match

            nGoodPos = nLastPos;

            nGoodEnd = nFoundEnd;

        }

        if( nFoundEnd >= startPos)

            break;

        bFirst = false;

        if( nFoundEnd == nLastPos)

            ++nFoundEnd;

    } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));


    // Ignore all zero-length matches except "$" anchor on first match.

    if (nGoodPos == nGoodEnd)

    {

        if (bFirst && nLastPos == startPos)

            nGoodPos = nLastPos;

        else

            return aRet;

    }


    // find last match again to get its details

    lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);


    // fill in the details of the last match

    const int nGroupCount = pRegexMatcher->groupCount();

    aRet.subRegExpressions = nGroupCount + 1;

    aRet.startOffset.realloc( aRet.subRegExpressions);

    auto pstartOffset = aRet.startOffset.getArray();

    aRet.endOffset.realloc( aRet.subRegExpressions);

    auto pendOffset = aRet.endOffset.getArray();

    // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!

    pstartOffset[0] = pRegexMatcher->end( nIcuErr);

    pendOffset[0]   = pRegexMatcher->start( nIcuErr);

    for( int i = 1; i <= nGroupCount; ++i) {

        pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);

        pendOffset[i]   = pRegexMatcher->start( i, nIcuErr);

    }


    return aRet;

}


// search for words phonetically

SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,

                                          sal_Int32 startPos, sal_Int32 endPos )

{

    SearchResult aRet;

    aRet.subRegExpressions = 0;


    if( !xBreak.is() )

        return aRet;


    sal_Int32 nStt, nEnd;


    Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,

            aSrchPara.Locale,

            WordType::ANYWORD_IGNOREWHITESPACES, true );


    do

    {

        if( aWBnd.startPos >= endPos )

            break;

        nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;

        nEnd = std::min(aWBnd.endPos, endPos);


        if( nStt < nEnd &&

                pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )

        {

            aRet.subRegExpressions = 1;

            aRet.startOffset = { nStt };

            aRet.endOffset = { nEnd };

            break;

        }


        nStt = nEnd - 1;

        aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,

                WordType::ANYWORD_IGNOREWHITESPACES);

    } while( aWBnd.startPos != aWBnd.endPos ||

            (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );

    // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only

    // whitespace) in searchStr, getWordBoundary() returned startPos,startPos

    // and nextWord() does also => don't loop forever.

    return aRet;

}


SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,

                                          sal_Int32 startPos, sal_Int32 endPos )

{

    SearchResult aRet;

    aRet.subRegExpressions = 0;


    if( !xBreak.is() )

        return aRet;


    sal_Int32 nStt, nEnd;


    Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,

            aSrchPara.Locale,

            WordType::ANYWORD_IGNOREWHITESPACES, true );


    do

    {

        if( aWBnd.endPos <= endPos )

            break;

        nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;

        nEnd = std::min(aWBnd.endPos, startPos);


        if( nStt < nEnd &&

                pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )

        {

            aRet.subRegExpressions = 1;

            aRet.startOffset = { nEnd };

            aRet.endOffset = { nStt };

            break;

        }

        if( !nStt )

            break;


        aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,

                WordType::ANYWORD_IGNOREWHITESPACES);

    } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );

    return aRet;

}


namespace {

void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )

{

    rRes.subRegExpressions = 1;

    rRes.startOffset = { nStartOffset };

    rRes.endOffset = { nEndOffset };

}

}


SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )

{

    SearchResult aRes;

    aRes.subRegExpressions = 0;     // no match

    sal_Int32 nStartOffset = nStartPos;

    sal_Int32 nEndOffset = nEndPos;


    const sal_Int32 nStringLen = searchStr.getLength();


    // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty

    // string match with [0,0).

    if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||

            (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))

        return aRes;


    const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);

    const sal_Int32 nPatternLen = rPattern.getLength();


    // Handle special cases empty pattern and/or string outside of the loop to

    // not add performance penalties there and simplify.

    if (nStartPos == nEndPos)

    {

        sal_Int32 i = 0;

        while (i < nPatternLen && rPattern[i] == '*')

            ++i;

        if (i == nPatternLen)

            setWildcardMatch( aRes, nStartOffset, nEndOffset);

        return aRes;

    }


    // Empty pattern does not match any non-empty string.

    if (!nPatternLen)

        return aRes;


    bool bRewind = false;

    sal_uInt32 cPattern = 0;

    sal_Int32 nPattern = 0;

    sal_Int32 nAfterFakePattern = nPattern;

    if (mbWildcardAllowSubstring)

    {

        // Fake a leading '*' wildcard.

        cPattern = '*';

        bRewind = true;

        // Assume a non-'*' pattern character follows. If it is a '*' instead

        // that will be handled in the loop by setting nPat.

        sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);

        if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)

            rPattern.iterateCodePoints( &nAfterFakePattern);

    }


    sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;

    sal_uInt32 cPatternAfterAsterisk = 0;

    bool bEscaped = false, bEscapedAfterAsterisk = false;


    // The loop code tries to avoid costly calls to iterateCodePoints() when

    // possible.


    do

    {

        if (bRewind)

        {

            // Reuse cPattern after '*', nPattern was correspondingly

            // incremented to point behind cPattern.

            bRewind = false;

        }

        else if (nPattern < nPatternLen)

        {

            // nPattern will be incremented by iterateCodePoints().

            cPattern = rPattern.iterateCodePoints( &nPattern);

            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)

            {

                bEscaped = true;

                cPattern = rPattern.iterateCodePoints( &nPattern);

            }

        }

        else

        {

            // A trailing '*' is handled below.

            if (mbWildcardAllowSubstring)

            {

                // If the pattern is consumed and substring match allowed we're good.

                setWildcardMatch( aRes, nStartOffset, nString);

                return aRes;

            }

            else if (nString < nEndPos && nLastAsterisk >= 0)

            {

                // If substring match is not allowed try a greedy '*' match.

                nPattern = nLastAsterisk;

                continue;   // do

            }

            else

                return aRes;

        }


        if (cPattern == '*' && !bEscaped)

        {

            // '*' is one code unit, so not using iterateCodePoints() is ok.

            while (nPattern < nPatternLen && rPattern[nPattern] == '*')

                ++nPattern;


            if (nPattern >= nPatternLen)

            {

                // Last pattern is '*', remaining string matches.

                setWildcardMatch( aRes, nStartOffset, nEndOffset);

                return aRes;

            }


            nLastAsterisk = nPattern;   // Remember last encountered '*'.


            // cPattern will be the next non-'*' character, nPattern

            // incremented.

            cPattern = rPattern.iterateCodePoints( &nPattern);

            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)

            {

                bEscaped = true;

                cPattern = rPattern.iterateCodePoints( &nPattern);

            }


            cPatternAfterAsterisk = cPattern;

            bEscapedAfterAsterisk = bEscaped;

            nPat = nPattern;    // Remember position of pattern behind '*', already incremented.

            nStr = nString;     // Remember the current string to be matched.

        }


        if (nString >= nEndPos)

            // Whatever follows in pattern, string will not match.

            return aRes;


        // nString will be incremented by iterateCodePoints().

        sal_uInt32 cString = searchStr.iterateCodePoints( &nString);


        if ((cPattern != '?' || bEscaped) && cPattern != cString)

        {

            if (nPat == -1)

                // Non-match already without any '*' pattern.

                return aRes;


            bRewind = true;

            nPattern = nPat;                    // Rewind pattern to character behind '*', already incremented.

            cPattern = cPatternAfterAsterisk;

            bEscaped = bEscapedAfterAsterisk;

            searchStr.iterateCodePoints( &nStr);

            nString = nStr;                     // Restore incremented remembered string position.

            if (nPat == nAfterFakePattern)

            {

                // Next start offset will be the next character.

                nStartOffset = nString;

            }

        }

        else

        {

            // An unescaped '?' pattern matched any character, or characters

            // matched. Reset only escaped state.

            bEscaped = false;

        }

    }

    while (nString < nEndPos);


    if (bRewind)

        return aRes;


    // Eat trailing '*' pattern that matches anything, including nothing.

    // '*' is one code unit, so not using iterateCodePoints() is ok.

    while (nPattern < nPatternLen && rPattern[nPattern] == '*')

        ++nPattern;


    if (nPattern == nPatternLen)

        setWildcardMatch( aRes, nStartOffset, nEndOffset);

    return aRes;

}


SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )

{

    SearchResult aRes;

    aRes.subRegExpressions = 0;     // no match


    sal_Int32 nStartOffset = nStartPos;

    sal_Int32 nEndOffset = nEndPos;


    const sal_Int32 nStringLen = searchStr.getLength();


    // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty

    // string match with (0,0].

    if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||

            (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))

        return aRes;


    const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);

    sal_Int32 nPatternLen = rPattern.getLength();


    // Handle special cases empty pattern and/or string outside of the loop to

    // not add performance penalties there and simplify.

    if (nStartPos == nEndPos)

    {

        sal_Int32 i = 0;

        while (i < nPatternLen && rPattern[i] == '*')

            ++i;

        if (i == nPatternLen)

            setWildcardMatch( aRes, nStartOffset, nEndOffset);

        return aRes;

    }


    // Empty pattern does not match any non-empty string.

    if (!nPatternLen)

        return aRes;


    // Reverse escaped patterns to ease the handling of escapes, keeping escape

    // and following character as one sequence in backward direction.

    if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||

            (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))

    {

        OUStringBuffer aPatternBuf( rPattern);

        sal_Int32 nIndex = 0;

        while (nIndex < nPatternLen)

        {

            const sal_Int32 nOld = nIndex;

            const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);

            if (cu == mcWildcardEscapeChar)

            {

                if (nIndex < nPatternLen)

                {

                    if (nIndex - nOld == 1)

                    {

                        // Simply move code units, we already memorized the one

                        // in 'cu'.

                        const sal_Int32 nOld2 = nIndex;

                        rPattern.iterateCodePoints( &nIndex);

                        for (sal_Int32 i=0; i < nIndex - nOld2; ++i)

                            aPatternBuf[nOld+i] = rPattern[nOld2+i];

                        aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);

                    }

                    else

                    {

                        // Copy the escape character code units first in the

                        // unlikely case that it would not be of BMP.

                        assert(nIndex - nOld == 2);  // it's UTF-16, so...

                        sal_Unicode buf[2];

                        buf[0] = rPattern[nOld];

                        buf[1] = rPattern[nOld+1];

                        const sal_Int32 nOld2 = nIndex;

                        rPattern.iterateCodePoints( &nIndex);

                        for (sal_Int32 i=0; i < nIndex - nOld2; ++i)

                            aPatternBuf[nOld+i] = rPattern[nOld2+i];

                        aPatternBuf[nIndex-2] = buf[0];

                        aPatternBuf[nIndex-1] = buf[1];

                    }

                }

                else

                {

                    // Trailing escape would become leading escape, do what?

                    // Eliminate.

                    aPatternBuf.remove( nOld, nIndex - nOld);

                }

            }

        }

        if (bUsePrimarySrchStr)

            maWildcardReversePattern = aPatternBuf.makeStringAndClear();

        else

            maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();

    }

    const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);

    nPatternLen = rReversePattern.getLength();


    bool bRewind = false;

    sal_uInt32 cPattern = 0;

    sal_Int32 nPattern = nPatternLen;

    sal_Int32 nAfterFakePattern = nPattern;

    if (mbWildcardAllowSubstring)

    {

        // Fake a trailing '*' wildcard.

        cPattern = '*';

        bRewind = true;

        // Assume a non-'*' pattern character follows. If it is a '*' instead

        // that will be handled in the loop by setting nPat.

        sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);

        if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)

            rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);

    }


    sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;

    sal_uInt32 cPatternAfterAsterisk = 0;

    bool bEscaped = false, bEscapedAfterAsterisk = false;


    // The loop code tries to avoid costly calls to iterateCodePoints() when

    // possible.


    do

    {

        if (bRewind)

        {

            // Reuse cPattern after '*', nPattern was correspondingly

            // decremented to point before cPattern.

            bRewind = false;

        }

        else if (nPattern > 0)

        {

            // nPattern will be decremented by iterateCodePoints().

            cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);

            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)

            {

                bEscaped = true;

                cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);

            }

        }

        else

        {

            // A trailing '*' is handled below.

            if (mbWildcardAllowSubstring)

            {

                // If the pattern is consumed and substring match allowed we're good.

                setWildcardMatch( aRes, nStartOffset, nString);

                return aRes;

            }

            else if (nString > nEndPos && nLastAsterisk >= 0)

            {

                // If substring match is not allowed try a greedy '*' match.

                nPattern = nLastAsterisk;

                continue;   // do

            }

            else

                return aRes;

        }


        if (cPattern == '*' && !bEscaped)

        {

            // '*' is one code unit, so not using iterateCodePoints() is ok.

            while (nPattern > 0 && rReversePattern[nPattern-1] == '*')

                --nPattern;


            if (nPattern <= 0)

            {

                // First pattern is '*', remaining string matches.

                setWildcardMatch( aRes, nStartOffset, nEndOffset);

                return aRes;

            }


            nLastAsterisk = nPattern;   // Remember last encountered '*'.


            // cPattern will be the previous non-'*' character, nPattern

            // decremented.

            cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);

            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)

            {

                bEscaped = true;

                cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);

            }


            cPatternAfterAsterisk = cPattern;

            bEscapedAfterAsterisk = bEscaped;

            nPat = nPattern;    // Remember position of pattern before '*', already decremented.

            nStr = nString;     // Remember the current string to be matched.

        }


        if (nString <= nEndPos)

            // Whatever leads in pattern, string will not match.

            return aRes;


        // nString will be decremented by iterateCodePoints().

        sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);


        if ((cPattern != '?' || bEscaped) && cPattern != cString)

        {

            if (nPat == -1)

                // Non-match already without any '*' pattern.

                return aRes;


            bRewind = true;

            nPattern = nPat;                    // Rewind pattern to character before '*', already decremented.

            cPattern = cPatternAfterAsterisk;

            bEscaped = bEscapedAfterAsterisk;

            searchStr.iterateCodePoints( &nStr, -1);

            nString = nStr;                     // Restore decremented remembered string position.

            if (nPat == nAfterFakePattern)

            {

                // Next start offset will be this character (exclusive).

                nStartOffset = nString;

            }

        }

        else

        {

            // An unescaped '?' pattern matched any character, or characters

            // matched. Reset only escaped state.

            bEscaped = false;

        }

    }

    while (nString > nEndPos);


    if (bRewind)

        return aRes;


    // Eat leading '*' pattern that matches anything, including nothing.

    // '*' is one code unit, so not using iterateCodePoints() is ok.

    while (nPattern > 0 && rReversePattern[nPattern-1] == '*')

        --nPattern;


    if (nPattern == 0)

        setWildcardMatch( aRes, nStartOffset, nEndOffset);

    return aRes;

}


OUString SAL_CALL

TextSearch::getImplementationName()

{

    return "com.sun.star.util.TextSearch_i18n";

}


sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)

{

    return cppu::supportsService(this, rServiceName);

}


Sequence< OUString > SAL_CALL

TextSearch::getSupportedServiceNames()

{

    return { "com.sun.star.util.TextSearch", "com.sun.star.util.TextSearch2" };

}


extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*

i18npool_TextSearch_get_implementation(

    css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)

{

    return cppu::acquire(new TextSearch(context));

}


/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

m_xContext
Reference< XComponentContext > m_xContext

TextSearch
Definition: textsearch.hxx:49

TextSearch::m_xContext
css::uno::Reference< css::uno::XComponentContext > m_xContext
Definition: textsearch.hxx:51

TextSearch::searchForward
virtual css::util::SearchResult SAL_CALL searchForward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:301

TextSearch::startPos
sal_Int32 startPos
Definition: textsearch.hxx:65

TextSearch::pWLD
std::unique_ptr< WLevDistance > pWLD
Definition: textsearch.hxx:106

TextSearch::RESrchPrepare
void RESrchPrepare(const css::util::SearchOptions2 &)
Definition: textsearch.cxx:833

TextSearch::fnBackward
FnSrch fnBackward
Definition: textsearch.hxx:68

TextSearch::RESrchFrwrd
css::util::SearchResult SAL_CALL RESrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:922

TextSearch::bUsePrimarySrchStr
bool bUsePrimarySrchStr
Definition: textsearch.hxx:77

TextSearch::pJumpTable
std::unique_ptr< TextSearchJumpTable > pJumpTable
Definition: textsearch.hxx:74

TextSearch::bIsForwardTab
bool bIsForwardTab
Definition: textsearch.hxx:76

TextSearch::maWildcardReversePattern
OUString maWildcardReversePattern
Definition: textsearch.hxx:118

TextSearch::IsDelimiter
bool IsDelimiter(const OUString &rStr, sal_Int32 nPos) const
Definition: textsearch.cxx:563

TextSearch::WildcardSrchFrwrd
css::util::SearchResult SAL_CALL WildcardSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:1142

TextSearch::maWildcardReversePattern2
OUString maWildcardReversePattern2
Definition: textsearch.hxx:119

TextSearch::RESrchBkwrd
css::util::SearchResult SAL_CALL RESrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:976

TextSearch::nLimit
int nLimit
Definition: textsearch.hxx:105

TextSearch::~TextSearch
virtual ~TextSearch() override
Definition: textsearch.cxx:109

TextSearch::aSrchPara
css::util::SearchOptions2 aSrchPara
Definition: textsearch.hxx:53

TextSearch::searchBackward
virtual css::util::SearchResult SAL_CALL searchBackward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:440

TextSearch::pJumpTable2
std::unique_ptr< TextSearchJumpTable > pJumpTable2
Definition: textsearch.hxx:75

TextSearch::setOptions2
virtual void SAL_CALL setOptions2(const css::util::SearchOptions2 &options) override
Definition: textsearch.cxx:117

TextSearch::bSearchApostrophe
bool bSearchApostrophe
Definition: textsearch.hxx:71

TextSearch::MakeBackwardTab
void MakeBackwardTab()
Definition: textsearch.cxx:633

TextSearch::xTranslit2
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit2
Definition: textsearch.hxx:60

TextSearch::sSrchStr2
OUString sSrchStr2
Definition: textsearch.hxx:55

TextSearch::mbWildcardAllowSubstring
bool mbWildcardAllowSubstring
Definition: textsearch.hxx:121

TextSearch::NSrchBkwrd
css::util::SearchResult SAL_CALL NSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:762

TextSearch::mcWildcardEscapeChar
sal_uInt32 mcWildcardEscapeChar
Definition: textsearch.hxx:120

TextSearch::getImplementationName
virtual OUString SAL_CALL getImplementationName() override
Definition: textsearch.cxx:1544

TextSearch::ApproxSrchBkwrd
css::util::SearchResult SAL_CALL ApproxSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:1093

TextSearch::GetDiff
sal_Int32 GetDiff(const sal_Unicode) const
Definition: textsearch.cxx:679

TextSearch::MakeBackwardTab2
void MakeBackwardTab2()
Definition: textsearch.cxx:656

TextSearch::sSrchStr
OUString sSrchStr
Definition: textsearch.hxx:54

TextSearch::endPos
sal_Int32 sal_Int32 endPos
Definition: textsearch.hxx:65

TextSearch::fnForward
FnSrch fnForward
Definition: textsearch.hxx:67

TextSearch::WildcardSrchBkwrd
css::util::SearchResult SAL_CALL WildcardSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:1313

TextSearch::supportsService
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
Definition: textsearch.cxx:1549

TextSearch::MakeForwardTab
void MakeForwardTab()
Definition: textsearch.cxx:582

TextSearch::MakeForwardTab2
void MakeForwardTab2()
Definition: textsearch.cxx:608

TextSearch::xTranslit
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit
Definition: textsearch.hxx:59

TextSearch::m_aMutex
std::mutex m_aMutex
Definition: textsearch.hxx:50

TextSearch::NSrchFrwrd
css::util::SearchResult SAL_CALL NSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:699

TextSearch::setOptions
virtual void SAL_CALL setOptions(const css::util::SearchOptions &options) override
Definition: textsearch.cxx:258

TextSearch::xCharClass
css::uno::Reference< css::i18n::XCharacterClassification > xCharClass
Definition: textsearch.hxx:57

TextSearch::ApproxSrchFrwrd
css::util::SearchResult SAL_CALL ApproxSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:1051

TextSearch::pRegexMatcher
std::unique_ptr< icu::RegexMatcher > pRegexMatcher
Definition: textsearch.hxx:93

TextSearch::getSupportedServiceNames
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
Definition: textsearch.cxx:1555

TextSearch::TextSearch
TextSearch(const css::uno::Reference< css::uno::XComponentContext > &rxContext)

TextSearch::xBreak
css::uno::Reference< css::i18n::XBreakIterator > xBreak
Definition: textsearch.hxx:107

WLevDistance
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:134

v
float v

u
float u

nIndex
sal_Int32 nIndex

n
sal_Int64 n

levdis.hxx

nPos
sal_uInt16 nPos

log.hxx

SAL_WARN
#define SAL_WARN(area, stream)

SAL_INFO
#define SAL_INFO(area, stream)

com::sun::star::i18n

com::sun::star::lang

com::sun::star::uno

com::sun::star::util

com::sun::star

cppu::supportsService
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)

i
int i

m
m

supportsservice.hxx

COMPLEX_TRANS_MASK
const TransliterationFlags COMPLEX_TRANS_MASK
Definition: textsearch.cxx:45

i18npool_TextSearch_get_implementation
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * i18npool_TextSearch_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
Definition: textsearch.cxx:1561

lcl_findRegex
static bool lcl_findRegex(std::unique_ptr< icu::RegexMatcher > const &pRegexMatcher, sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode &rIcuErr)
Definition: textsearch.cxx:901

FindPosInSeq_Impl
static sal_Int32 FindPosInSeq_Impl(const Sequence< sal_Int32 > &rOff, sal_Int32 nPos)
Definition: textsearch.cxx:294

textsearch.hxx

TextSearchJumpTable
::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable
Definition: textsearch.hxx:41

transliteration.hxx

TransliterationFlags
TransliterationFlags

sal_Bool
unsigned char sal_Bool

sal_Unicode
sal_uInt16 sal_Unicode

weak.hxx