LibreOffice Module i18npool (master) 1
textsearch.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include "textsearch.hxx"
21#include "levdis.hxx"
22#include <com/sun/star/i18n/BreakIterator.hpp>
23#include <com/sun/star/util/SearchAlgorithms2.hpp>
24#include <com/sun/star/util/SearchFlags.hpp>
25#include <com/sun/star/i18n/WordType.hpp>
26#include <com/sun/star/i18n/ScriptType.hpp>
27#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
28#include <com/sun/star/i18n/CharacterClassification.hpp>
29#include <com/sun/star/i18n/KCharacterType.hpp>
30#include <com/sun/star/i18n/Transliteration.hpp>
32#include <cppuhelper/weak.hxx>
34#include <rtl/ustrbuf.hxx>
35#include <sal/log.hxx>
36
37#include <unicode/regex.h>
38
39using namespace ::com::sun::star::util;
40using namespace ::com::sun::star::uno;
41using namespace ::com::sun::star::lang;
42using namespace ::com::sun::star::i18n;
43using namespace ::com::sun::star;
44
46 TransliterationFlags::ignoreBaFa_ja_JP |
47 TransliterationFlags::ignoreIterationMark_ja_JP |
48 TransliterationFlags::ignoreTiJi_ja_JP |
49 TransliterationFlags::ignoreHyuByu_ja_JP |
50 TransliterationFlags::ignoreSeZe_ja_JP |
51 TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
52 TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
53 TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
54
55namespace
56{
58{
59 // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
60 // in complex transliteration.
61 return
62 n & (COMPLEX_TRANS_MASK | // all set ignore bits
63 TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
64 TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
65}
66
67bool isComplexTrans( TransliterationFlags n )
68{
69 return bool(n & COMPLEX_TRANS_MASK);
70}
71
73{
74 return n & ~COMPLEX_TRANS_MASK;
75}
76
77bool isSimpleTrans( TransliterationFlags n )
78{
79 return bool(maskSimpleTrans(n));
80}
81
82// Regex patterns are case sensitive.
83TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
84{
85 TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
86 TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
87 if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
88 v = TransliterationFlags::NONE;
89 return (m | v) & ~COMPLEX_TRANS_MASK;
90}
91
92bool isSimpleRegexTrans( TransliterationFlags n )
93{
94 return bool(maskSimpleRegexTrans(n));
95}
96};
97
98TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
99 : m_xContext( rxContext )
100{
101 SearchOptions2 aOpt;
102 aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
103 aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
104 aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
105 //aOpt.Locale = ???;
106 setOptions( aOpt );
107}
108
110{
111 pRegexMatcher.reset();
112 pWLD.reset();
113 pJumpTable.reset();
114 pJumpTable2.reset();
115}
116
117void TextSearch::setOptions2( const SearchOptions2& rOptions )
118{
119 std::unique_lock g(m_aMutex);
120
121 aSrchPara = rOptions;
122
123 pRegexMatcher.reset();
124 pWLD.reset();
125 pJumpTable.reset();
126 pJumpTable2.reset();
129 TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
130 bSearchApostrophe = false;
131 bool bReplaceApostrophe = false;
132 if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
133 {
134 // RESrchPrepare will consider aSrchPara.transliterateFlags when
135 // picking the actual regex pattern
136 // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
137 // case-insensitivity. Create transliteration instance, if any, without
138 // ignore-case so later in TextSearch::searchForward() the string to
139 // match is not case-altered, leave case-(in)sensitive to regex engine.
140 transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
141 }
142 else if ( aSrchPara.searchString.indexOf('\'') > - 1 )
143 {
144 bSearchApostrophe = true;
145 bReplaceApostrophe = aSrchPara.searchString.indexOf(u'\u2019') > -1;
146 }
147
148 // Create Transliteration class
149 if( isSimpleTrans( transliterateFlags) )
150 {
151 if( !xTranslit.is() )
152 xTranslit.set( Transliteration::create( m_xContext ) );
153 xTranslit->loadModule(
154 static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
155 aSrchPara.Locale);
156 }
157 else if( xTranslit.is() )
158 xTranslit = nullptr;
159
160 // Create Transliteration for 2<->1, 2<->2 transliteration
161 if ( isComplexTrans( transliterateFlags) )
162 {
163 if( !xTranslit2.is() )
164 xTranslit2.set( Transliteration::create( m_xContext ) );
165 // Load transliteration module
166 xTranslit2->loadModule(
167 static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
168 aSrchPara.Locale);
169 }
170
171 if ( !xBreak.is() )
172 xBreak = css::i18n::BreakIterator::create( m_xContext );
173
174 sSrchStr = aSrchPara.searchString;
175
176 // Transliterate search string.
177 if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
178 {
179 if (isSimpleRegexTrans(transliterateFlags))
180 {
181 if (maskSimpleRegexTrans(transliterateFlags) !=
182 maskSimpleTrans(transliterateFlags))
183 {
184 css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
185 Transliteration::create( m_xContext ));
186 if (xTranslitPattern.is())
187 {
188 xTranslitPattern->loadModule(
189 static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
190 aSrchPara.Locale);
191 sSrchStr = xTranslitPattern->transliterateString2String(
192 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
193 }
194 }
195 else
196 {
197 if (xTranslit.is())
198 sSrchStr = xTranslit->transliterateString2String(
199 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
200 }
201 // xTranslit2 complex transliterated sSrchStr2 is not used in
202 // regex, see TextSearch::searchForward() and
203 // TextSearch::searchBackward()
204 }
205 }
206 else
207 {
208 if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
209 sSrchStr = xTranslit->transliterateString2String(
210 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
211
212 if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
213 sSrchStr2 = xTranslit2->transliterateString2String(
214 aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
215 }
216
217 if ( bReplaceApostrophe )
218 sSrchStr = sSrchStr.replace(u'\u2019', '\'');
219
220 // Take the new SearchOptions2::AlgorithmType2 field and ignore
221 // SearchOptions::algorithmType
222 switch( aSrchPara.AlgorithmType2)
223 {
224 case SearchAlgorithms2::REGEXP:
228 break;
229
230 case SearchAlgorithms2::APPROXIMATE:
233
234 pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
235 aSrchPara.insertedChars, aSrchPara.deletedChars,
236 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
237
238 nLimit = pWLD->GetLimit();
239 break;
240
241 case SearchAlgorithms2::WILDCARD:
242 mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
243 mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
246 break;
247
248 default:
249 SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
250 [[fallthrough]];
251 case SearchAlgorithms2::ABSOLUTE:
254 break;
255 }
256}
257
258void TextSearch::setOptions( const SearchOptions& rOptions )
259{
260 sal_Int16 nAlgorithmType2;
261 switch (rOptions.algorithmType)
262 {
263 case SearchAlgorithms_REGEXP:
264 nAlgorithmType2 = SearchAlgorithms2::REGEXP;
265 break;
266 case SearchAlgorithms_APPROXIMATE:
267 nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
268 break;
269 default:
270 SAL_WARN("i18npool","TextSearch::setOptions - default what?");
271 [[fallthrough]];
272 case SearchAlgorithms_ABSOLUTE:
273 nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
274 break;
275 }
276 // It would be nice if an inherited struct had a ctor that takes an
277 // instance of the object the struct derived from...
278 SearchOptions2 aOptions2(
279 rOptions.algorithmType,
280 rOptions.searchFlag,
281 rOptions.searchString,
282 rOptions.replaceString,
283 rOptions.Locale,
284 rOptions.changedChars,
285 rOptions.deletedChars,
286 rOptions.insertedChars,
287 rOptions.transliterateFlags,
288 nAlgorithmType2,
289 0 // no wildcard search, no escape character...
290 );
291 setOptions2( aOptions2);
292}
293
294static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
295{
296 auto pOff = std::find_if(rOff.begin(), rOff.end(),
297 [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
298 return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
299}
300
301SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
302{
303 std::unique_lock g(m_aMutex);
304
305 SearchResult sres;
306
307 OUString in_str(searchStr);
308
309 // in non-regex mode, allow searching typographical apostrophe with the ASCII one
310 // to avoid regression after using automatic conversion to U+2019 during typing in Writer
311 bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
312
313 bUsePrimarySrchStr = true;
314
315 if ( xTranslit.is() )
316 {
317 // apply normal transliteration (1<->1, 1<->0)
318
319 sal_Int32 nInStartPos = startPos;
320 if (pRegexMatcher && startPos > 0)
321 {
322 // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
323 // transliterate enough of the leading text to allow sensible look-behind assertions.
324 // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
325 // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
326 // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
327 // the middle of a surrogate pair, preceded by another surrogate pair.
328 const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
329 nInStartPos -= std::min(nMaxLeadingLen, startPos);
330 }
331 sal_Int32 nInEndPos = endPos;
332 if (pRegexMatcher && endPos < searchStr.getLength())
333 {
334 // tdf#65038: ditto for look-ahead assertions
335 const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
336 nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
337 }
338
339 css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
340 in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
341
342 if ( bReplaceApostrophe )
343 in_str = in_str.replace(u'\u2019', '\'');
344
345 // JP 20.6.2001: also the start and end positions must be corrected!
346 sal_Int32 newStartPos =
347 (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
348
349 sal_Int32 newEndPos = (endPos < searchStr.getLength())
350 ? FindPosInSeq_Impl( offset, endPos )
351 : in_str.getLength();
352
353 sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
354
355 // Map offsets back to untransliterated string.
356 const sal_Int32 nOffsets = offset.getLength();
357 if (nOffsets)
358 {
359 auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
360 auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
361 // For regex nGroups is the number of groups+1 with group 0 being
362 // the entire match.
363 const sal_Int32 nGroups = sres.startOffset.getLength();
364 for ( sal_Int32 k = 0; k < nGroups; k++ )
365 {
366 const sal_Int32 nStart = sres.startOffset[k];
367 // Result offsets are negative (-1) if a group expression was
368 // not matched.
369 if (nStart >= 0)
370 sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
371 // JP 20.6.2001: end is ever exclusive and then don't return
372 // the position of the next character - return the
373 // next position behind the last found character!
374 // "a b c" find "b" must return 2,3 and not 2,4!!!
375 const sal_Int32 nStop = sres.endOffset[k];
376 if (nStop >= 0)
377 {
378 if (nStop > 0)
379 sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
380 else
381 sres_endOffsetRange[k] = offset[0];
382 }
383 }
384 }
385 }
386 else
387 {
388 if ( bReplaceApostrophe )
389 in_str = in_str.replace(u'\u2019', '\'');
390
391 sres = (this->*fnForward)( in_str, startPos, endPos );
392 }
393
394 if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
395 {
396 SearchResult sres2;
397
398 in_str = searchStr;
399 css::uno::Sequence <sal_Int32> offset( in_str.getLength());
400
401 in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
402
403 if( startPos )
405
406 if( endPos < searchStr.getLength() )
407 endPos = FindPosInSeq_Impl( offset, endPos );
408 else
409 endPos = in_str.getLength();
410
411 bUsePrimarySrchStr = false;
412 sres2 = (this->*fnForward)( in_str, startPos, endPos );
413 auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
414 auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
415
416 for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
417 {
418 if (sres2.startOffset[k])
419 sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
420 if (sres2.endOffset[k])
421 sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
422 }
423
424 // pick first and long one
425 if ( sres.subRegExpressions == 0)
426 return sres2;
427 if ( sres2.subRegExpressions == 1)
428 {
429 if ( sres.startOffset[0] > sres2.startOffset[0])
430 return sres2;
431 else if ( sres.startOffset[0] == sres2.startOffset[0] &&
432 sres.endOffset[0] < sres2.endOffset[0])
433 return sres2;
434 }
435 }
436
437 return sres;
438}
439
440SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
441{
442 std::unique_lock g(m_aMutex);
443
444 SearchResult sres;
445
446 OUString in_str(searchStr);
447
448 // in non-regex mode, allow searching typographical apostrophe with the ASCII one
449 // to avoid regression after using automatic conversion to U+2019 during typing in Writer
450 bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
451
452 bUsePrimarySrchStr = true;
453
454 if ( xTranslit.is() )
455 {
456 // apply only simple 1<->1 transliteration here
457 css::uno::Sequence<sal_Int32> offset(startPos - endPos);
458 in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
459
460 if ( bReplaceApostrophe )
461 in_str = in_str.replace(u'\u2019', '\'');
462
463 // JP 20.6.2001: also the start and end positions must be corrected!
464 sal_Int32 const newStartPos = (startPos < searchStr.getLength())
465 ? FindPosInSeq_Impl( offset, startPos )
466 : in_str.getLength();
467
468 sal_Int32 const newEndPos =
469 (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
470
471 // TODO: this would need nExtraOffset handling to avoid $ matching
472 // if (pRegexMatcher && startPos < searchStr.getLength())
473 // but that appears to be impossible with ICU regex
474
475 sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
476
477 // Map offsets back to untransliterated string.
478 const sal_Int32 nOffsets = offset.getLength();
479 if (nOffsets)
480 {
481 auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
482 auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
483 // For regex nGroups is the number of groups+1 with group 0 being
484 // the entire match.
485 const sal_Int32 nGroups = sres.startOffset.getLength();
486 for ( sal_Int32 k = 0; k < nGroups; k++ )
487 {
488 const sal_Int32 nStart = sres.startOffset[k];
489 // Result offsets are negative (-1) if a group expression was
490 // not matched.
491 if (nStart >= 0)
492 {
493 if (nStart > 0)
494 sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
495 else
496 sres_startOffsetRange[k] = offset[0];
497 }
498 // JP 20.6.2001: end is ever exclusive and then don't return
499 // the position of the next character - return the
500 // next position behind the last found character!
501 // "a b c" find "b" must return 2,3 and not 2,4!!!
502 const sal_Int32 nStop = sres.endOffset[k];
503 if (nStop >= 0)
504 sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
505 }
506 }
507 }
508 else
509 {
510 if ( bReplaceApostrophe )
511 in_str = in_str.replace(u'\u2019', '\'');
512
513 sres = (this->*fnBackward)( in_str, startPos, endPos );
514 }
515
516 if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
517 {
518 SearchResult sres2;
519
520 in_str = searchStr;
521 css::uno::Sequence <sal_Int32> offset( in_str.getLength());
522
523 in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
524
525 if( startPos < searchStr.getLength() )
527 else
528 startPos = in_str.getLength();
529
530 if( endPos )
531 endPos = FindPosInSeq_Impl( offset, endPos );
532
533 bUsePrimarySrchStr = false;
534 sres2 = (this->*fnBackward)( in_str, startPos, endPos );
535 auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
536 auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
537
538 for( int k = 0; k < sres2.startOffset.getLength(); k++ )
539 {
540 if (sres2.startOffset[k])
541 sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
542 if (sres2.endOffset[k])
543 sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
544 }
545
546 // pick last and long one
547 if ( sres.subRegExpressions == 0 )
548 return sres2;
549 if ( sres2.subRegExpressions == 1 )
550 {
551 if ( sres.startOffset[0] < sres2.startOffset[0] )
552 return sres2;
553 if ( sres.startOffset[0] == sres2.startOffset[0] &&
554 sres.endOffset[0] > sres2.endOffset[0] )
555 return sres2;
556 }
557 }
558
559 return sres;
560}
561
562
563bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
564{
565 bool bRet = true;
566 if( '\x7f' != rStr[nPos])
567 {
568 if ( !xCharClass.is() )
569 xCharClass = CharacterClassification::create( m_xContext );
570 sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
571 aSrchPara.Locale );
572 if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
573 KCharacterType::LETTER ) & nCType ) )
574 bRet = false;
575 }
576 return bRet;
577}
578
579// --------- helper methods for Boyer-Moore like text searching ----------
580// TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
581
583{
584 // create the jumptable for the search text
585
587 {
588 return; // the jumpTable is ok
589 }
590 bIsForwardTab = true;
591
592 sal_Int32 n, nLen = sSrchStr.getLength();
593 pJumpTable.reset( new TextSearchJumpTable );
594
595 for( n = 0; n < nLen - 1; ++n )
596 {
597 sal_Unicode cCh = sSrchStr[n];
598 sal_Int32 nDiff = nLen - n - 1;
599 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
600
601 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
602 pJumpTable->insert( aEntry );
603 if ( !aPair.second )
604 (*(aPair.first)).second = nDiff;
605 }
606}
607
609{
610 // create the jumptable for the search text
612 {
613 return; // the jumpTable is ok
614 }
615 bIsForwardTab = true;
616
617 sal_Int32 n, nLen = sSrchStr2.getLength();
618 pJumpTable2.reset( new TextSearchJumpTable );
619
620 for( n = 0; n < nLen - 1; ++n )
621 {
622 sal_Unicode cCh = sSrchStr2[n];
623 sal_Int32 nDiff = nLen - n - 1;
624
625 TextSearchJumpTable::value_type aEntry( cCh, nDiff );
626 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
627 pJumpTable2->insert( aEntry );
628 if ( !aPair.second )
629 (*(aPair.first)).second = nDiff;
630 }
631}
632
634{
635 // create the jumptable for the search text
637 {
638 return; // the jumpTable is ok
639 }
640 bIsForwardTab = false;
641
642 sal_Int32 n, nLen = sSrchStr.getLength();
643 pJumpTable.reset( new TextSearchJumpTable );
644
645 for( n = nLen-1; n > 0; --n )
646 {
647 sal_Unicode cCh = sSrchStr[n];
648 TextSearchJumpTable::value_type aEntry( cCh, n );
649 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
650 pJumpTable->insert( aEntry );
651 if ( !aPair.second )
652 (*(aPair.first)).second = n;
653 }
654}
655
657{
658 // create the jumptable for the search text
659 if( pJumpTable2 && !bIsForwardTab )
660 {
661 return; // the jumpTable is ok
662 }
663 bIsForwardTab = false;
664
665 sal_Int32 n, nLen = sSrchStr2.getLength();
666 pJumpTable2.reset( new TextSearchJumpTable );
667
668 for( n = nLen-1; n > 0; --n )
669 {
670 sal_Unicode cCh = sSrchStr2[n];
671 TextSearchJumpTable::value_type aEntry( cCh, n );
672 ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
673 pJumpTable2->insert( aEntry );
674 if ( !aPair.second )
675 (*(aPair.first)).second = n;
676 }
677}
678
679sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
680{
681 TextSearchJumpTable *pJump;
682 OUString sSearchKey;
683
684 if ( bUsePrimarySrchStr ) {
685 pJump = pJumpTable.get();
686 sSearchKey = sSrchStr;
687 } else {
688 pJump = pJumpTable2.get();
689 sSearchKey = sSrchStr2;
690 }
691
692 TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
693 if ( iLook == pJump->end() )
694 return sSearchKey.getLength();
695 return (*iLook).second;
696}
697
698
699SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
700{
701 SearchResult aRet;
702 aRet.subRegExpressions = 0;
703
704 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
705
706 sal_Int32 nSuchIdx = searchStr.getLength();
707 sal_Int32 nEnd = endPos;
708 if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
709 return aRet;
710
711
712 if( nEnd < sSearchKey.getLength() ) // position inside the search region ?
713 return aRet;
714
715 nEnd -= sSearchKey.getLength();
716
718 MakeForwardTab(); // create the jumptable
719 else
721
722 for (sal_Int32 nCmpIdx = startPos; // start position for the search
723 nCmpIdx <= nEnd;
724 nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
725 {
726 nSuchIdx = sSearchKey.getLength() - 1;
727 while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
728 {
729 if( nSuchIdx == 0 )
730 {
731 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
732 {
733 sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
734 bool bAtStart = !nCmpIdx;
735 bool bAtEnd = nFndEnd == endPos;
736 bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
737 bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd );
738 // * 1 -> only one word in the paragraph
739 // * 2 -> at begin of paragraph
740 // * 3 -> at end of paragraph
741 // * 4 -> inside the paragraph
742 if( !( ( bAtStart && bAtEnd ) || // 1
743 ( bAtStart && bDelimBehind ) || // 2
744 ( bAtEnd && bDelimBefore ) || // 3
745 ( bDelimBefore && bDelimBehind ))) // 4
746 break;
747 }
748
749 aRet.subRegExpressions = 1;
750 aRet.startOffset = { nCmpIdx };
751 aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };
752
753 return aRet;
754 }
755 else
756 nSuchIdx--;
757 }
758 }
759 return aRet;
760}
761
762SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
763{
764 SearchResult aRet;
765 aRet.subRegExpressions = 0;
766
767 OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
768
769 sal_Int32 nSuchIdx = searchStr.getLength();
770 sal_Int32 nEnd = endPos;
771 if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
772 return aRet;
773
775 MakeBackwardTab(); // create the jumptable
776 else
778
779 if( nEnd == nSuchIdx ) // end position for the search
780 nEnd = sSearchKey.getLength();
781 else
782 nEnd += sSearchKey.getLength();
783
784 sal_Int32 nCmpIdx = startPos; // start position for the search
785
786 while (nCmpIdx >= nEnd)
787 {
788 nSuchIdx = 0;
789 while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
790 searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
791 nSuchIdx++;
792 if( nSuchIdx >= sSearchKey.getLength() )
793 {
794 if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
795 {
796 sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
797 bool bAtStart = !nFndStt;
798 bool bAtEnd = nCmpIdx == startPos;
799 bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
800 bool bDelimBefore = bAtStart || // begin of paragraph
801 IsDelimiter( searchStr, nFndStt-1 );
802 // * 1 -> only one word in the paragraph
803 // * 2 -> at begin of paragraph
804 // * 3 -> at end of paragraph
805 // * 4 -> inside the paragraph
806 if( ( bAtStart && bAtEnd ) || // 1
807 ( bAtStart && bDelimBehind ) || // 2
808 ( bAtEnd && bDelimBefore ) || // 3
809 ( bDelimBefore && bDelimBehind )) // 4
810 {
811 aRet.subRegExpressions = 1;
812 aRet.startOffset = { nCmpIdx };
813 aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
814 return aRet;
815 }
816 }
817 else
818 {
819 aRet.subRegExpressions = 1;
820 aRet.startOffset = { nCmpIdx };
821 aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
822 return aRet;
823 }
824 }
825 nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
826 if( nCmpIdx < nSuchIdx )
827 return aRet;
828 nCmpIdx -= nSuchIdx;
829 }
830 return aRet;
831}
832
833void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
834{
835 TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
836 // select the transliterated pattern string
837 const OUString& rPatternStr =
838 (isSimpleTrans(transliterateFlags) ? sSrchStr
839 : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
840
841 sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
842 // map css::util::SearchFlags to ICU uregex.h flags
843 // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
844 // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
845 // REG_NOSUB is not used anywhere => not implemented
846 // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
847 // LEV_RELAXED is only used for SearchAlgorithm==Approximate
848 // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
849 // probably because the transliteration flag IGNORE_CASE handles it as well.
850 if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
851 || (transliterateFlags & TransliterationFlags::IGNORE_CASE))
852 nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
853 UErrorCode nIcuErr = U_ZERO_ERROR;
854 // assumption: transliteration didn't mangle regexp control chars
855 icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
856#ifndef DISABLE_WORDBOUND_EMULATION
857 // for convenience specific syntax elements of the old regex engine are emulated
858 // - by replacing < with "word-break followed by a look-ahead word-char"
859 static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);
860 static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);
861 static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
862 aChevronMatcherB.reset( aIcuSearchPatStr);
863 aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
864 aChevronMatcherB.reset();
865 // - by replacing > with "look-behind word-char followed by a word-break"
866 static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);
867 static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);
868 static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
869 aChevronMatcherE.reset( aIcuSearchPatStr);
870 aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
871 aChevronMatcherE.reset();
872#endif
873 pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
874 if (nIcuErr)
875 {
876 SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
877 pRegexMatcher.reset();
878 }
879 else
880 {
881 // Pathological patterns may result in exponential run time making the
882 // application appear to be frozen. Limit that. Documentation for this
883 // call says
884 // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
885 // "The units of the limit are steps of the match engine.
886 // Correspondence with actual processor time will depend on the speed
887 // of the processor and the details of the specific pattern, but will
888 // typically be on the order of milliseconds."
889 // Just what is a good value? 42 is always an answer ... the 23 enigma
890 // as well... which on the dev's machine is roughly 50 seconds with the
891 // pattern of fdo#70627.
892 /* TODO: make this a configuration settable value and possibly take
893 * complexity of expression into account and maybe even length of text
894 * to be matched; currently (2013-11-25) that is at most one 64k
895 * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
896 pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
897 }
898}
899
900
901static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
902 sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
903{
904 pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
905 pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
906 pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
907 // look-ahead/behind assertions
908
909 if (!pRegexMatcher->find(rIcuErr))
910 {
911 /* TODO: future versions could pass the UErrorCode or translations
912 * thereof to the caller, for example to inform the user of
913 * U_REGEX_TIME_OUT. The strange thing though is that an error is set
914 * only after the second call that returns immediately and not if
915 * timeout occurred on the first call?!? */
916 SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
917 return false;
918 }
919 return true;
920}
921
922SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
923 sal_Int32 startPos, sal_Int32 endPos )
924{
925 SearchResult aRet;
926 aRet.subRegExpressions = 0;
927 if( !pRegexMatcher)
928 return aRet;
929
930 if( endPos > searchStr.getLength())
931 endPos = searchStr.getLength();
932
933 // use the ICU RegexMatcher to find the matches
934 UErrorCode nIcuErr = U_ZERO_ERROR;
935 const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
936 searchStr.getLength());
937 pRegexMatcher->reset( aSearchTargetStr);
938 // search until there is a valid match
939 for(;;)
940 {
941 if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
942 return aRet;
943
944 // #i118887# ignore zero-length matches e.g. "a*" in "bc"
945 int nStartOfs = pRegexMatcher->start( nIcuErr);
946 int nEndOfs = pRegexMatcher->end( nIcuErr);
947 if( nStartOfs < nEndOfs)
948 break;
949 // If the zero-length match is behind the string, do not match it again
950 // and again until startPos reaches there. A match behind the string is
951 // a "$" anchor.
952 if (nStartOfs == endPos)
953 break;
954 // try at next position if there was a zero-length match
955 if( ++startPos >= endPos)
956 return aRet;
957 }
958
959 // extract the result of the search
960 const int nGroupCount = pRegexMatcher->groupCount();
961 aRet.subRegExpressions = nGroupCount + 1;
962 aRet.startOffset.realloc( aRet.subRegExpressions);
963 auto pstartOffset = aRet.startOffset.getArray();
964 aRet.endOffset.realloc( aRet.subRegExpressions);
965 auto pendOffset = aRet.endOffset.getArray();
966 pstartOffset[0] = pRegexMatcher->start( nIcuErr);
967 pendOffset[0] = pRegexMatcher->end( nIcuErr);
968 for( int i = 1; i <= nGroupCount; ++i) {
969 pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
970 pendOffset[i] = pRegexMatcher->end( i, nIcuErr);
971 }
972
973 return aRet;
974}
975
976SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
977 sal_Int32 startPos, sal_Int32 endPos )
978{
979 // NOTE: for backwards search callers provide startPos/endPos inverted!
980 SearchResult aRet;
981 aRet.subRegExpressions = 0;
982 if( !pRegexMatcher)
983 return aRet;
984
985 if( startPos > searchStr.getLength())
986 startPos = searchStr.getLength();
987
988 // use the ICU RegexMatcher to find the matches
989 // TODO: use ICU's backward searching once it becomes available
990 // as its replacement using forward search is not as good as the real thing
991 UErrorCode nIcuErr = U_ZERO_ERROR;
992 const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
993 searchStr.getLength());
994 pRegexMatcher->reset( aSearchTargetStr);
995 if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
996 return aRet;
997
998 // find the last match
999 int nLastPos = 0;
1000 int nFoundEnd = 0;
1001 int nGoodPos = 0, nGoodEnd = 0;
1002 bool bFirst = true;
1003 do {
1004 nLastPos = pRegexMatcher->start( nIcuErr);
1005 nFoundEnd = pRegexMatcher->end( nIcuErr);
1006 if (nLastPos < nFoundEnd)
1007 {
1008 // remember last non-zero-length match
1009 nGoodPos = nLastPos;
1010 nGoodEnd = nFoundEnd;
1011 }
1012 if( nFoundEnd >= startPos)
1013 break;
1014 bFirst = false;
1015 if( nFoundEnd == nLastPos)
1016 ++nFoundEnd;
1017 } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
1018
1019 // Ignore all zero-length matches except "$" anchor on first match.
1020 if (nGoodPos == nGoodEnd)
1021 {
1022 if (bFirst && nLastPos == startPos)
1023 nGoodPos = nLastPos;
1024 else
1025 return aRet;
1026 }
1027
1028 // find last match again to get its details
1029 lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
1030
1031 // fill in the details of the last match
1032 const int nGroupCount = pRegexMatcher->groupCount();
1033 aRet.subRegExpressions = nGroupCount + 1;
1034 aRet.startOffset.realloc( aRet.subRegExpressions);
1035 auto pstartOffset = aRet.startOffset.getArray();
1036 aRet.endOffset.realloc( aRet.subRegExpressions);
1037 auto pendOffset = aRet.endOffset.getArray();
1038 // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1039 pstartOffset[0] = pRegexMatcher->end( nIcuErr);
1040 pendOffset[0] = pRegexMatcher->start( nIcuErr);
1041 for( int i = 1; i <= nGroupCount; ++i) {
1042 pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
1043 pendOffset[i] = pRegexMatcher->start( i, nIcuErr);
1044 }
1045
1046 return aRet;
1047}
1048
1049
1050// search for words phonetically
1051SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
1052 sal_Int32 startPos, sal_Int32 endPos )
1053{
1054 SearchResult aRet;
1055 aRet.subRegExpressions = 0;
1056
1057 if( !xBreak.is() )
1058 return aRet;
1059
1060 sal_Int32 nStt, nEnd;
1061
1062 Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1063 aSrchPara.Locale,
1064 WordType::ANYWORD_IGNOREWHITESPACES, true );
1065
1066 do
1067 {
1068 if( aWBnd.startPos >= endPos )
1069 break;
1070 nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
1071 nEnd = std::min(aWBnd.endPos, endPos);
1072
1073 if( nStt < nEnd &&
1074 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1075 {
1076 aRet.subRegExpressions = 1;
1077 aRet.startOffset = { nStt };
1078 aRet.endOffset = { nEnd };
1079 break;
1080 }
1081
1082 nStt = nEnd - 1;
1083 aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
1084 WordType::ANYWORD_IGNOREWHITESPACES);
1085 } while( aWBnd.startPos != aWBnd.endPos ||
1086 (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
1087 // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1088 // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1089 // and nextWord() does also => don't loop forever.
1090 return aRet;
1091}
1092
1093SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
1094 sal_Int32 startPos, sal_Int32 endPos )
1095{
1096 SearchResult aRet;
1097 aRet.subRegExpressions = 0;
1098
1099 if( !xBreak.is() )
1100 return aRet;
1101
1102 sal_Int32 nStt, nEnd;
1103
1104 Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1105 aSrchPara.Locale,
1106 WordType::ANYWORD_IGNOREWHITESPACES, true );
1107
1108 do
1109 {
1110 if( aWBnd.endPos <= endPos )
1111 break;
1112 nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
1113 nEnd = std::min(aWBnd.endPos, startPos);
1114
1115 if( nStt < nEnd &&
1116 pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1117 {
1118 aRet.subRegExpressions = 1;
1119 aRet.startOffset = { nEnd };
1120 aRet.endOffset = { nStt };
1121 break;
1122 }
1123 if( !nStt )
1124 break;
1125
1126 aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
1127 WordType::ANYWORD_IGNOREWHITESPACES);
1128 } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
1129 return aRet;
1130}
1131
1132
1133namespace {
1134void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
1135{
1136 rRes.subRegExpressions = 1;
1137 rRes.startOffset = { nStartOffset };
1138 rRes.endOffset = { nEndOffset };
1139}
1140}
1141
1142SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1143{
1144 SearchResult aRes;
1145 aRes.subRegExpressions = 0; // no match
1146 sal_Int32 nStartOffset = nStartPos;
1147 sal_Int32 nEndOffset = nEndPos;
1148
1149 const sal_Int32 nStringLen = searchStr.getLength();
1150
1151 // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1152 // string match with [0,0).
1153 if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
1154 (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1155 return aRes;
1156
1157 const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1158 const sal_Int32 nPatternLen = rPattern.getLength();
1159
1160 // Handle special cases empty pattern and/or string outside of the loop to
1161 // not add performance penalties there and simplify.
1162 if (nStartPos == nEndPos)
1163 {
1164 sal_Int32 i = 0;
1165 while (i < nPatternLen && rPattern[i] == '*')
1166 ++i;
1167 if (i == nPatternLen)
1168 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1169 return aRes;
1170 }
1171
1172 // Empty pattern does not match any non-empty string.
1173 if (!nPatternLen)
1174 return aRes;
1175
1176 bool bRewind = false;
1177 sal_uInt32 cPattern = 0;
1178 sal_Int32 nPattern = 0;
1179 sal_Int32 nAfterFakePattern = nPattern;
1181 {
1182 // Fake a leading '*' wildcard.
1183 cPattern = '*';
1184 bRewind = true;
1185 // Assume a non-'*' pattern character follows. If it is a '*' instead
1186 // that will be handled in the loop by setting nPat.
1187 sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
1188 if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
1189 rPattern.iterateCodePoints( &nAfterFakePattern);
1190 }
1191
1192 sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1193 sal_uInt32 cPatternAfterAsterisk = 0;
1194 bool bEscaped = false, bEscapedAfterAsterisk = false;
1195
1196 // The loop code tries to avoid costly calls to iterateCodePoints() when
1197 // possible.
1198
1199 do
1200 {
1201 if (bRewind)
1202 {
1203 // Reuse cPattern after '*', nPattern was correspondingly
1204 // incremented to point behind cPattern.
1205 bRewind = false;
1206 }
1207 else if (nPattern < nPatternLen)
1208 {
1209 // nPattern will be incremented by iterateCodePoints().
1210 cPattern = rPattern.iterateCodePoints( &nPattern);
1211 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1212 {
1213 bEscaped = true;
1214 cPattern = rPattern.iterateCodePoints( &nPattern);
1215 }
1216 }
1217 else
1218 {
1219 // A trailing '*' is handled below.
1221 {
1222 // If the pattern is consumed and substring match allowed we're good.
1223 setWildcardMatch( aRes, nStartOffset, nString);
1224 return aRes;
1225 }
1226 else if (nString < nEndPos && nLastAsterisk >= 0)
1227 {
1228 // If substring match is not allowed try a greedy '*' match.
1229 nPattern = nLastAsterisk;
1230 continue; // do
1231 }
1232 else
1233 return aRes;
1234 }
1235
1236 if (cPattern == '*' && !bEscaped)
1237 {
1238 // '*' is one code unit, so not using iterateCodePoints() is ok.
1239 while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1240 ++nPattern;
1241
1242 if (nPattern >= nPatternLen)
1243 {
1244 // Last pattern is '*', remaining string matches.
1245 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1246 return aRes;
1247 }
1248
1249 nLastAsterisk = nPattern; // Remember last encountered '*'.
1250
1251 // cPattern will be the next non-'*' character, nPattern
1252 // incremented.
1253 cPattern = rPattern.iterateCodePoints( &nPattern);
1254 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1255 {
1256 bEscaped = true;
1257 cPattern = rPattern.iterateCodePoints( &nPattern);
1258 }
1259
1260 cPatternAfterAsterisk = cPattern;
1261 bEscapedAfterAsterisk = bEscaped;
1262 nPat = nPattern; // Remember position of pattern behind '*', already incremented.
1263 nStr = nString; // Remember the current string to be matched.
1264 }
1265
1266 if (nString >= nEndPos)
1267 // Whatever follows in pattern, string will not match.
1268 return aRes;
1269
1270 // nString will be incremented by iterateCodePoints().
1271 sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
1272
1273 if ((cPattern != '?' || bEscaped) && cPattern != cString)
1274 {
1275 if (nPat == -1)
1276 // Non-match already without any '*' pattern.
1277 return aRes;
1278
1279 bRewind = true;
1280 nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
1281 cPattern = cPatternAfterAsterisk;
1282 bEscaped = bEscapedAfterAsterisk;
1283 searchStr.iterateCodePoints( &nStr);
1284 nString = nStr; // Restore incremented remembered string position.
1285 if (nPat == nAfterFakePattern)
1286 {
1287 // Next start offset will be the next character.
1288 nStartOffset = nString;
1289 }
1290 }
1291 else
1292 {
1293 // An unescaped '?' pattern matched any character, or characters
1294 // matched. Reset only escaped state.
1295 bEscaped = false;
1296 }
1297 }
1298 while (nString < nEndPos);
1299
1300 if (bRewind)
1301 return aRes;
1302
1303 // Eat trailing '*' pattern that matches anything, including nothing.
1304 // '*' is one code unit, so not using iterateCodePoints() is ok.
1305 while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1306 ++nPattern;
1307
1308 if (nPattern == nPatternLen)
1309 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1310 return aRes;
1311}
1312
1313SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1314{
1315 SearchResult aRes;
1316 aRes.subRegExpressions = 0; // no match
1317
1318 sal_Int32 nStartOffset = nStartPos;
1319 sal_Int32 nEndOffset = nEndPos;
1320
1321 const sal_Int32 nStringLen = searchStr.getLength();
1322
1323 // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1324 // string match with (0,0].
1325 if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
1326 (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1327 return aRes;
1328
1329 const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1330 sal_Int32 nPatternLen = rPattern.getLength();
1331
1332 // Handle special cases empty pattern and/or string outside of the loop to
1333 // not add performance penalties there and simplify.
1334 if (nStartPos == nEndPos)
1335 {
1336 sal_Int32 i = 0;
1337 while (i < nPatternLen && rPattern[i] == '*')
1338 ++i;
1339 if (i == nPatternLen)
1340 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1341 return aRes;
1342 }
1343
1344 // Empty pattern does not match any non-empty string.
1345 if (!nPatternLen)
1346 return aRes;
1347
1348 // Reverse escaped patterns to ease the handling of escapes, keeping escape
1349 // and following character as one sequence in backward direction.
1350 if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
1352 {
1353 OUStringBuffer aPatternBuf( rPattern);
1354 sal_Int32 nIndex = 0;
1355 while (nIndex < nPatternLen)
1356 {
1357 const sal_Int32 nOld = nIndex;
1358 const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
1359 if (cu == mcWildcardEscapeChar)
1360 {
1361 if (nIndex < nPatternLen)
1362 {
1363 if (nIndex - nOld == 1)
1364 {
1365 // Simply move code units, we already memorized the one
1366 // in 'cu'.
1367 const sal_Int32 nOld2 = nIndex;
1368 rPattern.iterateCodePoints( &nIndex);
1369 for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1370 aPatternBuf[nOld+i] = rPattern[nOld2+i];
1371 aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
1372 }
1373 else
1374 {
1375 // Copy the escape character code units first in the
1376 // unlikely case that it would not be of BMP.
1377 assert(nIndex - nOld == 2); // it's UTF-16, so...
1378 sal_Unicode buf[2];
1379 buf[0] = rPattern[nOld];
1380 buf[1] = rPattern[nOld+1];
1381 const sal_Int32 nOld2 = nIndex;
1382 rPattern.iterateCodePoints( &nIndex);
1383 for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1384 aPatternBuf[nOld+i] = rPattern[nOld2+i];
1385 aPatternBuf[nIndex-2] = buf[0];
1386 aPatternBuf[nIndex-1] = buf[1];
1387 }
1388 }
1389 else
1390 {
1391 // Trailing escape would become leading escape, do what?
1392 // Eliminate.
1393 aPatternBuf.remove( nOld, nIndex - nOld);
1394 }
1395 }
1396 }
1398 maWildcardReversePattern = aPatternBuf.makeStringAndClear();
1399 else
1400 maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
1401 }
1402 const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
1403 nPatternLen = rReversePattern.getLength();
1404
1405 bool bRewind = false;
1406 sal_uInt32 cPattern = 0;
1407 sal_Int32 nPattern = nPatternLen;
1408 sal_Int32 nAfterFakePattern = nPattern;
1410 {
1411 // Fake a trailing '*' wildcard.
1412 cPattern = '*';
1413 bRewind = true;
1414 // Assume a non-'*' pattern character follows. If it is a '*' instead
1415 // that will be handled in the loop by setting nPat.
1416 sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1417 if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
1418 rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1419 }
1420
1421 sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1422 sal_uInt32 cPatternAfterAsterisk = 0;
1423 bool bEscaped = false, bEscapedAfterAsterisk = false;
1424
1425 // The loop code tries to avoid costly calls to iterateCodePoints() when
1426 // possible.
1427
1428 do
1429 {
1430 if (bRewind)
1431 {
1432 // Reuse cPattern after '*', nPattern was correspondingly
1433 // decremented to point before cPattern.
1434 bRewind = false;
1435 }
1436 else if (nPattern > 0)
1437 {
1438 // nPattern will be decremented by iterateCodePoints().
1439 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1440 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1441 {
1442 bEscaped = true;
1443 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1444 }
1445 }
1446 else
1447 {
1448 // A trailing '*' is handled below.
1450 {
1451 // If the pattern is consumed and substring match allowed we're good.
1452 setWildcardMatch( aRes, nStartOffset, nString);
1453 return aRes;
1454 }
1455 else if (nString > nEndPos && nLastAsterisk >= 0)
1456 {
1457 // If substring match is not allowed try a greedy '*' match.
1458 nPattern = nLastAsterisk;
1459 continue; // do
1460 }
1461 else
1462 return aRes;
1463 }
1464
1465 if (cPattern == '*' && !bEscaped)
1466 {
1467 // '*' is one code unit, so not using iterateCodePoints() is ok.
1468 while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1469 --nPattern;
1470
1471 if (nPattern <= 0)
1472 {
1473 // First pattern is '*', remaining string matches.
1474 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1475 return aRes;
1476 }
1477
1478 nLastAsterisk = nPattern; // Remember last encountered '*'.
1479
1480 // cPattern will be the previous non-'*' character, nPattern
1481 // decremented.
1482 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1483 if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1484 {
1485 bEscaped = true;
1486 cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1487 }
1488
1489 cPatternAfterAsterisk = cPattern;
1490 bEscapedAfterAsterisk = bEscaped;
1491 nPat = nPattern; // Remember position of pattern before '*', already decremented.
1492 nStr = nString; // Remember the current string to be matched.
1493 }
1494
1495 if (nString <= nEndPos)
1496 // Whatever leads in pattern, string will not match.
1497 return aRes;
1498
1499 // nString will be decremented by iterateCodePoints().
1500 sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
1501
1502 if ((cPattern != '?' || bEscaped) && cPattern != cString)
1503 {
1504 if (nPat == -1)
1505 // Non-match already without any '*' pattern.
1506 return aRes;
1507
1508 bRewind = true;
1509 nPattern = nPat; // Rewind pattern to character before '*', already decremented.
1510 cPattern = cPatternAfterAsterisk;
1511 bEscaped = bEscapedAfterAsterisk;
1512 searchStr.iterateCodePoints( &nStr, -1);
1513 nString = nStr; // Restore decremented remembered string position.
1514 if (nPat == nAfterFakePattern)
1515 {
1516 // Next start offset will be this character (exclusive).
1517 nStartOffset = nString;
1518 }
1519 }
1520 else
1521 {
1522 // An unescaped '?' pattern matched any character, or characters
1523 // matched. Reset only escaped state.
1524 bEscaped = false;
1525 }
1526 }
1527 while (nString > nEndPos);
1528
1529 if (bRewind)
1530 return aRes;
1531
1532 // Eat leading '*' pattern that matches anything, including nothing.
1533 // '*' is one code unit, so not using iterateCodePoints() is ok.
1534 while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1535 --nPattern;
1536
1537 if (nPattern == 0)
1538 setWildcardMatch( aRes, nStartOffset, nEndOffset);
1539 return aRes;
1540}
1541
1542
1543OUString SAL_CALL
1545{
1546 return "com.sun.star.util.TextSearch_i18n";
1547}
1548
1549sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
1550{
1551 return cppu::supportsService(this, rServiceName);
1552}
1553
1554Sequence< OUString > SAL_CALL
1556{
1557 return { "com.sun.star.util.TextSearch", "com.sun.star.util.TextSearch2" };
1558}
1559
1560extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
1562 css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
1563{
1564 return cppu::acquire(new TextSearch(context));
1565}
1566
1567/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Reference< XComponentContext > m_xContext
css::uno::Reference< css::uno::XComponentContext > m_xContext
Definition: textsearch.hxx:51
virtual css::util::SearchResult SAL_CALL searchForward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:301
sal_Int32 startPos
Definition: textsearch.hxx:65
std::unique_ptr< WLevDistance > pWLD
Definition: textsearch.hxx:106
void RESrchPrepare(const css::util::SearchOptions2 &)
Definition: textsearch.cxx:833
FnSrch fnBackward
Definition: textsearch.hxx:68
css::util::SearchResult SAL_CALL RESrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:922
bool bUsePrimarySrchStr
Definition: textsearch.hxx:77
std::unique_ptr< TextSearchJumpTable > pJumpTable
Definition: textsearch.hxx:74
bool bIsForwardTab
Definition: textsearch.hxx:76
OUString maWildcardReversePattern
Definition: textsearch.hxx:118
bool IsDelimiter(const OUString &rStr, sal_Int32 nPos) const
Definition: textsearch.cxx:563
css::util::SearchResult SAL_CALL WildcardSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
OUString maWildcardReversePattern2
Definition: textsearch.hxx:119
css::util::SearchResult SAL_CALL RESrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:976
virtual ~TextSearch() override
Definition: textsearch.cxx:109
css::util::SearchOptions2 aSrchPara
Definition: textsearch.hxx:53
virtual css::util::SearchResult SAL_CALL searchBackward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:440
std::unique_ptr< TextSearchJumpTable > pJumpTable2
Definition: textsearch.hxx:75
virtual void SAL_CALL setOptions2(const css::util::SearchOptions2 &options) override
Definition: textsearch.cxx:117
bool bSearchApostrophe
Definition: textsearch.hxx:71
void MakeBackwardTab()
Definition: textsearch.cxx:633
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit2
Definition: textsearch.hxx:60
OUString sSrchStr2
Definition: textsearch.hxx:55
bool mbWildcardAllowSubstring
Definition: textsearch.hxx:121
css::util::SearchResult SAL_CALL NSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:762
sal_uInt32 mcWildcardEscapeChar
Definition: textsearch.hxx:120
virtual OUString SAL_CALL getImplementationName() override
css::util::SearchResult SAL_CALL ApproxSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
sal_Int32 GetDiff(const sal_Unicode) const
Definition: textsearch.cxx:679
void MakeBackwardTab2()
Definition: textsearch.cxx:656
OUString sSrchStr
Definition: textsearch.hxx:54
sal_Int32 sal_Int32 endPos
Definition: textsearch.hxx:65
FnSrch fnForward
Definition: textsearch.hxx:67
css::util::SearchResult SAL_CALL WildcardSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
void MakeForwardTab()
Definition: textsearch.cxx:582
void MakeForwardTab2()
Definition: textsearch.cxx:608
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit
Definition: textsearch.hxx:59
std::mutex m_aMutex
Definition: textsearch.hxx:50
css::util::SearchResult SAL_CALL NSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:699
virtual void SAL_CALL setOptions(const css::util::SearchOptions &options) override
Definition: textsearch.cxx:258
css::uno::Reference< css::i18n::XCharacterClassification > xCharClass
Definition: textsearch.hxx:57
css::util::SearchResult SAL_CALL ApproxSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
std::unique_ptr< icu::RegexMatcher > pRegexMatcher
Definition: textsearch.hxx:93
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
TextSearch(const css::uno::Reference< css::uno::XComponentContext > &rxContext)
css::uno::Reference< css::i18n::XBreakIterator > xBreak
Definition: textsearch.hxx:107
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:134
float v
float u
sal_Int32 nIndex
sal_Int64 n
sal_uInt16 nPos
#define SAL_WARN(area, stream)
#define SAL_INFO(area, stream)
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
int i
m
const TransliterationFlags COMPLEX_TRANS_MASK
Definition: textsearch.cxx:45
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * i18npool_TextSearch_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
static bool lcl_findRegex(std::unique_ptr< icu::RegexMatcher > const &pRegexMatcher, sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode &rIcuErr)
Definition: textsearch.cxx:901
static sal_Int32 FindPosInSeq_Impl(const Sequence< sal_Int32 > &rOff, sal_Int32 nPos)
Definition: textsearch.cxx:294
::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable
Definition: textsearch.hxx:41
TransliterationFlags
unsigned char sal_Bool
sal_uInt16 sal_Unicode