LibreOffice Module i18npool (master)  1
textsearch.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include "textsearch.hxx"
21 #include "levdis.hxx"
22 #include <com/sun/star/i18n/BreakIterator.hpp>
23 #include <com/sun/star/util/SearchAlgorithms2.hpp>
24 #include <com/sun/star/util/SearchFlags.hpp>
25 #include <com/sun/star/i18n/WordType.hpp>
26 #include <com/sun/star/i18n/ScriptType.hpp>
27 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
28 #include <com/sun/star/i18n/CharacterClassification.hpp>
29 #include <com/sun/star/i18n/KCharacterType.hpp>
30 #include <com/sun/star/i18n/Transliteration.hpp>
32 #include <cppuhelper/weak.hxx>
34 #include <rtl/ustrbuf.hxx>
35 #include <sal/log.hxx>
36 
37 #include <unicode/regex.h>
38 
39 using namespace ::com::sun::star::util;
40 using namespace ::com::sun::star::uno;
41 using namespace ::com::sun::star::lang;
42 using namespace ::com::sun::star::i18n;
43 using namespace ::com::sun::star;
44 
46  TransliterationFlags::ignoreBaFa_ja_JP |
47  TransliterationFlags::ignoreIterationMark_ja_JP |
48  TransliterationFlags::ignoreTiJi_ja_JP |
49  TransliterationFlags::ignoreHyuByu_ja_JP |
50  TransliterationFlags::ignoreSeZe_ja_JP |
51  TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
52  TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
53  TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
54 
55 namespace
56 {
58 {
59  // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
60  // in complex transliteration.
61  return
62  n & (COMPLEX_TRANS_MASK | // all set ignore bits
63  TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
64  TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
65 }
66 
67 bool isComplexTrans( TransliterationFlags n )
68 {
69  return bool(n & COMPLEX_TRANS_MASK);
70 }
71 
72 TransliterationFlags maskSimpleTrans( TransliterationFlags n )
73 {
74  return n & ~COMPLEX_TRANS_MASK;
75 }
76 
77 bool isSimpleTrans( TransliterationFlags n )
78 {
79  return bool(maskSimpleTrans(n));
80 }
81 
82 // Regex patterns are case sensitive.
83 TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
84 {
85  TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
86  TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
87  if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
88  v = TransliterationFlags::NONE;
89  return (m | v) & ~COMPLEX_TRANS_MASK;
90 }
91 
92 bool isSimpleRegexTrans( TransliterationFlags n )
93 {
94  return bool(maskSimpleRegexTrans(n));
95 }
96 };
97 
99  : m_xContext( rxContext )
100 {
101  SearchOptions2 aOpt;
102  aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
103  aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
104  aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
105  //aOpt.Locale = ???;
106  setOptions( aOpt );
107 }
108 
110 {
111  pRegexMatcher.reset();
112  pWLD.reset();
113  pJumpTable.reset();
114  pJumpTable2.reset();
115 }
116 
117 void TextSearch::setOptions2( const SearchOptions2& rOptions )
118 {
119  osl::MutexGuard g(m_aMutex);
120 
121  aSrchPara = rOptions;
122 
123  pRegexMatcher.reset();
124  pWLD.reset();
125  pJumpTable.reset();
126  pJumpTable2.reset();
127  maWildcardReversePattern.clear();
129  TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
130  bSearchApostrophe = false;
131  bool bReplaceApostrophe = false;
132  if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
133  {
134  // RESrchPrepare will consider aSrchPara.transliterateFlags when
135  // picking the actual regex pattern
136  // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
137  // case-insensitivity. Create transliteration instance, if any, without
138  // ignore-case so later in TextSearch::searchForward() the string to
139  // match is not case-altered, leave case-(in)sensitive to regex engine.
140  transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
141  }
142  else if ( aSrchPara.searchString.indexOf('\'') > - 1 )
143  {
144  bSearchApostrophe = true;
145  bReplaceApostrophe = aSrchPara.searchString.indexOf(u'\u2019') > -1;
146  }
147 
148  // Create Transliteration class
149  if( isSimpleTrans( transliterateFlags) )
150  {
151  if( !xTranslit.is() )
152  xTranslit.set( Transliteration::create( m_xContext ) );
153  xTranslit->loadModule(
154  static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
155  aSrchPara.Locale);
156  }
157  else if( xTranslit.is() )
158  xTranslit = nullptr;
159 
160  // Create Transliteration for 2<->1, 2<->2 transliteration
161  if ( isComplexTrans( transliterateFlags) )
162  {
163  if( !xTranslit2.is() )
164  xTranslit2.set( Transliteration::create( m_xContext ) );
165  // Load transliteration module
166  xTranslit2->loadModule(
167  static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
168  aSrchPara.Locale);
169  }
170 
171  if ( !xBreak.is() )
172  xBreak = css::i18n::BreakIterator::create( m_xContext );
173 
174  sSrchStr = aSrchPara.searchString;
175 
176  // Transliterate search string.
177  if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
178  {
179  if (isSimpleRegexTrans(transliterateFlags))
180  {
181  if (maskSimpleRegexTrans(transliterateFlags) !=
182  maskSimpleTrans(transliterateFlags))
183  {
184  css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
185  Transliteration::create( m_xContext ));
186  if (xTranslitPattern.is())
187  {
188  xTranslitPattern->loadModule(
189  static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
190  aSrchPara.Locale);
191  sSrchStr = xTranslitPattern->transliterateString2String(
192  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
193  }
194  }
195  else
196  {
197  if (xTranslit.is())
198  sSrchStr = xTranslit->transliterateString2String(
199  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
200  }
201  // xTranslit2 complex transliterated sSrchStr2 is not used in
202  // regex, see TextSearch::searchForward() and
203  // TextSearch::searchBackward()
204  }
205  }
206  else
207  {
208  if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
209  sSrchStr = xTranslit->transliterateString2String(
210  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
211 
212  if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
213  sSrchStr2 = xTranslit2->transliterateString2String(
214  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
215  }
216 
217  // When start or end of search string is a complex script type, we need to
218  // make sure the result boundary is not located in the middle of cell.
219  checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
220  ScriptType::COMPLEX));
221  checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
222  sSrchStr.getLength()-1) == ScriptType::COMPLEX));
223 
224  if ( bReplaceApostrophe )
225  sSrchStr = sSrchStr.replace(u'\u2019', '\'');
226 
227  // Take the new SearchOptions2::AlgorithmType2 field and ignore
228  // SearchOptions::algorithmType
229  switch( aSrchPara.AlgorithmType2)
230  {
231  case SearchAlgorithms2::REGEXP:
235  break;
236 
237  case SearchAlgorithms2::APPROXIMATE:
240 
241  pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
242  aSrchPara.insertedChars, aSrchPara.deletedChars,
243  0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
244 
245  nLimit = pWLD->GetLimit();
246  break;
247 
248  case SearchAlgorithms2::WILDCARD:
249  mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
250  mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
253  break;
254 
255  default:
256  SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
257  [[fallthrough]];
258  case SearchAlgorithms2::ABSOLUTE:
261  break;
262  }
263 }
264 
265 void TextSearch::setOptions( const SearchOptions& rOptions )
266 {
267  osl::MutexGuard g(m_aMutex);
268 
269  sal_Int16 nAlgorithmType2;
270  switch (rOptions.algorithmType)
271  {
272  case SearchAlgorithms_REGEXP:
273  nAlgorithmType2 = SearchAlgorithms2::REGEXP;
274  break;
275  case SearchAlgorithms_APPROXIMATE:
276  nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
277  break;
278  default:
279  SAL_WARN("i18npool","TextSearch::setOptions - default what?");
280  [[fallthrough]];
281  case SearchAlgorithms_ABSOLUTE:
282  nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
283  break;
284  }
285  // It would be nice if an inherited struct had a ctor that takes an
286  // instance of the object the struct derived from...
287  SearchOptions2 aOptions2(
288  rOptions.algorithmType,
289  rOptions.searchFlag,
290  rOptions.searchString,
291  rOptions.replaceString,
292  rOptions.Locale,
293  rOptions.changedChars,
294  rOptions.deletedChars,
295  rOptions.insertedChars,
296  rOptions.transliterateFlags,
297  nAlgorithmType2,
298  0 // no wildcard search, no escape character...
299  );
300  setOptions2( aOptions2);
301 }
302 
303 static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
304 {
305  auto pOff = std::find_if(rOff.begin(), rOff.end(),
306  [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
307  return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
308 }
309 
310 bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
311 {
312  sal_Int32 nDone;
313  return nPos == xBreak->previousCharacters(searchStr, nPos+1,
314  aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
315 }
316 
317 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
318 {
319  osl::MutexGuard g(m_aMutex);
320 
321  SearchResult sres;
322 
323  OUString in_str(searchStr);
324 
325  // in non-regex mode, allow searching typographical apostrophe with the ASCII one
326  // to avoid regression after using automatic conversion to U+2019 during typing in Writer
327  bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
328 
329  bUsePrimarySrchStr = true;
330 
331  if ( xTranslit.is() )
332  {
333  // apply normal transliteration (1<->1, 1<->0)
334 
335  sal_Int32 nInStartPos = startPos;
336  if (pRegexMatcher && startPos > 0)
337  {
338  // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
339  // transliterate enough of the leading text to allow sensible look-behind assertions.
340  // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
341  // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
342  // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
343  // the middle of a surrogate pair, preceded by another surrogate pair.
344  const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
345  nInStartPos -= std::min(nMaxLeadingLen, startPos);
346  }
347  sal_Int32 nInEndPos = endPos;
348  if (pRegexMatcher && endPos < searchStr.getLength())
349  {
350  // tdf#65038: ditto for look-ahead assertions
351  const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
352  nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
353  }
354 
355  css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
356  in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
357 
358  if ( bReplaceApostrophe )
359  in_str = in_str.replace(u'\u2019', '\'');
360 
361  // JP 20.6.2001: also the start and end positions must be corrected!
362  sal_Int32 newStartPos =
363  (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
364 
365  sal_Int32 newEndPos = (endPos < searchStr.getLength())
366  ? FindPosInSeq_Impl( offset, endPos )
367  : in_str.getLength();
368 
369  sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
370 
371  // Map offsets back to untransliterated string.
372  const sal_Int32 nOffsets = offset.getLength();
373  if (nOffsets)
374  {
375  // For regex nGroups is the number of groups+1 with group 0 being
376  // the entire match.
377  const sal_Int32 nGroups = sres.startOffset.getLength();
378  for ( sal_Int32 k = 0; k < nGroups; k++ )
379  {
380  const sal_Int32 nStart = sres.startOffset[k];
381  // Result offsets are negative (-1) if a group expression was
382  // not matched.
383  if (nStart >= 0)
384  sres.startOffset[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
385  // JP 20.6.2001: end is ever exclusive and then don't return
386  // the position of the next character - return the
387  // next position behind the last found character!
388  // "a b c" find "b" must return 2,3 and not 2,4!!!
389  const sal_Int32 nStop = sres.endOffset[k];
390  if (nStop >= 0)
391  {
392  if (nStop > 0)
393  sres.endOffset[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
394  else
395  sres.endOffset[k] = offset[0];
396  }
397  }
398  }
399  }
400  else
401  {
402  if ( bReplaceApostrophe )
403  in_str = in_str.replace(u'\u2019', '\'');
404 
405  sres = (this->*fnForward)( in_str, startPos, endPos );
406  }
407 
408  if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
409  {
410  SearchResult sres2;
411 
412  in_str = searchStr;
413  css::uno::Sequence <sal_Int32> offset( in_str.getLength());
414 
415  in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
416 
417  if( startPos )
418  startPos = FindPosInSeq_Impl( offset, startPos );
419 
420  if( endPos < searchStr.getLength() )
421  endPos = FindPosInSeq_Impl( offset, endPos );
422  else
423  endPos = in_str.getLength();
424 
425  bUsePrimarySrchStr = false;
426  sres2 = (this->*fnForward)( in_str, startPos, endPos );
427 
428  for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
429  {
430  if (sres2.startOffset[k])
431  sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
432  if (sres2.endOffset[k])
433  sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
434  }
435 
436  // pick first and long one
437  if ( sres.subRegExpressions == 0)
438  return sres2;
439  if ( sres2.subRegExpressions == 1)
440  {
441  if ( sres.startOffset[0] > sres2.startOffset[0])
442  return sres2;
443  else if ( sres.startOffset[0] == sres2.startOffset[0] &&
444  sres.endOffset[0] < sres2.endOffset[0])
445  return sres2;
446  }
447  }
448 
449  return sres;
450 }
451 
452 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
453 {
454  osl::MutexGuard g(m_aMutex);
455 
456  SearchResult sres;
457 
458  OUString in_str(searchStr);
459 
460  // in non-regex mode, allow searching typographical apostrophe with the ASCII one
461  // to avoid regression after using automatic conversion to U+2019 during typing in Writer
462  bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
463 
464  bUsePrimarySrchStr = true;
465 
466  if ( xTranslit.is() )
467  {
468  // apply only simple 1<->1 transliteration here
469  css::uno::Sequence<sal_Int32> offset(startPos - endPos);
470  in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
471 
472  if ( bReplaceApostrophe )
473  in_str = in_str.replace(u'\u2019', '\'');
474 
475  // JP 20.6.2001: also the start and end positions must be corrected!
476  sal_Int32 const newStartPos = (startPos < searchStr.getLength())
477  ? FindPosInSeq_Impl( offset, startPos )
478  : in_str.getLength();
479 
480  sal_Int32 const newEndPos =
481  (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
482 
483  // TODO: this would need nExtraOffset handling to avoid $ matching
484  // if (pRegexMatcher && startPos < searchStr.getLength())
485  // but that appears to be impossible with ICU regex
486 
487  sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
488 
489  // Map offsets back to untransliterated string.
490  const sal_Int32 nOffsets = offset.getLength();
491  if (nOffsets)
492  {
493  // For regex nGroups is the number of groups+1 with group 0 being
494  // the entire match.
495  const sal_Int32 nGroups = sres.startOffset.getLength();
496  for ( sal_Int32 k = 0; k < nGroups; k++ )
497  {
498  const sal_Int32 nStart = sres.startOffset[k];
499  // Result offsets are negative (-1) if a group expression was
500  // not matched.
501  if (nStart >= 0)
502  {
503  if (nStart > 0)
504  sres.startOffset[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
505  else
506  sres.startOffset[k] = offset[0];
507  }
508  // JP 20.6.2001: end is ever exclusive and then don't return
509  // the position of the next character - return the
510  // next position behind the last found character!
511  // "a b c" find "b" must return 2,3 and not 2,4!!!
512  const sal_Int32 nStop = sres.endOffset[k];
513  if (nStop >= 0)
514  sres.endOffset[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
515  }
516  }
517  }
518  else
519  {
520  if ( bReplaceApostrophe )
521  in_str = in_str.replace(u'\u2019', '\'');
522 
523  sres = (this->*fnBackward)( in_str, startPos, endPos );
524  }
525 
526  if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
527  {
528  SearchResult sres2;
529 
530  in_str = searchStr;
531  css::uno::Sequence <sal_Int32> offset( in_str.getLength());
532 
533  in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
534 
535  if( startPos < searchStr.getLength() )
536  startPos = FindPosInSeq_Impl( offset, startPos );
537  else
538  startPos = in_str.getLength();
539 
540  if( endPos )
541  endPos = FindPosInSeq_Impl( offset, endPos );
542 
543  bUsePrimarySrchStr = false;
544  sres2 = (this->*fnBackward)( in_str, startPos, endPos );
545 
546  for( int k = 0; k < sres2.startOffset.getLength(); k++ )
547  {
548  if (sres2.startOffset[k])
549  sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
550  if (sres2.endOffset[k])
551  sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
552  }
553 
554  // pick last and long one
555  if ( sres.subRegExpressions == 0 )
556  return sres2;
557  if ( sres2.subRegExpressions == 1 )
558  {
559  if ( sres.startOffset[0] < sres2.startOffset[0] )
560  return sres2;
561  if ( sres.startOffset[0] == sres2.startOffset[0] &&
562  sres.endOffset[0] > sres2.endOffset[0] )
563  return sres2;
564  }
565  }
566 
567  return sres;
568 }
569 
570 
571 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
572 {
573  bool bRet = true;
574  if( '\x7f' != rStr[nPos])
575  {
576  if ( !xCharClass.is() )
577  xCharClass = CharacterClassification::create( m_xContext );
578  sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
579  aSrchPara.Locale );
580  if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
581  KCharacterType::LETTER ) & nCType ) )
582  bRet = false;
583  }
584  return bRet;
585 }
586 
587 // --------- helper methods for Boyer-Moore like text searching ----------
588 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
589 
591 {
592  // create the jumptable for the search text
593 
594  if( pJumpTable && bIsForwardTab )
595  {
596  return; // the jumpTable is ok
597  }
598  bIsForwardTab = true;
599 
600  sal_Int32 n, nLen = sSrchStr.getLength();
601  pJumpTable.reset( new TextSearchJumpTable );
602 
603  for( n = 0; n < nLen - 1; ++n )
604  {
605  sal_Unicode cCh = sSrchStr[n];
606  sal_Int32 nDiff = nLen - n - 1;
607  TextSearchJumpTable::value_type aEntry( cCh, nDiff );
608 
609  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
610  pJumpTable->insert( aEntry );
611  if ( !aPair.second )
612  (*(aPair.first)).second = nDiff;
613  }
614 }
615 
617 {
618  // create the jumptable for the search text
619  if( pJumpTable2 && bIsForwardTab )
620  {
621  return; // the jumpTable is ok
622  }
623  bIsForwardTab = true;
624 
625  sal_Int32 n, nLen = sSrchStr2.getLength();
626  pJumpTable2.reset( new TextSearchJumpTable );
627 
628  for( n = 0; n < nLen - 1; ++n )
629  {
630  sal_Unicode cCh = sSrchStr2[n];
631  sal_Int32 nDiff = nLen - n - 1;
632 
633  TextSearchJumpTable::value_type aEntry( cCh, nDiff );
634  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
635  pJumpTable2->insert( aEntry );
636  if ( !aPair.second )
637  (*(aPair.first)).second = nDiff;
638  }
639 }
640 
642 {
643  // create the jumptable for the search text
644  if( pJumpTable && !bIsForwardTab)
645  {
646  return; // the jumpTable is ok
647  }
648  bIsForwardTab = false;
649 
650  sal_Int32 n, nLen = sSrchStr.getLength();
651  pJumpTable.reset( new TextSearchJumpTable );
652 
653  for( n = nLen-1; n > 0; --n )
654  {
655  sal_Unicode cCh = sSrchStr[n];
656  TextSearchJumpTable::value_type aEntry( cCh, n );
657  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
658  pJumpTable->insert( aEntry );
659  if ( !aPair.second )
660  (*(aPair.first)).second = n;
661  }
662 }
663 
665 {
666  // create the jumptable for the search text
667  if( pJumpTable2 && !bIsForwardTab )
668  {
669  return; // the jumpTable is ok
670  }
671  bIsForwardTab = false;
672 
673  sal_Int32 n, nLen = sSrchStr2.getLength();
674  pJumpTable2.reset( new TextSearchJumpTable );
675 
676  for( n = nLen-1; n > 0; --n )
677  {
678  sal_Unicode cCh = sSrchStr2[n];
679  TextSearchJumpTable::value_type aEntry( cCh, n );
680  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
681  pJumpTable2->insert( aEntry );
682  if ( !aPair.second )
683  (*(aPair.first)).second = n;
684  }
685 }
686 
687 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
688 {
689  TextSearchJumpTable *pJump;
690  OUString sSearchKey;
691 
692  if ( bUsePrimarySrchStr ) {
693  pJump = pJumpTable.get();
694  sSearchKey = sSrchStr;
695  } else {
696  pJump = pJumpTable2.get();
697  sSearchKey = sSrchStr2;
698  }
699 
700  TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
701  if ( iLook == pJump->end() )
702  return sSearchKey.getLength();
703  return (*iLook).second;
704 }
705 
706 
707 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
708 {
709  SearchResult aRet;
710  aRet.subRegExpressions = 0;
711 
712  OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
713 
714  sal_Int32 nSuchIdx = searchStr.getLength();
715  sal_Int32 nEnd = endPos;
716  if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
717  return aRet;
718 
719 
720  if( nEnd < sSearchKey.getLength() ) // position inside the search region ?
721  return aRet;
722 
723  nEnd -= sSearchKey.getLength();
724 
725  if (bUsePrimarySrchStr)
726  MakeForwardTab(); // create the jumptable
727  else
728  MakeForwardTab2();
729 
730  for (sal_Int32 nCmpIdx = startPos; // start position for the search
731  nCmpIdx <= nEnd;
732  nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
733  {
734  // if the match would be the completed cells, skip it.
735  if ( (checkCTLStart && !isCellStart( searchStr, nCmpIdx )) || (checkCTLEnd
736  && !isCellStart( searchStr, nCmpIdx + sSearchKey.getLength())) )
737  continue;
738 
739  nSuchIdx = sSearchKey.getLength() - 1;
740  while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
741  {
742  if( nSuchIdx == 0 )
743  {
744  if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
745  {
746  sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
747  bool bAtStart = !nCmpIdx;
748  bool bAtEnd = nFndEnd == endPos;
749  bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
750  bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd );
751  // * 1 -> only one word in the paragraph
752  // * 2 -> at begin of paragraph
753  // * 3 -> at end of paragraph
754  // * 4 -> inside the paragraph
755  if( !( ( bAtStart && bAtEnd ) || // 1
756  ( bAtStart && bDelimBehind ) || // 2
757  ( bAtEnd && bDelimBefore ) || // 3
758  ( bDelimBefore && bDelimBehind ))) // 4
759  break;
760  }
761 
762  aRet.subRegExpressions = 1;
763  aRet.startOffset.realloc( 1 );
764  aRet.startOffset[ 0 ] = nCmpIdx;
765  aRet.endOffset.realloc( 1 );
766  aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
767 
768  return aRet;
769  }
770  else
771  nSuchIdx--;
772  }
773  }
774  return aRet;
775 }
776 
777 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
778 {
779  SearchResult aRet;
780  aRet.subRegExpressions = 0;
781 
782  OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
783 
784  sal_Int32 nSuchIdx = searchStr.getLength();
785  sal_Int32 nEnd = endPos;
786  if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
787  return aRet;
788 
789  if (bUsePrimarySrchStr)
790  MakeBackwardTab(); // create the jumptable
791  else
793 
794  if( nEnd == nSuchIdx ) // end position for the search
795  nEnd = sSearchKey.getLength();
796  else
797  nEnd += sSearchKey.getLength();
798 
799  sal_Int32 nCmpIdx = startPos; // start position for the search
800 
801  while (nCmpIdx >= nEnd)
802  {
803  // if the match would be the completed cells, skip it.
804  if ( (!checkCTLStart || isCellStart( searchStr, nCmpIdx -
805  sSearchKey.getLength() )) && (!checkCTLEnd ||
806  isCellStart( searchStr, nCmpIdx)))
807  {
808  nSuchIdx = 0;
809  while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
810  searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
811  nSuchIdx++;
812  if( nSuchIdx >= sSearchKey.getLength() )
813  {
814  if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
815  {
816  sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
817  bool bAtStart = !nFndStt;
818  bool bAtEnd = nCmpIdx == startPos;
819  bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
820  bool bDelimBefore = bAtStart || // begin of paragraph
821  IsDelimiter( searchStr, nFndStt-1 );
822  // * 1 -> only one word in the paragraph
823  // * 2 -> at begin of paragraph
824  // * 3 -> at end of paragraph
825  // * 4 -> inside the paragraph
826  if( ( bAtStart && bAtEnd ) || // 1
827  ( bAtStart && bDelimBehind ) || // 2
828  ( bAtEnd && bDelimBefore ) || // 3
829  ( bDelimBefore && bDelimBehind )) // 4
830  {
831  aRet.subRegExpressions = 1;
832  aRet.startOffset.realloc( 1 );
833  aRet.startOffset[ 0 ] = nCmpIdx;
834  aRet.endOffset.realloc( 1 );
835  aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
836  return aRet;
837  }
838  }
839  else
840  {
841  aRet.subRegExpressions = 1;
842  aRet.startOffset.realloc( 1 );
843  aRet.startOffset[ 0 ] = nCmpIdx;
844  aRet.endOffset.realloc( 1 );
845  aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
846  return aRet;
847  }
848  }
849  }
850  nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
851  if( nCmpIdx < nSuchIdx )
852  return aRet;
853  nCmpIdx -= nSuchIdx;
854  }
855  return aRet;
856 }
857 
858 void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
859 {
860  TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
861  // select the transliterated pattern string
862  const OUString& rPatternStr =
863  (isSimpleTrans(transliterateFlags) ? sSrchStr
864  : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
865 
866  sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
867  // map css::util::SearchFlags to ICU uregex.h flags
868  // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
869  // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
870  // REG_NOSUB is not used anywhere => not implemented
871  // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
872  // LEV_RELAXED is only used for SearchAlgorithm==Approximate
873  // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
874  // probably because the transliteration flag IGNORE_CASE handles it as well.
875  if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
876  || (transliterateFlags & TransliterationFlags::IGNORE_CASE))
877  nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
878  UErrorCode nIcuErr = U_ZERO_ERROR;
879  // assumption: transliteration didn't mangle regexp control chars
880  IcuUniString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
881 #ifndef DISABLE_WORDBOUND_EMULATION
882  // for convenience specific syntax elements of the old regex engine are emulated
883  // - by replacing < with "word-break followed by a look-ahead word-char"
884  static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
885  static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
886  static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
887  aChevronMatcherB.reset( aIcuSearchPatStr);
888  aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
889  aChevronMatcherB.reset();
890  // - by replacing > with "look-behind word-char followed by a word-break"
891  static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
892  static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
893  static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
894  aChevronMatcherE.reset( aIcuSearchPatStr);
895  aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
896  aChevronMatcherE.reset();
897 #endif
898  pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
899  if (nIcuErr)
900  {
901  SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
902  pRegexMatcher.reset();
903  }
904  else
905  {
906  // Pathological patterns may result in exponential run time making the
907  // application appear to be frozen. Limit that. Documentation for this
908  // call says
909  // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
910  // "The units of the limit are steps of the match engine.
911  // Correspondence with actual processor time will depend on the speed
912  // of the processor and the details of the specific pattern, but will
913  // typically be on the order of milliseconds."
914  // Just what is a good value? 42 is always an answer ... the 23 enigma
915  // as well... which on the dev's machine is roughly 50 seconds with the
916  // pattern of fdo#70627.
917  /* TODO: make this a configuration settable value and possibly take
918  * complexity of expression into account and maybe even length of text
919  * to be matched; currently (2013-11-25) that is at most one 64k
920  * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
921  pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
922  }
923 }
924 
925 
926 static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
927  sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
928 {
929  pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
930  pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
931  pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
932  // look-ahead/behind assertions
933 
934  if (!pRegexMatcher->find(rIcuErr))
935  {
936  /* TODO: future versions could pass the UErrorCode or translations
937  * thereof to the caller, for example to inform the user of
938  * U_REGEX_TIME_OUT. The strange thing though is that an error is set
939  * only after the second call that returns immediately and not if
940  * timeout occurred on the first call?!? */
941  SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
942  return false;
943  }
944  return true;
945 }
946 
947 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
948  sal_Int32 startPos, sal_Int32 endPos )
949 {
950  SearchResult aRet;
951  aRet.subRegExpressions = 0;
952  if( !pRegexMatcher)
953  return aRet;
954 
955  if( endPos > searchStr.getLength())
956  endPos = searchStr.getLength();
957 
958  // use the ICU RegexMatcher to find the matches
959  UErrorCode nIcuErr = U_ZERO_ERROR;
960  const IcuUniString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
961  searchStr.getLength());
962  pRegexMatcher->reset( aSearchTargetStr);
963  // search until there is a valid match
964  for(;;)
965  {
966  if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
967  return aRet;
968 
969  // #i118887# ignore zero-length matches e.g. "a*" in "bc"
970  int nStartOfs = pRegexMatcher->start( nIcuErr);
971  int nEndOfs = pRegexMatcher->end( nIcuErr);
972  if( nStartOfs < nEndOfs)
973  break;
974  // If the zero-length match is behind the string, do not match it again
975  // and again until startPos reaches there. A match behind the string is
976  // a "$" anchor.
977  if (nStartOfs == endPos)
978  break;
979  // try at next position if there was a zero-length match
980  if( ++startPos >= endPos)
981  return aRet;
982  }
983 
984  // extract the result of the search
985  const int nGroupCount = pRegexMatcher->groupCount();
986  aRet.subRegExpressions = nGroupCount + 1;
987  aRet.startOffset.realloc( aRet.subRegExpressions);
988  aRet.endOffset.realloc( aRet.subRegExpressions);
989  aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
990  aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
991  for( int i = 1; i <= nGroupCount; ++i) {
992  aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
993  aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr);
994  }
995 
996  return aRet;
997 }
998 
999 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
1000  sal_Int32 startPos, sal_Int32 endPos )
1001 {
1002  // NOTE: for backwards search callers provide startPos/endPos inverted!
1003  SearchResult aRet;
1004  aRet.subRegExpressions = 0;
1005  if( !pRegexMatcher)
1006  return aRet;
1007 
1008  if( startPos > searchStr.getLength())
1009  startPos = searchStr.getLength();
1010 
1011  // use the ICU RegexMatcher to find the matches
1012  // TODO: use ICU's backward searching once it becomes available
1013  // as its replacement using forward search is not as good as the real thing
1014  UErrorCode nIcuErr = U_ZERO_ERROR;
1015  const IcuUniString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
1016  searchStr.getLength());
1017  pRegexMatcher->reset( aSearchTargetStr);
1018  if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
1019  return aRet;
1020 
1021  // find the last match
1022  int nLastPos = 0;
1023  int nFoundEnd = 0;
1024  int nGoodPos = 0, nGoodEnd = 0;
1025  bool bFirst = true;
1026  do {
1027  nLastPos = pRegexMatcher->start( nIcuErr);
1028  nFoundEnd = pRegexMatcher->end( nIcuErr);
1029  if (nLastPos < nFoundEnd)
1030  {
1031  // remember last non-zero-length match
1032  nGoodPos = nLastPos;
1033  nGoodEnd = nFoundEnd;
1034  }
1035  if( nFoundEnd >= startPos)
1036  break;
1037  bFirst = false;
1038  if( nFoundEnd == nLastPos)
1039  ++nFoundEnd;
1040  } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
1041 
1042  // Ignore all zero-length matches except "$" anchor on first match.
1043  if (nGoodPos == nGoodEnd)
1044  {
1045  if (bFirst && nLastPos == startPos)
1046  nGoodPos = nLastPos;
1047  else
1048  return aRet;
1049  }
1050 
1051  // find last match again to get its details
1052  lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
1053 
1054  // fill in the details of the last match
1055  const int nGroupCount = pRegexMatcher->groupCount();
1056  aRet.subRegExpressions = nGroupCount + 1;
1057  aRet.startOffset.realloc( aRet.subRegExpressions);
1058  aRet.endOffset.realloc( aRet.subRegExpressions);
1059  // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1060  aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
1061  aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
1062  for( int i = 1; i <= nGroupCount; ++i) {
1063  aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
1064  aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr);
1065  }
1066 
1067  return aRet;
1068 }
1069 
1070 
1071 // search for words phonetically
1072 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
1073  sal_Int32 startPos, sal_Int32 endPos )
1074 {
1075  SearchResult aRet;
1076  aRet.subRegExpressions = 0;
1077 
1078  if( !xBreak.is() )
1079  return aRet;
1080 
1081  sal_Int32 nStt, nEnd;
1082 
1083  Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1084  aSrchPara.Locale,
1085  WordType::ANYWORD_IGNOREWHITESPACES, true );
1086 
1087  do
1088  {
1089  if( aWBnd.startPos >= endPos )
1090  break;
1091  nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
1092  nEnd = std::min(aWBnd.endPos, endPos);
1093 
1094  if( nStt < nEnd &&
1095  pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1096  {
1097  aRet.subRegExpressions = 1;
1098  aRet.startOffset.realloc( 1 );
1099  aRet.startOffset[ 0 ] = nStt;
1100  aRet.endOffset.realloc( 1 );
1101  aRet.endOffset[ 0 ] = nEnd;
1102  break;
1103  }
1104 
1105  nStt = nEnd - 1;
1106  aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
1107  WordType::ANYWORD_IGNOREWHITESPACES);
1108  } while( aWBnd.startPos != aWBnd.endPos ||
1109  (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
1110  // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1111  // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1112  // and nextWord() does also => don't loop forever.
1113  return aRet;
1114 }
1115 
1116 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
1117  sal_Int32 startPos, sal_Int32 endPos )
1118 {
1119  SearchResult aRet;
1120  aRet.subRegExpressions = 0;
1121 
1122  if( !xBreak.is() )
1123  return aRet;
1124 
1125  sal_Int32 nStt, nEnd;
1126 
1127  Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1128  aSrchPara.Locale,
1129  WordType::ANYWORD_IGNOREWHITESPACES, true );
1130 
1131  do
1132  {
1133  if( aWBnd.endPos <= endPos )
1134  break;
1135  nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
1136  nEnd = std::min(aWBnd.endPos, startPos);
1137 
1138  if( nStt < nEnd &&
1139  pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1140  {
1141  aRet.subRegExpressions = 1;
1142  aRet.startOffset.realloc( 1 );
1143  aRet.startOffset[ 0 ] = nEnd;
1144  aRet.endOffset.realloc( 1 );
1145  aRet.endOffset[ 0 ] = nStt;
1146  break;
1147  }
1148  if( !nStt )
1149  break;
1150 
1151  aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
1152  WordType::ANYWORD_IGNOREWHITESPACES);
1153  } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
1154  return aRet;
1155 }
1156 
1157 
1158 namespace {
1159 void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
1160 {
1161  rRes.subRegExpressions = 1;
1162  rRes.startOffset.realloc(1);
1163  rRes.endOffset.realloc(1);
1164  rRes.startOffset[0] = nStartOffset;
1165  rRes.endOffset[0] = nEndOffset;
1166 }
1167 }
1168 
1169 SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1170 {
1171  SearchResult aRes;
1172  aRes.subRegExpressions = 0; // no match
1173  sal_Int32 nStartOffset = nStartPos;
1174  sal_Int32 nEndOffset = nEndPos;
1175 
1176  const sal_Int32 nStringLen = searchStr.getLength();
1177 
1178  // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1179  // string match with [0,0).
1180  if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
1181  (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1182  return aRes;
1183 
1184  const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1185  const sal_Int32 nPatternLen = rPattern.getLength();
1186 
1187  // Handle special cases empty pattern and/or string outside of the loop to
1188  // not add performance penalties there and simplify.
1189  if (nStartPos == nEndPos)
1190  {
1191  sal_Int32 i = 0;
1192  while (i < nPatternLen && rPattern[i] == '*')
1193  ++i;
1194  if (i == nPatternLen)
1195  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1196  return aRes;
1197  }
1198 
1199  // Empty pattern does not match any non-empty string.
1200  if (!nPatternLen)
1201  return aRes;
1202 
1203  bool bRewind = false;
1204  sal_uInt32 cPattern = 0;
1205  sal_Int32 nPattern = 0;
1206  sal_Int32 nAfterFakePattern = nPattern;
1208  {
1209  // Fake a leading '*' wildcard.
1210  cPattern = '*';
1211  bRewind = true;
1212  // Assume a non-'*' pattern character follows. If it is a '*' instead
1213  // that will be handled in the loop by setting nPat.
1214  sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
1215  if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
1216  rPattern.iterateCodePoints( &nAfterFakePattern);
1217  }
1218 
1219  sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1220  sal_uInt32 cPatternAfterAsterisk = 0;
1221  bool bEscaped = false, bEscapedAfterAsterisk = false;
1222 
1223  // The loop code tries to avoid costly calls to iterateCodePoints() when
1224  // possible.
1225 
1226  do
1227  {
1228  if (bRewind)
1229  {
1230  // Reuse cPattern after '*', nPattern was correspondingly
1231  // incremented to point behind cPattern.
1232  bRewind = false;
1233  }
1234  else if (nPattern < nPatternLen)
1235  {
1236  // nPattern will be incremented by iterateCodePoints().
1237  cPattern = rPattern.iterateCodePoints( &nPattern);
1238  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1239  {
1240  bEscaped = true;
1241  cPattern = rPattern.iterateCodePoints( &nPattern);
1242  }
1243  }
1244  else
1245  {
1246  // A trailing '*' is handled below.
1248  {
1249  // If the pattern is consumed and substring match allowed we're good.
1250  setWildcardMatch( aRes, nStartOffset, nString);
1251  return aRes;
1252  }
1253  else if (nString < nEndPos && nLastAsterisk >= 0)
1254  {
1255  // If substring match is not allowed try a greedy '*' match.
1256  nPattern = nLastAsterisk;
1257  continue; // do
1258  }
1259  else
1260  return aRes;
1261  }
1262 
1263  if (cPattern == '*' && !bEscaped)
1264  {
1265  // '*' is one code unit, so not using iterateCodePoints() is ok.
1266  while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1267  ++nPattern;
1268 
1269  if (nPattern >= nPatternLen)
1270  {
1271  // Last pattern is '*', remaining string matches.
1272  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1273  return aRes;
1274  }
1275 
1276  nLastAsterisk = nPattern; // Remember last encountered '*'.
1277 
1278  // cPattern will be the next non-'*' character, nPattern
1279  // incremented.
1280  cPattern = rPattern.iterateCodePoints( &nPattern);
1281  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1282  {
1283  bEscaped = true;
1284  cPattern = rPattern.iterateCodePoints( &nPattern);
1285  }
1286 
1287  cPatternAfterAsterisk = cPattern;
1288  bEscapedAfterAsterisk = bEscaped;
1289  nPat = nPattern; // Remember position of pattern behind '*', already incremented.
1290  nStr = nString; // Remember the current string to be matched.
1291  }
1292 
1293  if (nString >= nEndPos)
1294  // Whatever follows in pattern, string will not match.
1295  return aRes;
1296 
1297  // nString will be incremented by iterateCodePoints().
1298  sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
1299 
1300  if ((cPattern != '?' || bEscaped) && cPattern != cString)
1301  {
1302  if (nPat == -1)
1303  // Non-match already without any '*' pattern.
1304  return aRes;
1305 
1306  bRewind = true;
1307  nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
1308  cPattern = cPatternAfterAsterisk;
1309  bEscaped = bEscapedAfterAsterisk;
1310  searchStr.iterateCodePoints( &nStr);
1311  nString = nStr; // Restore incremented remembered string position.
1312  if (nPat == nAfterFakePattern)
1313  {
1314  // Next start offset will be the next character.
1315  nStartOffset = nString;
1316  }
1317  }
1318  else
1319  {
1320  // An unescaped '?' pattern matched any character, or characters
1321  // matched. Reset only escaped state.
1322  bEscaped = false;
1323  }
1324  }
1325  while (nString < nEndPos);
1326 
1327  if (bRewind)
1328  return aRes;
1329 
1330  // Eat trailing '*' pattern that matches anything, including nothing.
1331  // '*' is one code unit, so not using iterateCodePoints() is ok.
1332  while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1333  ++nPattern;
1334 
1335  if (nPattern == nPatternLen)
1336  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1337  return aRes;
1338 }
1339 
1340 SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1341 {
1342  SearchResult aRes;
1343  aRes.subRegExpressions = 0; // no match
1344 
1345  sal_Int32 nStartOffset = nStartPos;
1346  sal_Int32 nEndOffset = nEndPos;
1347 
1348  const sal_Int32 nStringLen = searchStr.getLength();
1349 
1350  // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1351  // string match with (0,0].
1352  if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
1353  (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1354  return aRes;
1355 
1356  const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1357  sal_Int32 nPatternLen = rPattern.getLength();
1358 
1359  // Handle special cases empty pattern and/or string outside of the loop to
1360  // not add performance penalties there and simplify.
1361  if (nStartPos == nEndPos)
1362  {
1363  sal_Int32 i = 0;
1364  while (i < nPatternLen && rPattern[i] == '*')
1365  ++i;
1366  if (i == nPatternLen)
1367  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1368  return aRes;
1369  }
1370 
1371  // Empty pattern does not match any non-empty string.
1372  if (!nPatternLen)
1373  return aRes;
1374 
1375  // Reverse escaped patterns to ease the handling of escapes, keeping escape
1376  // and following character as one sequence in backward direction.
1377  if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
1379  {
1380  OUStringBuffer aPatternBuf( rPattern);
1381  sal_Int32 nIndex = 0;
1382  while (nIndex < nPatternLen)
1383  {
1384  const sal_Int32 nOld = nIndex;
1385  const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
1386  if (cu == mcWildcardEscapeChar)
1387  {
1388  if (nIndex < nPatternLen)
1389  {
1390  if (nIndex - nOld == 1)
1391  {
1392  // Simply move code units, we already memorized the one
1393  // in 'cu'.
1394  const sal_Int32 nOld2 = nIndex;
1395  rPattern.iterateCodePoints( &nIndex);
1396  for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1397  aPatternBuf[nOld+i] = rPattern[nOld2+i];
1398  aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
1399  }
1400  else
1401  {
1402  // Copy the escape character code units first in the
1403  // unlikely case that it would not be of BMP.
1404  assert(nIndex - nOld == 2); // it's UTF-16, so...
1405  sal_Unicode buf[2];
1406  buf[0] = rPattern[nOld];
1407  buf[1] = rPattern[nOld+1];
1408  const sal_Int32 nOld2 = nIndex;
1409  rPattern.iterateCodePoints( &nIndex);
1410  for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1411  aPatternBuf[nOld+i] = rPattern[nOld2+i];
1412  aPatternBuf[nIndex-2] = buf[0];
1413  aPatternBuf[nIndex-1] = buf[1];
1414  }
1415  }
1416  else
1417  {
1418  // Trailing escape would become leading escape, do what?
1419  // Eliminate.
1420  aPatternBuf.remove( nOld, nIndex - nOld);
1421  }
1422  }
1423  }
1424  if (bUsePrimarySrchStr)
1425  maWildcardReversePattern = aPatternBuf.makeStringAndClear();
1426  else
1427  maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
1428  }
1429  const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
1430  nPatternLen = rReversePattern.getLength();
1431 
1432  bool bRewind = false;
1433  sal_uInt32 cPattern = 0;
1434  sal_Int32 nPattern = nPatternLen;
1435  sal_Int32 nAfterFakePattern = nPattern;
1437  {
1438  // Fake a trailing '*' wildcard.
1439  cPattern = '*';
1440  bRewind = true;
1441  // Assume a non-'*' pattern character follows. If it is a '*' instead
1442  // that will be handled in the loop by setting nPat.
1443  sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1444  if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
1445  rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1446  }
1447 
1448  sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1449  sal_uInt32 cPatternAfterAsterisk = 0;
1450  bool bEscaped = false, bEscapedAfterAsterisk = false;
1451 
1452  // The loop code tries to avoid costly calls to iterateCodePoints() when
1453  // possible.
1454 
1455  do
1456  {
1457  if (bRewind)
1458  {
1459  // Reuse cPattern after '*', nPattern was correspondingly
1460  // decremented to point before cPattern.
1461  bRewind = false;
1462  }
1463  else if (nPattern > 0)
1464  {
1465  // nPattern will be decremented by iterateCodePoints().
1466  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1467  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1468  {
1469  bEscaped = true;
1470  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1471  }
1472  }
1473  else
1474  {
1475  // A trailing '*' is handled below.
1477  {
1478  // If the pattern is consumed and substring match allowed we're good.
1479  setWildcardMatch( aRes, nStartOffset, nString);
1480  return aRes;
1481  }
1482  else if (nString > nEndPos && nLastAsterisk >= 0)
1483  {
1484  // If substring match is not allowed try a greedy '*' match.
1485  nPattern = nLastAsterisk;
1486  continue; // do
1487  }
1488  else
1489  return aRes;
1490  }
1491 
1492  if (cPattern == '*' && !bEscaped)
1493  {
1494  // '*' is one code unit, so not using iterateCodePoints() is ok.
1495  while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1496  --nPattern;
1497 
1498  if (nPattern <= 0)
1499  {
1500  // First pattern is '*', remaining string matches.
1501  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1502  return aRes;
1503  }
1504 
1505  nLastAsterisk = nPattern; // Remember last encountered '*'.
1506 
1507  // cPattern will be the previous non-'*' character, nPattern
1508  // decremented.
1509  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1510  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1511  {
1512  bEscaped = true;
1513  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1514  }
1515 
1516  cPatternAfterAsterisk = cPattern;
1517  bEscapedAfterAsterisk = bEscaped;
1518  nPat = nPattern; // Remember position of pattern before '*', already decremented.
1519  nStr = nString; // Remember the current string to be matched.
1520  }
1521 
1522  if (nString <= nEndPos)
1523  // Whatever leads in pattern, string will not match.
1524  return aRes;
1525 
1526  // nString will be decremented by iterateCodePoints().
1527  sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
1528 
1529  if ((cPattern != '?' || bEscaped) && cPattern != cString)
1530  {
1531  if (nPat == -1)
1532  // Non-match already without any '*' pattern.
1533  return aRes;
1534 
1535  bRewind = true;
1536  nPattern = nPat; // Rewind pattern to character before '*', already decremented.
1537  cPattern = cPatternAfterAsterisk;
1538  bEscaped = bEscapedAfterAsterisk;
1539  searchStr.iterateCodePoints( &nStr, -1);
1540  nString = nStr; // Restore decremented remembered string position.
1541  if (nPat == nAfterFakePattern)
1542  {
1543  // Next start offset will be this character (exclusive).
1544  nStartOffset = nString;
1545  }
1546  }
1547  else
1548  {
1549  // An unescaped '?' pattern matched any character, or characters
1550  // matched. Reset only escaped state.
1551  bEscaped = false;
1552  }
1553  }
1554  while (nString > nEndPos);
1555 
1556  if (bRewind)
1557  return aRes;
1558 
1559  // Eat leading '*' pattern that matches anything, including nothing.
1560  // '*' is one code unit, so not using iterateCodePoints() is ok.
1561  while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1562  --nPattern;
1563 
1564  if (nPattern == 0)
1565  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1566  return aRes;
1567 }
1568 
1569 
1570 OUString SAL_CALL
1572 {
1573  return "com.sun.star.util.TextSearch_i18n";
1574 }
1575 
1576 sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
1577 {
1578  return cppu::supportsService(this, rServiceName);
1579 }
1580 
1581 Sequence< OUString > SAL_CALL
1583 {
1584  return { "com.sun.star.util.TextSearch", "com.sun.star.util.TextSearch2" };
1585 }
1586 
1587 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
1589  css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
1590 {
1591  return cppu::acquire(new TextSearch(context));
1592 }
1593 
1594 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
std::unique_ptr< icu::RegexMatcher > pRegexMatcher
Definition: textsearch.hxx:94
sal_Int32 nIndex
bool bUsePrimarySrchStr
Definition: textsearch.hxx:78
virtual void SAL_CALL setOptions2(const css::util::SearchOptions2 &options) override
Definition: textsearch.cxx:117
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * i18npool_TextSearch_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
css::util::SearchOptions2 aSrchPara
Definition: textsearch.hxx:54
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit
Definition: textsearch.hxx:60
osl::Mutex m_aMutex
Definition: textsearch.hxx:51
sal_Int64 n
OUString sSrchStr2
Definition: textsearch.hxx:56
bool checkCTLEnd
Definition: textsearch.hxx:134
sal_uInt32 mcWildcardEscapeChar
Definition: textsearch.hxx:121
void MakeForwardTab()
Definition: textsearch.cxx:590
const TransliterationFlags COMPLEX_TRANS_MASK
Definition: textsearch.cxx:45
css::util::SearchResult SAL_CALL RESrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:999
virtual void SAL_CALL setOptions(const css::util::SearchOptions &options) override
Definition: textsearch.cxx:265
void MakeBackwardTab2()
Definition: textsearch.cxx:664
sal_Int32 GetDiff(const sal_Unicode) const
Definition: textsearch.cxx:687
void MakeBackwardTab()
Definition: textsearch.cxx:641
OUString maWildcardReversePattern2
Definition: textsearch.hxx:120
static sal_Int32 FindPosInSeq_Impl(const Sequence< sal_Int32 > &rOff, sal_Int32 nPos)
Definition: textsearch.cxx:303
sal_uInt16 sal_Unicode
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
virtual ~TextSearch() override
Definition: textsearch.cxx:109
bool checkCTLStart
Definition: textsearch.hxx:134
bool bSearchApostrophe
Definition: textsearch.hxx:72
TransliterationFlags
css::util::SearchResult SAL_CALL WildcardSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
css::util::SearchResult SAL_CALL ApproxSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
TextSearch(const css::uno::Reference< css::uno::XComponentContext > &rxContext)
virtual css::util::SearchResult SAL_CALL searchBackward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:452
void RESrchPrepare(const css::util::SearchOptions2 &)
Definition: textsearch.cxx:858
#define DIGIT
bool IsDelimiter(const OUString &rStr, sal_Int32 nPos) const
Definition: textsearch.cxx:571
css::uno::Reference< css::i18n::XCharacterClassification > xCharClass
Definition: textsearch.hxx:58
int i
css::util::SearchResult SAL_CALL NSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:777
static bool lcl_findRegex(std::unique_ptr< icu::RegexMatcher > const &pRegexMatcher, sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode &rIcuErr)
Definition: textsearch.cxx:926
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual css::util::SearchResult SAL_CALL searchForward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:317
FnSrch fnBackward
Definition: textsearch.hxx:69
float u
unsigned char sal_Bool
css::util::SearchResult SAL_CALL ApproxSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
bool bIsForwardTab
Definition: textsearch.hxx:77
OUString sSrchStr
Definition: textsearch.hxx:55
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:133
OUString maWildcardReversePattern
Definition: textsearch.hxx:119
U_ICU_NAMESPACE::UnicodeString IcuUniString
Definition: textsearch.hxx:40
float v
css::uno::Reference< css::i18n::XBreakIterator > xBreak
Definition: textsearch.hxx:108
css::util::SearchResult SAL_CALL RESrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:947
sal_Int32 startPos
Definition: textsearch.hxx:66
bool mbWildcardAllowSubstring
Definition: textsearch.hxx:122
bool isCellStart(const OUString &searchStr, sal_Int32 nPos)
Definition: textsearch.cxx:310
css::util::SearchResult SAL_CALL NSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:707
std::unique_ptr< TextSearchJumpTable > pJumpTable2
Definition: textsearch.hxx:76
#define SAL_INFO(area, stream)
std::unique_ptr< WLevDistance > pWLD
Definition: textsearch.hxx:107
std::unique_ptr< TextSearchJumpTable > pJumpTable
Definition: textsearch.hxx:75
FnSrch fnForward
Definition: textsearch.hxx:68
virtual OUString SAL_CALL getImplementationName() override
void MakeForwardTab2()
Definition: textsearch.cxx:616
sal_Int32 sal_Int32 endPos
Definition: textsearch.hxx:66
#define SAL_WARN(area, stream)
Reference< XComponentContext > m_xContext
tuple m
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit2
Definition: textsearch.hxx:61
::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable
Definition: textsearch.hxx:42
css::uno::Reference< css::uno::XComponentContext > m_xContext
Definition: textsearch.hxx:52
css::util::SearchResult SAL_CALL WildcardSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)