LibreOffice Module i18npool (master)  1
textsearch.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include "textsearch.hxx"
21 #include "levdis.hxx"
22 #include <com/sun/star/i18n/BreakIterator.hpp>
23 #include <com/sun/star/util/SearchAlgorithms2.hpp>
24 #include <com/sun/star/util/SearchFlags.hpp>
25 #include <com/sun/star/i18n/WordType.hpp>
26 #include <com/sun/star/i18n/ScriptType.hpp>
27 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
28 #include <com/sun/star/i18n/CharacterClassification.hpp>
29 #include <com/sun/star/i18n/KCharacterType.hpp>
30 #include <com/sun/star/i18n/Transliteration.hpp>
32 #include <cppuhelper/weak.hxx>
34 #include <rtl/ustrbuf.hxx>
35 #include <sal/log.hxx>
36 
37 #include <unicode/regex.h>
38 
39 using namespace ::com::sun::star::util;
40 using namespace ::com::sun::star::uno;
41 using namespace ::com::sun::star::lang;
42 using namespace ::com::sun::star::i18n;
43 using namespace ::com::sun::star;
44 
46  TransliterationFlags::ignoreBaFa_ja_JP |
47  TransliterationFlags::ignoreIterationMark_ja_JP |
48  TransliterationFlags::ignoreTiJi_ja_JP |
49  TransliterationFlags::ignoreHyuByu_ja_JP |
50  TransliterationFlags::ignoreSeZe_ja_JP |
51  TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
52  TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
53  TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
54 
55 namespace
56 {
58 {
59  // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
60  // in complex transliteration.
61  return
62  n & (COMPLEX_TRANS_MASK | // all set ignore bits
63  TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
64  TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
65 }
66 
67 bool isComplexTrans( TransliterationFlags n )
68 {
69  return bool(n & COMPLEX_TRANS_MASK);
70 }
71 
72 TransliterationFlags maskSimpleTrans( TransliterationFlags n )
73 {
74  return n & ~COMPLEX_TRANS_MASK;
75 }
76 
77 bool isSimpleTrans( TransliterationFlags n )
78 {
79  return bool(maskSimpleTrans(n));
80 }
81 
82 // Regex patterns are case sensitive.
83 TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
84 {
85  TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
86  TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
87  if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
88  v = TransliterationFlags::NONE;
89  return (m | v) & ~COMPLEX_TRANS_MASK;
90 }
91 
92 bool isSimpleRegexTrans( TransliterationFlags n )
93 {
94  return bool(maskSimpleRegexTrans(n));
95 }
96 };
97 
99  : m_xContext( rxContext )
100 {
101  SearchOptions2 aOpt;
102  aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
103  aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
104  aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
105  //aOpt.Locale = ???;
106  setOptions( aOpt );
107 }
108 
110 {
111  pRegexMatcher.reset();
112  pWLD.reset();
113  pJumpTable.reset();
114  pJumpTable2.reset();
115 }
116 
117 void TextSearch::setOptions2( const SearchOptions2& rOptions )
118 {
119  std::unique_lock g(m_aMutex);
120 
121  aSrchPara = rOptions;
122 
123  pRegexMatcher.reset();
124  pWLD.reset();
125  pJumpTable.reset();
126  pJumpTable2.reset();
127  maWildcardReversePattern.clear();
129  TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
130  bSearchApostrophe = false;
131  bool bReplaceApostrophe = false;
132  if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
133  {
134  // RESrchPrepare will consider aSrchPara.transliterateFlags when
135  // picking the actual regex pattern
136  // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
137  // case-insensitivity. Create transliteration instance, if any, without
138  // ignore-case so later in TextSearch::searchForward() the string to
139  // match is not case-altered, leave case-(in)sensitive to regex engine.
140  transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
141  }
142  else if ( aSrchPara.searchString.indexOf('\'') > - 1 )
143  {
144  bSearchApostrophe = true;
145  bReplaceApostrophe = aSrchPara.searchString.indexOf(u'\u2019') > -1;
146  }
147 
148  // Create Transliteration class
149  if( isSimpleTrans( transliterateFlags) )
150  {
151  if( !xTranslit.is() )
152  xTranslit.set( Transliteration::create( m_xContext ) );
153  xTranslit->loadModule(
154  static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
155  aSrchPara.Locale);
156  }
157  else if( xTranslit.is() )
158  xTranslit = nullptr;
159 
160  // Create Transliteration for 2<->1, 2<->2 transliteration
161  if ( isComplexTrans( transliterateFlags) )
162  {
163  if( !xTranslit2.is() )
164  xTranslit2.set( Transliteration::create( m_xContext ) );
165  // Load transliteration module
166  xTranslit2->loadModule(
167  static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
168  aSrchPara.Locale);
169  }
170 
171  if ( !xBreak.is() )
172  xBreak = css::i18n::BreakIterator::create( m_xContext );
173 
174  sSrchStr = aSrchPara.searchString;
175 
176  // Transliterate search string.
177  if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
178  {
179  if (isSimpleRegexTrans(transliterateFlags))
180  {
181  if (maskSimpleRegexTrans(transliterateFlags) !=
182  maskSimpleTrans(transliterateFlags))
183  {
184  css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
185  Transliteration::create( m_xContext ));
186  if (xTranslitPattern.is())
187  {
188  xTranslitPattern->loadModule(
189  static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
190  aSrchPara.Locale);
191  sSrchStr = xTranslitPattern->transliterateString2String(
192  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
193  }
194  }
195  else
196  {
197  if (xTranslit.is())
198  sSrchStr = xTranslit->transliterateString2String(
199  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
200  }
201  // xTranslit2 complex transliterated sSrchStr2 is not used in
202  // regex, see TextSearch::searchForward() and
203  // TextSearch::searchBackward()
204  }
205  }
206  else
207  {
208  if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
209  sSrchStr = xTranslit->transliterateString2String(
210  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
211 
212  if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
213  sSrchStr2 = xTranslit2->transliterateString2String(
214  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
215  }
216 
217  // When start or end of search string is a complex script type, we need to
218  // make sure the result boundary is not located in the middle of cell.
219  checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
220  ScriptType::COMPLEX));
221  checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
222  sSrchStr.getLength()-1) == ScriptType::COMPLEX));
223 
224  if ( bReplaceApostrophe )
225  sSrchStr = sSrchStr.replace(u'\u2019', '\'');
226 
227  // Take the new SearchOptions2::AlgorithmType2 field and ignore
228  // SearchOptions::algorithmType
229  switch( aSrchPara.AlgorithmType2)
230  {
231  case SearchAlgorithms2::REGEXP:
235  break;
236 
237  case SearchAlgorithms2::APPROXIMATE:
240 
241  pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
242  aSrchPara.insertedChars, aSrchPara.deletedChars,
243  0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
244 
245  nLimit = pWLD->GetLimit();
246  break;
247 
248  case SearchAlgorithms2::WILDCARD:
249  mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
250  mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
253  break;
254 
255  default:
256  SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
257  [[fallthrough]];
258  case SearchAlgorithms2::ABSOLUTE:
261  break;
262  }
263 }
264 
265 void TextSearch::setOptions( const SearchOptions& rOptions )
266 {
267  sal_Int16 nAlgorithmType2;
268  switch (rOptions.algorithmType)
269  {
270  case SearchAlgorithms_REGEXP:
271  nAlgorithmType2 = SearchAlgorithms2::REGEXP;
272  break;
273  case SearchAlgorithms_APPROXIMATE:
274  nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
275  break;
276  default:
277  SAL_WARN("i18npool","TextSearch::setOptions - default what?");
278  [[fallthrough]];
279  case SearchAlgorithms_ABSOLUTE:
280  nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
281  break;
282  }
283  // It would be nice if an inherited struct had a ctor that takes an
284  // instance of the object the struct derived from...
285  SearchOptions2 aOptions2(
286  rOptions.algorithmType,
287  rOptions.searchFlag,
288  rOptions.searchString,
289  rOptions.replaceString,
290  rOptions.Locale,
291  rOptions.changedChars,
292  rOptions.deletedChars,
293  rOptions.insertedChars,
294  rOptions.transliterateFlags,
295  nAlgorithmType2,
296  0 // no wildcard search, no escape character...
297  );
298  setOptions2( aOptions2);
299 }
300 
301 static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
302 {
303  auto pOff = std::find_if(rOff.begin(), rOff.end(),
304  [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
305  return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
306 }
307 
308 bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
309 {
310  sal_Int32 nDone;
311  return nPos == xBreak->previousCharacters(searchStr, nPos+1,
312  aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
313 }
314 
315 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
316 {
317  std::unique_lock g(m_aMutex);
318 
319  SearchResult sres;
320 
321  OUString in_str(searchStr);
322 
323  // in non-regex mode, allow searching typographical apostrophe with the ASCII one
324  // to avoid regression after using automatic conversion to U+2019 during typing in Writer
325  bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
326 
327  bUsePrimarySrchStr = true;
328 
329  if ( xTranslit.is() )
330  {
331  // apply normal transliteration (1<->1, 1<->0)
332 
333  sal_Int32 nInStartPos = startPos;
334  if (pRegexMatcher && startPos > 0)
335  {
336  // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
337  // transliterate enough of the leading text to allow sensible look-behind assertions.
338  // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
339  // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
340  // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
341  // the middle of a surrogate pair, preceded by another surrogate pair.
342  const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
343  nInStartPos -= std::min(nMaxLeadingLen, startPos);
344  }
345  sal_Int32 nInEndPos = endPos;
346  if (pRegexMatcher && endPos < searchStr.getLength())
347  {
348  // tdf#65038: ditto for look-ahead assertions
349  const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
350  nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
351  }
352 
353  css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
354  in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
355 
356  if ( bReplaceApostrophe )
357  in_str = in_str.replace(u'\u2019', '\'');
358 
359  // JP 20.6.2001: also the start and end positions must be corrected!
360  sal_Int32 newStartPos =
361  (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
362 
363  sal_Int32 newEndPos = (endPos < searchStr.getLength())
364  ? FindPosInSeq_Impl( offset, endPos )
365  : in_str.getLength();
366 
367  sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
368 
369  // Map offsets back to untransliterated string.
370  const sal_Int32 nOffsets = offset.getLength();
371  if (nOffsets)
372  {
373  auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
374  auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
375  // For regex nGroups is the number of groups+1 with group 0 being
376  // the entire match.
377  const sal_Int32 nGroups = sres.startOffset.getLength();
378  for ( sal_Int32 k = 0; k < nGroups; k++ )
379  {
380  const sal_Int32 nStart = sres.startOffset[k];
381  // Result offsets are negative (-1) if a group expression was
382  // not matched.
383  if (nStart >= 0)
384  sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
385  // JP 20.6.2001: end is ever exclusive and then don't return
386  // the position of the next character - return the
387  // next position behind the last found character!
388  // "a b c" find "b" must return 2,3 and not 2,4!!!
389  const sal_Int32 nStop = sres.endOffset[k];
390  if (nStop >= 0)
391  {
392  if (nStop > 0)
393  sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
394  else
395  sres_endOffsetRange[k] = offset[0];
396  }
397  }
398  }
399  }
400  else
401  {
402  if ( bReplaceApostrophe )
403  in_str = in_str.replace(u'\u2019', '\'');
404 
405  sres = (this->*fnForward)( in_str, startPos, endPos );
406  }
407 
408  if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
409  {
410  SearchResult sres2;
411 
412  in_str = searchStr;
413  css::uno::Sequence <sal_Int32> offset( in_str.getLength());
414 
415  in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
416 
417  if( startPos )
418  startPos = FindPosInSeq_Impl( offset, startPos );
419 
420  if( endPos < searchStr.getLength() )
421  endPos = FindPosInSeq_Impl( offset, endPos );
422  else
423  endPos = in_str.getLength();
424 
425  bUsePrimarySrchStr = false;
426  sres2 = (this->*fnForward)( in_str, startPos, endPos );
427  auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
428  auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
429 
430  for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
431  {
432  if (sres2.startOffset[k])
433  sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
434  if (sres2.endOffset[k])
435  sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
436  }
437 
438  // pick first and long one
439  if ( sres.subRegExpressions == 0)
440  return sres2;
441  if ( sres2.subRegExpressions == 1)
442  {
443  if ( sres.startOffset[0] > sres2.startOffset[0])
444  return sres2;
445  else if ( sres.startOffset[0] == sres2.startOffset[0] &&
446  sres.endOffset[0] < sres2.endOffset[0])
447  return sres2;
448  }
449  }
450 
451  return sres;
452 }
453 
454 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
455 {
456  std::unique_lock g(m_aMutex);
457 
458  SearchResult sres;
459 
460  OUString in_str(searchStr);
461 
462  // in non-regex mode, allow searching typographical apostrophe with the ASCII one
463  // to avoid regression after using automatic conversion to U+2019 during typing in Writer
464  bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
465 
466  bUsePrimarySrchStr = true;
467 
468  if ( xTranslit.is() )
469  {
470  // apply only simple 1<->1 transliteration here
471  css::uno::Sequence<sal_Int32> offset(startPos - endPos);
472  in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
473 
474  if ( bReplaceApostrophe )
475  in_str = in_str.replace(u'\u2019', '\'');
476 
477  // JP 20.6.2001: also the start and end positions must be corrected!
478  sal_Int32 const newStartPos = (startPos < searchStr.getLength())
479  ? FindPosInSeq_Impl( offset, startPos )
480  : in_str.getLength();
481 
482  sal_Int32 const newEndPos =
483  (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
484 
485  // TODO: this would need nExtraOffset handling to avoid $ matching
486  // if (pRegexMatcher && startPos < searchStr.getLength())
487  // but that appears to be impossible with ICU regex
488 
489  sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
490 
491  // Map offsets back to untransliterated string.
492  const sal_Int32 nOffsets = offset.getLength();
493  if (nOffsets)
494  {
495  auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
496  auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
497  // For regex nGroups is the number of groups+1 with group 0 being
498  // the entire match.
499  const sal_Int32 nGroups = sres.startOffset.getLength();
500  for ( sal_Int32 k = 0; k < nGroups; k++ )
501  {
502  const sal_Int32 nStart = sres.startOffset[k];
503  // Result offsets are negative (-1) if a group expression was
504  // not matched.
505  if (nStart >= 0)
506  {
507  if (nStart > 0)
508  sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
509  else
510  sres_startOffsetRange[k] = offset[0];
511  }
512  // JP 20.6.2001: end is ever exclusive and then don't return
513  // the position of the next character - return the
514  // next position behind the last found character!
515  // "a b c" find "b" must return 2,3 and not 2,4!!!
516  const sal_Int32 nStop = sres.endOffset[k];
517  if (nStop >= 0)
518  sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
519  }
520  }
521  }
522  else
523  {
524  if ( bReplaceApostrophe )
525  in_str = in_str.replace(u'\u2019', '\'');
526 
527  sres = (this->*fnBackward)( in_str, startPos, endPos );
528  }
529 
530  if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
531  {
532  SearchResult sres2;
533 
534  in_str = searchStr;
535  css::uno::Sequence <sal_Int32> offset( in_str.getLength());
536 
537  in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
538 
539  if( startPos < searchStr.getLength() )
540  startPos = FindPosInSeq_Impl( offset, startPos );
541  else
542  startPos = in_str.getLength();
543 
544  if( endPos )
545  endPos = FindPosInSeq_Impl( offset, endPos );
546 
547  bUsePrimarySrchStr = false;
548  sres2 = (this->*fnBackward)( in_str, startPos, endPos );
549  auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
550  auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
551 
552  for( int k = 0; k < sres2.startOffset.getLength(); k++ )
553  {
554  if (sres2.startOffset[k])
555  sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
556  if (sres2.endOffset[k])
557  sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
558  }
559 
560  // pick last and long one
561  if ( sres.subRegExpressions == 0 )
562  return sres2;
563  if ( sres2.subRegExpressions == 1 )
564  {
565  if ( sres.startOffset[0] < sres2.startOffset[0] )
566  return sres2;
567  if ( sres.startOffset[0] == sres2.startOffset[0] &&
568  sres.endOffset[0] > sres2.endOffset[0] )
569  return sres2;
570  }
571  }
572 
573  return sres;
574 }
575 
576 
577 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
578 {
579  bool bRet = true;
580  if( '\x7f' != rStr[nPos])
581  {
582  if ( !xCharClass.is() )
583  xCharClass = CharacterClassification::create( m_xContext );
584  sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
585  aSrchPara.Locale );
586  if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
587  KCharacterType::LETTER ) & nCType ) )
588  bRet = false;
589  }
590  return bRet;
591 }
592 
593 // --------- helper methods for Boyer-Moore like text searching ----------
594 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
595 
597 {
598  // create the jumptable for the search text
599 
600  if( pJumpTable && bIsForwardTab )
601  {
602  return; // the jumpTable is ok
603  }
604  bIsForwardTab = true;
605 
606  sal_Int32 n, nLen = sSrchStr.getLength();
607  pJumpTable.reset( new TextSearchJumpTable );
608 
609  for( n = 0; n < nLen - 1; ++n )
610  {
611  sal_Unicode cCh = sSrchStr[n];
612  sal_Int32 nDiff = nLen - n - 1;
613  TextSearchJumpTable::value_type aEntry( cCh, nDiff );
614 
615  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
616  pJumpTable->insert( aEntry );
617  if ( !aPair.second )
618  (*(aPair.first)).second = nDiff;
619  }
620 }
621 
623 {
624  // create the jumptable for the search text
625  if( pJumpTable2 && bIsForwardTab )
626  {
627  return; // the jumpTable is ok
628  }
629  bIsForwardTab = true;
630 
631  sal_Int32 n, nLen = sSrchStr2.getLength();
632  pJumpTable2.reset( new TextSearchJumpTable );
633 
634  for( n = 0; n < nLen - 1; ++n )
635  {
636  sal_Unicode cCh = sSrchStr2[n];
637  sal_Int32 nDiff = nLen - n - 1;
638 
639  TextSearchJumpTable::value_type aEntry( cCh, nDiff );
640  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
641  pJumpTable2->insert( aEntry );
642  if ( !aPair.second )
643  (*(aPair.first)).second = nDiff;
644  }
645 }
646 
648 {
649  // create the jumptable for the search text
650  if( pJumpTable && !bIsForwardTab)
651  {
652  return; // the jumpTable is ok
653  }
654  bIsForwardTab = false;
655 
656  sal_Int32 n, nLen = sSrchStr.getLength();
657  pJumpTable.reset( new TextSearchJumpTable );
658 
659  for( n = nLen-1; n > 0; --n )
660  {
661  sal_Unicode cCh = sSrchStr[n];
662  TextSearchJumpTable::value_type aEntry( cCh, n );
663  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
664  pJumpTable->insert( aEntry );
665  if ( !aPair.second )
666  (*(aPair.first)).second = n;
667  }
668 }
669 
671 {
672  // create the jumptable for the search text
673  if( pJumpTable2 && !bIsForwardTab )
674  {
675  return; // the jumpTable is ok
676  }
677  bIsForwardTab = false;
678 
679  sal_Int32 n, nLen = sSrchStr2.getLength();
680  pJumpTable2.reset( new TextSearchJumpTable );
681 
682  for( n = nLen-1; n > 0; --n )
683  {
684  sal_Unicode cCh = sSrchStr2[n];
685  TextSearchJumpTable::value_type aEntry( cCh, n );
686  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
687  pJumpTable2->insert( aEntry );
688  if ( !aPair.second )
689  (*(aPair.first)).second = n;
690  }
691 }
692 
693 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
694 {
695  TextSearchJumpTable *pJump;
696  OUString sSearchKey;
697 
698  if ( bUsePrimarySrchStr ) {
699  pJump = pJumpTable.get();
700  sSearchKey = sSrchStr;
701  } else {
702  pJump = pJumpTable2.get();
703  sSearchKey = sSrchStr2;
704  }
705 
706  TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
707  if ( iLook == pJump->end() )
708  return sSearchKey.getLength();
709  return (*iLook).second;
710 }
711 
712 
713 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
714 {
715  SearchResult aRet;
716  aRet.subRegExpressions = 0;
717 
718  OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
719 
720  sal_Int32 nSuchIdx = searchStr.getLength();
721  sal_Int32 nEnd = endPos;
722  if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
723  return aRet;
724 
725 
726  if( nEnd < sSearchKey.getLength() ) // position inside the search region ?
727  return aRet;
728 
729  nEnd -= sSearchKey.getLength();
730 
731  if (bUsePrimarySrchStr)
732  MakeForwardTab(); // create the jumptable
733  else
734  MakeForwardTab2();
735 
736  for (sal_Int32 nCmpIdx = startPos; // start position for the search
737  nCmpIdx <= nEnd;
738  nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
739  {
740  // if the match would be the completed cells, skip it.
741  if ( (checkCTLStart && !isCellStart( searchStr, nCmpIdx )) || (checkCTLEnd
742  && !isCellStart( searchStr, nCmpIdx + sSearchKey.getLength())) )
743  continue;
744 
745  nSuchIdx = sSearchKey.getLength() - 1;
746  while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
747  {
748  if( nSuchIdx == 0 )
749  {
750  if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
751  {
752  sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
753  bool bAtStart = !nCmpIdx;
754  bool bAtEnd = nFndEnd == endPos;
755  bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
756  bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd );
757  // * 1 -> only one word in the paragraph
758  // * 2 -> at begin of paragraph
759  // * 3 -> at end of paragraph
760  // * 4 -> inside the paragraph
761  if( !( ( bAtStart && bAtEnd ) || // 1
762  ( bAtStart && bDelimBehind ) || // 2
763  ( bAtEnd && bDelimBefore ) || // 3
764  ( bDelimBefore && bDelimBehind ))) // 4
765  break;
766  }
767 
768  aRet.subRegExpressions = 1;
769  aRet.startOffset = { nCmpIdx };
770  aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };
771 
772  return aRet;
773  }
774  else
775  nSuchIdx--;
776  }
777  }
778  return aRet;
779 }
780 
781 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
782 {
783  SearchResult aRet;
784  aRet.subRegExpressions = 0;
785 
786  OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
787 
788  sal_Int32 nSuchIdx = searchStr.getLength();
789  sal_Int32 nEnd = endPos;
790  if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
791  return aRet;
792 
793  if (bUsePrimarySrchStr)
794  MakeBackwardTab(); // create the jumptable
795  else
797 
798  if( nEnd == nSuchIdx ) // end position for the search
799  nEnd = sSearchKey.getLength();
800  else
801  nEnd += sSearchKey.getLength();
802 
803  sal_Int32 nCmpIdx = startPos; // start position for the search
804 
805  while (nCmpIdx >= nEnd)
806  {
807  // if the match would be the completed cells, skip it.
808  if ( (!checkCTLStart || isCellStart( searchStr, nCmpIdx -
809  sSearchKey.getLength() )) && (!checkCTLEnd ||
810  isCellStart( searchStr, nCmpIdx)))
811  {
812  nSuchIdx = 0;
813  while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
814  searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
815  nSuchIdx++;
816  if( nSuchIdx >= sSearchKey.getLength() )
817  {
818  if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
819  {
820  sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
821  bool bAtStart = !nFndStt;
822  bool bAtEnd = nCmpIdx == startPos;
823  bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
824  bool bDelimBefore = bAtStart || // begin of paragraph
825  IsDelimiter( searchStr, nFndStt-1 );
826  // * 1 -> only one word in the paragraph
827  // * 2 -> at begin of paragraph
828  // * 3 -> at end of paragraph
829  // * 4 -> inside the paragraph
830  if( ( bAtStart && bAtEnd ) || // 1
831  ( bAtStart && bDelimBehind ) || // 2
832  ( bAtEnd && bDelimBefore ) || // 3
833  ( bDelimBefore && bDelimBehind )) // 4
834  {
835  aRet.subRegExpressions = 1;
836  aRet.startOffset = { nCmpIdx };
837  aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
838  return aRet;
839  }
840  }
841  else
842  {
843  aRet.subRegExpressions = 1;
844  aRet.startOffset = { nCmpIdx };
845  aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
846  return aRet;
847  }
848  }
849  }
850  nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
851  if( nCmpIdx < nSuchIdx )
852  return aRet;
853  nCmpIdx -= nSuchIdx;
854  }
855  return aRet;
856 }
857 
858 void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
859 {
860  TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
861  // select the transliterated pattern string
862  const OUString& rPatternStr =
863  (isSimpleTrans(transliterateFlags) ? sSrchStr
864  : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
865 
866  sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
867  // map css::util::SearchFlags to ICU uregex.h flags
868  // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
869  // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
870  // REG_NOSUB is not used anywhere => not implemented
871  // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
872  // LEV_RELAXED is only used for SearchAlgorithm==Approximate
873  // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
874  // probably because the transliteration flag IGNORE_CASE handles it as well.
875  if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
876  || (transliterateFlags & TransliterationFlags::IGNORE_CASE))
877  nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
878  UErrorCode nIcuErr = U_ZERO_ERROR;
879  // assumption: transliteration didn't mangle regexp control chars
880  icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
881 #ifndef DISABLE_WORDBOUND_EMULATION
882  // for convenience specific syntax elements of the old regex engine are emulated
883  // - by replacing < with "word-break followed by a look-ahead word-char"
884  static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);
885  static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);
886  static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
887  aChevronMatcherB.reset( aIcuSearchPatStr);
888  aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
889  aChevronMatcherB.reset();
890  // - by replacing > with "look-behind word-char followed by a word-break"
891  static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);
892  static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);
893  static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
894  aChevronMatcherE.reset( aIcuSearchPatStr);
895  aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
896  aChevronMatcherE.reset();
897 #endif
898  pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
899  if (nIcuErr)
900  {
901  SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
902  pRegexMatcher.reset();
903  }
904  else
905  {
906  // Pathological patterns may result in exponential run time making the
907  // application appear to be frozen. Limit that. Documentation for this
908  // call says
909  // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
910  // "The units of the limit are steps of the match engine.
911  // Correspondence with actual processor time will depend on the speed
912  // of the processor and the details of the specific pattern, but will
913  // typically be on the order of milliseconds."
914  // Just what is a good value? 42 is always an answer ... the 23 enigma
915  // as well... which on the dev's machine is roughly 50 seconds with the
916  // pattern of fdo#70627.
917  /* TODO: make this a configuration settable value and possibly take
918  * complexity of expression into account and maybe even length of text
919  * to be matched; currently (2013-11-25) that is at most one 64k
920  * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
921  pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
922  }
923 }
924 
925 
926 static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
927  sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
928 {
929  pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
930  pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
931  pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
932  // look-ahead/behind assertions
933 
934  if (!pRegexMatcher->find(rIcuErr))
935  {
936  /* TODO: future versions could pass the UErrorCode or translations
937  * thereof to the caller, for example to inform the user of
938  * U_REGEX_TIME_OUT. The strange thing though is that an error is set
939  * only after the second call that returns immediately and not if
940  * timeout occurred on the first call?!? */
941  SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
942  return false;
943  }
944  return true;
945 }
946 
947 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
948  sal_Int32 startPos, sal_Int32 endPos )
949 {
950  SearchResult aRet;
951  aRet.subRegExpressions = 0;
952  if( !pRegexMatcher)
953  return aRet;
954 
955  if( endPos > searchStr.getLength())
956  endPos = searchStr.getLength();
957 
958  // use the ICU RegexMatcher to find the matches
959  UErrorCode nIcuErr = U_ZERO_ERROR;
960  const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
961  searchStr.getLength());
962  pRegexMatcher->reset( aSearchTargetStr);
963  // search until there is a valid match
964  for(;;)
965  {
966  if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
967  return aRet;
968 
969  // #i118887# ignore zero-length matches e.g. "a*" in "bc"
970  int nStartOfs = pRegexMatcher->start( nIcuErr);
971  int nEndOfs = pRegexMatcher->end( nIcuErr);
972  if( nStartOfs < nEndOfs)
973  break;
974  // If the zero-length match is behind the string, do not match it again
975  // and again until startPos reaches there. A match behind the string is
976  // a "$" anchor.
977  if (nStartOfs == endPos)
978  break;
979  // try at next position if there was a zero-length match
980  if( ++startPos >= endPos)
981  return aRet;
982  }
983 
984  // extract the result of the search
985  const int nGroupCount = pRegexMatcher->groupCount();
986  aRet.subRegExpressions = nGroupCount + 1;
987  aRet.startOffset.realloc( aRet.subRegExpressions);
988  auto pstartOffset = aRet.startOffset.getArray();
989  aRet.endOffset.realloc( aRet.subRegExpressions);
990  auto pendOffset = aRet.endOffset.getArray();
991  pstartOffset[0] = pRegexMatcher->start( nIcuErr);
992  pendOffset[0] = pRegexMatcher->end( nIcuErr);
993  for( int i = 1; i <= nGroupCount; ++i) {
994  pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
995  pendOffset[i] = pRegexMatcher->end( i, nIcuErr);
996  }
997 
998  return aRet;
999 }
1000 
1001 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
1002  sal_Int32 startPos, sal_Int32 endPos )
1003 {
1004  // NOTE: for backwards search callers provide startPos/endPos inverted!
1005  SearchResult aRet;
1006  aRet.subRegExpressions = 0;
1007  if( !pRegexMatcher)
1008  return aRet;
1009 
1010  if( startPos > searchStr.getLength())
1011  startPos = searchStr.getLength();
1012 
1013  // use the ICU RegexMatcher to find the matches
1014  // TODO: use ICU's backward searching once it becomes available
1015  // as its replacement using forward search is not as good as the real thing
1016  UErrorCode nIcuErr = U_ZERO_ERROR;
1017  const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
1018  searchStr.getLength());
1019  pRegexMatcher->reset( aSearchTargetStr);
1020  if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
1021  return aRet;
1022 
1023  // find the last match
1024  int nLastPos = 0;
1025  int nFoundEnd = 0;
1026  int nGoodPos = 0, nGoodEnd = 0;
1027  bool bFirst = true;
1028  do {
1029  nLastPos = pRegexMatcher->start( nIcuErr);
1030  nFoundEnd = pRegexMatcher->end( nIcuErr);
1031  if (nLastPos < nFoundEnd)
1032  {
1033  // remember last non-zero-length match
1034  nGoodPos = nLastPos;
1035  nGoodEnd = nFoundEnd;
1036  }
1037  if( nFoundEnd >= startPos)
1038  break;
1039  bFirst = false;
1040  if( nFoundEnd == nLastPos)
1041  ++nFoundEnd;
1042  } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
1043 
1044  // Ignore all zero-length matches except "$" anchor on first match.
1045  if (nGoodPos == nGoodEnd)
1046  {
1047  if (bFirst && nLastPos == startPos)
1048  nGoodPos = nLastPos;
1049  else
1050  return aRet;
1051  }
1052 
1053  // find last match again to get its details
1054  lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
1055 
1056  // fill in the details of the last match
1057  const int nGroupCount = pRegexMatcher->groupCount();
1058  aRet.subRegExpressions = nGroupCount + 1;
1059  aRet.startOffset.realloc( aRet.subRegExpressions);
1060  auto pstartOffset = aRet.startOffset.getArray();
1061  aRet.endOffset.realloc( aRet.subRegExpressions);
1062  auto pendOffset = aRet.endOffset.getArray();
1063  // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1064  pstartOffset[0] = pRegexMatcher->end( nIcuErr);
1065  pendOffset[0] = pRegexMatcher->start( nIcuErr);
1066  for( int i = 1; i <= nGroupCount; ++i) {
1067  pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
1068  pendOffset[i] = pRegexMatcher->start( i, nIcuErr);
1069  }
1070 
1071  return aRet;
1072 }
1073 
1074 
1075 // search for words phonetically
1076 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
1077  sal_Int32 startPos, sal_Int32 endPos )
1078 {
1079  SearchResult aRet;
1080  aRet.subRegExpressions = 0;
1081 
1082  if( !xBreak.is() )
1083  return aRet;
1084 
1085  sal_Int32 nStt, nEnd;
1086 
1087  Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1088  aSrchPara.Locale,
1089  WordType::ANYWORD_IGNOREWHITESPACES, true );
1090 
1091  do
1092  {
1093  if( aWBnd.startPos >= endPos )
1094  break;
1095  nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
1096  nEnd = std::min(aWBnd.endPos, endPos);
1097 
1098  if( nStt < nEnd &&
1099  pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1100  {
1101  aRet.subRegExpressions = 1;
1102  aRet.startOffset = { nStt };
1103  aRet.endOffset = { nEnd };
1104  break;
1105  }
1106 
1107  nStt = nEnd - 1;
1108  aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
1109  WordType::ANYWORD_IGNOREWHITESPACES);
1110  } while( aWBnd.startPos != aWBnd.endPos ||
1111  (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
1112  // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1113  // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1114  // and nextWord() does also => don't loop forever.
1115  return aRet;
1116 }
1117 
1118 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
1119  sal_Int32 startPos, sal_Int32 endPos )
1120 {
1121  SearchResult aRet;
1122  aRet.subRegExpressions = 0;
1123 
1124  if( !xBreak.is() )
1125  return aRet;
1126 
1127  sal_Int32 nStt, nEnd;
1128 
1129  Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1130  aSrchPara.Locale,
1131  WordType::ANYWORD_IGNOREWHITESPACES, true );
1132 
1133  do
1134  {
1135  if( aWBnd.endPos <= endPos )
1136  break;
1137  nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
1138  nEnd = std::min(aWBnd.endPos, startPos);
1139 
1140  if( nStt < nEnd &&
1141  pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1142  {
1143  aRet.subRegExpressions = 1;
1144  aRet.startOffset = { nEnd };
1145  aRet.endOffset = { nStt };
1146  break;
1147  }
1148  if( !nStt )
1149  break;
1150 
1151  aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
1152  WordType::ANYWORD_IGNOREWHITESPACES);
1153  } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
1154  return aRet;
1155 }
1156 
1157 
1158 namespace {
1159 void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
1160 {
1161  rRes.subRegExpressions = 1;
1162  rRes.startOffset = { nStartOffset };
1163  rRes.endOffset = { nEndOffset };
1164 }
1165 }
1166 
1167 SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1168 {
1169  SearchResult aRes;
1170  aRes.subRegExpressions = 0; // no match
1171  sal_Int32 nStartOffset = nStartPos;
1172  sal_Int32 nEndOffset = nEndPos;
1173 
1174  const sal_Int32 nStringLen = searchStr.getLength();
1175 
1176  // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1177  // string match with [0,0).
1178  if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
1179  (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1180  return aRes;
1181 
1182  const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1183  const sal_Int32 nPatternLen = rPattern.getLength();
1184 
1185  // Handle special cases empty pattern and/or string outside of the loop to
1186  // not add performance penalties there and simplify.
1187  if (nStartPos == nEndPos)
1188  {
1189  sal_Int32 i = 0;
1190  while (i < nPatternLen && rPattern[i] == '*')
1191  ++i;
1192  if (i == nPatternLen)
1193  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1194  return aRes;
1195  }
1196 
1197  // Empty pattern does not match any non-empty string.
1198  if (!nPatternLen)
1199  return aRes;
1200 
1201  bool bRewind = false;
1202  sal_uInt32 cPattern = 0;
1203  sal_Int32 nPattern = 0;
1204  sal_Int32 nAfterFakePattern = nPattern;
1206  {
1207  // Fake a leading '*' wildcard.
1208  cPattern = '*';
1209  bRewind = true;
1210  // Assume a non-'*' pattern character follows. If it is a '*' instead
1211  // that will be handled in the loop by setting nPat.
1212  sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
1213  if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
1214  rPattern.iterateCodePoints( &nAfterFakePattern);
1215  }
1216 
1217  sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1218  sal_uInt32 cPatternAfterAsterisk = 0;
1219  bool bEscaped = false, bEscapedAfterAsterisk = false;
1220 
1221  // The loop code tries to avoid costly calls to iterateCodePoints() when
1222  // possible.
1223 
1224  do
1225  {
1226  if (bRewind)
1227  {
1228  // Reuse cPattern after '*', nPattern was correspondingly
1229  // incremented to point behind cPattern.
1230  bRewind = false;
1231  }
1232  else if (nPattern < nPatternLen)
1233  {
1234  // nPattern will be incremented by iterateCodePoints().
1235  cPattern = rPattern.iterateCodePoints( &nPattern);
1236  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1237  {
1238  bEscaped = true;
1239  cPattern = rPattern.iterateCodePoints( &nPattern);
1240  }
1241  }
1242  else
1243  {
1244  // A trailing '*' is handled below.
1246  {
1247  // If the pattern is consumed and substring match allowed we're good.
1248  setWildcardMatch( aRes, nStartOffset, nString);
1249  return aRes;
1250  }
1251  else if (nString < nEndPos && nLastAsterisk >= 0)
1252  {
1253  // If substring match is not allowed try a greedy '*' match.
1254  nPattern = nLastAsterisk;
1255  continue; // do
1256  }
1257  else
1258  return aRes;
1259  }
1260 
1261  if (cPattern == '*' && !bEscaped)
1262  {
1263  // '*' is one code unit, so not using iterateCodePoints() is ok.
1264  while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1265  ++nPattern;
1266 
1267  if (nPattern >= nPatternLen)
1268  {
1269  // Last pattern is '*', remaining string matches.
1270  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1271  return aRes;
1272  }
1273 
1274  nLastAsterisk = nPattern; // Remember last encountered '*'.
1275 
1276  // cPattern will be the next non-'*' character, nPattern
1277  // incremented.
1278  cPattern = rPattern.iterateCodePoints( &nPattern);
1279  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1280  {
1281  bEscaped = true;
1282  cPattern = rPattern.iterateCodePoints( &nPattern);
1283  }
1284 
1285  cPatternAfterAsterisk = cPattern;
1286  bEscapedAfterAsterisk = bEscaped;
1287  nPat = nPattern; // Remember position of pattern behind '*', already incremented.
1288  nStr = nString; // Remember the current string to be matched.
1289  }
1290 
1291  if (nString >= nEndPos)
1292  // Whatever follows in pattern, string will not match.
1293  return aRes;
1294 
1295  // nString will be incremented by iterateCodePoints().
1296  sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
1297 
1298  if ((cPattern != '?' || bEscaped) && cPattern != cString)
1299  {
1300  if (nPat == -1)
1301  // Non-match already without any '*' pattern.
1302  return aRes;
1303 
1304  bRewind = true;
1305  nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
1306  cPattern = cPatternAfterAsterisk;
1307  bEscaped = bEscapedAfterAsterisk;
1308  searchStr.iterateCodePoints( &nStr);
1309  nString = nStr; // Restore incremented remembered string position.
1310  if (nPat == nAfterFakePattern)
1311  {
1312  // Next start offset will be the next character.
1313  nStartOffset = nString;
1314  }
1315  }
1316  else
1317  {
1318  // An unescaped '?' pattern matched any character, or characters
1319  // matched. Reset only escaped state.
1320  bEscaped = false;
1321  }
1322  }
1323  while (nString < nEndPos);
1324 
1325  if (bRewind)
1326  return aRes;
1327 
1328  // Eat trailing '*' pattern that matches anything, including nothing.
1329  // '*' is one code unit, so not using iterateCodePoints() is ok.
1330  while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1331  ++nPattern;
1332 
1333  if (nPattern == nPatternLen)
1334  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1335  return aRes;
1336 }
1337 
1338 SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1339 {
1340  SearchResult aRes;
1341  aRes.subRegExpressions = 0; // no match
1342 
1343  sal_Int32 nStartOffset = nStartPos;
1344  sal_Int32 nEndOffset = nEndPos;
1345 
1346  const sal_Int32 nStringLen = searchStr.getLength();
1347 
1348  // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1349  // string match with (0,0].
1350  if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
1351  (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1352  return aRes;
1353 
1354  const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1355  sal_Int32 nPatternLen = rPattern.getLength();
1356 
1357  // Handle special cases empty pattern and/or string outside of the loop to
1358  // not add performance penalties there and simplify.
1359  if (nStartPos == nEndPos)
1360  {
1361  sal_Int32 i = 0;
1362  while (i < nPatternLen && rPattern[i] == '*')
1363  ++i;
1364  if (i == nPatternLen)
1365  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1366  return aRes;
1367  }
1368 
1369  // Empty pattern does not match any non-empty string.
1370  if (!nPatternLen)
1371  return aRes;
1372 
1373  // Reverse escaped patterns to ease the handling of escapes, keeping escape
1374  // and following character as one sequence in backward direction.
1375  if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
1377  {
1378  OUStringBuffer aPatternBuf( rPattern);
1379  sal_Int32 nIndex = 0;
1380  while (nIndex < nPatternLen)
1381  {
1382  const sal_Int32 nOld = nIndex;
1383  const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
1384  if (cu == mcWildcardEscapeChar)
1385  {
1386  if (nIndex < nPatternLen)
1387  {
1388  if (nIndex - nOld == 1)
1389  {
1390  // Simply move code units, we already memorized the one
1391  // in 'cu'.
1392  const sal_Int32 nOld2 = nIndex;
1393  rPattern.iterateCodePoints( &nIndex);
1394  for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1395  aPatternBuf[nOld+i] = rPattern[nOld2+i];
1396  aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
1397  }
1398  else
1399  {
1400  // Copy the escape character code units first in the
1401  // unlikely case that it would not be of BMP.
1402  assert(nIndex - nOld == 2); // it's UTF-16, so...
1403  sal_Unicode buf[2];
1404  buf[0] = rPattern[nOld];
1405  buf[1] = rPattern[nOld+1];
1406  const sal_Int32 nOld2 = nIndex;
1407  rPattern.iterateCodePoints( &nIndex);
1408  for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1409  aPatternBuf[nOld+i] = rPattern[nOld2+i];
1410  aPatternBuf[nIndex-2] = buf[0];
1411  aPatternBuf[nIndex-1] = buf[1];
1412  }
1413  }
1414  else
1415  {
1416  // Trailing escape would become leading escape, do what?
1417  // Eliminate.
1418  aPatternBuf.remove( nOld, nIndex - nOld);
1419  }
1420  }
1421  }
1422  if (bUsePrimarySrchStr)
1423  maWildcardReversePattern = aPatternBuf.makeStringAndClear();
1424  else
1425  maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
1426  }
1427  const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
1428  nPatternLen = rReversePattern.getLength();
1429 
1430  bool bRewind = false;
1431  sal_uInt32 cPattern = 0;
1432  sal_Int32 nPattern = nPatternLen;
1433  sal_Int32 nAfterFakePattern = nPattern;
1435  {
1436  // Fake a trailing '*' wildcard.
1437  cPattern = '*';
1438  bRewind = true;
1439  // Assume a non-'*' pattern character follows. If it is a '*' instead
1440  // that will be handled in the loop by setting nPat.
1441  sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1442  if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
1443  rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1444  }
1445 
1446  sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1447  sal_uInt32 cPatternAfterAsterisk = 0;
1448  bool bEscaped = false, bEscapedAfterAsterisk = false;
1449 
1450  // The loop code tries to avoid costly calls to iterateCodePoints() when
1451  // possible.
1452 
1453  do
1454  {
1455  if (bRewind)
1456  {
1457  // Reuse cPattern after '*', nPattern was correspondingly
1458  // decremented to point before cPattern.
1459  bRewind = false;
1460  }
1461  else if (nPattern > 0)
1462  {
1463  // nPattern will be decremented by iterateCodePoints().
1464  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1465  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1466  {
1467  bEscaped = true;
1468  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1469  }
1470  }
1471  else
1472  {
1473  // A trailing '*' is handled below.
1475  {
1476  // If the pattern is consumed and substring match allowed we're good.
1477  setWildcardMatch( aRes, nStartOffset, nString);
1478  return aRes;
1479  }
1480  else if (nString > nEndPos && nLastAsterisk >= 0)
1481  {
1482  // If substring match is not allowed try a greedy '*' match.
1483  nPattern = nLastAsterisk;
1484  continue; // do
1485  }
1486  else
1487  return aRes;
1488  }
1489 
1490  if (cPattern == '*' && !bEscaped)
1491  {
1492  // '*' is one code unit, so not using iterateCodePoints() is ok.
1493  while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1494  --nPattern;
1495 
1496  if (nPattern <= 0)
1497  {
1498  // First pattern is '*', remaining string matches.
1499  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1500  return aRes;
1501  }
1502 
1503  nLastAsterisk = nPattern; // Remember last encountered '*'.
1504 
1505  // cPattern will be the previous non-'*' character, nPattern
1506  // decremented.
1507  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1508  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1509  {
1510  bEscaped = true;
1511  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1512  }
1513 
1514  cPatternAfterAsterisk = cPattern;
1515  bEscapedAfterAsterisk = bEscaped;
1516  nPat = nPattern; // Remember position of pattern before '*', already decremented.
1517  nStr = nString; // Remember the current string to be matched.
1518  }
1519 
1520  if (nString <= nEndPos)
1521  // Whatever leads in pattern, string will not match.
1522  return aRes;
1523 
1524  // nString will be decremented by iterateCodePoints().
1525  sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
1526 
1527  if ((cPattern != '?' || bEscaped) && cPattern != cString)
1528  {
1529  if (nPat == -1)
1530  // Non-match already without any '*' pattern.
1531  return aRes;
1532 
1533  bRewind = true;
1534  nPattern = nPat; // Rewind pattern to character before '*', already decremented.
1535  cPattern = cPatternAfterAsterisk;
1536  bEscaped = bEscapedAfterAsterisk;
1537  searchStr.iterateCodePoints( &nStr, -1);
1538  nString = nStr; // Restore decremented remembered string position.
1539  if (nPat == nAfterFakePattern)
1540  {
1541  // Next start offset will be this character (exclusive).
1542  nStartOffset = nString;
1543  }
1544  }
1545  else
1546  {
1547  // An unescaped '?' pattern matched any character, or characters
1548  // matched. Reset only escaped state.
1549  bEscaped = false;
1550  }
1551  }
1552  while (nString > nEndPos);
1553 
1554  if (bRewind)
1555  return aRes;
1556 
1557  // Eat leading '*' pattern that matches anything, including nothing.
1558  // '*' is one code unit, so not using iterateCodePoints() is ok.
1559  while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1560  --nPattern;
1561 
1562  if (nPattern == 0)
1563  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1564  return aRes;
1565 }
1566 
1567 
1568 OUString SAL_CALL
1570 {
1571  return "com.sun.star.util.TextSearch_i18n";
1572 }
1573 
1574 sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
1575 {
1576  return cppu::supportsService(this, rServiceName);
1577 }
1578 
1579 Sequence< OUString > SAL_CALL
1581 {
1582  return { "com.sun.star.util.TextSearch", "com.sun.star.util.TextSearch2" };
1583 }
1584 
1585 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
1587  css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
1588 {
1589  return cppu::acquire(new TextSearch(context));
1590 }
1591 
1592 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
std::unique_ptr< icu::RegexMatcher > pRegexMatcher
Definition: textsearch.hxx:93
std::mutex m_aMutex
Definition: textsearch.hxx:50
sal_Int32 nIndex
bool bUsePrimarySrchStr
Definition: textsearch.hxx:77
virtual void SAL_CALL setOptions2(const css::util::SearchOptions2 &options) override
Definition: textsearch.cxx:117
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * i18npool_TextSearch_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
css::util::SearchOptions2 aSrchPara
Definition: textsearch.hxx:53
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit
Definition: textsearch.hxx:59
sal_Int64 n
OUString sSrchStr2
Definition: textsearch.hxx:55
bool checkCTLEnd
Definition: textsearch.hxx:133
sal_uInt32 mcWildcardEscapeChar
Definition: textsearch.hxx:120
void MakeForwardTab()
Definition: textsearch.cxx:596
const TransliterationFlags COMPLEX_TRANS_MASK
Definition: textsearch.cxx:45
css::util::SearchResult SAL_CALL RESrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
virtual void SAL_CALL setOptions(const css::util::SearchOptions &options) override
Definition: textsearch.cxx:265
void MakeBackwardTab2()
Definition: textsearch.cxx:670
sal_Int32 GetDiff(const sal_Unicode) const
Definition: textsearch.cxx:693
void MakeBackwardTab()
Definition: textsearch.cxx:647
OUString maWildcardReversePattern2
Definition: textsearch.hxx:119
static sal_Int32 FindPosInSeq_Impl(const Sequence< sal_Int32 > &rOff, sal_Int32 nPos)
Definition: textsearch.cxx:301
sal_uInt16 sal_Unicode
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
virtual ~TextSearch() override
Definition: textsearch.cxx:109
bool checkCTLStart
Definition: textsearch.hxx:133
bool bSearchApostrophe
Definition: textsearch.hxx:71
TransliterationFlags
css::util::SearchResult SAL_CALL WildcardSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
css::util::SearchResult SAL_CALL ApproxSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
TextSearch(const css::uno::Reference< css::uno::XComponentContext > &rxContext)
virtual css::util::SearchResult SAL_CALL searchBackward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:454
void RESrchPrepare(const css::util::SearchOptions2 &)
Definition: textsearch.cxx:858
#define DIGIT
bool IsDelimiter(const OUString &rStr, sal_Int32 nPos) const
Definition: textsearch.cxx:577
css::uno::Reference< css::i18n::XCharacterClassification > xCharClass
Definition: textsearch.hxx:57
int i
css::util::SearchResult SAL_CALL NSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:781
static bool lcl_findRegex(std::unique_ptr< icu::RegexMatcher > const &pRegexMatcher, sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode &rIcuErr)
Definition: textsearch.cxx:926
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual css::util::SearchResult SAL_CALL searchForward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:315
FnSrch fnBackward
Definition: textsearch.hxx:68
float u
unsigned char sal_Bool
css::util::SearchResult SAL_CALL ApproxSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
bool bIsForwardTab
Definition: textsearch.hxx:76
OUString sSrchStr
Definition: textsearch.hxx:54
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:133
OUString maWildcardReversePattern
Definition: textsearch.hxx:118
float v
css::uno::Reference< css::i18n::XBreakIterator > xBreak
Definition: textsearch.hxx:107
css::util::SearchResult SAL_CALL RESrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:947
sal_Int32 startPos
Definition: textsearch.hxx:65
bool mbWildcardAllowSubstring
Definition: textsearch.hxx:121
bool isCellStart(const OUString &searchStr, sal_Int32 nPos)
Definition: textsearch.cxx:308
css::util::SearchResult SAL_CALL NSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:713
std::unique_ptr< TextSearchJumpTable > pJumpTable2
Definition: textsearch.hxx:75
#define SAL_INFO(area, stream)
std::unique_ptr< WLevDistance > pWLD
Definition: textsearch.hxx:106
std::unique_ptr< TextSearchJumpTable > pJumpTable
Definition: textsearch.hxx:74
FnSrch fnForward
Definition: textsearch.hxx:67
virtual OUString SAL_CALL getImplementationName() override
void MakeForwardTab2()
Definition: textsearch.cxx:622
sal_Int32 sal_Int32 endPos
Definition: textsearch.hxx:65
#define SAL_WARN(area, stream)
Reference< XComponentContext > m_xContext
tuple m
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit2
Definition: textsearch.hxx:60
::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable
Definition: textsearch.hxx:41
css::uno::Reference< css::uno::XComponentContext > m_xContext
Definition: textsearch.hxx:51
css::util::SearchResult SAL_CALL WildcardSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)