LibreOffice Module i18npool (master)  1
textsearch.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include "textsearch.hxx"
21 #include "levdis.hxx"
22 #include <com/sun/star/i18n/BreakIterator.hpp>
23 #include <com/sun/star/util/SearchAlgorithms2.hpp>
24 #include <com/sun/star/util/SearchFlags.hpp>
25 #include <com/sun/star/i18n/WordType.hpp>
26 #include <com/sun/star/i18n/ScriptType.hpp>
27 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
28 #include <com/sun/star/i18n/CharacterClassification.hpp>
29 #include <com/sun/star/i18n/KCharacterType.hpp>
30 #include <com/sun/star/i18n/Transliteration.hpp>
32 #include <cppuhelper/weak.hxx>
34 #include <rtl/ustrbuf.hxx>
35 #include <sal/log.hxx>
36 
37 #include <unicode/regex.h>
38 
39 using namespace ::com::sun::star::util;
40 using namespace ::com::sun::star::uno;
41 using namespace ::com::sun::star::lang;
42 using namespace ::com::sun::star::i18n;
43 using namespace ::com::sun::star;
44 
46  TransliterationFlags::ignoreBaFa_ja_JP |
47  TransliterationFlags::ignoreIterationMark_ja_JP |
48  TransliterationFlags::ignoreTiJi_ja_JP |
49  TransliterationFlags::ignoreHyuByu_ja_JP |
50  TransliterationFlags::ignoreSeZe_ja_JP |
51  TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
52  TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
53  TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
54 
55 namespace
56 {
58 {
59  // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
60  // in complex transliteration.
61  return
62  n & (COMPLEX_TRANS_MASK | // all set ignore bits
63  TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
64  TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
65 }
66 
67 bool isComplexTrans( TransliterationFlags n )
68 {
69  return bool(n & COMPLEX_TRANS_MASK);
70 }
71 
72 TransliterationFlags maskSimpleTrans( TransliterationFlags n )
73 {
74  return n & ~COMPLEX_TRANS_MASK;
75 }
76 
77 bool isSimpleTrans( TransliterationFlags n )
78 {
79  return bool(maskSimpleTrans(n));
80 }
81 
82 // Regex patterns are case sensitive.
83 TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
84 {
85  TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
86  TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
87  if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
88  v = TransliterationFlags::NONE;
89  return (m | v) & ~COMPLEX_TRANS_MASK;
90 }
91 
92 bool isSimpleRegexTrans( TransliterationFlags n )
93 {
94  return bool(maskSimpleRegexTrans(n));
95 }
96 };
97 
99  : m_xContext( rxContext )
100 {
101  SearchOptions2 aOpt;
102  aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
103  aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
104  aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
105  //aOpt.Locale = ???;
106  setOptions( aOpt );
107 }
108 
110 {
111  pRegexMatcher.reset();
112  pWLD.reset();
113  pJumpTable.reset();
114  pJumpTable2.reset();
115 }
116 
117 void TextSearch::setOptions2( const SearchOptions2& rOptions )
118 {
119  osl::MutexGuard g(m_aMutex);
120 
121  aSrchPara = rOptions;
122 
123  pRegexMatcher.reset();
124  pWLD.reset();
125  pJumpTable.reset();
126  pJumpTable2.reset();
127  maWildcardReversePattern.clear();
129  TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
130  bSearchApostrophe = false;
131  bool bReplaceApostrophe = false;
132  if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
133  {
134  // RESrchPrepare will consider aSrchPara.transliterateFlags when
135  // picking the actual regex pattern
136  // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
137  // case-insensitivity. Create transliteration instance, if any, without
138  // ignore-case so later in TextSearch::searchForward() the string to
139  // match is not case-altered, leave case-(in)sensitive to regex engine.
140  transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
141  }
142  else if ( aSrchPara.searchString.indexOf('\'') > - 1 )
143  {
144  bSearchApostrophe = true;
145  bReplaceApostrophe = aSrchPara.searchString.indexOf(u'\u2019') > -1;
146  }
147 
148  // Create Transliteration class
149  if( isSimpleTrans( transliterateFlags) )
150  {
151  if( !xTranslit.is() )
152  xTranslit.set( Transliteration::create( m_xContext ) );
153  xTranslit->loadModule(
154  static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
155  aSrchPara.Locale);
156  }
157  else if( xTranslit.is() )
158  xTranslit = nullptr;
159 
160  // Create Transliteration for 2<->1, 2<->2 transliteration
161  if ( isComplexTrans( transliterateFlags) )
162  {
163  if( !xTranslit2.is() )
164  xTranslit2.set( Transliteration::create( m_xContext ) );
165  // Load transliteration module
166  xTranslit2->loadModule(
167  static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
168  aSrchPara.Locale);
169  }
170 
171  if ( !xBreak.is() )
172  xBreak = css::i18n::BreakIterator::create( m_xContext );
173 
174  sSrchStr = aSrchPara.searchString;
175 
176  // Transliterate search string.
177  if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
178  {
179  if (isSimpleRegexTrans(transliterateFlags))
180  {
181  if (maskSimpleRegexTrans(transliterateFlags) !=
182  maskSimpleTrans(transliterateFlags))
183  {
184  css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
185  Transliteration::create( m_xContext ));
186  if (xTranslitPattern.is())
187  {
188  xTranslitPattern->loadModule(
189  static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
190  aSrchPara.Locale);
191  sSrchStr = xTranslitPattern->transliterateString2String(
192  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
193  }
194  }
195  else
196  {
197  if (xTranslit.is())
198  sSrchStr = xTranslit->transliterateString2String(
199  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
200  }
201  // xTranslit2 complex transliterated sSrchStr2 is not used in
202  // regex, see TextSearch::searchForward() and
203  // TextSearch::searchBackward()
204  }
205  }
206  else
207  {
208  if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
209  sSrchStr = xTranslit->transliterateString2String(
210  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
211 
212  if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
213  sSrchStr2 = xTranslit2->transliterateString2String(
214  aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
215  }
216 
217  // When start or end of search string is a complex script type, we need to
218  // make sure the result boundary is not located in the middle of cell.
219  checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
220  ScriptType::COMPLEX));
221  checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
222  sSrchStr.getLength()-1) == ScriptType::COMPLEX));
223 
224  if ( bReplaceApostrophe )
225  sSrchStr = sSrchStr.replace(u'\u2019', '\'');
226 
227  // Take the new SearchOptions2::AlgorithmType2 field and ignore
228  // SearchOptions::algorithmType
229  switch( aSrchPara.AlgorithmType2)
230  {
231  case SearchAlgorithms2::REGEXP:
235  break;
236 
237  case SearchAlgorithms2::APPROXIMATE:
240 
241  pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
242  aSrchPara.insertedChars, aSrchPara.deletedChars,
243  0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
244 
245  nLimit = pWLD->GetLimit();
246  break;
247 
248  case SearchAlgorithms2::WILDCARD:
249  mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
250  mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
253  break;
254 
255  default:
256  SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
257  [[fallthrough]];
258  case SearchAlgorithms2::ABSOLUTE:
261  break;
262  }
263 }
264 
265 void TextSearch::setOptions( const SearchOptions& rOptions )
266 {
267  osl::MutexGuard g(m_aMutex);
268 
269  sal_Int16 nAlgorithmType2;
270  switch (rOptions.algorithmType)
271  {
272  case SearchAlgorithms_REGEXP:
273  nAlgorithmType2 = SearchAlgorithms2::REGEXP;
274  break;
275  case SearchAlgorithms_APPROXIMATE:
276  nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
277  break;
278  default:
279  SAL_WARN("i18npool","TextSearch::setOptions - default what?");
280  [[fallthrough]];
281  case SearchAlgorithms_ABSOLUTE:
282  nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
283  break;
284  }
285  // It would be nice if an inherited struct had a ctor that takes an
286  // instance of the object the struct derived from...
287  SearchOptions2 aOptions2(
288  rOptions.algorithmType,
289  rOptions.searchFlag,
290  rOptions.searchString,
291  rOptions.replaceString,
292  rOptions.Locale,
293  rOptions.changedChars,
294  rOptions.deletedChars,
295  rOptions.insertedChars,
296  rOptions.transliterateFlags,
297  nAlgorithmType2,
298  0 // no wildcard search, no escape character...
299  );
300  setOptions2( aOptions2);
301 }
302 
303 static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
304 {
305  auto pOff = std::find_if(rOff.begin(), rOff.end(),
306  [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
307  return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
308 }
309 
310 bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
311 {
312  sal_Int32 nDone;
313  return nPos == xBreak->previousCharacters(searchStr, nPos+1,
314  aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
315 }
316 
317 SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
318 {
319  osl::MutexGuard g(m_aMutex);
320 
321  SearchResult sres;
322 
323  OUString in_str(searchStr);
324 
325  // in non-regex mode, allow searching typographical apostrophe with the ASCII one
326  // to avoid regression after using automatic conversion to U+2019 during typing in Writer
327  bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
328 
329  bUsePrimarySrchStr = true;
330 
331  if ( xTranslit.is() )
332  {
333  // apply normal transliteration (1<->1, 1<->0)
334 
335  sal_Int32 nInStartPos = startPos;
336  if (pRegexMatcher && startPos > 0)
337  {
338  // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
339  // transliterate enough of the leading text to allow sensible look-behind assertions.
340  // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
341  // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
342  // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
343  // the middle of a surrogate pair, preceded by another surrogate pair.
344  const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
345  nInStartPos -= std::min(nMaxLeadingLen, startPos);
346  }
347  sal_Int32 nInEndPos = endPos;
348  if (pRegexMatcher && endPos < searchStr.getLength())
349  {
350  // tdf#65038: ditto for look-ahead assertions
351  const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
352  nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
353  }
354 
355  css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
356  in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
357 
358  if ( bReplaceApostrophe )
359  in_str = in_str.replace(u'\u2019', '\'');
360 
361  // JP 20.6.2001: also the start and end positions must be corrected!
362  sal_Int32 newStartPos =
363  (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
364 
365  sal_Int32 newEndPos = (endPos < searchStr.getLength())
366  ? FindPosInSeq_Impl( offset, endPos )
367  : in_str.getLength();
368 
369  sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
370 
371  // Map offsets back to untransliterated string.
372  const sal_Int32 nOffsets = offset.getLength();
373  if (nOffsets)
374  {
375  auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
376  auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
377  // For regex nGroups is the number of groups+1 with group 0 being
378  // the entire match.
379  const sal_Int32 nGroups = sres.startOffset.getLength();
380  for ( sal_Int32 k = 0; k < nGroups; k++ )
381  {
382  const sal_Int32 nStart = sres.startOffset[k];
383  // Result offsets are negative (-1) if a group expression was
384  // not matched.
385  if (nStart >= 0)
386  sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
387  // JP 20.6.2001: end is ever exclusive and then don't return
388  // the position of the next character - return the
389  // next position behind the last found character!
390  // "a b c" find "b" must return 2,3 and not 2,4!!!
391  const sal_Int32 nStop = sres.endOffset[k];
392  if (nStop >= 0)
393  {
394  if (nStop > 0)
395  sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
396  else
397  sres_endOffsetRange[k] = offset[0];
398  }
399  }
400  }
401  }
402  else
403  {
404  if ( bReplaceApostrophe )
405  in_str = in_str.replace(u'\u2019', '\'');
406 
407  sres = (this->*fnForward)( in_str, startPos, endPos );
408  }
409 
410  if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
411  {
412  SearchResult sres2;
413 
414  in_str = searchStr;
415  css::uno::Sequence <sal_Int32> offset( in_str.getLength());
416 
417  in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
418 
419  if( startPos )
420  startPos = FindPosInSeq_Impl( offset, startPos );
421 
422  if( endPos < searchStr.getLength() )
423  endPos = FindPosInSeq_Impl( offset, endPos );
424  else
425  endPos = in_str.getLength();
426 
427  bUsePrimarySrchStr = false;
428  sres2 = (this->*fnForward)( in_str, startPos, endPos );
429  auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
430  auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
431 
432  for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
433  {
434  if (sres2.startOffset[k])
435  sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
436  if (sres2.endOffset[k])
437  sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
438  }
439 
440  // pick first and long one
441  if ( sres.subRegExpressions == 0)
442  return sres2;
443  if ( sres2.subRegExpressions == 1)
444  {
445  if ( sres.startOffset[0] > sres2.startOffset[0])
446  return sres2;
447  else if ( sres.startOffset[0] == sres2.startOffset[0] &&
448  sres.endOffset[0] < sres2.endOffset[0])
449  return sres2;
450  }
451  }
452 
453  return sres;
454 }
455 
456 SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
457 {
458  osl::MutexGuard g(m_aMutex);
459 
460  SearchResult sres;
461 
462  OUString in_str(searchStr);
463 
464  // in non-regex mode, allow searching typographical apostrophe with the ASCII one
465  // to avoid regression after using automatic conversion to U+2019 during typing in Writer
466  bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
467 
468  bUsePrimarySrchStr = true;
469 
470  if ( xTranslit.is() )
471  {
472  // apply only simple 1<->1 transliteration here
473  css::uno::Sequence<sal_Int32> offset(startPos - endPos);
474  in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
475 
476  if ( bReplaceApostrophe )
477  in_str = in_str.replace(u'\u2019', '\'');
478 
479  // JP 20.6.2001: also the start and end positions must be corrected!
480  sal_Int32 const newStartPos = (startPos < searchStr.getLength())
481  ? FindPosInSeq_Impl( offset, startPos )
482  : in_str.getLength();
483 
484  sal_Int32 const newEndPos =
485  (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
486 
487  // TODO: this would need nExtraOffset handling to avoid $ matching
488  // if (pRegexMatcher && startPos < searchStr.getLength())
489  // but that appears to be impossible with ICU regex
490 
491  sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
492 
493  // Map offsets back to untransliterated string.
494  const sal_Int32 nOffsets = offset.getLength();
495  if (nOffsets)
496  {
497  auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
498  auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
499  // For regex nGroups is the number of groups+1 with group 0 being
500  // the entire match.
501  const sal_Int32 nGroups = sres.startOffset.getLength();
502  for ( sal_Int32 k = 0; k < nGroups; k++ )
503  {
504  const sal_Int32 nStart = sres.startOffset[k];
505  // Result offsets are negative (-1) if a group expression was
506  // not matched.
507  if (nStart >= 0)
508  {
509  if (nStart > 0)
510  sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
511  else
512  sres_startOffsetRange[k] = offset[0];
513  }
514  // JP 20.6.2001: end is ever exclusive and then don't return
515  // the position of the next character - return the
516  // next position behind the last found character!
517  // "a b c" find "b" must return 2,3 and not 2,4!!!
518  const sal_Int32 nStop = sres.endOffset[k];
519  if (nStop >= 0)
520  sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
521  }
522  }
523  }
524  else
525  {
526  if ( bReplaceApostrophe )
527  in_str = in_str.replace(u'\u2019', '\'');
528 
529  sres = (this->*fnBackward)( in_str, startPos, endPos );
530  }
531 
532  if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
533  {
534  SearchResult sres2;
535 
536  in_str = searchStr;
537  css::uno::Sequence <sal_Int32> offset( in_str.getLength());
538 
539  in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
540 
541  if( startPos < searchStr.getLength() )
542  startPos = FindPosInSeq_Impl( offset, startPos );
543  else
544  startPos = in_str.getLength();
545 
546  if( endPos )
547  endPos = FindPosInSeq_Impl( offset, endPos );
548 
549  bUsePrimarySrchStr = false;
550  sres2 = (this->*fnBackward)( in_str, startPos, endPos );
551  auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
552  auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
553 
554  for( int k = 0; k < sres2.startOffset.getLength(); k++ )
555  {
556  if (sres2.startOffset[k])
557  sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
558  if (sres2.endOffset[k])
559  sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
560  }
561 
562  // pick last and long one
563  if ( sres.subRegExpressions == 0 )
564  return sres2;
565  if ( sres2.subRegExpressions == 1 )
566  {
567  if ( sres.startOffset[0] < sres2.startOffset[0] )
568  return sres2;
569  if ( sres.startOffset[0] == sres2.startOffset[0] &&
570  sres.endOffset[0] > sres2.endOffset[0] )
571  return sres2;
572  }
573  }
574 
575  return sres;
576 }
577 
578 
579 bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
580 {
581  bool bRet = true;
582  if( '\x7f' != rStr[nPos])
583  {
584  if ( !xCharClass.is() )
585  xCharClass = CharacterClassification::create( m_xContext );
586  sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
587  aSrchPara.Locale );
588  if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
589  KCharacterType::LETTER ) & nCType ) )
590  bRet = false;
591  }
592  return bRet;
593 }
594 
595 // --------- helper methods for Boyer-Moore like text searching ----------
596 // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
597 
599 {
600  // create the jumptable for the search text
601 
602  if( pJumpTable && bIsForwardTab )
603  {
604  return; // the jumpTable is ok
605  }
606  bIsForwardTab = true;
607 
608  sal_Int32 n, nLen = sSrchStr.getLength();
609  pJumpTable.reset( new TextSearchJumpTable );
610 
611  for( n = 0; n < nLen - 1; ++n )
612  {
613  sal_Unicode cCh = sSrchStr[n];
614  sal_Int32 nDiff = nLen - n - 1;
615  TextSearchJumpTable::value_type aEntry( cCh, nDiff );
616 
617  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
618  pJumpTable->insert( aEntry );
619  if ( !aPair.second )
620  (*(aPair.first)).second = nDiff;
621  }
622 }
623 
625 {
626  // create the jumptable for the search text
627  if( pJumpTable2 && bIsForwardTab )
628  {
629  return; // the jumpTable is ok
630  }
631  bIsForwardTab = true;
632 
633  sal_Int32 n, nLen = sSrchStr2.getLength();
634  pJumpTable2.reset( new TextSearchJumpTable );
635 
636  for( n = 0; n < nLen - 1; ++n )
637  {
638  sal_Unicode cCh = sSrchStr2[n];
639  sal_Int32 nDiff = nLen - n - 1;
640 
641  TextSearchJumpTable::value_type aEntry( cCh, nDiff );
642  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
643  pJumpTable2->insert( aEntry );
644  if ( !aPair.second )
645  (*(aPair.first)).second = nDiff;
646  }
647 }
648 
650 {
651  // create the jumptable for the search text
652  if( pJumpTable && !bIsForwardTab)
653  {
654  return; // the jumpTable is ok
655  }
656  bIsForwardTab = false;
657 
658  sal_Int32 n, nLen = sSrchStr.getLength();
659  pJumpTable.reset( new TextSearchJumpTable );
660 
661  for( n = nLen-1; n > 0; --n )
662  {
663  sal_Unicode cCh = sSrchStr[n];
664  TextSearchJumpTable::value_type aEntry( cCh, n );
665  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
666  pJumpTable->insert( aEntry );
667  if ( !aPair.second )
668  (*(aPair.first)).second = n;
669  }
670 }
671 
673 {
674  // create the jumptable for the search text
675  if( pJumpTable2 && !bIsForwardTab )
676  {
677  return; // the jumpTable is ok
678  }
679  bIsForwardTab = false;
680 
681  sal_Int32 n, nLen = sSrchStr2.getLength();
682  pJumpTable2.reset( new TextSearchJumpTable );
683 
684  for( n = nLen-1; n > 0; --n )
685  {
686  sal_Unicode cCh = sSrchStr2[n];
687  TextSearchJumpTable::value_type aEntry( cCh, n );
688  ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
689  pJumpTable2->insert( aEntry );
690  if ( !aPair.second )
691  (*(aPair.first)).second = n;
692  }
693 }
694 
695 sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
696 {
697  TextSearchJumpTable *pJump;
698  OUString sSearchKey;
699 
700  if ( bUsePrimarySrchStr ) {
701  pJump = pJumpTable.get();
702  sSearchKey = sSrchStr;
703  } else {
704  pJump = pJumpTable2.get();
705  sSearchKey = sSrchStr2;
706  }
707 
708  TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
709  if ( iLook == pJump->end() )
710  return sSearchKey.getLength();
711  return (*iLook).second;
712 }
713 
714 
715 SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
716 {
717  SearchResult aRet;
718  aRet.subRegExpressions = 0;
719 
720  OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
721 
722  sal_Int32 nSuchIdx = searchStr.getLength();
723  sal_Int32 nEnd = endPos;
724  if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
725  return aRet;
726 
727 
728  if( nEnd < sSearchKey.getLength() ) // position inside the search region ?
729  return aRet;
730 
731  nEnd -= sSearchKey.getLength();
732 
733  if (bUsePrimarySrchStr)
734  MakeForwardTab(); // create the jumptable
735  else
736  MakeForwardTab2();
737 
738  for (sal_Int32 nCmpIdx = startPos; // start position for the search
739  nCmpIdx <= nEnd;
740  nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
741  {
742  // if the match would be the completed cells, skip it.
743  if ( (checkCTLStart && !isCellStart( searchStr, nCmpIdx )) || (checkCTLEnd
744  && !isCellStart( searchStr, nCmpIdx + sSearchKey.getLength())) )
745  continue;
746 
747  nSuchIdx = sSearchKey.getLength() - 1;
748  while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
749  {
750  if( nSuchIdx == 0 )
751  {
752  if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
753  {
754  sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
755  bool bAtStart = !nCmpIdx;
756  bool bAtEnd = nFndEnd == endPos;
757  bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
758  bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd );
759  // * 1 -> only one word in the paragraph
760  // * 2 -> at begin of paragraph
761  // * 3 -> at end of paragraph
762  // * 4 -> inside the paragraph
763  if( !( ( bAtStart && bAtEnd ) || // 1
764  ( bAtStart && bDelimBehind ) || // 2
765  ( bAtEnd && bDelimBefore ) || // 3
766  ( bDelimBefore && bDelimBehind ))) // 4
767  break;
768  }
769 
770  aRet.subRegExpressions = 1;
771  aRet.startOffset = { nCmpIdx };
772  aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };
773 
774  return aRet;
775  }
776  else
777  nSuchIdx--;
778  }
779  }
780  return aRet;
781 }
782 
783 SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
784 {
785  SearchResult aRet;
786  aRet.subRegExpressions = 0;
787 
788  OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
789 
790  sal_Int32 nSuchIdx = searchStr.getLength();
791  sal_Int32 nEnd = endPos;
792  if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
793  return aRet;
794 
795  if (bUsePrimarySrchStr)
796  MakeBackwardTab(); // create the jumptable
797  else
799 
800  if( nEnd == nSuchIdx ) // end position for the search
801  nEnd = sSearchKey.getLength();
802  else
803  nEnd += sSearchKey.getLength();
804 
805  sal_Int32 nCmpIdx = startPos; // start position for the search
806 
807  while (nCmpIdx >= nEnd)
808  {
809  // if the match would be the completed cells, skip it.
810  if ( (!checkCTLStart || isCellStart( searchStr, nCmpIdx -
811  sSearchKey.getLength() )) && (!checkCTLEnd ||
812  isCellStart( searchStr, nCmpIdx)))
813  {
814  nSuchIdx = 0;
815  while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
816  searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
817  nSuchIdx++;
818  if( nSuchIdx >= sSearchKey.getLength() )
819  {
820  if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
821  {
822  sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
823  bool bAtStart = !nFndStt;
824  bool bAtEnd = nCmpIdx == startPos;
825  bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
826  bool bDelimBefore = bAtStart || // begin of paragraph
827  IsDelimiter( searchStr, nFndStt-1 );
828  // * 1 -> only one word in the paragraph
829  // * 2 -> at begin of paragraph
830  // * 3 -> at end of paragraph
831  // * 4 -> inside the paragraph
832  if( ( bAtStart && bAtEnd ) || // 1
833  ( bAtStart && bDelimBehind ) || // 2
834  ( bAtEnd && bDelimBefore ) || // 3
835  ( bDelimBefore && bDelimBehind )) // 4
836  {
837  aRet.subRegExpressions = 1;
838  aRet.startOffset = { nCmpIdx };
839  aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
840  return aRet;
841  }
842  }
843  else
844  {
845  aRet.subRegExpressions = 1;
846  aRet.startOffset = { nCmpIdx };
847  aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
848  return aRet;
849  }
850  }
851  }
852  nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
853  if( nCmpIdx < nSuchIdx )
854  return aRet;
855  nCmpIdx -= nSuchIdx;
856  }
857  return aRet;
858 }
859 
860 void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
861 {
862  TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
863  // select the transliterated pattern string
864  const OUString& rPatternStr =
865  (isSimpleTrans(transliterateFlags) ? sSrchStr
866  : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
867 
868  sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
869  // map css::util::SearchFlags to ICU uregex.h flags
870  // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
871  // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
872  // REG_NOSUB is not used anywhere => not implemented
873  // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
874  // LEV_RELAXED is only used for SearchAlgorithm==Approximate
875  // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
876  // probably because the transliteration flag IGNORE_CASE handles it as well.
877  if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
878  || (transliterateFlags & TransliterationFlags::IGNORE_CASE))
879  nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
880  UErrorCode nIcuErr = U_ZERO_ERROR;
881  // assumption: transliteration didn't mangle regexp control chars
882  IcuUniString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
883 #ifndef DISABLE_WORDBOUND_EMULATION
884  // for convenience specific syntax elements of the old regex engine are emulated
885  // - by replacing < with "word-break followed by a look-ahead word-char"
886  static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant);
887  static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant);
888  static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
889  aChevronMatcherB.reset( aIcuSearchPatStr);
890  aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
891  aChevronMatcherB.reset();
892  // - by replacing > with "look-behind word-char followed by a word-break"
893  static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant);
894  static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant);
895  static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
896  aChevronMatcherE.reset( aIcuSearchPatStr);
897  aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
898  aChevronMatcherE.reset();
899 #endif
900  pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
901  if (nIcuErr)
902  {
903  SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
904  pRegexMatcher.reset();
905  }
906  else
907  {
908  // Pathological patterns may result in exponential run time making the
909  // application appear to be frozen. Limit that. Documentation for this
910  // call says
911  // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
912  // "The units of the limit are steps of the match engine.
913  // Correspondence with actual processor time will depend on the speed
914  // of the processor and the details of the specific pattern, but will
915  // typically be on the order of milliseconds."
916  // Just what is a good value? 42 is always an answer ... the 23 enigma
917  // as well... which on the dev's machine is roughly 50 seconds with the
918  // pattern of fdo#70627.
919  /* TODO: make this a configuration settable value and possibly take
920  * complexity of expression into account and maybe even length of text
921  * to be matched; currently (2013-11-25) that is at most one 64k
922  * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
923  pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
924  }
925 }
926 
927 
928 static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
929  sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
930 {
931  pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
932  pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
933  pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
934  // look-ahead/behind assertions
935 
936  if (!pRegexMatcher->find(rIcuErr))
937  {
938  /* TODO: future versions could pass the UErrorCode or translations
939  * thereof to the caller, for example to inform the user of
940  * U_REGEX_TIME_OUT. The strange thing though is that an error is set
941  * only after the second call that returns immediately and not if
942  * timeout occurred on the first call?!? */
943  SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
944  return false;
945  }
946  return true;
947 }
948 
949 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
950  sal_Int32 startPos, sal_Int32 endPos )
951 {
952  SearchResult aRet;
953  aRet.subRegExpressions = 0;
954  if( !pRegexMatcher)
955  return aRet;
956 
957  if( endPos > searchStr.getLength())
958  endPos = searchStr.getLength();
959 
960  // use the ICU RegexMatcher to find the matches
961  UErrorCode nIcuErr = U_ZERO_ERROR;
962  const IcuUniString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
963  searchStr.getLength());
964  pRegexMatcher->reset( aSearchTargetStr);
965  // search until there is a valid match
966  for(;;)
967  {
968  if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
969  return aRet;
970 
971  // #i118887# ignore zero-length matches e.g. "a*" in "bc"
972  int nStartOfs = pRegexMatcher->start( nIcuErr);
973  int nEndOfs = pRegexMatcher->end( nIcuErr);
974  if( nStartOfs < nEndOfs)
975  break;
976  // If the zero-length match is behind the string, do not match it again
977  // and again until startPos reaches there. A match behind the string is
978  // a "$" anchor.
979  if (nStartOfs == endPos)
980  break;
981  // try at next position if there was a zero-length match
982  if( ++startPos >= endPos)
983  return aRet;
984  }
985 
986  // extract the result of the search
987  const int nGroupCount = pRegexMatcher->groupCount();
988  aRet.subRegExpressions = nGroupCount + 1;
989  aRet.startOffset.realloc( aRet.subRegExpressions);
990  auto pstartOffset = aRet.startOffset.getArray();
991  aRet.endOffset.realloc( aRet.subRegExpressions);
992  auto pendOffset = aRet.endOffset.getArray();
993  pstartOffset[0] = pRegexMatcher->start( nIcuErr);
994  pendOffset[0] = pRegexMatcher->end( nIcuErr);
995  for( int i = 1; i <= nGroupCount; ++i) {
996  pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
997  pendOffset[i] = pRegexMatcher->end( i, nIcuErr);
998  }
999 
1000  return aRet;
1001 }
1002 
1003 SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
1004  sal_Int32 startPos, sal_Int32 endPos )
1005 {
1006  // NOTE: for backwards search callers provide startPos/endPos inverted!
1007  SearchResult aRet;
1008  aRet.subRegExpressions = 0;
1009  if( !pRegexMatcher)
1010  return aRet;
1011 
1012  if( startPos > searchStr.getLength())
1013  startPos = searchStr.getLength();
1014 
1015  // use the ICU RegexMatcher to find the matches
1016  // TODO: use ICU's backward searching once it becomes available
1017  // as its replacement using forward search is not as good as the real thing
1018  UErrorCode nIcuErr = U_ZERO_ERROR;
1019  const IcuUniString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
1020  searchStr.getLength());
1021  pRegexMatcher->reset( aSearchTargetStr);
1022  if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
1023  return aRet;
1024 
1025  // find the last match
1026  int nLastPos = 0;
1027  int nFoundEnd = 0;
1028  int nGoodPos = 0, nGoodEnd = 0;
1029  bool bFirst = true;
1030  do {
1031  nLastPos = pRegexMatcher->start( nIcuErr);
1032  nFoundEnd = pRegexMatcher->end( nIcuErr);
1033  if (nLastPos < nFoundEnd)
1034  {
1035  // remember last non-zero-length match
1036  nGoodPos = nLastPos;
1037  nGoodEnd = nFoundEnd;
1038  }
1039  if( nFoundEnd >= startPos)
1040  break;
1041  bFirst = false;
1042  if( nFoundEnd == nLastPos)
1043  ++nFoundEnd;
1044  } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
1045 
1046  // Ignore all zero-length matches except "$" anchor on first match.
1047  if (nGoodPos == nGoodEnd)
1048  {
1049  if (bFirst && nLastPos == startPos)
1050  nGoodPos = nLastPos;
1051  else
1052  return aRet;
1053  }
1054 
1055  // find last match again to get its details
1056  lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
1057 
1058  // fill in the details of the last match
1059  const int nGroupCount = pRegexMatcher->groupCount();
1060  aRet.subRegExpressions = nGroupCount + 1;
1061  aRet.startOffset.realloc( aRet.subRegExpressions);
1062  auto pstartOffset = aRet.startOffset.getArray();
1063  aRet.endOffset.realloc( aRet.subRegExpressions);
1064  auto pendOffset = aRet.endOffset.getArray();
1065  // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
1066  pstartOffset[0] = pRegexMatcher->end( nIcuErr);
1067  pendOffset[0] = pRegexMatcher->start( nIcuErr);
1068  for( int i = 1; i <= nGroupCount; ++i) {
1069  pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
1070  pendOffset[i] = pRegexMatcher->start( i, nIcuErr);
1071  }
1072 
1073  return aRet;
1074 }
1075 
1076 
1077 // search for words phonetically
1078 SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
1079  sal_Int32 startPos, sal_Int32 endPos )
1080 {
1081  SearchResult aRet;
1082  aRet.subRegExpressions = 0;
1083 
1084  if( !xBreak.is() )
1085  return aRet;
1086 
1087  sal_Int32 nStt, nEnd;
1088 
1089  Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1090  aSrchPara.Locale,
1091  WordType::ANYWORD_IGNOREWHITESPACES, true );
1092 
1093  do
1094  {
1095  if( aWBnd.startPos >= endPos )
1096  break;
1097  nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
1098  nEnd = std::min(aWBnd.endPos, endPos);
1099 
1100  if( nStt < nEnd &&
1101  pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1102  {
1103  aRet.subRegExpressions = 1;
1104  aRet.startOffset = { nStt };
1105  aRet.endOffset = { nEnd };
1106  break;
1107  }
1108 
1109  nStt = nEnd - 1;
1110  aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
1111  WordType::ANYWORD_IGNOREWHITESPACES);
1112  } while( aWBnd.startPos != aWBnd.endPos ||
1113  (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
1114  // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
1115  // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
1116  // and nextWord() does also => don't loop forever.
1117  return aRet;
1118 }
1119 
1120 SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
1121  sal_Int32 startPos, sal_Int32 endPos )
1122 {
1123  SearchResult aRet;
1124  aRet.subRegExpressions = 0;
1125 
1126  if( !xBreak.is() )
1127  return aRet;
1128 
1129  sal_Int32 nStt, nEnd;
1130 
1131  Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
1132  aSrchPara.Locale,
1133  WordType::ANYWORD_IGNOREWHITESPACES, true );
1134 
1135  do
1136  {
1137  if( aWBnd.endPos <= endPos )
1138  break;
1139  nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
1140  nEnd = std::min(aWBnd.endPos, startPos);
1141 
1142  if( nStt < nEnd &&
1143  pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
1144  {
1145  aRet.subRegExpressions = 1;
1146  aRet.startOffset = { nEnd };
1147  aRet.endOffset = { nStt };
1148  break;
1149  }
1150  if( !nStt )
1151  break;
1152 
1153  aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
1154  WordType::ANYWORD_IGNOREWHITESPACES);
1155  } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
1156  return aRet;
1157 }
1158 
1159 
1160 namespace {
1161 void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
1162 {
1163  rRes.subRegExpressions = 1;
1164  rRes.startOffset = { nStartOffset };
1165  rRes.endOffset = { nEndOffset };
1166 }
1167 }
1168 
1169 SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1170 {
1171  SearchResult aRes;
1172  aRes.subRegExpressions = 0; // no match
1173  sal_Int32 nStartOffset = nStartPos;
1174  sal_Int32 nEndOffset = nEndPos;
1175 
1176  const sal_Int32 nStringLen = searchStr.getLength();
1177 
1178  // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
1179  // string match with [0,0).
1180  if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
1181  (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1182  return aRes;
1183 
1184  const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1185  const sal_Int32 nPatternLen = rPattern.getLength();
1186 
1187  // Handle special cases empty pattern and/or string outside of the loop to
1188  // not add performance penalties there and simplify.
1189  if (nStartPos == nEndPos)
1190  {
1191  sal_Int32 i = 0;
1192  while (i < nPatternLen && rPattern[i] == '*')
1193  ++i;
1194  if (i == nPatternLen)
1195  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1196  return aRes;
1197  }
1198 
1199  // Empty pattern does not match any non-empty string.
1200  if (!nPatternLen)
1201  return aRes;
1202 
1203  bool bRewind = false;
1204  sal_uInt32 cPattern = 0;
1205  sal_Int32 nPattern = 0;
1206  sal_Int32 nAfterFakePattern = nPattern;
1208  {
1209  // Fake a leading '*' wildcard.
1210  cPattern = '*';
1211  bRewind = true;
1212  // Assume a non-'*' pattern character follows. If it is a '*' instead
1213  // that will be handled in the loop by setting nPat.
1214  sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
1215  if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
1216  rPattern.iterateCodePoints( &nAfterFakePattern);
1217  }
1218 
1219  sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1220  sal_uInt32 cPatternAfterAsterisk = 0;
1221  bool bEscaped = false, bEscapedAfterAsterisk = false;
1222 
1223  // The loop code tries to avoid costly calls to iterateCodePoints() when
1224  // possible.
1225 
1226  do
1227  {
1228  if (bRewind)
1229  {
1230  // Reuse cPattern after '*', nPattern was correspondingly
1231  // incremented to point behind cPattern.
1232  bRewind = false;
1233  }
1234  else if (nPattern < nPatternLen)
1235  {
1236  // nPattern will be incremented by iterateCodePoints().
1237  cPattern = rPattern.iterateCodePoints( &nPattern);
1238  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1239  {
1240  bEscaped = true;
1241  cPattern = rPattern.iterateCodePoints( &nPattern);
1242  }
1243  }
1244  else
1245  {
1246  // A trailing '*' is handled below.
1248  {
1249  // If the pattern is consumed and substring match allowed we're good.
1250  setWildcardMatch( aRes, nStartOffset, nString);
1251  return aRes;
1252  }
1253  else if (nString < nEndPos && nLastAsterisk >= 0)
1254  {
1255  // If substring match is not allowed try a greedy '*' match.
1256  nPattern = nLastAsterisk;
1257  continue; // do
1258  }
1259  else
1260  return aRes;
1261  }
1262 
1263  if (cPattern == '*' && !bEscaped)
1264  {
1265  // '*' is one code unit, so not using iterateCodePoints() is ok.
1266  while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1267  ++nPattern;
1268 
1269  if (nPattern >= nPatternLen)
1270  {
1271  // Last pattern is '*', remaining string matches.
1272  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1273  return aRes;
1274  }
1275 
1276  nLastAsterisk = nPattern; // Remember last encountered '*'.
1277 
1278  // cPattern will be the next non-'*' character, nPattern
1279  // incremented.
1280  cPattern = rPattern.iterateCodePoints( &nPattern);
1281  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
1282  {
1283  bEscaped = true;
1284  cPattern = rPattern.iterateCodePoints( &nPattern);
1285  }
1286 
1287  cPatternAfterAsterisk = cPattern;
1288  bEscapedAfterAsterisk = bEscaped;
1289  nPat = nPattern; // Remember position of pattern behind '*', already incremented.
1290  nStr = nString; // Remember the current string to be matched.
1291  }
1292 
1293  if (nString >= nEndPos)
1294  // Whatever follows in pattern, string will not match.
1295  return aRes;
1296 
1297  // nString will be incremented by iterateCodePoints().
1298  sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
1299 
1300  if ((cPattern != '?' || bEscaped) && cPattern != cString)
1301  {
1302  if (nPat == -1)
1303  // Non-match already without any '*' pattern.
1304  return aRes;
1305 
1306  bRewind = true;
1307  nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
1308  cPattern = cPatternAfterAsterisk;
1309  bEscaped = bEscapedAfterAsterisk;
1310  searchStr.iterateCodePoints( &nStr);
1311  nString = nStr; // Restore incremented remembered string position.
1312  if (nPat == nAfterFakePattern)
1313  {
1314  // Next start offset will be the next character.
1315  nStartOffset = nString;
1316  }
1317  }
1318  else
1319  {
1320  // An unescaped '?' pattern matched any character, or characters
1321  // matched. Reset only escaped state.
1322  bEscaped = false;
1323  }
1324  }
1325  while (nString < nEndPos);
1326 
1327  if (bRewind)
1328  return aRes;
1329 
1330  // Eat trailing '*' pattern that matches anything, including nothing.
1331  // '*' is one code unit, so not using iterateCodePoints() is ok.
1332  while (nPattern < nPatternLen && rPattern[nPattern] == '*')
1333  ++nPattern;
1334 
1335  if (nPattern == nPatternLen)
1336  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1337  return aRes;
1338 }
1339 
1340 SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
1341 {
1342  SearchResult aRes;
1343  aRes.subRegExpressions = 0; // no match
1344 
1345  sal_Int32 nStartOffset = nStartPos;
1346  sal_Int32 nEndOffset = nEndPos;
1347 
1348  const sal_Int32 nStringLen = searchStr.getLength();
1349 
1350  // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
1351  // string match with (0,0].
1352  if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
1353  (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
1354  return aRes;
1355 
1356  const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
1357  sal_Int32 nPatternLen = rPattern.getLength();
1358 
1359  // Handle special cases empty pattern and/or string outside of the loop to
1360  // not add performance penalties there and simplify.
1361  if (nStartPos == nEndPos)
1362  {
1363  sal_Int32 i = 0;
1364  while (i < nPatternLen && rPattern[i] == '*')
1365  ++i;
1366  if (i == nPatternLen)
1367  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1368  return aRes;
1369  }
1370 
1371  // Empty pattern does not match any non-empty string.
1372  if (!nPatternLen)
1373  return aRes;
1374 
1375  // Reverse escaped patterns to ease the handling of escapes, keeping escape
1376  // and following character as one sequence in backward direction.
1377  if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
1379  {
1380  OUStringBuffer aPatternBuf( rPattern);
1381  sal_Int32 nIndex = 0;
1382  while (nIndex < nPatternLen)
1383  {
1384  const sal_Int32 nOld = nIndex;
1385  const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
1386  if (cu == mcWildcardEscapeChar)
1387  {
1388  if (nIndex < nPatternLen)
1389  {
1390  if (nIndex - nOld == 1)
1391  {
1392  // Simply move code units, we already memorized the one
1393  // in 'cu'.
1394  const sal_Int32 nOld2 = nIndex;
1395  rPattern.iterateCodePoints( &nIndex);
1396  for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1397  aPatternBuf[nOld+i] = rPattern[nOld2+i];
1398  aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
1399  }
1400  else
1401  {
1402  // Copy the escape character code units first in the
1403  // unlikely case that it would not be of BMP.
1404  assert(nIndex - nOld == 2); // it's UTF-16, so...
1405  sal_Unicode buf[2];
1406  buf[0] = rPattern[nOld];
1407  buf[1] = rPattern[nOld+1];
1408  const sal_Int32 nOld2 = nIndex;
1409  rPattern.iterateCodePoints( &nIndex);
1410  for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
1411  aPatternBuf[nOld+i] = rPattern[nOld2+i];
1412  aPatternBuf[nIndex-2] = buf[0];
1413  aPatternBuf[nIndex-1] = buf[1];
1414  }
1415  }
1416  else
1417  {
1418  // Trailing escape would become leading escape, do what?
1419  // Eliminate.
1420  aPatternBuf.remove( nOld, nIndex - nOld);
1421  }
1422  }
1423  }
1424  if (bUsePrimarySrchStr)
1425  maWildcardReversePattern = aPatternBuf.makeStringAndClear();
1426  else
1427  maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
1428  }
1429  const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
1430  nPatternLen = rReversePattern.getLength();
1431 
1432  bool bRewind = false;
1433  sal_uInt32 cPattern = 0;
1434  sal_Int32 nPattern = nPatternLen;
1435  sal_Int32 nAfterFakePattern = nPattern;
1437  {
1438  // Fake a trailing '*' wildcard.
1439  cPattern = '*';
1440  bRewind = true;
1441  // Assume a non-'*' pattern character follows. If it is a '*' instead
1442  // that will be handled in the loop by setting nPat.
1443  sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1444  if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
1445  rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
1446  }
1447 
1448  sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
1449  sal_uInt32 cPatternAfterAsterisk = 0;
1450  bool bEscaped = false, bEscapedAfterAsterisk = false;
1451 
1452  // The loop code tries to avoid costly calls to iterateCodePoints() when
1453  // possible.
1454 
1455  do
1456  {
1457  if (bRewind)
1458  {
1459  // Reuse cPattern after '*', nPattern was correspondingly
1460  // decremented to point before cPattern.
1461  bRewind = false;
1462  }
1463  else if (nPattern > 0)
1464  {
1465  // nPattern will be decremented by iterateCodePoints().
1466  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1467  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1468  {
1469  bEscaped = true;
1470  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1471  }
1472  }
1473  else
1474  {
1475  // A trailing '*' is handled below.
1477  {
1478  // If the pattern is consumed and substring match allowed we're good.
1479  setWildcardMatch( aRes, nStartOffset, nString);
1480  return aRes;
1481  }
1482  else if (nString > nEndPos && nLastAsterisk >= 0)
1483  {
1484  // If substring match is not allowed try a greedy '*' match.
1485  nPattern = nLastAsterisk;
1486  continue; // do
1487  }
1488  else
1489  return aRes;
1490  }
1491 
1492  if (cPattern == '*' && !bEscaped)
1493  {
1494  // '*' is one code unit, so not using iterateCodePoints() is ok.
1495  while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1496  --nPattern;
1497 
1498  if (nPattern <= 0)
1499  {
1500  // First pattern is '*', remaining string matches.
1501  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1502  return aRes;
1503  }
1504 
1505  nLastAsterisk = nPattern; // Remember last encountered '*'.
1506 
1507  // cPattern will be the previous non-'*' character, nPattern
1508  // decremented.
1509  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1510  if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
1511  {
1512  bEscaped = true;
1513  cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
1514  }
1515 
1516  cPatternAfterAsterisk = cPattern;
1517  bEscapedAfterAsterisk = bEscaped;
1518  nPat = nPattern; // Remember position of pattern before '*', already decremented.
1519  nStr = nString; // Remember the current string to be matched.
1520  }
1521 
1522  if (nString <= nEndPos)
1523  // Whatever leads in pattern, string will not match.
1524  return aRes;
1525 
1526  // nString will be decremented by iterateCodePoints().
1527  sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
1528 
1529  if ((cPattern != '?' || bEscaped) && cPattern != cString)
1530  {
1531  if (nPat == -1)
1532  // Non-match already without any '*' pattern.
1533  return aRes;
1534 
1535  bRewind = true;
1536  nPattern = nPat; // Rewind pattern to character before '*', already decremented.
1537  cPattern = cPatternAfterAsterisk;
1538  bEscaped = bEscapedAfterAsterisk;
1539  searchStr.iterateCodePoints( &nStr, -1);
1540  nString = nStr; // Restore decremented remembered string position.
1541  if (nPat == nAfterFakePattern)
1542  {
1543  // Next start offset will be this character (exclusive).
1544  nStartOffset = nString;
1545  }
1546  }
1547  else
1548  {
1549  // An unescaped '?' pattern matched any character, or characters
1550  // matched. Reset only escaped state.
1551  bEscaped = false;
1552  }
1553  }
1554  while (nString > nEndPos);
1555 
1556  if (bRewind)
1557  return aRes;
1558 
1559  // Eat leading '*' pattern that matches anything, including nothing.
1560  // '*' is one code unit, so not using iterateCodePoints() is ok.
1561  while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
1562  --nPattern;
1563 
1564  if (nPattern == 0)
1565  setWildcardMatch( aRes, nStartOffset, nEndOffset);
1566  return aRes;
1567 }
1568 
1569 
1570 OUString SAL_CALL
1572 {
1573  return "com.sun.star.util.TextSearch_i18n";
1574 }
1575 
1576 sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
1577 {
1578  return cppu::supportsService(this, rServiceName);
1579 }
1580 
1581 Sequence< OUString > SAL_CALL
1583 {
1584  return { "com.sun.star.util.TextSearch", "com.sun.star.util.TextSearch2" };
1585 }
1586 
1587 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
1589  css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
1590 {
1591  return cppu::acquire(new TextSearch(context));
1592 }
1593 
1594 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
std::unique_ptr< icu::RegexMatcher > pRegexMatcher
Definition: textsearch.hxx:94
sal_Int32 nIndex
bool bUsePrimarySrchStr
Definition: textsearch.hxx:78
virtual void SAL_CALL setOptions2(const css::util::SearchOptions2 &options) override
Definition: textsearch.cxx:117
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * i18npool_TextSearch_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
css::util::SearchOptions2 aSrchPara
Definition: textsearch.hxx:54
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit
Definition: textsearch.hxx:60
osl::Mutex m_aMutex
Definition: textsearch.hxx:51
sal_Int64 n
OUString sSrchStr2
Definition: textsearch.hxx:56
bool checkCTLEnd
Definition: textsearch.hxx:134
sal_uInt32 mcWildcardEscapeChar
Definition: textsearch.hxx:121
void MakeForwardTab()
Definition: textsearch.cxx:598
const TransliterationFlags COMPLEX_TRANS_MASK
Definition: textsearch.cxx:45
css::util::SearchResult SAL_CALL RESrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
virtual void SAL_CALL setOptions(const css::util::SearchOptions &options) override
Definition: textsearch.cxx:265
void MakeBackwardTab2()
Definition: textsearch.cxx:672
sal_Int32 GetDiff(const sal_Unicode) const
Definition: textsearch.cxx:695
void MakeBackwardTab()
Definition: textsearch.cxx:649
OUString maWildcardReversePattern2
Definition: textsearch.hxx:120
static sal_Int32 FindPosInSeq_Impl(const Sequence< sal_Int32 > &rOff, sal_Int32 nPos)
Definition: textsearch.cxx:303
sal_uInt16 sal_Unicode
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
virtual ~TextSearch() override
Definition: textsearch.cxx:109
bool checkCTLStart
Definition: textsearch.hxx:134
bool bSearchApostrophe
Definition: textsearch.hxx:72
TransliterationFlags
css::util::SearchResult SAL_CALL WildcardSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
css::util::SearchResult SAL_CALL ApproxSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
TextSearch(const css::uno::Reference< css::uno::XComponentContext > &rxContext)
virtual css::util::SearchResult SAL_CALL searchBackward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:456
void RESrchPrepare(const css::util::SearchOptions2 &)
Definition: textsearch.cxx:860
#define DIGIT
bool IsDelimiter(const OUString &rStr, sal_Int32 nPos) const
Definition: textsearch.cxx:579
css::uno::Reference< css::i18n::XCharacterClassification > xCharClass
Definition: textsearch.hxx:58
int i
css::util::SearchResult SAL_CALL NSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:783
static bool lcl_findRegex(std::unique_ptr< icu::RegexMatcher > const &pRegexMatcher, sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode &rIcuErr)
Definition: textsearch.cxx:928
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual css::util::SearchResult SAL_CALL searchForward(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos) override
Definition: textsearch.cxx:317
FnSrch fnBackward
Definition: textsearch.hxx:69
float u
unsigned char sal_Bool
css::util::SearchResult SAL_CALL ApproxSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
bool bIsForwardTab
Definition: textsearch.hxx:77
OUString sSrchStr
Definition: textsearch.hxx:55
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:133
OUString maWildcardReversePattern
Definition: textsearch.hxx:119
U_ICU_NAMESPACE::UnicodeString IcuUniString
Definition: textsearch.hxx:40
float v
css::uno::Reference< css::i18n::XBreakIterator > xBreak
Definition: textsearch.hxx:108
css::util::SearchResult SAL_CALL RESrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:949
sal_Int32 startPos
Definition: textsearch.hxx:66
bool mbWildcardAllowSubstring
Definition: textsearch.hxx:122
bool isCellStart(const OUString &searchStr, sal_Int32 nPos)
Definition: textsearch.cxx:310
css::util::SearchResult SAL_CALL NSrchFrwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)
Definition: textsearch.cxx:715
std::unique_ptr< TextSearchJumpTable > pJumpTable2
Definition: textsearch.hxx:76
#define SAL_INFO(area, stream)
std::unique_ptr< WLevDistance > pWLD
Definition: textsearch.hxx:107
std::unique_ptr< TextSearchJumpTable > pJumpTable
Definition: textsearch.hxx:75
FnSrch fnForward
Definition: textsearch.hxx:68
virtual OUString SAL_CALL getImplementationName() override
void MakeForwardTab2()
Definition: textsearch.cxx:624
sal_Int32 sal_Int32 endPos
Definition: textsearch.hxx:66
#define SAL_WARN(area, stream)
Reference< XComponentContext > m_xContext
tuple m
css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit2
Definition: textsearch.hxx:61
::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable
Definition: textsearch.hxx:42
css::uno::Reference< css::uno::XComponentContext > m_xContext
Definition: textsearch.hxx:52
css::util::SearchResult SAL_CALL WildcardSrchBkwrd(const OUString &searchStr, sal_Int32 startPos, sal_Int32 endPos)