LibreOffice Module i18npool (master)  1
breakiteratorImpl.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 #include <config_locales.h>
20 
21 #include <breakiteratorImpl.hxx>
23 #include <unicode/uchar.h>
24 #include <i18nutil/unicode.hxx>
25 
26 #include <com/sun/star/i18n/CharType.hpp>
27 #include <com/sun/star/i18n/ScriptType.hpp>
28 #include <com/sun/star/i18n/WordType.hpp>
29 #include <com/sun/star/uno/XComponentContext.hpp>
30 
31 using namespace ::com::sun::star;
32 using namespace ::com::sun::star::uno;
33 using namespace ::com::sun::star::i18n;
34 using namespace ::com::sun::star::lang;
35 
36 namespace i18npool {
37 
39 {
40 }
41 
43 {
44 }
45 
47 {
48 }
49 
50 #define LBI getLocaleSpecificBreakIterator(rLocale)
51 
52 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
53  const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
54 {
55  if (nCount < 0)
56  throw RuntimeException("BreakIteratorImpl::nextCharacters: expected nCount >=0, got "
57  + OUString::number(nCount));
58 
59  return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
60 }
61 
62 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
63  const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
64 {
65  if (nCount < 0)
66  throw RuntimeException("BreakIteratorImpl::previousCharacters: expected nCount >=0, got "
67  + OUString::number(nCount));
68 
69  return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
70 }
71 
72 #define isZWSP(c) (ch == 0x200B)
73 
74 static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
75 {
76  sal_uInt32 ch=0;
77  sal_Int32 pos=nPos;
78  switch (rWordType) {
79  case WordType::ANYWORD_IGNOREWHITESPACES:
80  if (bDirection)
81  while (nPos < len)
82  {
83  ch = Text.iterateCodePoints(&pos);
84  if (!u_isWhitespace(ch) && !isZWSP(ch))
85  break;
86  nPos = pos;
87  }
88  else
89  while (nPos > 0)
90  {
91  ch = Text.iterateCodePoints(&pos, -1);
92  if (!u_isWhitespace(ch) && !isZWSP(ch))
93  break;
94  nPos = pos;
95  }
96  break;
97  case WordType::DICTIONARY_WORD:
98  if (bDirection)
99  while (nPos < len)
100  {
101  ch = Text.iterateCodePoints(&pos);
102  if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
103  break;
104  nPos = pos;
105  }
106  else
107  while (nPos > 0)
108  {
109  ch = Text.iterateCodePoints(&pos, -1);
110  if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
111  break;
112  nPos = pos;
113  }
114  break;
115  case WordType::WORD_COUNT:
116  if (bDirection)
117  while (nPos < len)
118  {
119  ch = Text.iterateCodePoints(&pos);
120  if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
121  break;
122  nPos = pos;
123  }
124  else
125  while (nPos > 0)
126  {
127  ch = Text.iterateCodePoints(&pos, -1);
128  if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
129  break;
130  nPos = pos;
131  }
132  break;
133  }
134  return nPos;
135 }
136 
137 Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
138  const Locale& rLocale, sal_Int16 rWordType )
139 {
140  sal_Int32 len = Text.getLength();
141  if( nStartPos < 0 || len == 0 )
142  result.endPos = result.startPos = 0;
143  else if (nStartPos >= len)
144  result.endPos = result.startPos = len;
145  else {
146  result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
147 
148  nStartPos = skipSpace(Text, result.startPos, len, rWordType, true);
149 
150  if ( nStartPos != result.startPos) {
151  if( nStartPos >= len )
152  result.startPos = result.endPos = len;
153  else {
154  result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, true);
155  // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
156  if (result.startPos < nStartPos) result.startPos = nStartPos;
157  }
158  }
159  }
160  return result;
161 }
162 
163 static bool isCJK( const Locale& rLocale ) {
164  return rLocale.Language == "zh" || rLocale.Language == "ja" || rLocale.Language == "ko";
165 }
166 
167 Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
168  const Locale& rLocale, sal_Int16 rWordType)
169 {
170  sal_Int32 len = Text.getLength();
171  if( nStartPos <= 0 || len == 0 ) {
172  result.endPos = result.startPos = 0;
173  return result;
174  } else if (nStartPos > len) {
175  result.endPos = result.startPos = len;
176  return result;
177  }
178 
179  sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, false);
180 
181  // if some spaces are skipped, and the script type is Asian with no CJK rLocale, we have to return
182  // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
183  result.startPos = nPos;
184  if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
185  result.endPos = -1;
186  return result;
187  }
188 
189  return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
190 }
191 
192 
193 Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
194  sal_Int16 rWordType, sal_Bool bDirection )
195 {
196  sal_Int32 len = Text.getLength();
197  if( nPos < 0 || len == 0 )
198  result.endPos = result.startPos = 0;
199  else if (nPos > len)
200  result.endPos = result.startPos = len;
201  else {
202  sal_Int32 next, prev;
203  next = skipSpace(Text, nPos, len, rWordType, true);
204  prev = skipSpace(Text, nPos, len, rWordType, false);
205  if (prev == 0 && next == len) {
206  result.endPos = result.startPos = nPos;
207  } else if (prev == 0 && ! bDirection) {
208  result.endPos = result.startPos = 0;
209  } else if (next == len && bDirection) {
210  result.endPos = result.startPos = len;
211  } else {
212  if (next != prev) {
213  if (next == nPos && next != len)
214  bDirection = true;
215  else if (prev == nPos && prev != 0)
216  bDirection = false;
217  else
218  nPos = bDirection ? next : prev;
219  }
220  result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
221  }
222  }
223  return result;
224 }
225 
226 sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
227  const Locale& rLocale, sal_Int16 rWordType )
228 {
229  sal_Int32 len = Text.getLength();
230 
231  if (nPos < 0 || nPos >= len) return false;
232 
233  sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, true);
234 
235  if (tmp != nPos) return false;
236 
237  result = getWordBoundary(Text, nPos, rLocale, rWordType, true);
238 
239  return result.startPos == nPos;
240 }
241 
242 sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
243  const Locale& rLocale, sal_Int16 rWordType )
244 {
245  sal_Int32 len = Text.getLength();
246 
247  if (nPos <= 0 || nPos > len) return false;
248 
249  sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, false);
250 
251  if (tmp != nPos) return false;
252 
253  result = getWordBoundary(Text, nPos, rLocale, rWordType, false);
254 
255  return result.endPos == nPos;
256 }
257 
258 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
259  const Locale &rLocale )
260 {
261  if (nStartPos < 0 || nStartPos > Text.getLength())
262  return -1;
263  if (Text.isEmpty()) return 0;
264  return LBI->beginOfSentence(Text, nStartPos, rLocale);
265 }
266 
267 sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
268  const Locale &rLocale )
269 {
270  if (nStartPos < 0 || nStartPos > Text.getLength())
271  return -1;
272  if (Text.isEmpty()) return 0;
273  return LBI->endOfSentence(Text, nStartPos, rLocale);
274 }
275 
276 LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
277  const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
278  const LineBreakUserOptions& bOptions )
279 {
280  return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
281 }
282 
283 sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
284 {
285  return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
286  getScriptClass(Text.iterateCodePoints(&nPos, 0));
287 }
288 
289 
293 static sal_Int32 iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
294  sal_Int32 nLen = Text.getLength();
295  if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
296  ch = 0;
297  nStartPos = nStartPos + inc < 0 ? -1 : nLen;
298  } else {
299  ch = Text.iterateCodePoints(&nStartPos, inc);
300  // Fix for #i80436#.
301  // erAck: 2009-06-30T21:52+0200 This logic looks somewhat
302  // suspicious as if it cures a symptom... anyway, had to add
303  // nStartPos < Text.getLength() to silence the (correct) assertion
304  // in rtl_uString_iterateCodePoints() if Text was one character
305  // (codepoint) only, made up of a surrogate pair.
306  //if (inc > 0 && nStartPos < Text.getLength())
307  // ch = Text.iterateCodePoints(&nStartPos, 0);
308  // With surrogates, nStartPos may actually point behind string
309  // now, even if inc is only +1
310  if (inc > 0)
311  ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
312  }
313  return nStartPos;
314 }
315 
316 
317 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
318  sal_Int32 nStartPos, sal_Int16 ScriptType )
319 {
320  if (nStartPos < 0 || nStartPos >= Text.getLength())
321  return -1;
322 
323  if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
324  return -1;
325 
326  if (nStartPos == 0) return 0;
327  sal_uInt32 ch=0;
328  while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
329  if (nStartPos == 0) return 0;
330  }
331 
332  return iterateCodePoints(Text, nStartPos, 1, ch);
333 }
334 
335 sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
336  sal_Int32 nStartPos, sal_Int16 ScriptType )
337 {
338  if (nStartPos < 0 || nStartPos >= Text.getLength())
339  return -1;
340 
341  if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
342  return -1;
343 
344  sal_Int32 strLen = Text.getLength();
345  sal_uInt32 ch=0;
346  while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
347  sal_Int16 currentCharScriptType = getScriptClass(ch);
348  if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
349  break;
350  }
351  return nStartPos;
352 }
353 
354 sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
355  sal_Int32 nStartPos, sal_Int16 ScriptType )
356 {
357  if (nStartPos < 0)
358  return -1;
359  if (nStartPos > Text.getLength())
360  nStartPos = Text.getLength();
361 
362  sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
363 
364  sal_uInt32 ch=0;
365  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
366  if (((numberOfChange % 2) == 0) != (ScriptType != getScriptClass(ch)))
367  numberOfChange--;
368  else if (nStartPos == 0) {
369  return -1;
370  }
371  }
372  return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
373 }
374 
375 sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
376  sal_Int16 ScriptType )
377 
378 {
379  if (nStartPos < 0)
380  nStartPos = 0;
381  sal_Int32 strLen = Text.getLength();
382  if (nStartPos >= strLen)
383  return -1;
384 
385  sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
386 
387  sal_uInt32 ch=0;
388  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
389  sal_Int16 currentCharScriptType = getScriptClass(ch);
390  if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
391  (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
392  numberOfChange--;
393  }
394  return numberOfChange == 0 ? nStartPos : -1;
395 }
396 
397 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
398  const Locale& /*rLocale*/, sal_Int16 CharType )
399 {
400  if (CharType == CharType::ANY_CHAR) return 0;
401  if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
402  if (CharType != static_cast<sal_Int16>(u_charType( Text.iterateCodePoints(&nStartPos, 0)))) return -1;
403 
404  sal_Int32 nPos=nStartPos;
405  while(nStartPos > 0 && CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nPos, -1)))) { nStartPos=nPos; }
406  return nStartPos; // begin of char block is inclusive
407 }
408 
409 sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
410  const Locale& /*rLocale*/, sal_Int16 CharType )
411 {
412  sal_Int32 strLen = Text.getLength();
413 
414  if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
415  if (nStartPos < 0 || nStartPos >= strLen) return -1;
416  if (CharType != static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) return -1;
417 
418  sal_uInt32 ch=0;
419  while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == static_cast<sal_Int16>(u_charType(ch))) {}
420  return nStartPos; // end of char block is exclusive
421 }
422 
423 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
424  const Locale& /*rLocale*/, sal_Int16 CharType )
425 {
426  if (CharType == CharType::ANY_CHAR) return -1;
427  if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
428 
429  sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 2 : 1;
430  sal_Int32 strLen = Text.getLength();
431 
432  sal_uInt32 ch=0;
433  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
434  if ((CharType != static_cast<sal_Int16>(u_charType(ch))) != (numberOfChange == 1))
435  numberOfChange--;
436  }
437  return numberOfChange == 0 ? nStartPos : -1;
438 }
439 
440 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
441  const Locale& /*rLocale*/, sal_Int16 CharType )
442 {
443  if(CharType == CharType::ANY_CHAR) return -1;
444  if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
445 
446  sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 3 : 2;
447 
448  sal_uInt32 ch=0;
449  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
450  if (((numberOfChange % 2) == 0) != (CharType != static_cast<sal_Int16>(u_charType(ch))))
451  numberOfChange--;
452  if (nStartPos == 0 && numberOfChange > 0) {
453  numberOfChange--;
454  if (numberOfChange == 0) return nStartPos;
455  }
456  }
457  return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
458 }
459 
460 
461 sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
462  sal_Int32 /*nPos*/, const Locale& /*rLocale*/ )
463 {
464  return 0;
465 }
466 
467 namespace
468 {
469 sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
470 {
471  int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
472  return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
473 }
474 
475 struct UBlock2Script
476 {
477  UBlockCode from;
478  UBlockCode to;
479  sal_Int16 script;
480 };
481 
482 const UBlock2Script scriptList[] =
483 {
484  {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
485  {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN},
486  {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN},
487  {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
488  {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
489  {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
490  {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
491  {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
492  {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
493  {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
494  {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
495  {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
496  {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
497  {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
498  {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
499  {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
500  {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
501  {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
502  {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
503  {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
504 };
505 
506 #define scriptListCount SAL_N_ELEMENTS(scriptList)
507 
508 //always sets rScriptType
509 
510 //returns true for characters historically explicitly assigned to
511 //latin/weak/asian
512 
513 //returns false for characters that historically implicitly assigned to
514 //weak as unknown
515 bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
516 {
517  bool bKnown = true;
518  //handle specific characters always as weak:
519  // 0x01 - this breaks a word
520  // 0x02 - this can be inside a word
521  // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
522  if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
523  rScriptType = ScriptType::WEAK;
524  // Few Spacing Modifier Letters that can be Bopomofo tonal marks.
525  else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar || 0x2D9 == currentChar )
526  rScriptType = ScriptType::WEAK;
527  // workaround for Coptic
528  else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
529  rScriptType = ScriptType::LATIN;
530  else
531  {
532  UBlockCode block=ublock_getCode(currentChar);
533  size_t i = 0;
534  while (i < scriptListCount)
535  {
536  if (block <= scriptList[i].to)
537  break;
538  ++i;
539  }
540  if (i < scriptListCount && block >= scriptList[i].from)
541  rScriptType = scriptList[i].script;
542  else
543  {
544  rScriptType = ScriptType::WEAK;
545  bKnown = false;
546  }
547  }
548  return bKnown;
549 }
550 }
551 
552 sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
553 {
554  static sal_uInt32 lastChar = 0;
555  static sal_Int16 nRet = ScriptType::WEAK;
556 
557  if (currentChar != lastChar)
558  {
559  lastChar = currentChar;
560 
561  if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
562  nRet = getScriptClassByUAX24Script(currentChar);
563  }
564 
565  return nRet;
566 }
567 
569 {
570  // to share service between same Language but different Country code, like zh_CN and zh_TW
571  for (const lookupTableItem& listItem : lookupTable) {
572  if (aLocaleName == listItem.aLocale.Language) {
573  xBI = listItem.xBI;
574  return true;
575  }
576  }
577 
578 #if !WITH_LOCALE_ALL && !WITH_LOCALE_ja
579  if (aLocaleName == "ja")
580  return false;
581 #endif
582 #if !WITH_LOCALE_ALL && !WITH_LOCALE_zh
583  if (aLocaleName == "zh" || aLocaleName == "zh_TW")
584  return false;
585 #endif
586 #if !WITH_LOCALE_ALL && !WITH_LOCALE_ko
587  if (aLocaleName == "ko")
588  return false;
589 #endif
590 #if !WITH_LOCALE_ALL && !WITH_LOCALE_th
591  if (aLocaleName == "th")
592  return false;
593 #endif
594 
595  Reference < uno::XInterface > xI = m_xContext->getServiceManager()->createInstanceWithContext(
596  "com.sun.star.i18n.BreakIterator_" + aLocaleName, m_xContext);
597 
598  if ( xI.is() ) {
599  xBI.set(xI, UNO_QUERY);
600  if (xBI.is()) {
601  lookupTable.emplace_back(Locale(aLocaleName, aLocaleName, aLocaleName), xBI);
602  return true;
603  }
604  }
605  return false;
606 }
607 
608 const Reference < XBreakIterator > &
610 {
611  if (xBI.is() && rLocale == aLocale)
612  return xBI;
613  else if (m_xContext.is()) {
614  aLocale = rLocale;
615 
616  for (const lookupTableItem& listItem : lookupTable) {
617  if (rLocale == listItem.aLocale)
618  {
619  xBI = listItem.xBI;
620  return xBI;
621  }
622  }
623 
624  OUStringLiteral under(u"_");
625 
626  sal_Int32 l = rLocale.Language.getLength();
627  sal_Int32 c = rLocale.Country.getLength();
628  sal_Int32 v = rLocale.Variant.getLength();
629 
630  if ((l > 0 && c > 0 && v > 0 &&
631  // load service with name <base>_<lang>_<country>_<variant>
632  createLocaleSpecificBreakIterator(rLocale.Language + under +
633  rLocale.Country + under + rLocale.Variant)) ||
634  (l > 0 && c > 0 &&
635  // load service with name <base>_<lang>_<country>
636  createLocaleSpecificBreakIterator(rLocale.Language + under +
637  rLocale.Country)) ||
638  (l > 0 && c > 0 && rLocale.Language == "zh" &&
639  (rLocale.Country == "HK" ||
640  rLocale.Country == "MO" ) &&
641  // if the country code is HK or MO, one more step to try TW.
642  createLocaleSpecificBreakIterator(rLocale.Language + under +
643  "TW")) ||
644  (l > 0 &&
645  // load service with name <base>_<lang>
646  createLocaleSpecificBreakIterator(rLocale.Language)) ||
647  // load default service with name <base>_Unicode
649  lookupTable.emplace_back( aLocale, xBI );
650  return xBI;
651  }
652  }
653  throw RuntimeException("getLocaleSpecificBreakIterator: iterator not found");
654 }
655 
656 OUString SAL_CALL
658 {
659  return "com.sun.star.i18n.BreakIterator";
660 }
661 
662 sal_Bool SAL_CALL
663 BreakIteratorImpl::supportsService(const OUString& rServiceName)
664 {
665  return cppu::supportsService(this, rServiceName);
666 }
667 
668 Sequence< OUString > SAL_CALL
670 {
671  return { "com.sun.star.i18n.BreakIterator" };
672 }
673 
674 }
675 
676 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
678  css::uno::XComponentContext *context,
679  css::uno::Sequence<css::uno::Any> const &)
680 {
681  return cppu::acquire(new i18npool::BreakIteratorImpl(context));
682 }
683 
684 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
exports com.sun.star. script
static sal_Int16 getScriptClass(sal_uInt32 currentChar)
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
virtual sal_Int32 SAL_CALL previousCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
virtual sal_Int32 SAL_CALL nextCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
const css::uno::Reference< XBreakIterator > & getLocaleSpecificBreakIterator(const css::lang::Locale &rLocale)
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
virtual sal_Int32 SAL_CALL previousScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
static sal_Int32 skipSpace(const OUString &Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
std::vector< lookupTableItem > lookupTable
size_t pos
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
UBlockCode from
virtual sal_Int32 SAL_CALL endOfScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
virtual sal_Int16 SAL_CALL getWordType(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale) override
UBlockCode to
#define isZWSP(c)
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Int16 SAL_CALL getScriptType(const OUString &Text, sal_Int32 nPos) override
virtual sal_Int32 SAL_CALL beginOfCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
const UBlockScript scriptList[]
virtual sal_Int32 SAL_CALL endOfCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
int i
#define LBI
virtual sal_Int32 SAL_CALL beginOfScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
float u
unsigned char sal_Bool
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
Constant values shared between i18npool and, for example, the number formatter.
bool createLocaleSpecificBreakIterator(const OUString &aLocaleName)
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual sal_Bool SAL_CALL isEndWord(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual ~BreakIteratorImpl() override
float v
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
css::uno::Reference< XBreakIterator > xBI
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
css::uno::Reference< css::uno::XComponentContext > m_xContext
#define scriptListCount
static sal_Int32 iterateCodePoints(const OUString &Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32 &ch)
Increments/decrements position first, then obtains character.
virtual sal_Bool SAL_CALL isBeginWord(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
ScriptType
Reference< XComponentContext > m_xContext
virtual OUString SAL_CALL getImplementationName() override
static bool isCJK(const Locale &rLocale)
virtual sal_Int32 SAL_CALL nextScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
sal_uInt16 nPos