LibreOffice Module i18npool (master)  1
breakiteratorImpl.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <config_locales.h>
21 
22 #include <breakiteratorImpl.hxx>
24 #include <unicode/uchar.h>
25 #include <i18nutil/unicode.hxx>
26 
27 #include <com/sun/star/i18n/CharType.hpp>
28 #include <com/sun/star/i18n/ScriptType.hpp>
29 #include <com/sun/star/i18n/WordType.hpp>
30 #include <com/sun/star/uno/XComponentContext.hpp>
31 
32 using namespace ::com::sun::star;
33 using namespace ::com::sun::star::uno;
34 using namespace ::com::sun::star::i18n;
35 using namespace ::com::sun::star::lang;
36 
37 namespace i18npool {
38 
40 {
41 }
42 
44 {
45 }
46 
48 {
49 }
50 
51 #define LBI getLocaleSpecificBreakIterator(rLocale)
52 
53 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
54  const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
55 {
56  if (nCount < 0)
57  throw RuntimeException("BreakIteratorImpl::nextCharacters: expected nCount >=0, got "
58  + OUString::number(nCount));
59 
60  return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
61 }
62 
63 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
64  const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
65 {
66  if (nCount < 0)
67  throw RuntimeException("BreakIteratorImpl::previousCharacters: expected nCount >=0, got "
68  + OUString::number(nCount));
69 
70  return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
71 }
72 
73 #define isZWSP(c) (ch == 0x200B)
74 
75 static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
76 {
77  sal_uInt32 ch=0;
78  sal_Int32 pos=nPos;
79  switch (rWordType) {
80  case WordType::ANYWORD_IGNOREWHITESPACES:
81  if (bDirection)
82  while (nPos < len)
83  {
84  ch = Text.iterateCodePoints(&pos);
85  if (!u_isWhitespace(ch) && !isZWSP(ch))
86  break;
87  nPos = pos;
88  }
89  else
90  while (nPos > 0)
91  {
92  ch = Text.iterateCodePoints(&pos, -1);
93  if (!u_isWhitespace(ch) && !isZWSP(ch))
94  break;
95  nPos = pos;
96  }
97  break;
98  case WordType::DICTIONARY_WORD:
99  if (bDirection)
100  while (nPos < len)
101  {
102  ch = Text.iterateCodePoints(&pos);
103  if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
104  break;
105  nPos = pos;
106  }
107  else
108  while (nPos > 0)
109  {
110  ch = Text.iterateCodePoints(&pos, -1);
111  if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
112  break;
113  nPos = pos;
114  }
115  break;
116  case WordType::WORD_COUNT:
117  if (bDirection)
118  while (nPos < len)
119  {
120  ch = Text.iterateCodePoints(&pos);
121  if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
122  break;
123  nPos = pos;
124  }
125  else
126  while (nPos > 0)
127  {
128  ch = Text.iterateCodePoints(&pos, -1);
129  if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
130  break;
131  nPos = pos;
132  }
133  break;
134  }
135  return nPos;
136 }
137 
138 Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
139  const Locale& rLocale, sal_Int16 rWordType )
140 {
141  sal_Int32 len = Text.getLength();
142  if( nStartPos < 0 || len == 0 )
143  result.endPos = result.startPos = 0;
144  else if (nStartPos >= len)
145  result.endPos = result.startPos = len;
146  else {
147  result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
148 
149  nStartPos = skipSpace(Text, result.startPos, len, rWordType, true);
150 
151  if ( nStartPos != result.startPos) {
152  if( nStartPos >= len )
153  result.startPos = result.endPos = len;
154  else {
155  result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, true);
156  // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
157  if (result.startPos < nStartPos) result.startPos = nStartPos;
158  }
159  }
160  }
161  return result;
162 }
163 
164 static bool isCJK( const Locale& rLocale ) {
165  return rLocale.Language == "zh" || rLocale.Language == "ja" || rLocale.Language == "ko";
166 }
167 
168 Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
169  const Locale& rLocale, sal_Int16 rWordType)
170 {
171  sal_Int32 len = Text.getLength();
172  if( nStartPos <= 0 || len == 0 ) {
173  result.endPos = result.startPos = 0;
174  return result;
175  } else if (nStartPos > len) {
176  result.endPos = result.startPos = len;
177  return result;
178  }
179 
180  sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, false);
181 
182  // if some spaces are skipped, and the script type is Asian with no CJK rLocale, we have to return
183  // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
184  result.startPos = nPos;
185  if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
186  result.endPos = -1;
187  return result;
188  }
189 
190  return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
191 }
192 
193 
194 Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
195  sal_Int16 rWordType, sal_Bool bDirection )
196 {
197  sal_Int32 len = Text.getLength();
198  if( nPos < 0 || len == 0 )
199  result.endPos = result.startPos = 0;
200  else if (nPos > len)
201  result.endPos = result.startPos = len;
202  else {
203  sal_Int32 next, prev;
204  next = skipSpace(Text, nPos, len, rWordType, true);
205  prev = skipSpace(Text, nPos, len, rWordType, false);
206  if (prev == 0 && next == len) {
207  result.endPos = result.startPos = nPos;
208  } else if (prev == 0 && ! bDirection) {
209  result.endPos = result.startPos = 0;
210  } else if (next == len && bDirection) {
211  result.endPos = result.startPos = len;
212  } else {
213  if (next != prev) {
214  if (next == nPos && next != len)
215  bDirection = true;
216  else if (prev == nPos && prev != 0)
217  bDirection = false;
218  else
219  nPos = bDirection ? next : prev;
220  }
221  result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
222  }
223  }
224  return result;
225 }
226 
227 sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
228  const Locale& rLocale, sal_Int16 rWordType )
229 {
230  sal_Int32 len = Text.getLength();
231 
232  if (nPos < 0 || nPos >= len) return false;
233 
234  sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, true);
235 
236  if (tmp != nPos) return false;
237 
238  result = getWordBoundary(Text, nPos, rLocale, rWordType, true);
239 
240  return result.startPos == nPos;
241 }
242 
243 sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
244  const Locale& rLocale, sal_Int16 rWordType )
245 {
246  sal_Int32 len = Text.getLength();
247 
248  if (nPos <= 0 || nPos > len) return false;
249 
250  sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, false);
251 
252  if (tmp != nPos) return false;
253 
254  result = getWordBoundary(Text, nPos, rLocale, rWordType, false);
255 
256  return result.endPos == nPos;
257 }
258 
259 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
260  const Locale &rLocale )
261 {
262  if (nStartPos < 0 || nStartPos > Text.getLength())
263  return -1;
264  if (Text.isEmpty()) return 0;
265  return LBI->beginOfSentence(Text, nStartPos, rLocale);
266 }
267 
268 sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
269  const Locale &rLocale )
270 {
271  if (nStartPos < 0 || nStartPos > Text.getLength())
272  return -1;
273  if (Text.isEmpty()) return 0;
274  return LBI->endOfSentence(Text, nStartPos, rLocale);
275 }
276 
277 LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
278  const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
279  const LineBreakUserOptions& bOptions )
280 {
281  return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
282 }
283 
284 sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
285 {
286  return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
287  getScriptClass(Text.iterateCodePoints(&nPos, 0));
288 }
289 
290 
294 static sal_Int32 iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
295  sal_Int32 nLen = Text.getLength();
296  if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
297  ch = 0;
298  nStartPos = nStartPos + inc < 0 ? -1 : nLen;
299  } else {
300  ch = Text.iterateCodePoints(&nStartPos, inc);
301  // Fix for #i80436#.
302  // erAck: 2009-06-30T21:52+0200 This logic looks somewhat
303  // suspicious as if it cures a symptom... anyway, had to add
304  // nStartPos < Text.getLength() to silence the (correct) assertion
305  // in rtl_uString_iterateCodePoints() if Text was one character
306  // (codepoint) only, made up of a surrogate pair.
307  //if (inc > 0 && nStartPos < Text.getLength())
308  // ch = Text.iterateCodePoints(&nStartPos, 0);
309  // With surrogates, nStartPos may actually point behind string
310  // now, even if inc is only +1
311  if (inc > 0)
312  ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
313  }
314  return nStartPos;
315 }
316 
317 
318 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
319  sal_Int32 nStartPos, sal_Int16 ScriptType )
320 {
321  if (nStartPos < 0 || nStartPos >= Text.getLength())
322  return -1;
323 
324  if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
325  return -1;
326 
327  if (nStartPos == 0) return 0;
328  sal_uInt32 ch=0;
329  while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
330  if (nStartPos == 0) return 0;
331  }
332 
333  return iterateCodePoints(Text, nStartPos, 1, ch);
334 }
335 
336 sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
337  sal_Int32 nStartPos, sal_Int16 ScriptType )
338 {
339  if (nStartPos < 0 || nStartPos >= Text.getLength())
340  return -1;
341 
342  if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
343  return -1;
344 
345  sal_Int32 strLen = Text.getLength();
346  sal_uInt32 ch=0;
347  while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
348  sal_Int16 currentCharScriptType = getScriptClass(ch);
349  if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
350  break;
351  }
352  return nStartPos;
353 }
354 
355 sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
356  sal_Int32 nStartPos, sal_Int16 ScriptType )
357 {
358  if (nStartPos < 0)
359  return -1;
360  if (nStartPos > Text.getLength())
361  nStartPos = Text.getLength();
362 
363  sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
364 
365  sal_uInt32 ch=0;
366  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
367  if (((numberOfChange % 2) == 0) != (ScriptType != getScriptClass(ch)))
368  numberOfChange--;
369  else if (nStartPos == 0) {
370  return -1;
371  }
372  }
373  return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
374 }
375 
376 sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
377  sal_Int16 ScriptType )
378 
379 {
380  if (nStartPos < 0)
381  nStartPos = 0;
382  sal_Int32 strLen = Text.getLength();
383  if (nStartPos >= strLen)
384  return -1;
385 
386  sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
387 
388  sal_uInt32 ch=0;
389  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
390  sal_Int16 currentCharScriptType = getScriptClass(ch);
391  if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
392  (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
393  numberOfChange--;
394  }
395  return numberOfChange == 0 ? nStartPos : -1;
396 }
397 
398 sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
399  const Locale& /*rLocale*/, sal_Int16 CharType )
400 {
401  if (CharType == CharType::ANY_CHAR) return 0;
402  if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
403  if (CharType != static_cast<sal_Int16>(u_charType( Text.iterateCodePoints(&nStartPos, 0)))) return -1;
404 
405  sal_Int32 nPos=nStartPos;
406  while(nStartPos > 0 && CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nPos, -1)))) { nStartPos=nPos; }
407  return nStartPos; // begin of char block is inclusive
408 }
409 
410 sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
411  const Locale& /*rLocale*/, sal_Int16 CharType )
412 {
413  sal_Int32 strLen = Text.getLength();
414 
415  if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
416  if (nStartPos < 0 || nStartPos >= strLen) return -1;
417  if (CharType != static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) return -1;
418 
419  sal_uInt32 ch=0;
420  while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == static_cast<sal_Int16>(u_charType(ch))) {}
421  return nStartPos; // end of char block is exclusive
422 }
423 
424 sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
425  const Locale& /*rLocale*/, sal_Int16 CharType )
426 {
427  if (CharType == CharType::ANY_CHAR) return -1;
428  if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
429 
430  sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 2 : 1;
431  sal_Int32 strLen = Text.getLength();
432 
433  sal_uInt32 ch=0;
434  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
435  if ((CharType != static_cast<sal_Int16>(u_charType(ch))) != (numberOfChange == 1))
436  numberOfChange--;
437  }
438  return numberOfChange == 0 ? nStartPos : -1;
439 }
440 
441 sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
442  const Locale& /*rLocale*/, sal_Int16 CharType )
443 {
444  if(CharType == CharType::ANY_CHAR) return -1;
445  if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
446 
447  sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 3 : 2;
448 
449  sal_uInt32 ch=0;
450  while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
451  if (((numberOfChange % 2) == 0) != (CharType != static_cast<sal_Int16>(u_charType(ch))))
452  numberOfChange--;
453  if (nStartPos == 0 && numberOfChange > 0) {
454  numberOfChange--;
455  if (numberOfChange == 0) return nStartPos;
456  }
457  }
458  return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
459 }
460 
461 
462 sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
463  sal_Int32 /*nPos*/, const Locale& /*rLocale*/ )
464 {
465  return 0;
466 }
467 
468 namespace
469 {
470 sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
471 {
472  int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
473  return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
474 }
475 
476 struct UBlock2Script
477 {
478  UBlockCode from;
479  UBlockCode to;
480  sal_Int16 script;
481 };
482 
483 const UBlock2Script scriptList[] =
484 {
485  {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
486  {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN},
487  {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN},
488  {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
489  {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
490  {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
491  {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
492  {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
493  {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
494  {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
495  {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
496  {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
497  {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
498  {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
499  {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
500  {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
501  {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
502  {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
503  {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
504  {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
505 };
506 
507 #define scriptListCount SAL_N_ELEMENTS(scriptList)
508 
509 //always sets rScriptType
510 
511 //returns true for characters historically explicitly assigned to
512 //latin/weak/asian
513 
514 //returns false for characters that historically implicitly assigned to
515 //weak as unknown
516 bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
517 {
518  bool bKnown = true;
519  //handle specific characters always as weak:
520  // 0x01 - this breaks a word
521  // 0x02 - this can be inside a word
522  // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
523  if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
524  rScriptType = ScriptType::WEAK;
525  // Few Spacing Modifier Letters that can be Bopomofo tonal marks.
526  else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar || 0x2D9 == currentChar )
527  rScriptType = ScriptType::WEAK;
528  // workaround for Coptic
529  else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
530  rScriptType = ScriptType::LATIN;
531  else
532  {
533  UBlockCode block=ublock_getCode(currentChar);
534  size_t i = 0;
535  while (i < scriptListCount)
536  {
537  if (block <= scriptList[i].to)
538  break;
539  ++i;
540  }
541  if (i < scriptListCount && block >= scriptList[i].from)
542  rScriptType = scriptList[i].script;
543  else
544  {
545  rScriptType = ScriptType::WEAK;
546  bKnown = false;
547  }
548  }
549  return bKnown;
550 }
551 }
552 
553 sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
554 {
555  static sal_uInt32 lastChar = 0;
556  static sal_Int16 nRet = ScriptType::WEAK;
557 
558  if (currentChar != lastChar)
559  {
560  lastChar = currentChar;
561 
562  if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
563  nRet = getScriptClassByUAX24Script(currentChar);
564  }
565 
566  return nRet;
567 }
568 
570 {
571  // to share service between same Language but different Country code, like zh_CN and zh_TW
572  for (const lookupTableItem& listItem : lookupTable) {
573  if (aLocaleName == listItem.aLocale.Language) {
574  xBI = listItem.xBI;
575  return true;
576  }
577  }
578 
579 #if !WITH_LOCALE_ALL && !WITH_LOCALE_ja
580  if (aLocaleName == "ja")
581  return false;
582 #endif
583 #if !WITH_LOCALE_ALL && !WITH_LOCALE_zh
584  if (aLocaleName == "zh" || aLocaleName == "zh_TW")
585  return false;
586 #endif
587 #if !WITH_LOCALE_ALL && !WITH_LOCALE_ko
588  if (aLocaleName == "ko")
589  return false;
590 #endif
591 #if !WITH_LOCALE_ALL && !WITH_LOCALE_th
592  if (aLocaleName == "th")
593  return false;
594 #endif
595 
596  Reference < uno::XInterface > xI = m_xContext->getServiceManager()->createInstanceWithContext(
597  "com.sun.star.i18n.BreakIterator_" + aLocaleName, m_xContext);
598 
599  if ( xI.is() ) {
600  xBI.set(xI, UNO_QUERY);
601  if (xBI.is()) {
602  lookupTable.emplace_back(Locale(aLocaleName, aLocaleName, aLocaleName), xBI);
603  return true;
604  }
605  }
606  return false;
607 }
608 
609 const Reference < XBreakIterator > &
611 {
612  if (xBI.is() && rLocale == aLocale)
613  return xBI;
614  else if (m_xContext.is()) {
615  aLocale = rLocale;
616 
617  for (const lookupTableItem& listItem : lookupTable) {
618  if (rLocale == listItem.aLocale)
619  {
620  xBI = listItem.xBI;
621  return xBI;
622  }
623  }
624 
625  OUStringLiteral under(u"_");
626 
627  sal_Int32 l = rLocale.Language.getLength();
628  sal_Int32 c = rLocale.Country.getLength();
629  sal_Int32 v = rLocale.Variant.getLength();
630 
631  if ((l > 0 && c > 0 && v > 0 &&
632  // load service with name <base>_<lang>_<country>_<variant>
633  createLocaleSpecificBreakIterator(rLocale.Language + under +
634  rLocale.Country + under + rLocale.Variant)) ||
635  (l > 0 && c > 0 &&
636  // load service with name <base>_<lang>_<country>
637  createLocaleSpecificBreakIterator(rLocale.Language + under +
638  rLocale.Country)) ||
639  (l > 0 && c > 0 && rLocale.Language == "zh" &&
640  (rLocale.Country == "HK" ||
641  rLocale.Country == "MO" ) &&
642  // if the country code is HK or MO, one more step to try TW.
643  createLocaleSpecificBreakIterator(rLocale.Language + under +
644  "TW")) ||
645  (l > 0 &&
646  // load service with name <base>_<lang>
647  createLocaleSpecificBreakIterator(rLocale.Language)) ||
648  // load default service with name <base>_Unicode
650  lookupTable.emplace_back( aLocale, xBI );
651  return xBI;
652  }
653  }
654  throw RuntimeException("getLocaleSpecificBreakIterator: iterator not found");
655 }
656 
657 OUString SAL_CALL
659 {
660  return "com.sun.star.i18n.BreakIterator";
661 }
662 
663 sal_Bool SAL_CALL
664 BreakIteratorImpl::supportsService(const OUString& rServiceName)
665 {
666  return cppu::supportsService(this, rServiceName);
667 }
668 
669 Sequence< OUString > SAL_CALL
671 {
672  return { "com.sun.star.i18n.BreakIterator" };
673 }
674 
675 }
676 
677 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
679  css::uno::XComponentContext *context,
680  css::uno::Sequence<css::uno::Any> const &)
681 {
682  return cppu::acquire(new i18npool::BreakIteratorImpl(context));
683 }
684 
685 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
exports com.sun.star. script
static sal_Int16 getScriptClass(sal_uInt32 currentChar)
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
virtual sal_Int32 SAL_CALL previousCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
virtual sal_Int32 SAL_CALL nextCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
const css::uno::Reference< XBreakIterator > & getLocaleSpecificBreakIterator(const css::lang::Locale &rLocale)
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
virtual sal_Int32 SAL_CALL previousScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
static sal_Int32 skipSpace(const OUString &Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
std::vector< lookupTableItem > lookupTable
size_t pos
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
UBlockCode from
virtual sal_Int32 SAL_CALL endOfScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
virtual sal_Int16 SAL_CALL getWordType(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale) override
UBlockCode to
#define isZWSP(c)
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Int16 SAL_CALL getScriptType(const OUString &Text, sal_Int32 nPos) override
virtual sal_Int32 SAL_CALL beginOfCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
const UBlockScript scriptList[]
virtual sal_Int32 SAL_CALL endOfCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
int i
#define LBI
virtual sal_Int32 SAL_CALL beginOfScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
float u
unsigned char sal_Bool
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
Constant values shared between i18npool and, for example, the number formatter.
bool createLocaleSpecificBreakIterator(const OUString &aLocaleName)
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual sal_Bool SAL_CALL isEndWord(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual ~BreakIteratorImpl() override
float v
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
css::uno::Reference< XBreakIterator > xBI
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
css::uno::Reference< css::uno::XComponentContext > m_xContext
#define scriptListCount
static sal_Int32 iterateCodePoints(const OUString &Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32 &ch)
Increments/decrements position first, then obtains character.
virtual sal_Bool SAL_CALL isBeginWord(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
ScriptType
Reference< XComponentContext > m_xContext
virtual OUString SAL_CALL getImplementationName() override
static bool isCJK(const Locale &rLocale)
virtual sal_Int32 SAL_CALL nextScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
sal_uInt16 nPos