LibreOffice Module i18npool (master) 1
breakiteratorImpl.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <config_locales.h>
21
22#include <breakiteratorImpl.hxx>
24#include <unicode/uchar.h>
25#include <i18nutil/unicode.hxx>
26#include <o3tl/string_view.hxx>
27
28#include <com/sun/star/i18n/CharType.hpp>
29#include <com/sun/star/i18n/ScriptType.hpp>
30#include <com/sun/star/i18n/WordType.hpp>
31#include <com/sun/star/uno/XComponentContext.hpp>
32
33using namespace ::com::sun::star;
34using namespace ::com::sun::star::uno;
35using namespace ::com::sun::star::i18n;
36using namespace ::com::sun::star::lang;
37
38namespace i18npool {
39
40BreakIteratorImpl::BreakIteratorImpl( const Reference < XComponentContext >& rxContext ) : m_xContext( rxContext )
41{
42}
43
44BreakIteratorImpl::BreakIteratorImpl()
45{
46}
47
48BreakIteratorImpl::~BreakIteratorImpl()
49{
50}
51
52#define LBI getLocaleSpecificBreakIterator(rLocale)
53
54sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
55 const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
56{
57 if (nCount < 0)
58 throw RuntimeException("BreakIteratorImpl::nextCharacters: expected nCount >=0, got "
59 + OUString::number(nCount));
60
61 return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
62}
63
64sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
65 const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
66{
67 if (nCount < 0)
68 throw RuntimeException("BreakIteratorImpl::previousCharacters: expected nCount >=0, got "
69 + OUString::number(nCount));
70
71 return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
72}
73
74#define isZWSP(c) (ch == 0x200B)
75
76static sal_Int32 skipSpace(std::u16string_view Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
77{
78 sal_uInt32 ch=0;
79 sal_Int32 pos=nPos;
80 switch (rWordType) {
81 case WordType::ANYWORD_IGNOREWHITESPACES:
82 if (bDirection)
83 while (nPos < len)
84 {
86 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
87 break;
88 nPos = pos;
89 }
90 else
91 while (nPos > 0)
92 {
94 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
95 break;
96 nPos = pos;
97 }
98 break;
99 case WordType::DICTIONARY_WORD:
100 if (bDirection)
101 while (nPos < len)
102 {
104 if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
105 break;
106 nPos = pos;
107 }
108 else
109 while (nPos > 0)
110 {
112 if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
113 break;
114 nPos = pos;
115 }
116 break;
117 case WordType::WORD_COUNT:
118 if (bDirection)
119 while (nPos < len)
120 {
122 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
123 break;
124 nPos = pos;
125 }
126 else
127 while (nPos > 0)
128 {
130 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
131 break;
132 nPos = pos;
133 }
134 break;
135 }
136 return nPos;
137}
138
139Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
140 const Locale& rLocale, sal_Int16 rWordType )
141{
142 sal_Int32 len = Text.getLength();
143 if( nStartPos < 0 || len == 0 )
144 result.endPos = result.startPos = 0;
145 else if (nStartPos >= len)
146 result.endPos = result.startPos = len;
147 else {
148 result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
149
150 nStartPos = skipSpace(Text, result.startPos, len, rWordType, true);
151
152 if ( nStartPos != result.startPos) {
153 if( nStartPos >= len )
154 result.startPos = result.endPos = len;
155 else {
156 result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, true);
157 // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
158 if (result.startPos < nStartPos) result.startPos = nStartPos;
159 }
160 }
161 }
162 return result;
163}
164
165static bool isCJK( const Locale& rLocale ) {
166 return rLocale.Language == "zh" || rLocale.Language == "ja" || rLocale.Language == "ko";
167}
168
169Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
170 const Locale& rLocale, sal_Int16 rWordType)
171{
172 sal_Int32 len = Text.getLength();
173 if( nStartPos <= 0 || len == 0 ) {
174 result.endPos = result.startPos = 0;
175 return result;
176 } else if (nStartPos > len) {
177 result.endPos = result.startPos = len;
178 return result;
179 }
180
181 sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, false);
182
183 // if some spaces are skipped, and the script type is Asian with no CJK rLocale, we have to return
184 // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
185 result.startPos = nPos;
186 if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
187 result.endPos = -1;
188 return result;
189 }
190
191 return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
192}
193
194
195Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
196 sal_Int16 rWordType, sal_Bool bDirection )
197{
198 sal_Int32 len = Text.getLength();
199 if( nPos < 0 || len == 0 )
200 result.endPos = result.startPos = 0;
201 else if (nPos > len)
202 result.endPos = result.startPos = len;
203 else {
204 sal_Int32 next, prev;
205 next = skipSpace(Text, nPos, len, rWordType, true);
206 prev = skipSpace(Text, nPos, len, rWordType, false);
207 if (prev == 0 && next == len) {
208 result.endPos = result.startPos = nPos;
209 } else if (prev == 0 && ! bDirection) {
210 result.endPos = result.startPos = 0;
211 } else if (next == len && bDirection) {
212 result.endPos = result.startPos = len;
213 } else {
214 if (next != prev) {
215 if (next == nPos && next != len)
216 bDirection = true;
217 else if (prev == nPos && prev != 0)
218 bDirection = false;
219 else
220 nPos = bDirection ? next : prev;
221 }
222 result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
223 }
224 }
225 return result;
226}
227
228sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
229 const Locale& rLocale, sal_Int16 rWordType )
230{
231 sal_Int32 len = Text.getLength();
232
233 if (nPos < 0 || nPos >= len) return false;
234
235 sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, true);
236
237 if (tmp != nPos) return false;
238
239 result = getWordBoundary(Text, nPos, rLocale, rWordType, true);
240
241 return result.startPos == nPos;
242}
243
244sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
245 const Locale& rLocale, sal_Int16 rWordType )
246{
247 sal_Int32 len = Text.getLength();
248
249 if (nPos <= 0 || nPos > len) return false;
250
251 sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, false);
252
253 if (tmp != nPos) return false;
254
255 result = getWordBoundary(Text, nPos, rLocale, rWordType, false);
256
257 return result.endPos == nPos;
258}
259
260sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
261 const Locale &rLocale )
262{
263 if (nStartPos < 0 || nStartPos > Text.getLength())
264 return -1;
265 if (Text.isEmpty()) return 0;
266 return LBI->beginOfSentence(Text, nStartPos, rLocale);
267}
268
269sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
270 const Locale &rLocale )
271{
272 if (nStartPos < 0 || nStartPos > Text.getLength())
273 return -1;
274 if (Text.isEmpty()) return 0;
275 return LBI->endOfSentence(Text, nStartPos, rLocale);
276}
277
278LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
279 const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
280 const LineBreakUserOptions& bOptions )
281{
282 return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
283}
284
285sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
286{
287 return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
288 getScriptClass(Text.iterateCodePoints(&nPos, 0));
289}
290
291
295static sal_Int32 iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
296 sal_Int32 nLen = Text.getLength();
297 if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
298 ch = 0;
299 nStartPos = nStartPos + inc < 0 ? -1 : nLen;
300 } else {
301 ch = Text.iterateCodePoints(&nStartPos, inc);
302 // Fix for #i80436#.
303 // erAck: 2009-06-30T21:52+0200 This logic looks somewhat
304 // suspicious as if it cures a symptom... anyway, had to add
305 // nStartPos < Text.getLength() to silence the (correct) assertion
306 // in rtl_uString_iterateCodePoints() if Text was one character
307 // (codepoint) only, made up of a surrogate pair.
308 //if (inc > 0 && nStartPos < Text.getLength())
309 // ch = Text.iterateCodePoints(&nStartPos, 0);
310 // With surrogates, nStartPos may actually point behind string
311 // now, even if inc is only +1
312 if (inc > 0)
313 ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
314 }
315 return nStartPos;
316}
317
318
319sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
320 sal_Int32 nStartPos, sal_Int16 ScriptType )
321{
322 if (nStartPos < 0 || nStartPos >= Text.getLength())
323 return -1;
324
325 if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
326 return -1;
327
328 if (nStartPos == 0) return 0;
329 sal_uInt32 ch=0;
330 while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
331 if (nStartPos == 0) return 0;
332 }
333
334 return iterateCodePoints(Text, nStartPos, 1, ch);
335}
336
337sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
338 sal_Int32 nStartPos, sal_Int16 ScriptType )
339{
340 if (nStartPos < 0 || nStartPos >= Text.getLength())
341 return -1;
342
343 if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
344 return -1;
345
346 sal_Int32 strLen = Text.getLength();
347 sal_uInt32 ch=0;
348 while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
349 sal_Int16 currentCharScriptType = getScriptClass(ch);
350 if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
351 break;
352 }
353 return nStartPos;
354}
355
356sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
357 sal_Int32 nStartPos, sal_Int16 ScriptType )
358{
359 if (nStartPos < 0)
360 return -1;
361 if (nStartPos > Text.getLength())
362 nStartPos = Text.getLength();
363
364 sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
365
366 sal_uInt32 ch=0;
367 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
368 if (((numberOfChange % 2) == 0) != (ScriptType != getScriptClass(ch)))
369 numberOfChange--;
370 else if (nStartPos == 0) {
371 return -1;
372 }
373 }
374 return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
375}
376
377sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
378 sal_Int16 ScriptType )
379
380{
381 if (nStartPos < 0)
382 nStartPos = 0;
383 sal_Int32 strLen = Text.getLength();
384 if (nStartPos >= strLen)
385 return -1;
386
387 sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
388
389 sal_uInt32 ch=0;
390 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
391 sal_Int16 currentCharScriptType = getScriptClass(ch);
392 if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
393 (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
394 numberOfChange--;
395 }
396 return numberOfChange == 0 ? nStartPos : -1;
397}
398
399sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
400 const Locale& /*rLocale*/, sal_Int16 CharType )
401{
402 if (CharType == CharType::ANY_CHAR) return 0;
403 if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
404 if (CharType != static_cast<sal_Int16>(u_charType( Text.iterateCodePoints(&nStartPos, 0)))) return -1;
405
406 sal_Int32 nPos=nStartPos;
407 while(nStartPos > 0 && CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nPos, -1)))) { nStartPos=nPos; }
408 return nStartPos; // begin of char block is inclusive
409}
410
411sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
412 const Locale& /*rLocale*/, sal_Int16 CharType )
413{
414 sal_Int32 strLen = Text.getLength();
415
416 if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
417 if (nStartPos < 0 || nStartPos >= strLen) return -1;
418 if (CharType != static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) return -1;
419
420 sal_uInt32 ch=0;
421 while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == static_cast<sal_Int16>(u_charType(ch))) {}
422 return nStartPos; // end of char block is exclusive
423}
424
425sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
426 const Locale& /*rLocale*/, sal_Int16 CharType )
427{
428 if (CharType == CharType::ANY_CHAR) return -1;
429 if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
430
431 sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 2 : 1;
432 sal_Int32 strLen = Text.getLength();
433
434 sal_uInt32 ch=0;
435 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
436 if ((CharType != static_cast<sal_Int16>(u_charType(ch))) != (numberOfChange == 1))
437 numberOfChange--;
438 }
439 return numberOfChange == 0 ? nStartPos : -1;
440}
441
442sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
443 const Locale& /*rLocale*/, sal_Int16 CharType )
444{
445 if(CharType == CharType::ANY_CHAR) return -1;
446 if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
447
448 sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 3 : 2;
449
450 sal_uInt32 ch=0;
451 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
452 if (((numberOfChange % 2) == 0) != (CharType != static_cast<sal_Int16>(u_charType(ch))))
453 numberOfChange--;
454 if (nStartPos == 0 && numberOfChange > 0) {
455 numberOfChange--;
456 if (numberOfChange == 0) return nStartPos;
457 }
458 }
459 return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
460}
461
462
463sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
464 sal_Int32 /*nPos*/, const Locale& /*rLocale*/ )
465{
466 return 0;
467}
468
469namespace
470{
471sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
472{
473 int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
474 return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
475}
476
477struct UBlock2Script
478{
479 UBlockCode from;
480 UBlockCode to;
481 sal_Int16 script;
482};
483
484const UBlock2Script scriptList[] =
485{
486 {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
487 {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN},
488 {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN},
489 {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
490 {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
491 {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
492 {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
493 {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
494 {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
495 {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
496 {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
497 {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
498 {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
499 {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
500 {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
501 {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
502 {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
503 {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
504 {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
505 {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
506};
507
508#define scriptListCount SAL_N_ELEMENTS(scriptList)
509
510//always sets rScriptType
511
512//returns true for characters historically explicitly assigned to
513//latin/weak/asian
514
515//returns false for characters that historically implicitly assigned to
516//weak as unknown
517bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
518{
519 bool bKnown = true;
520 //handle specific characters always as weak:
521 // 0x01 - this breaks a word
522 // 0x02 - this can be inside a word
523 // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
524 if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
525 rScriptType = ScriptType::WEAK;
526 // Few Spacing Modifier Letters that can be Bopomofo tonal marks.
527 else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar || 0x2D9 == currentChar )
528 rScriptType = ScriptType::WEAK;
529 // tdf#52577 superscript numbers should be we weak.
530 else if ( 0xB2 == currentChar || 0xB3 == currentChar || 0xB9 == currentChar )
531 rScriptType = ScriptType::WEAK;
532 // workaround for Coptic
533 else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
534 rScriptType = ScriptType::LATIN;
535 else
536 {
537 UBlockCode block=ublock_getCode(currentChar);
538 size_t i = 0;
539 while (i < scriptListCount)
540 {
541 if (block <= scriptList[i].to)
542 break;
543 ++i;
544 }
545 if (i < scriptListCount && block >= scriptList[i].from)
546 rScriptType = scriptList[i].script;
547 else
548 {
549 rScriptType = ScriptType::WEAK;
550 bKnown = false;
551 }
552 }
553 return bKnown;
554}
555}
556
557sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
558{
559 static sal_uInt32 lastChar = 0;
560 static sal_Int16 nRet = ScriptType::WEAK;
561
562 if (currentChar != lastChar)
563 {
564 lastChar = currentChar;
565
566 if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
567 nRet = getScriptClassByUAX24Script(currentChar);
568 }
569
570 return nRet;
571}
572
573bool BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName)
574{
575 // to share service between same Language but different Country code, like zh_CN and zh_TW
576 for (const lookupTableItem& listItem : lookupTable) {
577 if (aLocaleName == listItem.aLocale.Language) {
578 xBI = listItem.xBI;
579 return true;
580 }
581 }
582
583#if !WITH_LOCALE_ALL && !WITH_LOCALE_ja
584 if (aLocaleName == "ja")
585 return false;
586#endif
587#if !WITH_LOCALE_ALL && !WITH_LOCALE_zh
588 if (aLocaleName == "zh" || aLocaleName == "zh_TW")
589 return false;
590#endif
591#if !WITH_LOCALE_ALL && !WITH_LOCALE_ko
592 if (aLocaleName == "ko")
593 return false;
594#endif
595#if !WITH_LOCALE_ALL && !WITH_LOCALE_th
596 if (aLocaleName == "th")
597 return false;
598#endif
599
600 Reference < uno::XInterface > xI = m_xContext->getServiceManager()->createInstanceWithContext(
601 "com.sun.star.i18n.BreakIterator_" + aLocaleName, m_xContext);
602
603 if ( xI.is() ) {
604 xBI.set(xI, UNO_QUERY);
605 if (xBI.is()) {
606 lookupTable.emplace_back(Locale(aLocaleName, aLocaleName, aLocaleName), xBI);
607 return true;
608 }
609 }
610 return false;
611}
612
613const Reference < XBreakIterator > &
614BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale)
615{
616 if (xBI.is() && rLocale == aLocale)
617 return xBI;
618 else if (m_xContext.is()) {
619 aLocale = rLocale;
620
621 for (const lookupTableItem& listItem : lookupTable) {
622 if (rLocale == listItem.aLocale)
623 {
624 xBI = listItem.xBI;
625 return xBI;
626 }
627 }
628
629 static constexpr OUStringLiteral under(u"_");
630
631 sal_Int32 l = rLocale.Language.getLength();
632 sal_Int32 c = rLocale.Country.getLength();
633 sal_Int32 v = rLocale.Variant.getLength();
634
635 if ((l > 0 && c > 0 && v > 0 &&
636 // load service with name <base>_<lang>_<country>_<variant>
637 createLocaleSpecificBreakIterator(rLocale.Language + under +
638 rLocale.Country + under + rLocale.Variant)) ||
639 (l > 0 && c > 0 &&
640 // load service with name <base>_<lang>_<country>
641 createLocaleSpecificBreakIterator(rLocale.Language + under +
642 rLocale.Country)) ||
643 (l > 0 && c > 0 && rLocale.Language == "zh" &&
644 (rLocale.Country == "HK" ||
645 rLocale.Country == "MO" ) &&
646 // if the country code is HK or MO, one more step to try TW.
647 createLocaleSpecificBreakIterator(rLocale.Language + under +
648 "TW")) ||
649 (l > 0 &&
650 // load service with name <base>_<lang>
651 createLocaleSpecificBreakIterator(rLocale.Language)) ||
652 // load default service with name <base>_Unicode
653 createLocaleSpecificBreakIterator("Unicode")) {
654 lookupTable.emplace_back( aLocale, xBI );
655 return xBI;
656 }
657 }
658 throw RuntimeException("getLocaleSpecificBreakIterator: iterator not found");
659}
660
661OUString SAL_CALL
662BreakIteratorImpl::getImplementationName()
663{
664 return "com.sun.star.i18n.BreakIterator";
665}
666
667sal_Bool SAL_CALL
668BreakIteratorImpl::supportsService(const OUString& rServiceName)
669{
670 return cppu::supportsService(this, rServiceName);
671}
672
673Sequence< OUString > SAL_CALL
674BreakIteratorImpl::getSupportedServiceNames()
675{
676 return { "com.sun.star.i18n.BreakIterator" };
677}
678
679}
680
681extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
683 css::uno::XComponentContext *context,
684 css::uno::Sequence<css::uno::Any> const &)
685{
686 return cppu::acquire(new i18npool::BreakIteratorImpl(context));
687}
688
689/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Reference< XComponentContext > m_xContext
UBlockCode from
#define isZWSP(c)
#define LBI
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
sal_Int16 script
#define scriptListCount
UBlockCode to
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
int nCount
float v
float u
const UBlockScript scriptList[]
sal_uInt16 nPos
ScriptType
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
int i
Constant values shared between i18npool and, for example, the number formatter.
static sal_Int32 skipSpace(std::u16string_view Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
static bool isCJK(const Locale &rLocale)
sal_uInt32 iterateCodePoints(std::u16string_view string, sal_Int32 *indexUtf16, sal_Int32 incrementCodePoints=1)
unsigned char sal_Bool
Any result
size_t pos