LibreOffice Module i18npool (master) 1
breakiteratorImpl.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <config_locales.h>
21
22#include <breakiteratorImpl.hxx>
24#include <unicode/uchar.h>
25#include <i18nutil/unicode.hxx>
26
27#include <com/sun/star/i18n/CharType.hpp>
28#include <com/sun/star/i18n/ScriptType.hpp>
29#include <com/sun/star/i18n/WordType.hpp>
30#include <com/sun/star/uno/XComponentContext.hpp>
31
32using namespace ::com::sun::star;
33using namespace ::com::sun::star::uno;
34using namespace ::com::sun::star::i18n;
35using namespace ::com::sun::star::lang;
36
37namespace i18npool {
38
39BreakIteratorImpl::BreakIteratorImpl( const Reference < XComponentContext >& rxContext ) : m_xContext( rxContext )
40{
41}
42
43BreakIteratorImpl::BreakIteratorImpl()
44{
45}
46
47BreakIteratorImpl::~BreakIteratorImpl()
48{
49}
50
51#define LBI getLocaleSpecificBreakIterator(rLocale)
52
53sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
54 const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
55{
56 if (nCount < 0)
57 throw RuntimeException("BreakIteratorImpl::nextCharacters: expected nCount >=0, got "
58 + OUString::number(nCount));
59
60 return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
61}
62
63sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
64 const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
65{
66 if (nCount < 0)
67 throw RuntimeException("BreakIteratorImpl::previousCharacters: expected nCount >=0, got "
68 + OUString::number(nCount));
69
70 return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
71}
72
73#define isZWSP(c) (ch == 0x200B)
74
75static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
76{
77 sal_uInt32 ch=0;
78 sal_Int32 pos=nPos;
79 switch (rWordType) {
80 case WordType::ANYWORD_IGNOREWHITESPACES:
81 if (bDirection)
82 while (nPos < len)
83 {
84 ch = Text.iterateCodePoints(&pos);
85 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
86 break;
87 nPos = pos;
88 }
89 else
90 while (nPos > 0)
91 {
92 ch = Text.iterateCodePoints(&pos, -1);
93 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
94 break;
95 nPos = pos;
96 }
97 break;
98 case WordType::DICTIONARY_WORD:
99 if (bDirection)
100 while (nPos < len)
101 {
102 ch = Text.iterateCodePoints(&pos);
103 if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
104 break;
105 nPos = pos;
106 }
107 else
108 while (nPos > 0)
109 {
110 ch = Text.iterateCodePoints(&pos, -1);
111 if (!u_isWhitespace(ch) && !isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
112 break;
113 nPos = pos;
114 }
115 break;
116 case WordType::WORD_COUNT:
117 if (bDirection)
118 while (nPos < len)
119 {
120 ch = Text.iterateCodePoints(&pos);
121 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
122 break;
123 nPos = pos;
124 }
125 else
126 while (nPos > 0)
127 {
128 ch = Text.iterateCodePoints(&pos, -1);
129 if (!u_isUWhiteSpace(ch) && !isZWSP(ch))
130 break;
131 nPos = pos;
132 }
133 break;
134 }
135 return nPos;
136}
137
138Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
139 const Locale& rLocale, sal_Int16 rWordType )
140{
141 sal_Int32 len = Text.getLength();
142 if( nStartPos < 0 || len == 0 )
143 result.endPos = result.startPos = 0;
144 else if (nStartPos >= len)
145 result.endPos = result.startPos = len;
146 else {
147 result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
148
149 nStartPos = skipSpace(Text, result.startPos, len, rWordType, true);
150
151 if ( nStartPos != result.startPos) {
152 if( nStartPos >= len )
153 result.startPos = result.endPos = len;
154 else {
155 result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, true);
156 // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
157 if (result.startPos < nStartPos) result.startPos = nStartPos;
158 }
159 }
160 }
161 return result;
162}
163
164static bool isCJK( const Locale& rLocale ) {
165 return rLocale.Language == "zh" || rLocale.Language == "ja" || rLocale.Language == "ko";
166}
167
168Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
169 const Locale& rLocale, sal_Int16 rWordType)
170{
171 sal_Int32 len = Text.getLength();
172 if( nStartPos <= 0 || len == 0 ) {
173 result.endPos = result.startPos = 0;
174 return result;
175 } else if (nStartPos > len) {
176 result.endPos = result.startPos = len;
177 return result;
178 }
179
180 sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, false);
181
182 // if some spaces are skipped, and the script type is Asian with no CJK rLocale, we have to return
183 // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
184 result.startPos = nPos;
185 if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
186 result.endPos = -1;
187 return result;
188 }
189
190 return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
191}
192
193
194Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
195 sal_Int16 rWordType, sal_Bool bDirection )
196{
197 sal_Int32 len = Text.getLength();
198 if( nPos < 0 || len == 0 )
199 result.endPos = result.startPos = 0;
200 else if (nPos > len)
201 result.endPos = result.startPos = len;
202 else {
203 sal_Int32 next, prev;
204 next = skipSpace(Text, nPos, len, rWordType, true);
205 prev = skipSpace(Text, nPos, len, rWordType, false);
206 if (prev == 0 && next == len) {
207 result.endPos = result.startPos = nPos;
208 } else if (prev == 0 && ! bDirection) {
209 result.endPos = result.startPos = 0;
210 } else if (next == len && bDirection) {
211 result.endPos = result.startPos = len;
212 } else {
213 if (next != prev) {
214 if (next == nPos && next != len)
215 bDirection = true;
216 else if (prev == nPos && prev != 0)
217 bDirection = false;
218 else
219 nPos = bDirection ? next : prev;
220 }
221 result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
222 }
223 }
224 return result;
225}
226
227sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
228 const Locale& rLocale, sal_Int16 rWordType )
229{
230 sal_Int32 len = Text.getLength();
231
232 if (nPos < 0 || nPos >= len) return false;
233
234 sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, true);
235
236 if (tmp != nPos) return false;
237
238 result = getWordBoundary(Text, nPos, rLocale, rWordType, true);
239
240 return result.startPos == nPos;
241}
242
243sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
244 const Locale& rLocale, sal_Int16 rWordType )
245{
246 sal_Int32 len = Text.getLength();
247
248 if (nPos <= 0 || nPos > len) return false;
249
250 sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, false);
251
252 if (tmp != nPos) return false;
253
254 result = getWordBoundary(Text, nPos, rLocale, rWordType, false);
255
256 return result.endPos == nPos;
257}
258
259sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
260 const Locale &rLocale )
261{
262 if (nStartPos < 0 || nStartPos > Text.getLength())
263 return -1;
264 if (Text.isEmpty()) return 0;
265 return LBI->beginOfSentence(Text, nStartPos, rLocale);
266}
267
268sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
269 const Locale &rLocale )
270{
271 if (nStartPos < 0 || nStartPos > Text.getLength())
272 return -1;
273 if (Text.isEmpty()) return 0;
274 return LBI->endOfSentence(Text, nStartPos, rLocale);
275}
276
277LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
278 const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
279 const LineBreakUserOptions& bOptions )
280{
281 return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
282}
283
284sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
285{
286 return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
287 getScriptClass(Text.iterateCodePoints(&nPos, 0));
288}
289
290
294static sal_Int32 iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
295 sal_Int32 nLen = Text.getLength();
296 if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
297 ch = 0;
298 nStartPos = nStartPos + inc < 0 ? -1 : nLen;
299 } else {
300 ch = Text.iterateCodePoints(&nStartPos, inc);
301 // Fix for #i80436#.
302 // erAck: 2009-06-30T21:52+0200 This logic looks somewhat
303 // suspicious as if it cures a symptom... anyway, had to add
304 // nStartPos < Text.getLength() to silence the (correct) assertion
305 // in rtl_uString_iterateCodePoints() if Text was one character
306 // (codepoint) only, made up of a surrogate pair.
307 //if (inc > 0 && nStartPos < Text.getLength())
308 // ch = Text.iterateCodePoints(&nStartPos, 0);
309 // With surrogates, nStartPos may actually point behind string
310 // now, even if inc is only +1
311 if (inc > 0)
312 ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
313 }
314 return nStartPos;
315}
316
317
318sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
319 sal_Int32 nStartPos, sal_Int16 ScriptType )
320{
321 if (nStartPos < 0 || nStartPos >= Text.getLength())
322 return -1;
323
324 if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
325 return -1;
326
327 if (nStartPos == 0) return 0;
328 sal_uInt32 ch=0;
329 while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
330 if (nStartPos == 0) return 0;
331 }
332
333 return iterateCodePoints(Text, nStartPos, 1, ch);
334}
335
336sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
337 sal_Int32 nStartPos, sal_Int16 ScriptType )
338{
339 if (nStartPos < 0 || nStartPos >= Text.getLength())
340 return -1;
341
342 if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
343 return -1;
344
345 sal_Int32 strLen = Text.getLength();
346 sal_uInt32 ch=0;
347 while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
348 sal_Int16 currentCharScriptType = getScriptClass(ch);
349 if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
350 break;
351 }
352 return nStartPos;
353}
354
355sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
356 sal_Int32 nStartPos, sal_Int16 ScriptType )
357{
358 if (nStartPos < 0)
359 return -1;
360 if (nStartPos > Text.getLength())
361 nStartPos = Text.getLength();
362
363 sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
364
365 sal_uInt32 ch=0;
366 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
367 if (((numberOfChange % 2) == 0) != (ScriptType != getScriptClass(ch)))
368 numberOfChange--;
369 else if (nStartPos == 0) {
370 return -1;
371 }
372 }
373 return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
374}
375
376sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
377 sal_Int16 ScriptType )
378
379{
380 if (nStartPos < 0)
381 nStartPos = 0;
382 sal_Int32 strLen = Text.getLength();
383 if (nStartPos >= strLen)
384 return -1;
385
386 sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
387
388 sal_uInt32 ch=0;
389 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
390 sal_Int16 currentCharScriptType = getScriptClass(ch);
391 if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
392 (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
393 numberOfChange--;
394 }
395 return numberOfChange == 0 ? nStartPos : -1;
396}
397
398sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
399 const Locale& /*rLocale*/, sal_Int16 CharType )
400{
401 if (CharType == CharType::ANY_CHAR) return 0;
402 if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
403 if (CharType != static_cast<sal_Int16>(u_charType( Text.iterateCodePoints(&nStartPos, 0)))) return -1;
404
405 sal_Int32 nPos=nStartPos;
406 while(nStartPos > 0 && CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nPos, -1)))) { nStartPos=nPos; }
407 return nStartPos; // begin of char block is inclusive
408}
409
410sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
411 const Locale& /*rLocale*/, sal_Int16 CharType )
412{
413 sal_Int32 strLen = Text.getLength();
414
415 if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
416 if (nStartPos < 0 || nStartPos >= strLen) return -1;
417 if (CharType != static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) return -1;
418
419 sal_uInt32 ch=0;
420 while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == static_cast<sal_Int16>(u_charType(ch))) {}
421 return nStartPos; // end of char block is exclusive
422}
423
424sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
425 const Locale& /*rLocale*/, sal_Int16 CharType )
426{
427 if (CharType == CharType::ANY_CHAR) return -1;
428 if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
429
430 sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 2 : 1;
431 sal_Int32 strLen = Text.getLength();
432
433 sal_uInt32 ch=0;
434 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
435 if ((CharType != static_cast<sal_Int16>(u_charType(ch))) != (numberOfChange == 1))
436 numberOfChange--;
437 }
438 return numberOfChange == 0 ? nStartPos : -1;
439}
440
441sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
442 const Locale& /*rLocale*/, sal_Int16 CharType )
443{
444 if(CharType == CharType::ANY_CHAR) return -1;
445 if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
446
447 sal_Int16 numberOfChange = (CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 3 : 2;
448
449 sal_uInt32 ch=0;
450 while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
451 if (((numberOfChange % 2) == 0) != (CharType != static_cast<sal_Int16>(u_charType(ch))))
452 numberOfChange--;
453 if (nStartPos == 0 && numberOfChange > 0) {
454 numberOfChange--;
455 if (numberOfChange == 0) return nStartPos;
456 }
457 }
458 return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
459}
460
461
462sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
463 sal_Int32 /*nPos*/, const Locale& /*rLocale*/ )
464{
465 return 0;
466}
467
468namespace
469{
470sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
471{
472 int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
473 return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
474}
475
476struct UBlock2Script
477{
478 UBlockCode from;
479 UBlockCode to;
480 sal_Int16 script;
481};
482
483const UBlock2Script scriptList[] =
484{
485 {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
486 {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN},
487 {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN},
488 {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
489 {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
490 {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
491 {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
492 {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
493 {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
494 {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
495 {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
496 {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
497 {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
498 {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
499 {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
500 {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
501 {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
502 {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
503 {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
504 {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
505};
506
507#define scriptListCount SAL_N_ELEMENTS(scriptList)
508
509//always sets rScriptType
510
511//returns true for characters historically explicitly assigned to
512//latin/weak/asian
513
514//returns false for characters that historically implicitly assigned to
515//weak as unknown
516bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
517{
518 bool bKnown = true;
519 //handle specific characters always as weak:
520 // 0x01 - this breaks a word
521 // 0x02 - this can be inside a word
522 // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
523 if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
524 rScriptType = ScriptType::WEAK;
525 // Few Spacing Modifier Letters that can be Bopomofo tonal marks.
526 else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar || 0x2D9 == currentChar )
527 rScriptType = ScriptType::WEAK;
528 // workaround for Coptic
529 else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
530 rScriptType = ScriptType::LATIN;
531 else
532 {
533 UBlockCode block=ublock_getCode(currentChar);
534 size_t i = 0;
535 while (i < scriptListCount)
536 {
537 if (block <= scriptList[i].to)
538 break;
539 ++i;
540 }
541 if (i < scriptListCount && block >= scriptList[i].from)
542 rScriptType = scriptList[i].script;
543 else
544 {
545 rScriptType = ScriptType::WEAK;
546 bKnown = false;
547 }
548 }
549 return bKnown;
550}
551}
552
553sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
554{
555 static sal_uInt32 lastChar = 0;
556 static sal_Int16 nRet = ScriptType::WEAK;
557
558 if (currentChar != lastChar)
559 {
560 lastChar = currentChar;
561
562 if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
563 nRet = getScriptClassByUAX24Script(currentChar);
564 }
565
566 return nRet;
567}
568
569bool BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName)
570{
571 // to share service between same Language but different Country code, like zh_CN and zh_TW
572 for (const lookupTableItem& listItem : lookupTable) {
573 if (aLocaleName == listItem.aLocale.Language) {
574 xBI = listItem.xBI;
575 return true;
576 }
577 }
578
579#if !WITH_LOCALE_ALL && !WITH_LOCALE_ja
580 if (aLocaleName == "ja")
581 return false;
582#endif
583#if !WITH_LOCALE_ALL && !WITH_LOCALE_zh
584 if (aLocaleName == "zh" || aLocaleName == "zh_TW")
585 return false;
586#endif
587#if !WITH_LOCALE_ALL && !WITH_LOCALE_ko
588 if (aLocaleName == "ko")
589 return false;
590#endif
591#if !WITH_LOCALE_ALL && !WITH_LOCALE_th
592 if (aLocaleName == "th")
593 return false;
594#endif
595
596 Reference < uno::XInterface > xI = m_xContext->getServiceManager()->createInstanceWithContext(
597 "com.sun.star.i18n.BreakIterator_" + aLocaleName, m_xContext);
598
599 if ( xI.is() ) {
600 xBI.set(xI, UNO_QUERY);
601 if (xBI.is()) {
602 lookupTable.emplace_back(Locale(aLocaleName, aLocaleName, aLocaleName), xBI);
603 return true;
604 }
605 }
606 return false;
607}
608
609const Reference < XBreakIterator > &
610BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale)
611{
612 if (xBI.is() && rLocale == aLocale)
613 return xBI;
614 else if (m_xContext.is()) {
615 aLocale = rLocale;
616
617 for (const lookupTableItem& listItem : lookupTable) {
618 if (rLocale == listItem.aLocale)
619 {
620 xBI = listItem.xBI;
621 return xBI;
622 }
623 }
624
625 OUStringLiteral under(u"_");
626
627 sal_Int32 l = rLocale.Language.getLength();
628 sal_Int32 c = rLocale.Country.getLength();
629 sal_Int32 v = rLocale.Variant.getLength();
630
631 if ((l > 0 && c > 0 && v > 0 &&
632 // load service with name <base>_<lang>_<country>_<variant>
633 createLocaleSpecificBreakIterator(rLocale.Language + under +
634 rLocale.Country + under + rLocale.Variant)) ||
635 (l > 0 && c > 0 &&
636 // load service with name <base>_<lang>_<country>
637 createLocaleSpecificBreakIterator(rLocale.Language + under +
638 rLocale.Country)) ||
639 (l > 0 && c > 0 && rLocale.Language == "zh" &&
640 (rLocale.Country == "HK" ||
641 rLocale.Country == "MO" ) &&
642 // if the country code is HK or MO, one more step to try TW.
643 createLocaleSpecificBreakIterator(rLocale.Language + under +
644 "TW")) ||
645 (l > 0 &&
646 // load service with name <base>_<lang>
647 createLocaleSpecificBreakIterator(rLocale.Language)) ||
648 // load default service with name <base>_Unicode
649 createLocaleSpecificBreakIterator("Unicode")) {
650 lookupTable.emplace_back( aLocale, xBI );
651 return xBI;
652 }
653 }
654 throw RuntimeException("getLocaleSpecificBreakIterator: iterator not found");
655}
656
657OUString SAL_CALL
658BreakIteratorImpl::getImplementationName()
659{
660 return "com.sun.star.i18n.BreakIterator";
661}
662
663sal_Bool SAL_CALL
664BreakIteratorImpl::supportsService(const OUString& rServiceName)
665{
666 return cppu::supportsService(this, rServiceName);
667}
668
669Sequence< OUString > SAL_CALL
670BreakIteratorImpl::getSupportedServiceNames()
671{
672 return { "com.sun.star.i18n.BreakIterator" };
673}
674
675}
676
677extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
679 css::uno::XComponentContext *context,
680 css::uno::Sequence<css::uno::Any> const &)
681{
682 return cppu::acquire(new i18npool::BreakIteratorImpl(context));
683}
684
685/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Reference< XComponentContext > m_xContext
UBlockCode from
#define isZWSP(c)
#define LBI
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
sal_Int16 script
#define scriptListCount
UBlockCode to
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
int nCount
float v
float u
const UBlockScript scriptList[]
sal_uInt16 nPos
ScriptType
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
int i
Constant values shared between i18npool and, for example, the number formatter.
static sal_Int32 skipSpace(const OUString &Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
static sal_Int32 iterateCodePoints(const OUString &Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32 &ch)
Increments/decrements position first, then obtains character.
static bool isCJK(const Locale &rLocale)
unsigned char sal_Bool
Any result
size_t pos