20 #include <config_locales.h>
24 #include <unicode/uchar.h>
27 #include <com/sun/star/i18n/CharType.hpp>
28 #include <com/sun/star/i18n/ScriptType.hpp>
29 #include <com/sun/star/i18n/WordType.hpp>
30 #include <com/sun/star/uno/XComponentContext.hpp>
51 #define LBI getLocaleSpecificBreakIterator(rLocale)
54 const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
57 throw RuntimeException(
"BreakIteratorImpl::nextCharacters: expected nCount >=0, got "
58 + OUString::number(nCount));
60 return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
64 const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
67 throw RuntimeException(
"BreakIteratorImpl::previousCharacters: expected nCount >=0, got "
68 + OUString::number(nCount));
70 return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
73 #define isZWSP(c) (ch == 0x200B)
75 static sal_Int32
skipSpace(
const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType,
bool bDirection)
80 case WordType::ANYWORD_IGNOREWHITESPACES:
84 ch = Text.iterateCodePoints(&pos);
85 if (!u_isWhitespace(ch) && !
isZWSP(ch))
92 ch = Text.iterateCodePoints(&pos, -1);
93 if (!u_isWhitespace(ch) && !
isZWSP(ch))
98 case WordType::DICTIONARY_WORD:
102 ch = Text.iterateCodePoints(&pos);
103 if (!u_isWhitespace(ch) && !
isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
110 ch = Text.iterateCodePoints(&pos, -1);
111 if (!u_isWhitespace(ch) && !
isZWSP(ch) && (ch == 0x002E || u_isalnum(ch)))
116 case WordType::WORD_COUNT:
120 ch = Text.iterateCodePoints(&pos);
121 if (!u_isUWhiteSpace(ch) && !
isZWSP(ch))
128 ch = Text.iterateCodePoints(&pos, -1);
129 if (!u_isUWhiteSpace(ch) && !
isZWSP(ch))
139 const Locale& rLocale, sal_Int16 rWordType )
141 sal_Int32 len = Text.getLength();
142 if( nStartPos < 0 || len == 0 )
144 else if (nStartPos >= len)
147 result =
LBI->nextWord(Text, nStartPos, rLocale, rWordType);
151 if ( nStartPos !=
result.startPos) {
152 if( nStartPos >= len )
155 result =
LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType,
true);
157 if (
result.startPos < nStartPos)
result.startPos = nStartPos;
164 static bool isCJK(
const Locale& rLocale ) {
165 return rLocale.Language ==
"zh" || rLocale.Language ==
"ja" || rLocale.Language ==
"ko";
169 const Locale& rLocale, sal_Int16 rWordType)
171 sal_Int32 len = Text.getLength();
172 if( nStartPos <= 0 || len == 0 ) {
175 }
else if (nStartPos > len) {
180 sal_Int32
nPos =
skipSpace(Text, nStartPos, len, rWordType,
false);
185 if (nPos != nStartPos && nPos > 0 && !
isCJK(rLocale) &&
getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
190 return LBI->previousWord(Text,
result.startPos, rLocale, rWordType);
195 sal_Int16 rWordType,
sal_Bool bDirection )
197 sal_Int32 len = Text.getLength();
198 if( nPos < 0 || len == 0 )
203 sal_Int32 next, prev;
204 next =
skipSpace(Text, nPos, len, rWordType,
true);
205 prev =
skipSpace(Text, nPos, len, rWordType,
false);
206 if (prev == 0 && next == len) {
208 }
else if (prev == 0 && ! bDirection) {
210 }
else if (next == len && bDirection) {
214 if (next == nPos && next != len)
216 else if (prev == nPos && prev != 0)
219 nPos = bDirection ? next : prev;
221 result =
LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
228 const Locale& rLocale, sal_Int16 rWordType )
230 sal_Int32 len = Text.getLength();
232 if (nPos < 0 || nPos >= len)
return false;
234 sal_Int32 tmp =
skipSpace(Text, nPos, len, rWordType,
true);
236 if (tmp != nPos)
return false;
240 return result.startPos == nPos;
244 const Locale& rLocale, sal_Int16 rWordType )
246 sal_Int32 len = Text.getLength();
248 if (nPos <= 0 || nPos > len)
return false;
250 sal_Int32 tmp =
skipSpace(Text, nPos, len, rWordType,
false);
252 if (tmp != nPos)
return false;
256 return result.endPos == nPos;
260 const Locale &rLocale )
262 if (nStartPos < 0 || nStartPos > Text.getLength())
264 if (Text.isEmpty())
return 0;
265 return LBI->beginOfSentence(Text, nStartPos, rLocale);
269 const Locale &rLocale )
271 if (nStartPos < 0 || nStartPos > Text.getLength())
273 if (Text.isEmpty())
return 0;
274 return LBI->endOfSentence(Text, nStartPos, rLocale);
278 const Locale& rLocale, sal_Int32 nMinBreakPos,
const LineBreakHyphenationOptions& hOptions,
279 const LineBreakUserOptions& bOptions )
281 return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
286 return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
294 static sal_Int32
iterateCodePoints(
const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
295 sal_Int32 nLen = Text.getLength();
296 if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
298 nStartPos = nStartPos + inc < 0 ? -1 : nLen;
300 ch = Text.iterateCodePoints(&nStartPos, inc);
312 ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
321 if (nStartPos < 0 || nStartPos >= Text.getLength())
324 if(ScriptType !=
getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
327 if (nStartPos == 0)
return 0;
330 if (nStartPos == 0)
return 0;
339 if (nStartPos < 0 || nStartPos >= Text.getLength())
342 if(ScriptType !=
getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
345 sal_Int32 strLen = Text.getLength();
349 if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
360 if (nStartPos > Text.getLength())
361 nStartPos = Text.getLength();
363 sal_Int16 numberOfChange = (ScriptType ==
getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
367 if (((numberOfChange % 2) == 0) != (ScriptType !=
getScriptClass(ch)))
369 else if (nStartPos == 0) {
382 sal_Int32 strLen = Text.getLength();
383 if (nStartPos >= strLen)
386 sal_Int16 numberOfChange = (ScriptType ==
getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
389 while (numberOfChange > 0 &&
iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
391 if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
392 (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
395 return numberOfChange == 0 ? nStartPos : -1;
399 const Locale& , sal_Int16 CharType )
401 if (CharType == CharType::ANY_CHAR)
return 0;
402 if (nStartPos < 0 || nStartPos >= Text.getLength())
return -1;
403 if (CharType != static_cast<sal_Int16>(u_charType( Text.iterateCodePoints(&nStartPos, 0))))
return -1;
405 sal_Int32
nPos=nStartPos;
406 while(nStartPos > 0 && CharType == static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nPos, -1)))) { nStartPos=nPos; }
411 const Locale& , sal_Int16 CharType )
413 sal_Int32 strLen = Text.getLength();
415 if (CharType == CharType::ANY_CHAR)
return strLen;
416 if (nStartPos < 0 || nStartPos >= strLen)
return -1;
417 if (CharType != static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nStartPos, 0))))
return -1;
420 while(
iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType ==
static_cast<sal_Int16
>(u_charType(ch))) {}
425 const Locale& , sal_Int16 CharType )
427 if (CharType == CharType::ANY_CHAR)
return -1;
428 if (nStartPos < 0 || nStartPos >= Text.getLength())
return -1;
430 sal_Int16 numberOfChange = (CharType ==
static_cast<sal_Int16
>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 2 : 1;
431 sal_Int32 strLen = Text.getLength();
434 while (numberOfChange > 0 &&
iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
435 if ((CharType != static_cast<sal_Int16>(u_charType(ch))) != (numberOfChange == 1))
438 return numberOfChange == 0 ? nStartPos : -1;
442 const Locale& , sal_Int16 CharType )
444 if(CharType == CharType::ANY_CHAR)
return -1;
445 if (nStartPos < 0 || nStartPos >= Text.getLength())
return -1;
447 sal_Int16 numberOfChange = (CharType ==
static_cast<sal_Int16
>(u_charType(Text.iterateCodePoints(&nStartPos, 0)))) ? 3 : 2;
451 if (((numberOfChange % 2) == 0) != (CharType != static_cast<sal_Int16>(u_charType(ch))))
453 if (nStartPos == 0 && numberOfChange > 0) {
455 if (numberOfChange == 0)
return nStartPos;
463 sal_Int32 ,
const Locale& )
470 sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
472 int32_t
script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
485 {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
486 {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN},
487 {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN},
488 {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
489 {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
490 {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
491 {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
492 {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
493 {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
494 {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
495 {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
496 {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
497 {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
498 {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
499 {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
500 {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
501 {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
502 {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
503 {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
504 {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
507 #define scriptListCount SAL_N_ELEMENTS(scriptList)
516 bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
523 if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
524 rScriptType = ScriptType::WEAK;
526 else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar || 0x2D9 == currentChar )
527 rScriptType = ScriptType::WEAK;
529 else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
530 rScriptType = ScriptType::LATIN;
533 UBlockCode block=ublock_getCode(currentChar);
537 if (block <= scriptList[i].
to)
541 if (i < scriptListCount && block >= scriptList[i].
from)
542 rScriptType = scriptList[i].script;
545 rScriptType = ScriptType::WEAK;
555 static sal_uInt32 lastChar = 0;
556 static sal_Int16 nRet = ScriptType::WEAK;
558 if (currentChar != lastChar)
560 lastChar = currentChar;
562 if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
563 nRet = getScriptClassByUAX24Script(currentChar);
573 if (aLocaleName == listItem.aLocale.Language) {
579 #if !WITH_LOCALE_ALL && !WITH_LOCALE_ja
580 if (aLocaleName ==
"ja")
583 #if !WITH_LOCALE_ALL && !WITH_LOCALE_zh
584 if (aLocaleName ==
"zh" || aLocaleName ==
"zh_TW")
587 #if !WITH_LOCALE_ALL && !WITH_LOCALE_ko
588 if (aLocaleName ==
"ko")
591 #if !WITH_LOCALE_ALL && !WITH_LOCALE_th
592 if (aLocaleName ==
"th")
596 Reference < uno::XInterface > xI =
m_xContext->getServiceManager()->createInstanceWithContext(
597 "com.sun.star.i18n.BreakIterator_" + aLocaleName,
m_xContext);
600 xBI.set(xI, UNO_QUERY);
602 lookupTable.emplace_back(
Locale(aLocaleName, aLocaleName, aLocaleName),
xBI);
609 const Reference < XBreakIterator > &
618 if (rLocale == listItem.aLocale)
625 OUStringLiteral under(
u"_");
627 sal_Int32 l = rLocale.Language.getLength();
628 sal_Int32 c = rLocale.Country.getLength();
629 sal_Int32
v = rLocale.Variant.getLength();
631 if ((l > 0 && c > 0 && v > 0 &&
634 rLocale.Country + under + rLocale.Variant)) ||
639 (l > 0 && c > 0 && rLocale.Language ==
"zh" &&
640 (rLocale.Country ==
"HK" ||
641 rLocale.Country ==
"MO" ) &&
654 throw RuntimeException(
"getLocaleSpecificBreakIterator: iterator not found");
660 return "com.sun.star.i18n.BreakIterator";
672 return {
"com.sun.star.i18n.BreakIterator" };
677 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
679 css::uno::XComponentContext *context,
680 css::uno::Sequence<css::uno::Any>
const &)
exports com.sun.star. script
static sal_Int16 getScriptClass(sal_uInt32 currentChar)
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
css::i18n::Boundary result
virtual sal_Int32 SAL_CALL previousCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
virtual sal_Int32 SAL_CALL nextCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
css::lang::Locale aLocale
const css::uno::Reference< XBreakIterator > & getLocaleSpecificBreakIterator(const css::lang::Locale &rLocale)
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
virtual sal_Int32 SAL_CALL previousScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
static sal_Int32 skipSpace(const OUString &Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
std::vector< lookupTableItem > lookupTable
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
virtual sal_Int32 SAL_CALL endOfScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
virtual sal_Int16 SAL_CALL getWordType(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale) override
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Int16 SAL_CALL getScriptType(const OUString &Text, sal_Int32 nPos) override
virtual sal_Int32 SAL_CALL beginOfCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
const UBlockScript scriptList[]
virtual sal_Int32 SAL_CALL endOfCharBlock(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 CharType) override
virtual sal_Int32 SAL_CALL beginOfScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
Constant values shared between i18npool and, for example, the number formatter.
bool createLocaleSpecificBreakIterator(const OUString &aLocaleName)
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual sal_Bool SAL_CALL isEndWord(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
virtual ~BreakIteratorImpl() override
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
css::uno::Reference< XBreakIterator > xBI
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
css::uno::Reference< css::uno::XComponentContext > m_xContext
static sal_Int32 iterateCodePoints(const OUString &Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32 &ch)
Increments/decrements position first, then obtains character.
virtual sal_Bool SAL_CALL isBeginWord(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
Reference< XComponentContext > m_xContext
virtual OUString SAL_CALL getImplementationName() override
static bool isCJK(const Locale &rLocale)
virtual sal_Int32 SAL_CALL nextScript(const OUString &Text, sal_Int32 nStartPos, sal_Int16 ScriptType) override