20#include <config_locales.h>
24#include <unicode/uchar.h>
28#include <com/sun/star/i18n/CharType.hpp>
29#include <com/sun/star/i18n/ScriptType.hpp>
30#include <com/sun/star/i18n/WordType.hpp>
31#include <com/sun/star/uno/XComponentContext.hpp>
44BreakIteratorImpl::BreakIteratorImpl()
48BreakIteratorImpl::~BreakIteratorImpl()
52#define LBI getLocaleSpecificBreakIterator(rLocale)
54sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters(
const OUString& Text, sal_Int32 nStartPos,
55 const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
58 throw RuntimeException(
"BreakIteratorImpl::nextCharacters: expected nCount >=0, got "
59 + OUString::number(
nCount));
61 return LBI->nextCharacters(
Text, nStartPos, rLocale, nCharacterIteratorMode,
nCount, nDone);
64sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters(
const OUString& Text, sal_Int32 nStartPos,
65 const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
68 throw RuntimeException(
"BreakIteratorImpl::previousCharacters: expected nCount >=0, got "
69 + OUString::number(
nCount));
71 return LBI->previousCharacters(
Text, nStartPos, rLocale, nCharacterIteratorMode,
nCount, nDone);
74#define isZWSP(c) (ch == 0x200B)
76static sal_Int32
skipSpace(std::u16string_view Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType,
bool bDirection)
81 case WordType::ANYWORD_IGNOREWHITESPACES:
99 case WordType::DICTIONARY_WORD:
104 if (!u_isWhitespace(
ch) && !
isZWSP(
ch) && (
ch == 0x002E || u_isalnum(
ch)))
112 if (!u_isWhitespace(
ch) && !
isZWSP(
ch) && (
ch == 0x002E || u_isalnum(
ch)))
117 case WordType::WORD_COUNT:
139Boundary SAL_CALL BreakIteratorImpl::nextWord(
const OUString& Text, sal_Int32 nStartPos,
140 const Locale& rLocale, sal_Int16 rWordType )
142 sal_Int32 len =
Text.getLength();
143 if( nStartPos < 0 || len == 0 )
145 else if (nStartPos >= len)
152 if ( nStartPos !=
result.startPos) {
153 if( nStartPos >= len )
156 result =
LBI->getWordBoundary(
Text, nStartPos, rLocale, rWordType,
true);
158 if (
result.startPos < nStartPos)
result.startPos = nStartPos;
165static bool isCJK(
const Locale& rLocale ) {
166 return rLocale.Language ==
"zh" || rLocale.Language ==
"ja" || rLocale.Language ==
"ko";
169Boundary SAL_CALL BreakIteratorImpl::previousWord(
const OUString& Text, sal_Int32 nStartPos,
170 const Locale& rLocale, sal_Int16 rWordType)
172 sal_Int32 len =
Text.getLength();
173 if( nStartPos <= 0 || len == 0 ) {
176 }
else if (nStartPos > len) {
186 if (
nPos != nStartPos &&
nPos > 0 && !
isCJK(rLocale) && getScriptClass(
Text.iterateCodePoints(&
nPos, -1)) == ScriptType::ASIAN) {
191 return LBI->previousWord(
Text,
result.startPos, rLocale, rWordType);
195Boundary SAL_CALL BreakIteratorImpl::getWordBoundary(
const OUString& Text, sal_Int32 nPos,
const Locale& rLocale,
196 sal_Int16 rWordType,
sal_Bool bDirection )
198 sal_Int32 len =
Text.getLength();
199 if(
nPos < 0 || len == 0 )
204 sal_Int32 next, prev;
207 if (prev == 0 && next == len) {
209 }
else if (prev == 0 && ! bDirection) {
211 }
else if (next == len && bDirection) {
215 if (next ==
nPos && next != len)
217 else if (prev ==
nPos && prev != 0)
220 nPos = bDirection ? next : prev;
228sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord(
const OUString& Text, sal_Int32 nPos,
229 const Locale& rLocale, sal_Int16 rWordType )
231 sal_Int32 len =
Text.getLength();
233 if (nPos < 0 || nPos >= len)
return false;
237 if (tmp !=
nPos)
return false;
244sal_Bool SAL_CALL BreakIteratorImpl::isEndWord(
const OUString& Text, sal_Int32 nPos,
245 const Locale& rLocale, sal_Int16 rWordType )
247 sal_Int32 len =
Text.getLength();
249 if (nPos <= 0 || nPos > len)
return false;
253 if (tmp !=
nPos)
return false;
260sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence(
const OUString& Text, sal_Int32 nStartPos,
261 const Locale &rLocale )
263 if (nStartPos < 0 || nStartPos >
Text.getLength())
265 if (
Text.isEmpty())
return 0;
266 return LBI->beginOfSentence(
Text, nStartPos, rLocale);
269sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence(
const OUString& Text, sal_Int32 nStartPos,
270 const Locale &rLocale )
272 if (nStartPos < 0 || nStartPos >
Text.getLength())
274 if (
Text.isEmpty())
return 0;
275 return LBI->endOfSentence(
Text, nStartPos, rLocale);
278LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak(
const OUString& Text, sal_Int32 nStartPos,
279 const Locale& rLocale, sal_Int32 nMinBreakPos,
const LineBreakHyphenationOptions& hOptions,
280 const LineBreakUserOptions& bOptions )
282 return LBI->getLineBreak(
Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
285sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType(
const OUString& Text, sal_Int32 nPos )
287 return (nPos < 0 || nPos >=
Text.getLength()) ? ScriptType::WEAK :
288 getScriptClass(
Text.iterateCodePoints(&
nPos, 0));
295static sal_Int32
iterateCodePoints(
const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
296 sal_Int32 nLen =
Text.getLength();
297 if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
299 nStartPos = nStartPos + inc < 0 ? -1 : nLen;
301 ch =
Text.iterateCodePoints(&nStartPos, inc);
313 ch = (nStartPos < nLen ?
Text.iterateCodePoints(&nStartPos, 0) : 0);
319sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript(
const OUString& Text,
322 if (nStartPos < 0 || nStartPos >=
Text.getLength())
325 if(
ScriptType != getScriptClass(
Text.iterateCodePoints(&nStartPos, 0)))
328 if (nStartPos == 0)
return 0;
331 if (nStartPos == 0)
return 0;
337sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript(
const OUString& Text,
340 if (nStartPos < 0 || nStartPos >=
Text.getLength())
343 if(
ScriptType != getScriptClass(
Text.iterateCodePoints(&nStartPos, 0)))
346 sal_Int32 strLen =
Text.getLength();
349 sal_Int16 currentCharScriptType = getScriptClass(
ch);
350 if(
ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
356sal_Int32 SAL_CALL BreakIteratorImpl::previousScript(
const OUString& Text,
361 if (nStartPos >
Text.getLength())
362 nStartPos =
Text.getLength();
364 sal_Int16 numberOfChange = (
ScriptType == getScriptClass(
Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
368 if (((numberOfChange % 2) == 0) != (
ScriptType != getScriptClass(
ch)))
370 else if (nStartPos == 0) {
377sal_Int32 SAL_CALL BreakIteratorImpl::nextScript(
const OUString& Text, sal_Int32 nStartPos,
383 sal_Int32 strLen =
Text.getLength();
384 if (nStartPos >= strLen)
387 sal_Int16 numberOfChange = (
ScriptType == getScriptClass(
Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
391 sal_Int16 currentCharScriptType = getScriptClass(
ch);
392 if ((numberOfChange == 1) ? (
ScriptType == currentCharScriptType) :
393 (
ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
396 return numberOfChange == 0 ? nStartPos : -1;
399sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock(
const OUString& Text, sal_Int32 nStartPos,
400 const Locale& , sal_Int16 CharType )
402 if (CharType == CharType::ANY_CHAR)
return 0;
403 if (nStartPos < 0 || nStartPos >=
Text.getLength())
return -1;
404 if (CharType !=
static_cast<sal_Int16
>(u_charType(
Text.iterateCodePoints(&nStartPos, 0))))
return -1;
406 sal_Int32
nPos=nStartPos;
407 while(nStartPos > 0 && CharType ==
static_cast<sal_Int16
>(u_charType(
Text.iterateCodePoints(&
nPos, -1)))) { nStartPos=
nPos; }
411sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock(
const OUString& Text, sal_Int32 nStartPos,
412 const Locale& , sal_Int16 CharType )
414 sal_Int32 strLen =
Text.getLength();
416 if (CharType == CharType::ANY_CHAR)
return strLen;
417 if (nStartPos < 0 || nStartPos >= strLen)
return -1;
418 if (CharType !=
static_cast<sal_Int16
>(u_charType(
Text.iterateCodePoints(&nStartPos, 0))))
return -1;
425sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock(
const OUString& Text, sal_Int32 nStartPos,
426 const Locale& , sal_Int16 CharType )
428 if (CharType == CharType::ANY_CHAR)
return -1;
429 if (nStartPos < 0 || nStartPos >=
Text.getLength())
return -1;
431 sal_Int16 numberOfChange = (CharType ==
static_cast<sal_Int16
>(u_charType(
Text.iterateCodePoints(&nStartPos, 0)))) ? 2 : 1;
432 sal_Int32 strLen =
Text.getLength();
436 if ((CharType !=
static_cast<sal_Int16
>(u_charType(
ch))) != (numberOfChange == 1))
439 return numberOfChange == 0 ? nStartPos : -1;
442sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock(
const OUString& Text, sal_Int32 nStartPos,
443 const Locale& , sal_Int16 CharType )
445 if(CharType == CharType::ANY_CHAR)
return -1;
446 if (nStartPos < 0 || nStartPos >=
Text.getLength())
return -1;
448 sal_Int16 numberOfChange = (CharType ==
static_cast<sal_Int16
>(u_charType(
Text.iterateCodePoints(&nStartPos, 0)))) ? 3 : 2;
452 if (((numberOfChange % 2) == 0) != (CharType !=
static_cast<sal_Int16
>(u_charType(
ch))))
454 if (nStartPos == 0 && numberOfChange > 0) {
456 if (numberOfChange == 0)
return nStartPos;
463sal_Int16 SAL_CALL BreakIteratorImpl::getWordType(
const OUString& ,
464 sal_Int32 ,
const Locale& )
471sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
473 int32_t
script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
486 {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
487 {UBLOCK_BASIC_LATIN, UBLOCK_SPACING_MODIFIER_LETTERS, ScriptType::LATIN},
488 {UBLOCK_GREEK, UBLOCK_ARMENIAN, ScriptType::LATIN},
489 {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
490 {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
491 {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
492 {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
493 {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
494 {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
495 {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
496 {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
497 {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
498 {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
499 {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
500 {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
501 {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
502 {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
503 {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
504 {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
505 {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
508#define scriptListCount SAL_N_ELEMENTS(scriptList)
517bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
524 if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
525 rScriptType = ScriptType::WEAK;
527 else if ( 0x2CA == currentChar || 0x2CB == currentChar || 0x2C7 == currentChar || 0x2D9 == currentChar )
528 rScriptType = ScriptType::WEAK;
530 else if ( 0xB2 == currentChar || 0xB3 == currentChar || 0xB9 == currentChar )
531 rScriptType = ScriptType::WEAK;
533 else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
534 rScriptType = ScriptType::LATIN;
537 UBlockCode block=ublock_getCode(currentChar);
549 rScriptType = ScriptType::WEAK;
557sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
559 static sal_uInt32 lastChar = 0;
560 static sal_Int16 nRet = ScriptType::WEAK;
562 if (currentChar != lastChar)
564 lastChar = currentChar;
566 if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
567 nRet = getScriptClassByUAX24Script(currentChar);
573bool BreakIteratorImpl::createLocaleSpecificBreakIterator(
const OUString& aLocaleName)
577 if (aLocaleName == listItem.aLocale.Language) {
583#if !WITH_LOCALE_ALL && !WITH_LOCALE_ja
584 if (aLocaleName ==
"ja")
587#if !WITH_LOCALE_ALL && !WITH_LOCALE_zh
588 if (aLocaleName ==
"zh" || aLocaleName ==
"zh_TW")
591#if !WITH_LOCALE_ALL && !WITH_LOCALE_ko
592 if (aLocaleName ==
"ko")
595#if !WITH_LOCALE_ALL && !WITH_LOCALE_th
596 if (aLocaleName ==
"th")
600 Reference < uno::XInterface > xI =
m_xContext->getServiceManager()->createInstanceWithContext(
601 "com.sun.star.i18n.BreakIterator_" + aLocaleName,
m_xContext);
604 xBI.set(xI, UNO_QUERY);
606 lookupTable.emplace_back(
Locale(aLocaleName, aLocaleName, aLocaleName), xBI);
613const Reference < XBreakIterator > &
614BreakIteratorImpl::getLocaleSpecificBreakIterator(
const Locale& rLocale)
616 if (xBI.is() && rLocale == aLocale)
622 if (rLocale == listItem.aLocale)
629 static constexpr OUStringLiteral under(
u"_");
631 sal_Int32 l = rLocale.Language.getLength();
632 sal_Int32 c = rLocale.Country.getLength();
633 sal_Int32
v = rLocale.Variant.getLength();
635 if ((l > 0 && c > 0 &&
v > 0 &&
637 createLocaleSpecificBreakIterator(rLocale.Language + under +
638 rLocale.Country + under + rLocale.Variant)) ||
641 createLocaleSpecificBreakIterator(rLocale.Language + under +
643 (l > 0 && c > 0 && rLocale.Language ==
"zh" &&
644 (rLocale.Country ==
"HK" ||
645 rLocale.Country ==
"MO" ) &&
647 createLocaleSpecificBreakIterator(rLocale.Language + under +
651 createLocaleSpecificBreakIterator(rLocale.Language)) ||
653 createLocaleSpecificBreakIterator(
"Unicode")) {
654 lookupTable.emplace_back( aLocale, xBI );
658 throw RuntimeException(
"getLocaleSpecificBreakIterator: iterator not found");
662BreakIteratorImpl::getImplementationName()
664 return "com.sun.star.i18n.BreakIterator";
668BreakIteratorImpl::supportsService(
const OUString& rServiceName)
673Sequence< OUString > SAL_CALL
674BreakIteratorImpl::getSupportedServiceNames()
676 return {
"com.sun.star.i18n.BreakIterator" };
681extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
683 css::uno::XComponentContext *context,
684 css::uno::Sequence<css::uno::Any>
const &)
Reference< XComponentContext > m_xContext
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_get_implementation(css::uno::XComponentContext *context, css::uno::Sequence< css::uno::Any > const &)
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
const UBlockScript scriptList[]
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
Constant values shared between i18npool and, for example, the number formatter.
static sal_Int32 skipSpace(std::u16string_view Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
static bool isCJK(const Locale &rLocale)
sal_uInt32 iterateCodePoints(std::u16string_view string, sal_Int32 *indexUtf16, sal_Int32 incrementCodePoints=1)