25#include <unicode/uchar.h>
26#include <unicode/locid.h>
27#include <unicode/rbbi.h>
28#include <unicode/udata.h>
29#include <rtl/strbuf.hxx>
30#include <rtl/ustring.hxx>
32#include <com/sun/star/i18n/BreakType.hpp>
33#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34#include <com/sun/star/i18n/WordType.hpp>
51 : cBreakIterator(
"com.sun.star.i18n.BreakIterator_Unicode" )
67class OOoRuleBasedBreakIterator :
public icu::RuleBasedBreakIterator
70#if (U_ICU_VERSION_MAJOR_NUM < 58)
72 void publicSetBreakType(int32_t type)
77 OOoRuleBasedBreakIterator(UDataMemory* image,
79 : icu::RuleBasedBreakIterator(image, status)
88 sal_Int16 rBreakType, sal_Int16 nWordType,
const char *rule,
const OUString& rText)
90 bool bNewBreak =
false;
91 UErrorCode status = U_ZERO_ERROR;
92 sal_Int16 breakType = 0;
96 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
99 case WordType::ANY_WORD:
break;
100 case WordType::ANYWORD_IGNOREWHITESPACES:
101 breakType = 0; rule =
"edit_word";
break;
102 case WordType::DICTIONARY_WORD:
103 breakType = 1; rule =
"dict_word";
break;
105 case WordType::WORD_COUNT:
106 breakType = 2; rule =
"count_word";
break;
127 assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
129 OStringBuffer aKeyBuf(64);
130 aKeyBuf.append( aLangtagStr +
";" );
132 aKeyBuf.append(rule);
133 aKeyBuf.append(
";" + OStringChar(
static_cast<char>(
'0'+breakType)) +
";"
134 + OStringChar(
static_cast<char>(
'0'+rBreakType)) +
";"
135 + OStringChar(
static_cast<char>(
'0'+nWordType)));
137 const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
142 auto aMapIt(
theBIMap.find( aBIMapGlobalKey));
143 bool bInMap = (aMapIt !=
theBIMap.end());
152 const uno::Sequence< OUString > breakRules =
LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
154 status = U_ZERO_ERROR;
156 if ( !U_SUCCESS(status) )
throw uno::RuntimeException();
158 std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
160 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
163 const OString aBIMapRuleTypeKey( aLangtagStr +
";" + rule +
";" + OString::number(breakType));
164 aMapIt =
theBIMap.find( aBIMapRuleTypeKey);
165 bInMap = (aMapIt !=
theBIMap.end());
174 rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open(
"OpenOffice",
"brk",
175 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
177 if (U_SUCCESS(status))
189 else if (rLocale.Language !=
"th" && rLocale.Language !=
"lo" && rLocale.Language !=
"bo" && rLocale.Language !=
"dz" && rLocale.Language !=
"km")
192 OString aLanguage(
LanguageTag( rLocale).getLanguage().toUtf8());
193 const OString aBIMapRuleKey( aLanguage +
";" + rule);
194 aMapIt =
theBIMap.find( aBIMapRuleKey);
195 bInMap = (aMapIt !=
theBIMap.end());
204 status = U_ZERO_ERROR;
205 OString aUDName = OString::Concat(rule) +
"_" + aLanguage;
206 UDataMemory* pUData = udata_open(
"OpenOffice",
"brk", aUDName.getStr(), &status);
207 if( U_SUCCESS(status) )
208 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
209 if ( U_SUCCESS(status) )
220 const OString aBIMapRuleOnlyKey( OString::Concat(
";") + rule);
221 aMapIt =
theBIMap.find( aBIMapRuleOnlyKey);
222 bInMap = (aMapIt !=
theBIMap.end());
231 status = U_ZERO_ERROR;
232 pUData = udata_open(
"OpenOffice",
"brk", rule, &status);
233 if( U_SUCCESS(status) )
234 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
235 if ( U_SUCCESS(status) )
248 #if (U_ICU_VERSION_MAJOR_NUM < 58)
256 switch (rBreakType) {
270 const OString aBIMapLocaleTypeKey( aLangtagStr +
";;;" + OString::number(rBreakType));
271 aMapIt =
theBIMap.find( aBIMapLocaleTypeKey);
272 bInMap = (aMapIt !=
theBIMap.end());
282 std::shared_ptr< icu::BreakIterator > pBI;
284 status = U_ZERO_ERROR;
285 switch (rBreakType) {
287 pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
290 pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
293 pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
296 pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
299 if ( !U_SUCCESS(status) || !pBI ) {
300 throw uno::RuntimeException();
307 throw uno::RuntimeException();
315 if (!(bNewBreak ||
icuBI->
mpValue->maICUText.pData != rText.pData))
318 const UChar *pText =
reinterpret_cast<const UChar *
>(rText.getStr());
320 status = U_ZERO_ERROR;
323 if (!U_SUCCESS(status))
324 throw uno::RuntimeException();
328 if (!U_SUCCESS(status))
329 throw uno::RuntimeException();
335 sal_Int32 nStartPos,
const lang::Locale &rLocale,
336 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
338 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) {
341 for (nDone = 0; nDone <
nCount; nDone++) {
342 nStartPos = pBI->following(nStartPos);
343 if (nStartPos == icu::BreakIterator::DONE)
344 return Text.getLength();
347 for (nDone = 0; nDone <
nCount && nStartPos <
Text.getLength(); nDone++)
348 Text.iterateCodePoints(&nStartPos);
354 sal_Int32 nStartPos,
const lang::Locale& rLocale,
355 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
357 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) {
360 for (nDone = 0; nDone <
nCount; nDone++) {
361 nStartPos = pBI->preceding(nStartPos);
362 if (nStartPos == icu::BreakIterator::DONE)
366 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
367 Text.iterateCodePoints(&nStartPos, -1);
374 const lang::Locale& rLocale, sal_Int16 rWordType )
379 rv.startPos =
icuBI->
mpValue->mpBreakIterator->following(nStartPos);
380 if( rv.startPos >=
Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
381 rv.endPos =
result.startPos;
383 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
384 && u_isUWhiteSpace(
Text.iterateCodePoints(&rv.startPos, 0)))
385 || (rWordType == WordType::DICTIONARY_WORD
386 && u_isWhitespace(
Text.iterateCodePoints(&rv.startPos, 0))))
387 rv.startPos =
icuBI->
mpValue->mpBreakIterator->following(rv.startPos);
389 rv.endPos =
icuBI->
mpValue->mpBreakIterator->following(rv.startPos);
390 if(rv.endPos == icu::BreakIterator::DONE)
391 rv.endPos = rv.startPos;
398 const lang::Locale& rLocale, sal_Int16 rWordType)
403 rv.startPos =
icuBI->
mpValue->mpBreakIterator->preceding(nStartPos);
405 rv.endPos = rv.startPos;
408 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
409 && u_isUWhiteSpace(
Text.iterateCodePoints(&rv.startPos, 0)))
410 || (rWordType == WordType::DICTIONARY_WORD
411 && u_isWhitespace(
Text.iterateCodePoints(&rv.startPos, 0))))
412 rv.startPos =
icuBI->
mpValue->mpBreakIterator->preceding(rv.startPos);
414 rv.endPos =
icuBI->
mpValue->mpBreakIterator->following(rv.startPos);
415 if(rv.endPos == icu::BreakIterator::DONE)
416 rv.endPos = rv.startPos;
423 sal_Int16 rWordType,
sal_Bool bDirection )
426 sal_Int32 len =
Text.getLength();
430 rv.startPos = rv.endPos =
nPos;
431 if((bDirection ||
nPos == 0) &&
nPos < len)
438 rv.endPos = len ?
icuBI->
mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
439 }
else if(
nPos >= len) {
440 rv.startPos =
icuBI->
mpValue->mpBreakIterator->preceding(len);
447 if (rv.startPos == icu::BreakIterator::DONE)
448 rv.startPos = rv.endPos;
449 else if (rv.endPos == icu::BreakIterator::DONE)
450 rv.endPos = rv.startPos;
457 const lang::Locale &rLocale )
461 sal_Int32 len =
Text.getLength();
462 if (len > 0 && nStartPos == len)
463 Text.iterateCodePoints(&nStartPos, -1);
468 sal_uInt32
ch =
Text.iterateCodePoints(&nStartPos);
469 while (nStartPos < len && u_isWhitespace(
ch))
ch =
Text.iterateCodePoints(&nStartPos);
470 Text.iterateCodePoints(&nStartPos, -1);
476 const lang::Locale &rLocale )
480 sal_Int32 len =
Text.getLength();
481 if (len > 0 && nStartPos == len)
482 Text.iterateCodePoints(&nStartPos, -1);
485 sal_Int32
nPos=nStartPos;
486 while (
nPos > 0 && u_isWhitespace(
Text.iterateCodePoints(&
nPos, -1))) nStartPos=
nPos;
492 const OUString& Text, sal_Int32 nStartPos,
493 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
494 const LineBreakHyphenationOptions& hOptions,
495 const LineBreakUserOptions& )
497 LineBreakResults lbr;
499 if (nStartPos >=
Text.getLength()) {
500 lbr.breakIndex =
Text.getLength();
501 lbr.breakType = BreakType::WORDBOUNDARY;
507 icu::BreakIterator* pLineBI =
line.
mpValue->mpBreakIterator.get();
511 if (pLineBI->preceding(nStartPos + 1) == nStartPos
512 && (nStartPos == 0 ||
Text[nStartPos - 1] !=
'/'))
514 lbr.breakIndex = nStartPos;
515 lbr.breakType = BreakType::WORDBOUNDARY;
516 }
else if (hOptions.rHyphenator.is()) {
517 sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
518 pLineBI->preceding(nStartPos + 1);
520 sal_Int32 nStartPosWordEnd = nStartPos;
521 while (pLineBI->current() < nStartPosWordEnd && u_ispunct(
static_cast<sal_uInt32
>(
Text[nStartPosWordEnd])))
525 WordType::DICTIONARY_WORD,
false);
527 nStartPosWordEnd = wBoundary.endPos;
528 while (nStartPosWordEnd <
Text.getLength() && (u_ispunct(
static_cast<sal_uInt32
>(
Text[nStartPosWordEnd]))))
530 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
531 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
533 while (boundary_with_punctuation > wBoundary.endPos &&
Text[--boundary_with_punctuation] ==
SPACE);
534 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(
Text.copy(wBoundary.startPos,
535 wBoundary.endPos - wBoundary.startPos), rLocale,
536 static_cast<sal_Int16
>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
537 if (aHyphenatedWord.is()) {
538 lbr.rHyphenatedWord = aHyphenatedWord;
539 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
542 lbr.breakIndex = wBoundary.startPos;
543 lbr.breakType = BreakType::HYPHENATION;
546 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
547 lbr.breakIndex = pLineBI->current();
548 lbr.breakType = BreakType::WORDBOUNDARY;
552 lbr.breakIndex = pLineBI->preceding(nStartPos);
553 lbr.breakType = BreakType::WORDBOUNDARY;
556 lbr.breakIndex = pLineBI->preceding(nStartPos);
557 lbr.breakType = BreakType::WORDBOUNDARY;
562 if (lbr.breakIndex > 0 &&
Text[lbr.breakIndex-1] ==
'/')
568 const sal_Int32 nOverlyLong = 66;
569 sal_Int32
nPos = lbr.breakIndex - 1;
570 while (
nPos > 0 && lbr.breakIndex -
nPos < nOverlyLong)
572 if (u_isWhitespace(
Text.iterateCodePoints( &
nPos, -1)))
574 lbr.breakIndex =
nPos + 1;
583 if (lbr.breakType == BreakType::WORDBOUNDARY) {
584 nStartPos = lbr.breakIndex;
585 if (nStartPos >= 0 &&
Text[nStartPos--] ==
WJ)
587 while (nStartPos >= 0 &&
588 (u_isWhitespace(
Text.iterateCodePoints(&nStartPos, 0)) ||
Text[nStartPos] ==
WJ)) {
589 if (
Text[nStartPos--] ==
WJ)
592 if (GlueSpace && nStartPos < 0) {
614uno::Sequence< OUString > SAL_CALL
617 uno::Sequence< OUString > aRet { OUString::createFromAscii(
cBreakIterator) };
623extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
625 css::uno::XComponentContext *,
626 css::uno::Sequence<css::uno::Any>
const &)
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_Unicode_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
U_CDECL_BEGIN const char OpenOffice_dat[]
#define LOAD_WORD_BREAKITERATOR
#define LOAD_SENTENCE_BREAKITERATOR
#define LOAD_LINE_BREAKITERATOR
#define LOAD_CHARACTER_BREAKITERATOR
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
static OUString convertToBcp47(LanguageType nLangID)
css::i18n::Boundary result
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
void loadICUBreakIterator(const css::lang::Locale &rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const char *name, const OUString &rText)
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual ~BreakIterator_Unicode() override
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
struct i18npool::BreakIterator_Unicode::BI_Data * icuBI
struct i18npool::BreakIterator_Unicode::BI_Data sentence
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
struct i18npool::BreakIterator_Unicode::BI_Data line
virtual OUString SAL_CALL getImplementationName() override
const char * cBreakIterator
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
std::unordered_map< OString, std::shared_ptr< BI_ValueData > > BIMap
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
struct i18npool::BreakIterator_Unicode::BI_Data character
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
static rtl::Reference< LocaleDataImpl > get()
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
Constant values shared between i18npool and, for example, the number formatter.
static thread_local BreakIterator_Unicode::BIMap theBIMap
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
std::shared_ptr< BI_ValueData > mpValue