25#include <unicode/uchar.h>
26#include <unicode/locid.h>
27#include <unicode/rbbi.h>
28#include <unicode/udata.h>
29#include <rtl/strbuf.hxx>
30#include <rtl/ustring.hxx>
32#include <com/sun/star/i18n/BreakType.hpp>
33#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34#include <com/sun/star/i18n/WordType.hpp>
51 : cBreakIterator(
"com.sun.star.i18n.BreakIterator_Unicode" )
67class OOoRuleBasedBreakIterator :
public icu::RuleBasedBreakIterator
70 OOoRuleBasedBreakIterator(UDataMemory* image,
72 : icu::RuleBasedBreakIterator(image, status)
81 sal_Int16 rBreakType, sal_Int16 nWordType,
const char *rule,
const OUString& rText)
83 bool bNewBreak =
false;
84 UErrorCode status = U_ZERO_ERROR;
85 sal_Int16 breakType = 0;
89 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
92 case WordType::ANY_WORD:
break;
93 case WordType::ANYWORD_IGNOREWHITESPACES:
94 breakType = 0; rule =
"edit_word";
break;
95 case WordType::DICTIONARY_WORD:
96 breakType = 1; rule =
"dict_word";
break;
98 case WordType::WORD_COUNT:
99 breakType = 2; rule =
"count_word";
break;
120 assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
122 OStringBuffer aKeyBuf(64);
123 aKeyBuf.append( aLangtagStr +
";" );
125 aKeyBuf.append(rule);
126 aKeyBuf.append(
";" + OStringChar(
static_cast<char>(
'0'+breakType)) +
";"
127 + OStringChar(
static_cast<char>(
'0'+rBreakType)) +
";"
128 + OStringChar(
static_cast<char>(
'0'+nWordType)));
130 const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
135 auto aMapIt(
theBIMap.find( aBIMapGlobalKey));
136 bool bInMap = (aMapIt !=
theBIMap.end());
145 const uno::Sequence< OUString > breakRules =
LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
147 status = U_ZERO_ERROR;
149 if ( !U_SUCCESS(status) )
throw uno::RuntimeException();
151 std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
153 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
156 const OString aBIMapRuleTypeKey( aLangtagStr +
";" + rule +
";" + OString::number(breakType));
157 aMapIt =
theBIMap.find( aBIMapRuleTypeKey);
158 bInMap = (aMapIt !=
theBIMap.end());
167 rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open(
"OpenOffice",
"brk",
168 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
170 if (U_SUCCESS(status))
182 else if (rLocale.Language !=
"th" && rLocale.Language !=
"lo" && rLocale.Language !=
"bo" && rLocale.Language !=
"dz" && rLocale.Language !=
"km")
185 OString aLanguage(
LanguageTag( rLocale).getLanguage().toUtf8());
186 const OString aBIMapRuleKey( aLanguage +
";" + rule);
187 aMapIt =
theBIMap.find( aBIMapRuleKey);
188 bInMap = (aMapIt !=
theBIMap.end());
197 status = U_ZERO_ERROR;
198 OString aUDName = OString::Concat(rule) +
"_" + aLanguage;
199 UDataMemory* pUData = udata_open(
"OpenOffice",
"brk", aUDName.getStr(), &status);
200 if( U_SUCCESS(status) )
201 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
202 if ( U_SUCCESS(status) )
213 const OString aBIMapRuleOnlyKey( OString::Concat(
";") + rule);
214 aMapIt =
theBIMap.find( aBIMapRuleOnlyKey);
215 bInMap = (aMapIt !=
theBIMap.end());
224 status = U_ZERO_ERROR;
225 pUData = udata_open(
"OpenOffice",
"brk", rule, &status);
226 if( U_SUCCESS(status) )
227 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
228 if ( U_SUCCESS(status) )
246 const OString aBIMapLocaleTypeKey( aLangtagStr +
";;;" + OString::number(rBreakType));
247 aMapIt =
theBIMap.find( aBIMapLocaleTypeKey);
248 bInMap = (aMapIt !=
theBIMap.end());
258 std::shared_ptr< icu::BreakIterator > pBI;
260 status = U_ZERO_ERROR;
261 switch (rBreakType) {
263 pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
266 pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
269 pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
272 pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
275 if ( !U_SUCCESS(status) || !pBI ) {
276 throw uno::RuntimeException();
283 throw uno::RuntimeException();
291 if (!(bNewBreak ||
icuBI->
mpValue->maICUText.pData != rText.pData))
294 const UChar *pText =
reinterpret_cast<const UChar *
>(rText.getStr());
296 status = U_ZERO_ERROR;
299 if (!U_SUCCESS(status))
300 throw uno::RuntimeException();
304 if (!U_SUCCESS(status))
305 throw uno::RuntimeException();
311 sal_Int32 nStartPos,
const lang::Locale &rLocale,
312 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
314 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) {
317 for (nDone = 0; nDone <
nCount; nDone++) {
318 nStartPos = pBI->following(nStartPos);
319 if (nStartPos == icu::BreakIterator::DONE)
320 return Text.getLength();
323 for (nDone = 0; nDone <
nCount && nStartPos <
Text.getLength(); nDone++)
324 Text.iterateCodePoints(&nStartPos);
330 sal_Int32 nStartPos,
const lang::Locale& rLocale,
331 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
333 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) {
336 for (nDone = 0; nDone <
nCount; nDone++) {
337 nStartPos = pBI->preceding(nStartPos);
338 if (nStartPos == icu::BreakIterator::DONE)
342 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
343 Text.iterateCodePoints(&nStartPos, -1);
350 const lang::Locale& rLocale, sal_Int16 rWordType )
355 rv.startPos =
icuBI->
mpValue->mpBreakIterator->following(nStartPos);
356 if( rv.startPos >=
Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
357 rv.endPos =
result.startPos;
359 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
360 && u_isUWhiteSpace(
Text.iterateCodePoints(&rv.startPos, 0)))
361 || (rWordType == WordType::DICTIONARY_WORD
362 && u_isWhitespace(
Text.iterateCodePoints(&rv.startPos, 0))))
363 rv.startPos =
icuBI->
mpValue->mpBreakIterator->following(rv.startPos);
365 rv.endPos =
icuBI->
mpValue->mpBreakIterator->following(rv.startPos);
366 if(rv.endPos == icu::BreakIterator::DONE)
367 rv.endPos = rv.startPos;
374 const lang::Locale& rLocale, sal_Int16 rWordType)
379 rv.startPos =
icuBI->
mpValue->mpBreakIterator->preceding(nStartPos);
381 rv.endPos = rv.startPos;
384 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
385 && u_isUWhiteSpace(
Text.iterateCodePoints(&rv.startPos, 0)))
386 || (rWordType == WordType::DICTIONARY_WORD
387 && u_isWhitespace(
Text.iterateCodePoints(&rv.startPos, 0))))
388 rv.startPos =
icuBI->
mpValue->mpBreakIterator->preceding(rv.startPos);
390 rv.endPos =
icuBI->
mpValue->mpBreakIterator->following(rv.startPos);
391 if(rv.endPos == icu::BreakIterator::DONE)
392 rv.endPos = rv.startPos;
399 sal_Int16 rWordType,
sal_Bool bDirection )
402 sal_Int32 len =
Text.getLength();
406 rv.startPos = rv.endPos =
nPos;
407 if((bDirection ||
nPos == 0) &&
nPos < len)
414 rv.endPos = len ?
icuBI->
mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
415 }
else if(
nPos >= len) {
416 rv.startPos =
icuBI->
mpValue->mpBreakIterator->preceding(len);
423 if (rv.startPos == icu::BreakIterator::DONE)
424 rv.startPos = rv.endPos;
425 else if (rv.endPos == icu::BreakIterator::DONE)
426 rv.endPos = rv.startPos;
433 const lang::Locale &rLocale )
437 sal_Int32 len =
Text.getLength();
438 if (len > 0 && nStartPos == len)
439 Text.iterateCodePoints(&nStartPos, -1);
444 sal_uInt32
ch =
Text.iterateCodePoints(&nStartPos);
445 while (nStartPos < len && u_isWhitespace(
ch))
ch =
Text.iterateCodePoints(&nStartPos);
446 Text.iterateCodePoints(&nStartPos, -1);
452 const lang::Locale &rLocale )
456 sal_Int32 len =
Text.getLength();
457 if (len > 0 && nStartPos == len)
458 Text.iterateCodePoints(&nStartPos, -1);
461 sal_Int32
nPos=nStartPos;
462 while (
nPos > 0 && u_isWhitespace(
Text.iterateCodePoints(&
nPos, -1))) nStartPos=
nPos;
468 const OUString& Text, sal_Int32 nStartPos,
469 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
470 const LineBreakHyphenationOptions& hOptions,
471 const LineBreakUserOptions& )
473 LineBreakResults lbr;
475 if (nStartPos >=
Text.getLength()) {
476 lbr.breakIndex =
Text.getLength();
477 lbr.breakType = BreakType::WORDBOUNDARY;
483 icu::BreakIterator* pLineBI =
line.
mpValue->mpBreakIterator.get();
487 if (pLineBI->preceding(nStartPos + 1) == nStartPos
488 && (nStartPos == 0 ||
Text[nStartPos - 1] !=
'/'))
490 lbr.breakIndex = nStartPos;
491 lbr.breakType = BreakType::WORDBOUNDARY;
492 }
else if (hOptions.rHyphenator.is()) {
493 sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
494 pLineBI->preceding(nStartPos + 1);
496 sal_Int32 nStartPosWordEnd = nStartPos;
497 while (pLineBI->current() < nStartPosWordEnd && u_ispunct(
static_cast<sal_uInt32
>(
Text[nStartPosWordEnd])))
501 WordType::DICTIONARY_WORD,
false);
503 nStartPosWordEnd = wBoundary.endPos;
504 while (nStartPosWordEnd <
Text.getLength() && (u_ispunct(
static_cast<sal_uInt32
>(
Text[nStartPosWordEnd]))))
506 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
507 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
509 while (boundary_with_punctuation > wBoundary.endPos &&
Text[--boundary_with_punctuation] ==
SPACE);
510 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(
Text.copy(wBoundary.startPos,
511 wBoundary.endPos - wBoundary.startPos), rLocale,
512 static_cast<sal_Int16
>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
513 if (aHyphenatedWord.is()) {
514 lbr.rHyphenatedWord = aHyphenatedWord;
515 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
518 lbr.breakIndex = wBoundary.startPos;
519 lbr.breakType = BreakType::HYPHENATION;
522 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
523 lbr.breakIndex = pLineBI->current();
524 lbr.breakType = BreakType::WORDBOUNDARY;
528 lbr.breakIndex = pLineBI->preceding(nStartPos);
529 lbr.breakType = BreakType::WORDBOUNDARY;
532 lbr.breakIndex = pLineBI->preceding(nStartPos);
533 lbr.breakType = BreakType::WORDBOUNDARY;
538 if (lbr.breakIndex > 0 &&
Text[lbr.breakIndex-1] ==
'/')
544 const sal_Int32 nOverlyLong = 66;
545 sal_Int32
nPos = lbr.breakIndex - 1;
546 while (
nPos > 0 && lbr.breakIndex -
nPos < nOverlyLong)
548 if (u_isWhitespace(
Text.iterateCodePoints( &
nPos, -1)))
550 lbr.breakIndex =
nPos + 1;
559 if (lbr.breakType == BreakType::WORDBOUNDARY) {
560 nStartPos = lbr.breakIndex;
561 if (nStartPos >= 0 &&
Text[nStartPos--] ==
WJ)
563 while (nStartPos >= 0 &&
564 (u_isWhitespace(
Text.iterateCodePoints(&nStartPos, 0)) ||
Text[nStartPos] ==
WJ)) {
565 if (
Text[nStartPos--] ==
WJ)
568 if (GlueSpace && nStartPos < 0) {
590uno::Sequence< OUString > SAL_CALL
593 uno::Sequence< OUString > aRet { OUString::createFromAscii(
cBreakIterator) };
599extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
601 css::uno::XComponentContext *,
602 css::uno::Sequence<css::uno::Any>
const &)
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_Unicode_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
U_CDECL_BEGIN const char OpenOffice_dat[]
#define LOAD_WORD_BREAKITERATOR
#define LOAD_SENTENCE_BREAKITERATOR
#define LOAD_LINE_BREAKITERATOR
#define LOAD_CHARACTER_BREAKITERATOR
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
static OUString convertToBcp47(LanguageType nLangID)
css::i18n::Boundary result
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
void loadICUBreakIterator(const css::lang::Locale &rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const char *name, const OUString &rText)
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual ~BreakIterator_Unicode() override
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
struct i18npool::BreakIterator_Unicode::BI_Data * icuBI
struct i18npool::BreakIterator_Unicode::BI_Data sentence
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
struct i18npool::BreakIterator_Unicode::BI_Data line
virtual OUString SAL_CALL getImplementationName() override
const char * cBreakIterator
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
std::unordered_map< OString, std::shared_ptr< BI_ValueData > > BIMap
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
struct i18npool::BreakIterator_Unicode::BI_Data character
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
static rtl::Reference< LocaleDataImpl > get()
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
Constant values shared between i18npool and, for example, the number formatter.
static thread_local BreakIterator_Unicode::BIMap theBIMap
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
std::shared_ptr< BI_ValueData > mpValue