LibreOffice Module i18npool (master) 1
breakiterator_unicode.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
22#include <localedata.hxx>
25#include <unicode/uchar.h>
26#include <unicode/locid.h>
27#include <unicode/rbbi.h>
28#include <unicode/udata.h>
29#include <rtl/strbuf.hxx>
30#include <rtl/ustring.hxx>
31
32#include <com/sun/star/i18n/BreakType.hpp>
33#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34#include <com/sun/star/i18n/WordType.hpp>
35
36U_CDECL_BEGIN
37extern const char OpenOffice_dat[];
38U_CDECL_END
39
40using namespace ::com::sun::star;
41using namespace ::com::sun::star::i18n;
42using namespace ::com::sun::star::lang;
43
44namespace i18npool {
45
46// Cache map of breakiterators, stores state information so has to be
47// thread_local.
49
51 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
52 , lineRule( "line" )
53 , icuBI( nullptr )
54{
55}
56
58{
59}
60
61namespace {
62
63/*
64 Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
65 setbreakType method.
66*/
67class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
68{
69 public:
70#if (U_ICU_VERSION_MAJOR_NUM < 58)
71 // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
72 void publicSetBreakType(int32_t type)
73 {
74 setBreakType(type);
75 };
76#endif
77 OOoRuleBasedBreakIterator(UDataMemory* image,
78 UErrorCode &status)
79 : icu::RuleBasedBreakIterator(image, status)
80 { };
81
82};
83
84}
85
86// loading ICU breakiterator on demand.
87void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
88 sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText)
89{
90 bool bNewBreak = false;
91 UErrorCode status = U_ZERO_ERROR;
92 sal_Int16 breakType = 0;
93 switch (rBreakType) {
94 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
96 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
97 icuBI=&words[nWordType];
98 switch (nWordType) {
99 case WordType::ANY_WORD: break; // odd but previous behavior
100 case WordType::ANYWORD_IGNOREWHITESPACES:
101 breakType = 0; rule = "edit_word"; break;
102 case WordType::DICTIONARY_WORD:
103 breakType = 1; rule = "dict_word"; break;
104 default:
105 case WordType::WORD_COUNT:
106 breakType = 2; rule = "count_word"; break;
107 }
108 break;
109 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
110 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
111 }
112
113 // Using the cache map prevents accessing the file system for each
114 // udata_open() where ICU tries first files then data objects. And that for
115 // two fallbacks worst case... for each new allocated EditEngine, layout
116 // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
117 // This also speeds up loading iterators for alternating or generally more
118 // than one language/locale in that iterators are not constructed and
119 // destroyed en masse.
120 // Four possible keys, locale rule based with break type, locale rule based
121 // only, rule based only, locale based with break type. A fifth global key
122 // for the initial lookup.
123 // Multiple global keys may map to identical value data.
124 // All enums used here should be in the range 0..9 so assert that and avoid
125 // expensive numeric conversion in append() for faster construction of the
126 // always used global key.
127 assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
128 const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
129 OStringBuffer aKeyBuf(64);
130 aKeyBuf.append( aLangtagStr + ";" );
131 if (rule)
132 aKeyBuf.append(rule);
133 aKeyBuf.append(";" + OStringChar(static_cast<char>('0'+breakType)) + ";"
134 + OStringChar(static_cast<char>('0'+rBreakType)) + ";"
135 + OStringChar( static_cast<char>('0'+nWordType)));
136 // langtag;rule;breakType;rBreakType;nWordType
137 const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
138
139 if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
140 {
141
142 auto aMapIt( theBIMap.find( aBIMapGlobalKey));
143 bool bInMap = (aMapIt != theBIMap.end());
144 if (bInMap)
145 icuBI->mpValue = aMapIt->second;
146 else
147 icuBI->mpValue.reset();
148
149 if (!bInMap && rule)
150 do
151 {
152 const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
153
154 status = U_ZERO_ERROR;
155 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
156 if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
157
158 std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
159
160 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
161 {
162 // langtag;rule;breakType
163 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
164 aMapIt = theBIMap.find( aBIMapRuleTypeKey);
165 bInMap = (aMapIt != theBIMap.end());
166 if (bInMap)
167 {
168 icuBI->mpValue = aMapIt->second;
169 icuBI->maBIMapKey = aBIMapGlobalKey;
170 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
171 break; // do
172 }
173
174 rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk",
175 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
176
177 if (U_SUCCESS(status))
178 {
179 icuBI->mpValue = std::make_shared<BI_ValueData>();
180 icuBI->mpValue->mpBreakIterator = rbi;
181 theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
182 }
183 else
184 {
185 rbi.reset();
186 }
187 }
188 //use icu's breakiterator for Thai, Tibetan and Dzongkha
189 else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
190 {
191 // language;rule (not langtag, unless we'd actually load such)
192 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
193 const OString aBIMapRuleKey( aLanguage + ";" + rule);
194 aMapIt = theBIMap.find( aBIMapRuleKey);
195 bInMap = (aMapIt != theBIMap.end());
196 if (bInMap)
197 {
198 icuBI->mpValue = aMapIt->second;
199 icuBI->maBIMapKey = aBIMapGlobalKey;
200 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
201 break; // do
202 }
203
204 status = U_ZERO_ERROR;
205 OString aUDName = OString::Concat(rule) + "_" + aLanguage;
206 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
207 if( U_SUCCESS(status) )
208 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
209 if ( U_SUCCESS(status) )
210 {
211 icuBI->mpValue = std::make_shared<BI_ValueData>();
212 icuBI->mpValue->mpBreakIterator = rbi;
213 theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
214 }
215 else
216 {
217 rbi.reset();
218
219 // ;rule (only)
220 const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule);
221 aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
222 bInMap = (aMapIt != theBIMap.end());
223 if (bInMap)
224 {
225 icuBI->mpValue = aMapIt->second;
226 icuBI->maBIMapKey = aBIMapGlobalKey;
227 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
228 break; // do
229 }
230
231 status = U_ZERO_ERROR;
232 pUData = udata_open("OpenOffice", "brk", rule, &status);
233 if( U_SUCCESS(status) )
234 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
235 if ( U_SUCCESS(status) )
236 {
237 icuBI->mpValue = std::make_shared<BI_ValueData>();
238 icuBI->mpValue->mpBreakIterator = rbi;
239 theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
240 }
241 else
242 {
243 rbi.reset();
244 }
245 }
246 }
247 if (rbi) {
248 #if (U_ICU_VERSION_MAJOR_NUM < 58)
249 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
250 // instead of protected, so the old workaround of
251 // https://ssl.icu-project.org/trac/ticket/5498
252 // doesn't work anymore. However, they also claim to have fixed
253 // the cause that an initial fBreakType==-1 would lead to an
254 // endless loop under some circumstances.
255 // Let's see ...
256 switch (rBreakType) {
257 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
258 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
259 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
260 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
261 }
262 #endif
263 }
264 } while (false);
265
266 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
267 do
268 {
269 // langtag;;;rBreakType (empty rule; empty breakType)
270 const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
271 aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
272 bInMap = (aMapIt != theBIMap.end());
273 if (bInMap)
274 {
275 icuBI->mpValue = aMapIt->second;
276 icuBI->maBIMapKey = aBIMapGlobalKey;
277 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
278 break; // do
279 }
280
281 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
282 std::shared_ptr< icu::BreakIterator > pBI;
283
284 status = U_ZERO_ERROR;
285 switch (rBreakType) {
287 pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
288 break;
290 pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
291 break;
293 pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
294 break;
296 pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
297 break;
298 }
299 if ( !U_SUCCESS(status) || !pBI ) {
300 throw uno::RuntimeException();
301 }
302 icuBI->mpValue = std::make_shared<BI_ValueData>();
303 icuBI->mpValue->mpBreakIterator = pBI;
304 theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
305 } while (false);
306 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
307 throw uno::RuntimeException();
308 }
309 icuBI->maBIMapKey = aBIMapGlobalKey;
310 if (!bInMap)
311 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
312 bNewBreak=true;
313 }
314
315 if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData))
316 return;
317
318 const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
319
320 status = U_ZERO_ERROR;
321 icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
322
323 if (!U_SUCCESS(status))
324 throw uno::RuntimeException();
325
326 icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
327
328 if (!U_SUCCESS(status))
329 throw uno::RuntimeException();
330
331 icuBI->mpValue->maICUText = rText;
332}
333
334sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
335 sal_Int32 nStartPos, const lang::Locale &rLocale,
336 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
337{
338 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
340 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
341 for (nDone = 0; nDone < nCount; nDone++) {
342 nStartPos = pBI->following(nStartPos);
343 if (nStartPos == icu::BreakIterator::DONE)
344 return Text.getLength();
345 }
346 } else { // for CHARACTER mode
347 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
348 Text.iterateCodePoints(&nStartPos);
349 }
350 return nStartPos;
351}
352
353sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
354 sal_Int32 nStartPos, const lang::Locale& rLocale,
355 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
356{
357 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
359 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
360 for (nDone = 0; nDone < nCount; nDone++) {
361 nStartPos = pBI->preceding(nStartPos);
362 if (nStartPos == icu::BreakIterator::DONE)
363 return 0;
364 }
365 } else { // for BS to delete one char and CHARACTER mode.
366 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
367 Text.iterateCodePoints(&nStartPos, -1);
368 }
369 return nStartPos;
370}
371
372
373Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
374 const lang::Locale& rLocale, sal_Int16 rWordType )
375{
376 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
377
378 Boundary rv;
379 rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
380 if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
381 rv.endPos = result.startPos;
382 else {
383 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
384 && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
385 || (rWordType == WordType::DICTIONARY_WORD
386 && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
387 rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
388
389 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
390 if(rv.endPos == icu::BreakIterator::DONE)
391 rv.endPos = rv.startPos;
392 }
393 return rv;
394}
395
396
397Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
398 const lang::Locale& rLocale, sal_Int16 rWordType)
399{
400 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
401
402 Boundary rv;
403 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
404 if( rv.startPos < 0)
405 rv.endPos = rv.startPos;
406 else {
407
408 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
409 && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
410 || (rWordType == WordType::DICTIONARY_WORD
411 && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
412 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
413
414 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
415 if(rv.endPos == icu::BreakIterator::DONE)
416 rv.endPos = rv.startPos;
417 }
418 return rv;
419}
420
421
422Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
423 sal_Int16 rWordType, sal_Bool bDirection )
424{
425 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
426 sal_Int32 len = Text.getLength();
427
428 Boundary rv;
429 if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
430 rv.startPos = rv.endPos = nPos;
431 if((bDirection || nPos == 0) && nPos < len) //forward
432 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
433 else
434 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
435 } else {
436 if(nPos <= 0) {
437 rv.startPos = 0;
438 rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
439 } else if(nPos >= len) {
440 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
441 rv.endPos = len;
442 } else {
443 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
444 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
445 }
446 }
447 if (rv.startPos == icu::BreakIterator::DONE)
448 rv.startPos = rv.endPos;
449 else if (rv.endPos == icu::BreakIterator::DONE)
450 rv.endPos = rv.startPos;
451
452 return rv;
453}
454
455
456sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
457 const lang::Locale &rLocale )
458{
460
461 sal_Int32 len = Text.getLength();
462 if (len > 0 && nStartPos == len)
463 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
464 if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
465 nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
466
467 // skip preceding space.
468 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
469 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
470 Text.iterateCodePoints(&nStartPos, -1);
471
472 return nStartPos;
473}
474
475sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
476 const lang::Locale &rLocale )
477{
479
480 sal_Int32 len = Text.getLength();
481 if (len > 0 && nStartPos == len)
482 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
483 nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
484
485 sal_Int32 nPos=nStartPos;
486 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
487
488 return nStartPos;
489}
490
491LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
492 const OUString& Text, sal_Int32 nStartPos,
493 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
494 const LineBreakHyphenationOptions& hOptions,
495 const LineBreakUserOptions& /*rOptions*/ )
496{
497 LineBreakResults lbr;
498
499 if (nStartPos >= Text.getLength()) {
500 lbr.breakIndex = Text.getLength();
501 lbr.breakType = BreakType::WORDBOUNDARY;
502 return lbr;
503 }
504
506
507 icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
508 bool GlueSpace=true;
509 while (GlueSpace) {
510 // don't break with Slash U+002F SOLIDUS at end of line; see "else" below!
511 if (pLineBI->preceding(nStartPos + 1) == nStartPos
512 && (nStartPos == 0 || Text[nStartPos - 1] != '/'))
513 { //Line boundary break
514 lbr.breakIndex = nStartPos;
515 lbr.breakType = BreakType::WORDBOUNDARY;
516 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
517 sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
518 pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
519
520 sal_Int32 nStartPosWordEnd = nStartPos;
521 while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
522 nStartPosWordEnd --;
523
524 Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
525 WordType::DICTIONARY_WORD, false);
526
527 nStartPosWordEnd = wBoundary.endPos;
528 while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
529 nStartPosWordEnd ++;
530 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
531 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
532#define SPACE 0x0020
533 while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
534 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
535 wBoundary.endPos - wBoundary.startPos), rLocale,
536 static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
537 if (aHyphenatedWord.is()) {
538 lbr.rHyphenatedWord = aHyphenatedWord;
539 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
540 lbr.breakIndex = -1;
541 else
542 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
543 lbr.breakType = BreakType::HYPHENATION;
544
545 // check not optimal hyphenation of "word-word" (word with hyphens)
546 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
547 lbr.breakIndex = pLineBI->current();
548 lbr.breakType = BreakType::WORDBOUNDARY;
549 }
550
551 } else {
552 lbr.breakIndex = pLineBI->preceding(nStartPos);
553 lbr.breakType = BreakType::WORDBOUNDARY;
554 }
555 } else { //word boundary break
556 lbr.breakIndex = pLineBI->preceding(nStartPos);
557 lbr.breakType = BreakType::WORDBOUNDARY;
558
559 // Special case for Slash U+002F SOLIDUS in URI and path names.
560 // TR14 defines that as SY: Symbols Allowing Break After (A).
561 // This is unwanted in paths, see also i#17155
562 if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
563 {
564 // Look backward and take any whitespace before as a break
565 // opportunity. This also glues something like "w/o".
566 // Avoid an overly long path and break it as was indicated.
567 // Overly long here is arbitrarily defined.
568 const sal_Int32 nOverlyLong = 66;
569 sal_Int32 nPos = lbr.breakIndex - 1;
570 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
571 {
572 if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
573 {
574 lbr.breakIndex = nPos + 1;
575 break;
576 }
577 }
578 }
579 }
580
581#define WJ 0x2060 // Word Joiner
582 GlueSpace=false;
583 if (lbr.breakType == BreakType::WORDBOUNDARY) {
584 nStartPos = lbr.breakIndex;
585 if (nStartPos >= 0 && Text[nStartPos--] == WJ)
586 GlueSpace=true;
587 while (nStartPos >= 0 &&
588 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
589 if (Text[nStartPos--] == WJ)
590 GlueSpace=true;
591 }
592 if (GlueSpace && nStartPos < 0) {
593 lbr.breakIndex = 0;
594 break;
595 }
596 }
597 }
598
599 return lbr;
600}
601
602OUString SAL_CALL
604{
605 return OUString::createFromAscii(cBreakIterator);
606}
607
608sal_Bool SAL_CALL
609BreakIterator_Unicode::supportsService(const OUString& rServiceName)
610{
611 return cppu::supportsService(this, rServiceName);
612}
613
614uno::Sequence< OUString > SAL_CALL
616{
617 uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
618 return aRet;
619}
620
621}
622
623extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
625 css::uno::XComponentContext *,
626 css::uno::Sequence<css::uno::Any> const &)
627{
628 return cppu::acquire(new i18npool::BreakIterator_Unicode());
629}
630
631/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_Unicode_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
#define SPACE
#define WJ
U_CDECL_BEGIN const char OpenOffice_dat[]
#define LOAD_WORD_BREAKITERATOR
#define LOAD_SENTENCE_BREAKITERATOR
#define LOAD_LINE_BREAKITERATOR
#define LOAD_CHARACTER_BREAKITERATOR
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
static OUString convertToBcp47(LanguageType nLangID)
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
void loadICUBreakIterator(const css::lang::Locale &rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const char *name, const OUString &rText)
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
struct i18npool::BreakIterator_Unicode::BI_Data * icuBI
struct i18npool::BreakIterator_Unicode::BI_Data sentence
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
struct i18npool::BreakIterator_Unicode::BI_Data line
virtual OUString SAL_CALL getImplementationName() override
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
std::unordered_map< OString, std::shared_ptr< BI_ValueData > > BIMap
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
struct i18npool::BreakIterator_Unicode::BI_Data character
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
static rtl::Reference< LocaleDataImpl > get()
Definition: localedata.hxx:77
int nCount
sal_uInt16 nPos
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
Constant values shared between i18npool and, for example, the number formatter.
static thread_local BreakIterator_Unicode::BIMap theBIMap
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
unsigned char sal_Bool