LibreOffice Module i18npool (master) 1
breakiterator_unicode.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
22#include <localedata.hxx>
25#include <unicode/uchar.h>
26#include <unicode/locid.h>
27#include <unicode/rbbi.h>
28#include <unicode/udata.h>
29#include <rtl/strbuf.hxx>
30#include <rtl/ustring.hxx>
31
32#include <com/sun/star/i18n/BreakType.hpp>
33#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34#include <com/sun/star/i18n/WordType.hpp>
35
36U_CDECL_BEGIN
37extern const char OpenOffice_dat[];
38U_CDECL_END
39
40using namespace ::com::sun::star;
41using namespace ::com::sun::star::i18n;
42using namespace ::com::sun::star::lang;
43
44namespace i18npool {
45
46// Cache map of breakiterators, stores state information so has to be
47// thread_local.
49
51 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
52 , lineRule( "line" )
53 , icuBI( nullptr )
54{
55}
56
58{
59}
60
61namespace {
62
63/*
64 Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
65 setbreakType method.
66*/
67class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
68{
69 public:
70#if (U_ICU_VERSION_MAJOR_NUM < 58)
71 // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
72 void publicSetBreakType(int32_t type)
73 {
74 setBreakType(type);
75 };
76#endif
77 OOoRuleBasedBreakIterator(UDataMemory* image,
78 UErrorCode &status)
79 : icu::RuleBasedBreakIterator(image, status)
80 { };
81
82};
83
84}
85
86// loading ICU breakiterator on demand.
87void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
88 sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText)
89{
90 bool bNewBreak = false;
91 UErrorCode status = U_ZERO_ERROR;
92 sal_Int16 breakType = 0;
93 switch (rBreakType) {
94 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
96 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
97 icuBI=&words[nWordType];
98 switch (nWordType) {
99 case WordType::ANY_WORD: break; // odd but previous behavior
100 case WordType::ANYWORD_IGNOREWHITESPACES:
101 breakType = 0; rule = "edit_word"; break;
102 case WordType::DICTIONARY_WORD:
103 breakType = 1; rule = "dict_word"; break;
104 default:
105 case WordType::WORD_COUNT:
106 breakType = 2; rule = "count_word"; break;
107 }
108 break;
109 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
110 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
111 }
112
113 // Using the cache map prevents accessing the file system for each
114 // udata_open() where ICU tries first files then data objects. And that for
115 // two fallbacks worst case... for each new allocated EditEngine, layout
116 // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
117 // This also speeds up loading iterators for alternating or generally more
118 // than one language/locale in that iterators are not constructed and
119 // destroyed en masse.
120 // Four possible keys, locale rule based with break type, locale rule based
121 // only, rule based only, locale based with break type. A fifth global key
122 // for the initial lookup.
123 // Multiple global keys may map to identical value data.
124 // All enums used here should be in the range 0..9 so assert that and avoid
125 // expensive numeric conversion in append() for faster construction of the
126 // always used global key.
127 assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
128 const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
129 OStringBuffer aKeyBuf(64);
130 aKeyBuf.append( aLangtagStr).append(';');
131 if (rule)
132 aKeyBuf.append(rule);
133 aKeyBuf.append(';').append( static_cast<char>('0'+breakType)).append(';').
134 append( static_cast<char>('0'+rBreakType)).append(';').append( static_cast<char>('0'+nWordType));
135 // langtag;rule;breakType;rBreakType;nWordType
136 const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
137
138 if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
139 {
140
141 auto aMapIt( theBIMap.find( aBIMapGlobalKey));
142 bool bInMap = (aMapIt != theBIMap.end());
143 if (bInMap)
144 icuBI->mpValue = aMapIt->second;
145 else
146 icuBI->mpValue.reset();
147
148 if (!bInMap && rule)
149 do
150 {
151 const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
152
153 status = U_ZERO_ERROR;
154 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
155 if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
156
157 std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
158
159 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
160 {
161 // langtag;rule;breakType
162 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
163 aMapIt = theBIMap.find( aBIMapRuleTypeKey);
164 bInMap = (aMapIt != theBIMap.end());
165 if (bInMap)
166 {
167 icuBI->mpValue = aMapIt->second;
168 icuBI->maBIMapKey = aBIMapGlobalKey;
169 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
170 break; // do
171 }
172
173 rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk",
174 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
175
176 if (U_SUCCESS(status))
177 {
178 icuBI->mpValue = std::make_shared<BI_ValueData>();
179 icuBI->mpValue->mpBreakIterator = rbi;
180 theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
181 }
182 else
183 {
184 rbi.reset();
185 }
186 }
187 //use icu's breakiterator for Thai, Tibetan and Dzongkha
188 else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
189 {
190 // language;rule (not langtag, unless we'd actually load such)
191 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
192 const OString aBIMapRuleKey( aLanguage + ";" + rule);
193 aMapIt = theBIMap.find( aBIMapRuleKey);
194 bInMap = (aMapIt != theBIMap.end());
195 if (bInMap)
196 {
197 icuBI->mpValue = aMapIt->second;
198 icuBI->maBIMapKey = aBIMapGlobalKey;
199 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
200 break; // do
201 }
202
203 status = U_ZERO_ERROR;
204 OString aUDName = OString::Concat(rule) + "_" + aLanguage;
205 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
206 if( U_SUCCESS(status) )
207 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
208 if ( U_SUCCESS(status) )
209 {
210 icuBI->mpValue = std::make_shared<BI_ValueData>();
211 icuBI->mpValue->mpBreakIterator = rbi;
212 theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
213 }
214 else
215 {
216 rbi.reset();
217
218 // ;rule (only)
219 const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule);
220 aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
221 bInMap = (aMapIt != theBIMap.end());
222 if (bInMap)
223 {
224 icuBI->mpValue = aMapIt->second;
225 icuBI->maBIMapKey = aBIMapGlobalKey;
226 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
227 break; // do
228 }
229
230 status = U_ZERO_ERROR;
231 pUData = udata_open("OpenOffice", "brk", rule, &status);
232 if( U_SUCCESS(status) )
233 rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
234 if ( U_SUCCESS(status) )
235 {
236 icuBI->mpValue = std::make_shared<BI_ValueData>();
237 icuBI->mpValue->mpBreakIterator = rbi;
238 theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
239 }
240 else
241 {
242 rbi.reset();
243 }
244 }
245 }
246 if (rbi) {
247 #if (U_ICU_VERSION_MAJOR_NUM < 58)
248 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
249 // instead of protected, so the old workaround of
250 // https://ssl.icu-project.org/trac/ticket/5498
251 // doesn't work anymore. However, they also claim to have fixed
252 // the cause that an initial fBreakType==-1 would lead to an
253 // endless loop under some circumstances.
254 // Let's see ...
255 switch (rBreakType) {
256 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
257 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
258 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
259 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
260 }
261 #endif
262 }
263 } while (false);
264
265 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
266 do
267 {
268 // langtag;;;rBreakType (empty rule; empty breakType)
269 const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
270 aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
271 bInMap = (aMapIt != theBIMap.end());
272 if (bInMap)
273 {
274 icuBI->mpValue = aMapIt->second;
275 icuBI->maBIMapKey = aBIMapGlobalKey;
276 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
277 break; // do
278 }
279
280 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
281 std::shared_ptr< icu::BreakIterator > pBI;
282
283 status = U_ZERO_ERROR;
284 switch (rBreakType) {
286 pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
287 break;
289 pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
290 break;
292 pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
293 break;
295 pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
296 break;
297 }
298 if ( !U_SUCCESS(status) || !pBI ) {
299 throw uno::RuntimeException();
300 }
301 icuBI->mpValue = std::make_shared<BI_ValueData>();
302 icuBI->mpValue->mpBreakIterator = pBI;
303 theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
304 } while (false);
305 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
306 throw uno::RuntimeException();
307 }
308 icuBI->maBIMapKey = aBIMapGlobalKey;
309 if (!bInMap)
310 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
311 bNewBreak=true;
312 }
313
314 if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData))
315 return;
316
317 const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
318
319 status = U_ZERO_ERROR;
320 icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
321
322 if (!U_SUCCESS(status))
323 throw uno::RuntimeException();
324
325 icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
326
327 if (!U_SUCCESS(status))
328 throw uno::RuntimeException();
329
330 icuBI->mpValue->maICUText = rText;
331}
332
333sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
334 sal_Int32 nStartPos, const lang::Locale &rLocale,
335 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
336{
337 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
339 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
340 for (nDone = 0; nDone < nCount; nDone++) {
341 nStartPos = pBI->following(nStartPos);
342 if (nStartPos == icu::BreakIterator::DONE)
343 return Text.getLength();
344 }
345 } else { // for CHARACTER mode
346 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
347 Text.iterateCodePoints(&nStartPos);
348 }
349 return nStartPos;
350}
351
352sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
353 sal_Int32 nStartPos, const lang::Locale& rLocale,
354 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
355{
356 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
358 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
359 for (nDone = 0; nDone < nCount; nDone++) {
360 nStartPos = pBI->preceding(nStartPos);
361 if (nStartPos == icu::BreakIterator::DONE)
362 return 0;
363 }
364 } else { // for BS to delete one char and CHARACTER mode.
365 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
366 Text.iterateCodePoints(&nStartPos, -1);
367 }
368 return nStartPos;
369}
370
371
372Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
373 const lang::Locale& rLocale, sal_Int16 rWordType )
374{
375 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
376
377 Boundary rv;
378 rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
379 if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
380 rv.endPos = result.startPos;
381 else {
382 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
383 && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
384 || (rWordType == WordType::DICTIONARY_WORD
385 && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
386 rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
387
388 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
389 if(rv.endPos == icu::BreakIterator::DONE)
390 rv.endPos = rv.startPos;
391 }
392 return rv;
393}
394
395
396Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
397 const lang::Locale& rLocale, sal_Int16 rWordType)
398{
399 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
400
401 Boundary rv;
402 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
403 if( rv.startPos < 0)
404 rv.endPos = rv.startPos;
405 else {
406
407 if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
408 && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
409 || (rWordType == WordType::DICTIONARY_WORD
410 && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
411 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
412
413 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
414 if(rv.endPos == icu::BreakIterator::DONE)
415 rv.endPos = rv.startPos;
416 }
417 return rv;
418}
419
420
421Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
422 sal_Int16 rWordType, sal_Bool bDirection )
423{
424 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
425 sal_Int32 len = Text.getLength();
426
427 Boundary rv;
428 if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
429 rv.startPos = rv.endPos = nPos;
430 if((bDirection || nPos == 0) && nPos < len) //forward
431 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
432 else
433 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
434 } else {
435 if(nPos <= 0) {
436 rv.startPos = 0;
437 rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
438 } else if(nPos >= len) {
439 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
440 rv.endPos = len;
441 } else {
442 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
443 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
444 }
445 }
446 if (rv.startPos == icu::BreakIterator::DONE)
447 rv.startPos = rv.endPos;
448 else if (rv.endPos == icu::BreakIterator::DONE)
449 rv.endPos = rv.startPos;
450
451 return rv;
452}
453
454
455sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
456 const lang::Locale &rLocale )
457{
459
460 sal_Int32 len = Text.getLength();
461 if (len > 0 && nStartPos == len)
462 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
463 if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
464 nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
465
466 // skip preceding space.
467 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
468 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
469 Text.iterateCodePoints(&nStartPos, -1);
470
471 return nStartPos;
472}
473
474sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
475 const lang::Locale &rLocale )
476{
478
479 sal_Int32 len = Text.getLength();
480 if (len > 0 && nStartPos == len)
481 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
482 nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
483
484 sal_Int32 nPos=nStartPos;
485 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
486
487 return nStartPos;
488}
489
490LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
491 const OUString& Text, sal_Int32 nStartPos,
492 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
493 const LineBreakHyphenationOptions& hOptions,
494 const LineBreakUserOptions& /*rOptions*/ )
495{
496 LineBreakResults lbr;
497
498 if (nStartPos >= Text.getLength()) {
499 lbr.breakIndex = Text.getLength();
500 lbr.breakType = BreakType::WORDBOUNDARY;
501 return lbr;
502 }
503
505
506 icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
507 bool GlueSpace=true;
508 while (GlueSpace) {
509 // don't break with Slash U+002F SOLIDUS at end of line; see "else" below!
510 if (pLineBI->preceding(nStartPos + 1) == nStartPos
511 && (nStartPos == 0 || Text[nStartPos - 1] != '/'))
512 { //Line boundary break
513 lbr.breakIndex = nStartPos;
514 lbr.breakType = BreakType::WORDBOUNDARY;
515 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
516 sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
517 pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
518
519 sal_Int32 nStartPosWordEnd = nStartPos;
520 while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
521 nStartPosWordEnd --;
522
523 Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
524 WordType::DICTIONARY_WORD, false);
525
526 nStartPosWordEnd = wBoundary.endPos;
527 while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
528 nStartPosWordEnd ++;
529 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
530 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
531#define SPACE 0x0020
532 while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
533 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
534 wBoundary.endPos - wBoundary.startPos), rLocale,
535 static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
536 if (aHyphenatedWord.is()) {
537 lbr.rHyphenatedWord = aHyphenatedWord;
538 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
539 lbr.breakIndex = -1;
540 else
541 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
542 lbr.breakType = BreakType::HYPHENATION;
543
544 // check not optimal hyphenation of "word-word" (word with hyphens)
545 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
546 lbr.breakIndex = pLineBI->current();
547 lbr.breakType = BreakType::WORDBOUNDARY;
548 }
549
550 } else {
551 lbr.breakIndex = pLineBI->preceding(nStartPos);
552 lbr.breakType = BreakType::WORDBOUNDARY;
553 }
554 } else { //word boundary break
555 lbr.breakIndex = pLineBI->preceding(nStartPos);
556 lbr.breakType = BreakType::WORDBOUNDARY;
557
558 // Special case for Slash U+002F SOLIDUS in URI and path names.
559 // TR14 defines that as SY: Symbols Allowing Break After (A).
560 // This is unwanted in paths, see also i#17155
561 if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
562 {
563 // Look backward and take any whitespace before as a break
564 // opportunity. This also glues something like "w/o".
565 // Avoid an overly long path and break it as was indicated.
566 // Overly long here is arbitrarily defined.
567 const sal_Int32 nOverlyLong = 66;
568 sal_Int32 nPos = lbr.breakIndex - 1;
569 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
570 {
571 if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
572 {
573 lbr.breakIndex = nPos + 1;
574 break;
575 }
576 }
577 }
578 }
579
580#define WJ 0x2060 // Word Joiner
581 GlueSpace=false;
582 if (lbr.breakType == BreakType::WORDBOUNDARY) {
583 nStartPos = lbr.breakIndex;
584 if (nStartPos >= 0 && Text[nStartPos--] == WJ)
585 GlueSpace=true;
586 while (nStartPos >= 0 &&
587 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
588 if (Text[nStartPos--] == WJ)
589 GlueSpace=true;
590 }
591 if (GlueSpace && nStartPos < 0) {
592 lbr.breakIndex = 0;
593 break;
594 }
595 }
596 }
597
598 return lbr;
599}
600
601OUString SAL_CALL
603{
604 return OUString::createFromAscii(cBreakIterator);
605}
606
607sal_Bool SAL_CALL
608BreakIterator_Unicode::supportsService(const OUString& rServiceName)
609{
610 return cppu::supportsService(this, rServiceName);
611}
612
613uno::Sequence< OUString > SAL_CALL
615{
616 uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
617 return aRet;
618}
619
620}
621
622extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
624 css::uno::XComponentContext *,
625 css::uno::Sequence<css::uno::Any> const &)
626{
627 return cppu::acquire(new i18npool::BreakIterator_Unicode());
628}
629
630/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_Unicode_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
#define SPACE
#define WJ
U_CDECL_BEGIN const char OpenOffice_dat[]
#define LOAD_WORD_BREAKITERATOR
#define LOAD_SENTENCE_BREAKITERATOR
#define LOAD_LINE_BREAKITERATOR
#define LOAD_CHARACTER_BREAKITERATOR
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
static OUString convertToBcp47(LanguageType nLangID)
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
void loadICUBreakIterator(const css::lang::Locale &rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const char *name, const OUString &rText)
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
virtual css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
struct i18npool::BreakIterator_Unicode::BI_Data * icuBI
struct i18npool::BreakIterator_Unicode::BI_Data sentence
virtual sal_Int32 SAL_CALL endOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
struct i18npool::BreakIterator_Unicode::BI_Data line
virtual OUString SAL_CALL getImplementationName() override
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
std::unordered_map< OString, std::shared_ptr< BI_ValueData > > BIMap
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
struct i18npool::BreakIterator_Unicode::BI_Data character
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Int32 SAL_CALL beginOfSentence(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale) override
static rtl::Reference< LocaleDataImpl > get()
Definition: localedata.hxx:77
int nCount
sal_uInt16 nPos
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
Constant values shared between i18npool and, for example, the number formatter.
static thread_local BreakIterator_Unicode::BIMap theBIMap
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
unsigned char sal_Bool