LibreOffice Module i18npool (master)  1
breakiterator_unicode.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
22 #include <localedata.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
31 
32 #include <com/sun/star/i18n/BreakType.hpp>
33 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
35 
36 U_CDECL_BEGIN
37 extern const char OpenOffice_dat[];
38 U_CDECL_END
39 
40 using namespace ::com::sun::star;
41 using namespace ::com::sun::star::i18n;
42 using namespace ::com::sun::star::lang;
43 
44 namespace i18npool {
45 
46 // Cache map of breakiterators, stores state information so has to be
47 // thread_local.
49 
51  : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
52  , lineRule( "line" )
53  , icuBI( nullptr )
54 {
55 }
56 
58 {
59 }
60 
61 namespace {
62 
63 /*
64  Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
65  setbreakType method.
66 */
67 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
68 {
69  public:
70 #if (U_ICU_VERSION_MAJOR_NUM < 58)
71  // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
72  void publicSetBreakType(int32_t type)
73  {
74  setBreakType(type);
75  };
76 #endif
77  OOoRuleBasedBreakIterator(UDataMemory* image,
78  UErrorCode &status)
79  : icu::RuleBasedBreakIterator(image, status)
80  { };
81 
82 };
83 
84 }
85 
86 // loading ICU breakiterator on demand.
87 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
88  sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText)
89 {
90  bool bNewBreak = false;
91  UErrorCode status = U_ZERO_ERROR;
92  sal_Int16 breakType = 0;
93  switch (rBreakType) {
94  case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
96  assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
97  icuBI=&words[nWordType];
98  switch (nWordType) {
99  case WordType::ANY_WORD: break; // odd but previous behavior
100  case WordType::ANYWORD_IGNOREWHITESPACES:
101  breakType = 0; rule = "edit_word"; break;
102  case WordType::DICTIONARY_WORD:
103  breakType = 1; rule = "dict_word"; break;
104  default:
105  case WordType::WORD_COUNT:
106  breakType = 2; rule = "count_word"; break;
107  }
108  break;
109  case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
110  case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
111  }
112 
113  // Using the cache map prevents accessing the file system for each
114  // udata_open() where ICU tries first files then data objects. And that for
115  // two fallbacks worst case... for each new allocated EditEngine, layout
116  // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
117  // This also speeds up loading iterators for alternating or generally more
118  // than one language/locale in that iterators are not constructed and
119  // destroyed en masse.
120  // Four possible keys, locale rule based with break type, locale rule based
121  // only, rule based only, locale based with break type. A fifth global key
122  // for the initial lookup.
123  // Multiple global keys may map to identical value data.
124  // All enums used here should be in the range 0..9 so assert that and avoid
125  // expensive numeric conversion in append() for faster construction of the
126  // always used global key.
127  assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
128  const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
129  OStringBuffer aKeyBuf(64);
130  aKeyBuf.append( aLangtagStr).append(';');
131  if (rule)
132  aKeyBuf.append(rule);
133  aKeyBuf.append(';').append( static_cast<char>('0'+breakType)).append(';').
134  append( static_cast<char>('0'+rBreakType)).append(';').append( static_cast<char>('0'+nWordType));
135  // langtag;rule;breakType;rBreakType;nWordType
136  const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
137 
138  if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
139  {
140 
141  auto aMapIt( theBIMap.find( aBIMapGlobalKey));
142  bool bInMap = (aMapIt != theBIMap.end());
143  if (bInMap)
144  icuBI->mpValue = aMapIt->second;
145  else
146  icuBI->mpValue.reset();
147 
148  if (!bInMap && rule)
149  do
150  {
151  const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
152 
153  status = U_ZERO_ERROR;
154  udata_setAppData("OpenOffice", OpenOffice_dat, &status);
155  if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
156 
157  std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
158 
159  if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
160  {
161  // langtag;rule;breakType
162  const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
163  aMapIt = theBIMap.find( aBIMapRuleTypeKey);
164  bInMap = (aMapIt != theBIMap.end());
165  if (bInMap)
166  {
167  icuBI->mpValue = aMapIt->second;
168  icuBI->maBIMapKey = aBIMapGlobalKey;
169  theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
170  break; // do
171  }
172 
173  rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk",
174  OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
175 
176  if (U_SUCCESS(status))
177  {
178  icuBI->mpValue = std::make_shared<BI_ValueData>();
179  icuBI->mpValue->mpBreakIterator = rbi;
180  theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
181  }
182  else
183  {
184  rbi.reset();
185  }
186  }
187  //use icu's breakiterator for Thai, Tibetan and Dzongkha
188  else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
189  {
190  // language;rule (not langtag, unless we'd actually load such)
191  OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
192  const OString aBIMapRuleKey( aLanguage + ";" + rule);
193  aMapIt = theBIMap.find( aBIMapRuleKey);
194  bInMap = (aMapIt != theBIMap.end());
195  if (bInMap)
196  {
197  icuBI->mpValue = aMapIt->second;
198  icuBI->maBIMapKey = aBIMapGlobalKey;
199  theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
200  break; // do
201  }
202 
203  status = U_ZERO_ERROR;
204  OString aUDName = OString::Concat(rule) + "_" + aLanguage;
205  UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
206  if( U_SUCCESS(status) )
207  rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
208  if ( U_SUCCESS(status) )
209  {
210  icuBI->mpValue = std::make_shared<BI_ValueData>();
211  icuBI->mpValue->mpBreakIterator = rbi;
212  theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
213  }
214  else
215  {
216  rbi.reset();
217 
218  // ;rule (only)
219  const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule);
220  aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
221  bInMap = (aMapIt != theBIMap.end());
222  if (bInMap)
223  {
224  icuBI->mpValue = aMapIt->second;
225  icuBI->maBIMapKey = aBIMapGlobalKey;
226  theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
227  break; // do
228  }
229 
230  status = U_ZERO_ERROR;
231  pUData = udata_open("OpenOffice", "brk", rule, &status);
232  if( U_SUCCESS(status) )
233  rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
234  if ( U_SUCCESS(status) )
235  {
236  icuBI->mpValue = std::make_shared<BI_ValueData>();
237  icuBI->mpValue->mpBreakIterator = rbi;
238  theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
239  }
240  else
241  {
242  rbi.reset();
243  }
244  }
245  }
246  if (rbi) {
247  #if (U_ICU_VERSION_MAJOR_NUM < 58)
248  // ICU 58 made RuleBasedBreakIterator::setBreakType() private
249  // instead of protected, so the old workaround of
250  // https://ssl.icu-project.org/trac/ticket/5498
251  // doesn't work anymore. However, they also claim to have fixed
252  // the cause that an initial fBreakType==-1 would lead to an
253  // endless loop under some circumstances.
254  // Let's see ...
255  switch (rBreakType) {
256  case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
257  case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
258  case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
259  case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
260  }
261  #endif
262  }
263  } while (false);
264 
265  if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
266  do
267  {
268  // langtag;;;rBreakType (empty rule; empty breakType)
269  const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
270  aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
271  bInMap = (aMapIt != theBIMap.end());
272  if (bInMap)
273  {
274  icuBI->mpValue = aMapIt->second;
275  icuBI->maBIMapKey = aBIMapGlobalKey;
276  theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
277  break; // do
278  }
279 
280  icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
281  std::shared_ptr< icu::BreakIterator > pBI;
282 
283  status = U_ZERO_ERROR;
284  switch (rBreakType) {
286  pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
287  break;
289  pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
290  break;
292  pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
293  break;
295  pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
296  break;
297  }
298  if ( !U_SUCCESS(status) || !pBI ) {
299  throw uno::RuntimeException();
300  }
301  icuBI->mpValue = std::make_shared<BI_ValueData>();
302  icuBI->mpValue->mpBreakIterator = pBI;
303  theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
304  } while (false);
305  if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
306  throw uno::RuntimeException();
307  }
308  icuBI->maBIMapKey = aBIMapGlobalKey;
309  if (!bInMap)
310  theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
311  bNewBreak=true;
312  }
313 
314  if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData))
315  return;
316 
317  const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
318 
319  status = U_ZERO_ERROR;
320  icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
321 
322  if (!U_SUCCESS(status))
323  throw uno::RuntimeException();
324 
325  icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
326 
327  if (!U_SUCCESS(status))
328  throw uno::RuntimeException();
329 
330  icuBI->mpValue->maICUText = rText;
331 }
332 
333 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
334  sal_Int32 nStartPos, const lang::Locale &rLocale,
335  sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
336 {
337  if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
338  loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
339  icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
340  for (nDone = 0; nDone < nCount; nDone++) {
341  nStartPos = pBI->following(nStartPos);
342  if (nStartPos == icu::BreakIterator::DONE)
343  return Text.getLength();
344  }
345  } else { // for CHARACTER mode
346  for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
347  Text.iterateCodePoints(&nStartPos);
348  }
349  return nStartPos;
350 }
351 
352 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
353  sal_Int32 nStartPos, const lang::Locale& rLocale,
354  sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
355 {
356  if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
357  loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
358  icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
359  for (nDone = 0; nDone < nCount; nDone++) {
360  nStartPos = pBI->preceding(nStartPos);
361  if (nStartPos == icu::BreakIterator::DONE)
362  return 0;
363  }
364  } else { // for BS to delete one char and CHARACTER mode.
365  for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
366  Text.iterateCodePoints(&nStartPos, -1);
367  }
368  return nStartPos;
369 }
370 
371 
372 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
373  const lang::Locale& rLocale, sal_Int16 rWordType )
374 {
375  loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
376 
377  Boundary rv;
378  rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
379  if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
380  rv.endPos = result.startPos;
381  else {
382  if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
383  rWordType == WordType::DICTIONARY_WORD ) &&
384  u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
385  rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
386 
387  rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
388  if(rv.endPos == icu::BreakIterator::DONE)
389  rv.endPos = rv.startPos;
390  }
391  return rv;
392 }
393 
394 
395 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
396  const lang::Locale& rLocale, sal_Int16 rWordType)
397 {
398  loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
399 
400  Boundary rv;
401  rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
402  if( rv.startPos < 0)
403  rv.endPos = rv.startPos;
404  else {
405  if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
406  rWordType == WordType::DICTIONARY_WORD) &&
407  u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
408  rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
409 
410  rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
411  if(rv.endPos == icu::BreakIterator::DONE)
412  rv.endPos = rv.startPos;
413  }
414  return rv;
415 }
416 
417 
418 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
419  sal_Int16 rWordType, sal_Bool bDirection )
420 {
421  loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
422  sal_Int32 len = Text.getLength();
423 
424  Boundary rv;
425  if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
426  rv.startPos = rv.endPos = nPos;
427  if((bDirection || nPos == 0) && nPos < len) //forward
428  rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
429  else
430  rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
431  } else {
432  if(nPos <= 0) {
433  rv.startPos = 0;
434  rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
435  } else if(nPos >= len) {
436  rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
437  rv.endPos = len;
438  } else {
439  rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
440  rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
441  }
442  }
443  if (rv.startPos == icu::BreakIterator::DONE)
444  rv.startPos = rv.endPos;
445  else if (rv.endPos == icu::BreakIterator::DONE)
446  rv.endPos = rv.startPos;
447 
448  return rv;
449 }
450 
451 
452 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
453  const lang::Locale &rLocale )
454 {
455  loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
456 
457  sal_Int32 len = Text.getLength();
458  if (len > 0 && nStartPos == len)
459  Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
460  if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
461  nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
462 
463  // skip preceding space.
464  sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
465  while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
466  Text.iterateCodePoints(&nStartPos, -1);
467 
468  return nStartPos;
469 }
470 
471 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
472  const lang::Locale &rLocale )
473 {
474  loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
475 
476  sal_Int32 len = Text.getLength();
477  if (len > 0 && nStartPos == len)
478  Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
479  nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
480 
481  sal_Int32 nPos=nStartPos;
482  while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
483 
484  return nStartPos;
485 }
486 
487 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
488  const OUString& Text, sal_Int32 nStartPos,
489  const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
490  const LineBreakHyphenationOptions& hOptions,
491  const LineBreakUserOptions& /*rOptions*/ )
492 {
493  LineBreakResults lbr;
494 
495  if (nStartPos >= Text.getLength()) {
496  lbr.breakIndex = Text.getLength();
497  lbr.breakType = BreakType::WORDBOUNDARY;
498  return lbr;
499  }
500 
501  loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
502 
503  icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
504  bool GlueSpace=true;
505  while (GlueSpace) {
506  // don't break with Slash U+002F SOLIDUS at end of line; see "else" below!
507  if (pLineBI->preceding(nStartPos + 1) == nStartPos
508  && (nStartPos == 0 || Text[nStartPos - 1] != '/'))
509  { //Line boundary break
510  lbr.breakIndex = nStartPos;
511  lbr.breakType = BreakType::WORDBOUNDARY;
512  } else if (hOptions.rHyphenator.is()) { //Hyphenation break
513  sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
514  pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
515 
516  sal_Int32 nStartPosWordEnd = nStartPos;
517  while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
518  nStartPosWordEnd --;
519 
520  Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
521  WordType::DICTIONARY_WORD, false);
522 
523  nStartPosWordEnd = wBoundary.endPos;
524  while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
525  nStartPosWordEnd ++;
526  nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
527  if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
528 #define SPACE 0x0020
529  while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
530  uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
531  wBoundary.endPos - wBoundary.startPos), rLocale,
532  static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
533  if (aHyphenatedWord.is()) {
534  lbr.rHyphenatedWord = aHyphenatedWord;
535  if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
536  lbr.breakIndex = -1;
537  else
538  lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
539  lbr.breakType = BreakType::HYPHENATION;
540 
541  // check not optimal hyphenation of "word-word" (word with hyphens)
542  if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
543  lbr.breakIndex = pLineBI->current();
544  lbr.breakType = BreakType::WORDBOUNDARY;
545  }
546 
547  } else {
548  lbr.breakIndex = pLineBI->preceding(nStartPos);
549  lbr.breakType = BreakType::WORDBOUNDARY;
550  }
551  } else { //word boundary break
552  lbr.breakIndex = pLineBI->preceding(nStartPos);
553  lbr.breakType = BreakType::WORDBOUNDARY;
554 
555  // Special case for Slash U+002F SOLIDUS in URI and path names.
556  // TR14 defines that as SY: Symbols Allowing Break After (A).
557  // This is unwanted in paths, see also i#17155
558  if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
559  {
560  // Look backward and take any whitespace before as a break
561  // opportunity. This also glues something like "w/o".
562  // Avoid an overly long path and break it as was indicated.
563  // Overly long here is arbitrarily defined.
564  const sal_Int32 nOverlyLong = 66;
565  sal_Int32 nPos = lbr.breakIndex - 1;
566  while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
567  {
568  if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
569  {
570  lbr.breakIndex = nPos + 1;
571  break;
572  }
573  }
574  }
575  }
576 
577 #define WJ 0x2060 // Word Joiner
578  GlueSpace=false;
579  if (lbr.breakType == BreakType::WORDBOUNDARY) {
580  nStartPos = lbr.breakIndex;
581  if (nStartPos >= 0 && Text[nStartPos--] == WJ)
582  GlueSpace=true;
583  while (nStartPos >= 0 &&
584  (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
585  if (Text[nStartPos--] == WJ)
586  GlueSpace=true;
587  }
588  if (GlueSpace && nStartPos < 0) {
589  lbr.breakIndex = 0;
590  break;
591  }
592  }
593  }
594 
595  return lbr;
596 }
597 
598 OUString SAL_CALL
599 BreakIterator_Unicode::getImplementationName()
600 {
601  return OUString::createFromAscii(cBreakIterator);
602 }
603 
604 sal_Bool SAL_CALL
605 BreakIterator_Unicode::supportsService(const OUString& rServiceName)
606 {
607  return cppu::supportsService(this, rServiceName);
608 }
609 
610 uno::Sequence< OUString > SAL_CALL
611 BreakIterator_Unicode::getSupportedServiceNames()
612 {
613  uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
614  return aRet;
615 }
616 
617 }
618 
619 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
621  css::uno::XComponentContext *,
622  css::uno::Sequence<css::uno::Any> const &)
623 {
624  return cppu::acquire(new i18npool::BreakIterator_Unicode());
625 }
626 
627 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
tuple line
exports com.sun.star.frame. status
virtual sal_Int32 SAL_CALL nextCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
#define LOAD_WORD_BREAKITERATOR
U_CDECL_BEGIN const char OpenOffice_dat[]
static OUString convertToBcp47(LanguageType nLangID)
struct i18npool::BreakIterator_Unicode::BI_Data * icuBI
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
static thread_local BreakIterator_Unicode::BIMap theBIMap
struct i18npool::BreakIterator_Unicode::BI_Data sentence
static rtl::Reference< LocaleDataImpl > get()
Definition: localedata.hxx:77
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
#define LOAD_LINE_BREAKITERATOR
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
void loadICUBreakIterator(const css::lang::Locale &rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const char *name, const OUString &rText)
#define SPACE
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_i18n_BreakIterator_Unicode_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
unsigned char sal_Bool
#define WJ
Constant values shared between i18npool and, for example, the number formatter.
struct i18npool::BreakIterator_Unicode::BI_Data character
std::unordered_map< OString, std::shared_ptr< BI_ValueData > > BIMap
if(aStr!=aBuf) UpdateName_Impl(m_xFollowLb.get()
virtual sal_Int32 SAL_CALL previousCharacters(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32 &nDone) override
#define LOAD_CHARACTER_BREAKITERATOR
struct i18npool::BreakIterator_Unicode::BI_Data line
#define LOAD_SENTENCE_BREAKITERATOR
sal_uInt16 nPos