LibreOffice Module i18npool (master)  1
breakiterator_cjk.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <breakiterator_cjk.hxx>
21 #include <localedata.hxx>
22 
23 #include <com/sun/star/i18n/BreakType.hpp>
24 #include <com/sun/star/i18n/ScriptType.hpp>
25 
26 using namespace ::com::sun::star::uno;
27 using namespace ::com::sun::star::i18n;
28 using namespace ::com::sun::star::lang;
29 
30 namespace i18npool {
31 
32 // ----------------------------------------------------
33 // class BreakIterator_CJK
34 // ----------------------------------------------------;
35 
37 {
38  cBreakIterator = "com.sun.star.i18n.BreakIterator_CJK";
39 }
40 
41 Boundary SAL_CALL
42 BreakIterator_CJK::previousWord(const OUString& text, sal_Int32 anyPos,
43  const css::lang::Locale& nLocale, sal_Int16 wordType)
44 {
45  if (m_xDict) {
46  result = m_xDict->previousWord(text, anyPos, wordType);
47  // #109813# for non-CJK, single character word, fallback to ICU breakiterator.
48  if (result.endPos - result.startPos != 1 ||
49  getScriptType(text, result.startPos) == ScriptType::ASIAN)
50  return result;
51  result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
52  if (result.endPos < anyPos)
53  return result;
54  }
55  return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType);
56 }
57 
58 Boundary SAL_CALL
59 BreakIterator_CJK::nextWord(const OUString& text, sal_Int32 anyPos,
60  const css::lang::Locale& nLocale, sal_Int16 wordType)
61 {
62  if (m_xDict) {
63  result = m_xDict->nextWord(text, anyPos, wordType);
64  // #109813# for non-CJK, single character word, fallback to ICU breakiterator.
65  if (result.endPos - result.startPos != 1 ||
66  getScriptType(text, result.startPos) == ScriptType::ASIAN)
67  return result;
68  result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
69  if (result.startPos > anyPos)
70  return result;
71  }
72  return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType);
73 }
74 
75 Boundary SAL_CALL
76 BreakIterator_CJK::getWordBoundary( const OUString& text, sal_Int32 anyPos,
77  const css::lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection )
78 {
79  if (m_xDict) {
80  result = m_xDict->getWordBoundary(text, anyPos, wordType, bDirection);
81  // #109813# for non-CJK, single character word, fallback to ICU breakiterator.
82  if (result.endPos - result.startPos != 1 ||
83  getScriptType(text, result.startPos) == ScriptType::ASIAN)
84  return result;
85  }
86  return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection);
87 }
88 
89 namespace {
90 bool isHangul( sal_Unicode cCh )
91 {
92  return (cCh >= 0xAC00 && cCh <= 0xD7AF) || (cCh >= 0x1100 && cCh <= 0x11FF) ||
93  (cCh >= 0xA960 && cCh <= 0xA97F) || (cCh >= 0xD7B0 && cCh <= 0xD7FF) ||
94  (cCh >= 0x3130 && cCh <= 0x318F);
95 }
96 }
97 
98 LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
99  const OUString& Text, sal_Int32 nStartPos,
100  const css::lang::Locale& /*rLocale*/, sal_Int32 /*nMinBreakPos*/,
101  const LineBreakHyphenationOptions& /*hOptions*/,
102  const LineBreakUserOptions& bOptions )
103 {
104  LineBreakResults lbr;
105 
106  const sal_Int32 nOldStartPos = nStartPos;
107 
108  if (bOptions.allowPunctuationOutsideMargin &&
109  nStartPos != Text.getLength() &&
110  hangingCharacters.indexOf(Text[nStartPos]) != -1 &&
111  (Text.iterateCodePoints( &nStartPos ), nStartPos == Text.getLength())) {
112  ; // do nothing
113  } else if (bOptions.applyForbiddenRules && 0 < nStartPos && nStartPos < Text.getLength()) {
114 
115  while (nStartPos > 0 &&
116  (bOptions.forbiddenBeginCharacters.indexOf(Text[nStartPos]) != -1 ||
117  bOptions.forbiddenEndCharacters.indexOf(Text[nStartPos-1]) != -1))
118  Text.iterateCodePoints( &nStartPos, -1);
119  }
120 
121  // Prevent cutting Korean words in the middle.
122  if (nOldStartPos == nStartPos && nStartPos < Text.getLength()
123  && isHangul(Text[nStartPos]))
124  {
125  while ( nStartPos >= 0 && isHangul( Text[nStartPos] ) )
126  --nStartPos;
127 
128  // beginning of the last Korean word.
129  if ( nStartPos < nOldStartPos )
130  ++nStartPos;
131 
132  if ( nStartPos == 0 )
133  nStartPos = nOldStartPos;
134  }
135 
136  lbr.breakIndex = nStartPos;
137  lbr.breakType = BreakType::WORDBOUNDARY;
138  return lbr;
139 }
140 
141 #define LOCALE(language, country) css::lang::Locale(language, country, OUString())
142 // ----------------------------------------------------
143 // class BreakIterator_zh
144 // ----------------------------------------------------;
146 {
147  m_xDict = std::make_unique<xdictionary>("zh");
148  assert(hangingCharacters.pData);
149  hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "CN"));
150  cBreakIterator = "com.sun.star.i18n.BreakIterator_zh";
151 }
152 
153 // ----------------------------------------------------
154 // class BreakIterator_zh_TW
155 // ----------------------------------------------------;
157 {
158  m_xDict = std::make_unique<xdictionary>("zh");
159  assert(hangingCharacters.pData);
160  hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "TW"));
161  cBreakIterator = "com.sun.star.i18n.BreakIterator_zh_TW";
162 }
163 
164 // ----------------------------------------------------
165 // class BreakIterator_ja
166 // ----------------------------------------------------;
168 {
169  m_xDict = std::make_unique<xdictionary>("ja");
170  m_xDict->setJapaneseWordBreak();
171  assert(hangingCharacters.pData);
172  hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("ja", "JP"));
173  cBreakIterator = "com.sun.star.i18n.BreakIterator_ja";
174 }
175 
176 // ----------------------------------------------------
177 // class BreakIterator_ko
178 // ----------------------------------------------------;
180 {
181  assert(hangingCharacters.pData);
182  hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("ko", "KR"));
183  cBreakIterator = "com.sun.star.i18n.BreakIterator_ko";
184 }
185 
186 }
187 
188 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
sal_uInt16 sal_Unicode
const BorderLinePrimitive2D *pCandidateB assert(pCandidateA)
css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
css::i18n::LineBreakResults SAL_CALL getLineBreak(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int32 nMinBreakPos, const css::i18n::LineBreakHyphenationOptions &hOptions, const css::i18n::LineBreakUserOptions &bOptions) override
std::unique_ptr< xdictionary > m_xDict
virtual sal_Int16 SAL_CALL getScriptType(const OUString &Text, sal_Int32 nPos) override
css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
static rtl::Reference< LocaleDataImpl > get()
Definition: localedata.hxx:62
virtual css::i18n::Boundary SAL_CALL nextWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override
unsigned char sal_Bool
Constant values shared between i18npool and, for example, the number formatter.
#define LOCALE(language, country)
virtual css::i18n::Boundary SAL_CALL getWordBoundary(const OUString &Text, sal_Int32 nPos, const css::lang::Locale &nLocale, sal_Int16 WordType, sal_Bool bDirection) override
virtual css::i18n::Boundary SAL_CALL previousWord(const OUString &Text, sal_Int32 nStartPos, const css::lang::Locale &nLocale, sal_Int16 WordType) override