LibreOffice Module i18npool (master)  1
textconversion_zh.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 
21 #include <textconversion.hxx>
22 #include <com/sun/star/i18n/TextConversionType.hpp>
23 #include <com/sun/star/i18n/TextConversionOption.hpp>
24 #include <com/sun/star/lang/NoSupportException.hpp>
25 #include <com/sun/star/linguistic2/ConversionDirection.hpp>
26 #include <com/sun/star/linguistic2/ConversionDictionaryType.hpp>
27 #include <com/sun/star/linguistic2/ConversionDictionaryList.hpp>
28 #include <memory>
29 
30 using namespace com::sun::star::lang;
31 using namespace com::sun::star::i18n;
32 using namespace com::sun::star::linguistic2;
33 using namespace com::sun::star::uno;
34 
35 
36 namespace i18npool {
37 
38 TextConversion_zh::TextConversion_zh( const Reference < XComponentContext >& xContext )
39  : TextConversionService("com.sun.star.i18n.TextConversion_zh")
40 {
41  xCDL = ConversionDictionaryList::create(xContext);
42 }
43 
44 static sal_Unicode getOneCharConversion(sal_Unicode ch, const sal_Unicode* Data, const sal_uInt16* Index)
45 {
46  if (Data && Index) {
47  sal_Unicode address = Index[ch>>8];
48  if (address != 0xFFFF)
49  address = Data[address + (ch & 0xFF)];
50  return (address != 0xFFFF) ? address : ch;
51  } else {
52  return ch;
53  }
54 }
55 
56 #ifdef DISABLE_DYNLOADING
57 
58 extern "C" {
59 
60 const sal_Unicode* getSTC_CharData_T2S();
61 const sal_uInt16* getSTC_CharIndex_T2S();
62 const sal_Unicode* getSTC_CharData_S2V();
63 const sal_uInt16* getSTC_CharIndex_S2V();
64 const sal_Unicode* getSTC_CharData_S2T();
65 const sal_uInt16* getSTC_CharIndex_S2T();
66 
67 const sal_Unicode *getSTC_WordData(sal_Int32&);
68 
69 const sal_uInt16 *getSTC_WordIndex_T2S(sal_Int32&);
70 const sal_uInt16 *getSTC_WordEntry_T2S();
71 const sal_uInt16 *getSTC_WordIndex_S2T(sal_Int32&);
72 const sal_uInt16 *getSTC_WordEntry_S2T();
73 
74 }
75 
76 #endif
77 
78 OUString
79 TextConversion_zh::getCharConversion(const OUString& aText, sal_Int32 nStartPos, sal_Int32 nLength, bool toSChinese, sal_Int32 nConversionOptions)
80 {
81  const sal_Unicode *Data;
82  const sal_uInt16 *Index;
83 
84 #ifndef DISABLE_DYNLOADING
85  if (toSChinese) {
86  Data = reinterpret_cast<const sal_Unicode* (*)()>(getFunctionBySymbol("getSTC_CharData_T2S"))();
87  Index = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_CharIndex_T2S"))();
88  } else if (nConversionOptions & TextConversionOption::USE_CHARACTER_VARIANTS) {
89  Data = reinterpret_cast<const sal_Unicode* (*)()>(getFunctionBySymbol("getSTC_CharData_S2V"))();
90  Index = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_CharIndex_S2V"))();
91  } else {
92  Data = reinterpret_cast<const sal_Unicode* (*)()>(getFunctionBySymbol("getSTC_CharData_S2T"))();
93  Index = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_CharIndex_S2T"))();
94  }
95 #else
96  if (toSChinese) {
97  Data = getSTC_CharData_T2S();
98  Index = getSTC_CharIndex_T2S();
99  } else if (nConversionOptions & TextConversionOption::USE_CHARACTER_VARIANTS) {
100  Data = getSTC_CharData_S2V();
101  Index = getSTC_CharIndex_S2V();
102  } else {
103  Data = getSTC_CharData_S2T();
104  Index = getSTC_CharIndex_S2T();
105  }
106 #endif
107 
108  rtl_uString * newStr = rtl_uString_alloc(nLength);
109  for (sal_Int32 i = 0; i < nLength; i++)
110  newStr->buffer[i] =
111  getOneCharConversion(aText[nStartPos+i], Data, Index);
112  return OUString(newStr, SAL_NO_ACQUIRE); //take ownership
113 }
114 
115 OUString
116 TextConversion_zh::getWordConversion(const OUString& aText, sal_Int32 nStartPos, sal_Int32 nLength, bool toSChinese, sal_Int32 nConversionOptions, Sequence<sal_Int32>& offset)
117 {
118  sal_Int32 dictLen = 0;
119  sal_Int32 maxLen = 0;
120  const sal_uInt16 *index;
121  const sal_uInt16 *entry;
122  const sal_Unicode *charData;
123  const sal_uInt16 *charIndex;
124  bool one2one=true;
125 
126 #ifndef DISABLE_DYNLOADING
127  const sal_Unicode *wordData = reinterpret_cast<const sal_Unicode* (*)(sal_Int32&)>(getFunctionBySymbol("getSTC_WordData"))(dictLen);
128  if (toSChinese) {
129  index = reinterpret_cast<const sal_uInt16* (*)(sal_Int32&)>(getFunctionBySymbol("getSTC_WordIndex_T2S"))(maxLen);
130  entry = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_WordEntry_T2S"))();
131  charData = reinterpret_cast<const sal_Unicode* (*)()>(getFunctionBySymbol("getSTC_CharData_T2S"))();
132  charIndex = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_CharIndex_T2S"))();
133  } else {
134  index = reinterpret_cast<const sal_uInt16* (*)(sal_Int32&)>(getFunctionBySymbol("getSTC_WordIndex_S2T"))(maxLen);
135  entry = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_WordEntry_S2T"))();
136  if (nConversionOptions & TextConversionOption::USE_CHARACTER_VARIANTS) {
137  charData = reinterpret_cast<const sal_Unicode* (*)()>(getFunctionBySymbol("getSTC_CharData_S2V"))();
138  charIndex = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_CharIndex_S2V"))();
139  } else {
140  charData = reinterpret_cast<const sal_Unicode* (*)()>(getFunctionBySymbol("getSTC_CharData_S2T"))();
141  charIndex = reinterpret_cast<const sal_uInt16* (*)()>(getFunctionBySymbol("getSTC_CharIndex_S2T"))();
142  }
143  }
144 #else
145  const sal_Unicode *wordData = getSTC_WordData(dictLen);
146  if (toSChinese) {
147  index = getSTC_WordIndex_T2S(maxLen);
148  entry = getSTC_WordEntry_T2S();
149  charData = getSTC_CharData_T2S();
150  charIndex = getSTC_CharIndex_T2S();
151  } else {
152  index = getSTC_WordIndex_S2T(maxLen);
153  entry = getSTC_WordEntry_S2T();
154  if (nConversionOptions & TextConversionOption::USE_CHARACTER_VARIANTS) {
155  charData = getSTC_CharData_S2V();
156  charIndex = getSTC_CharIndex_S2V();
157  } else {
158  charData = getSTC_CharData_S2T();
159  charIndex = getSTC_CharIndex_S2T();
160  }
161  }
162 #endif
163 
164  if ((!wordData || !index || !entry) && !xCDL.is()) // no word mapping defined, do char2char conversion.
165  return getCharConversion(aText, nStartPos, nLength, toSChinese, nConversionOptions);
166 
167  std::unique_ptr<sal_Unicode[]> newStr(new sal_Unicode[nLength * 2 + 1]);
168  sal_Int32 currPos = 0, count = 0;
169  auto offsetRange = asNonConstRange(offset);
170  while (currPos < nLength) {
171  sal_Int32 len = nLength - currPos;
172  bool found = false;
173  if (len > maxLen)
174  len = maxLen;
175  for (; len > 0 && ! found; len--) {
176  OUString word = aText.copy(nStartPos + currPos, len);
177  sal_Int32 current = 0;
178  // user dictionary
179  if (xCDL.is()) {
180  Sequence < OUString > conversions;
181  try {
182  conversions = xCDL->queryConversions(word, 0, len,
183  aLocale, ConversionDictionaryType::SCHINESE_TCHINESE,
184  /*toSChinese ?*/ ConversionDirection_FROM_LEFT /*: ConversionDirection_FROM_RIGHT*/,
185  nConversionOptions);
186  }
187  catch ( NoSupportException & ) {
188  // clear reference (when there is no user dictionary) in order
189  // to not always have to catch this exception again
190  // in further calls. (save time)
191  xCDL = nullptr;
192  }
193  catch (...) {
194  // catch all other exceptions to allow
195  // querying the system dictionary in the next line
196  }
197  if (conversions.hasElements()) {
198  if (offset.hasElements()) {
199  if (word.getLength() != conversions[0].getLength())
200  one2one=false;
201  while (current < conversions[0].getLength()) {
202  offsetRange[count] = nStartPos + currPos + (current *
203  word.getLength() / conversions[0].getLength());
204  newStr[count++] = conversions[0][current++];
205  }
206  // offset[count-1] = nStartPos + currPos + word.getLength() - 1;
207  } else {
208  while (current < conversions[0].getLength())
209  newStr[count++] = conversions[0][current++];
210  }
211  currPos += word.getLength();
212  found = true;
213  }
214  }
215 
216  if (wordData && !found && index[len+1] - index[len] > 0) {
217  sal_Int32 bottom = static_cast<sal_Int32>(index[len]);
218  sal_Int32 top = static_cast<sal_Int32>(index[len+1]) - 1;
219 
220  while (bottom <= top && !found) {
221  current = (top + bottom) / 2;
222  const sal_Int32 result = rtl_ustr_compare(
223  word.getStr(), wordData + entry[current]);
224  if (result < 0)
225  top = current - 1;
226  else if (result > 0)
227  bottom = current + 1;
228  else {
229  if (toSChinese) // Traditionary/Simplified conversion,
230  for (current = entry[current]-1; current > 0 && wordData[current-1]; current--) ;
231  else // Simplified/Traditionary conversion, forwards search for next word
232  current = entry[current] + word.getLength() + 1;
233  sal_Int32 start=current;
234  if (offset.hasElements()) {
235  if (word.getLength() != OUString(&wordData[current]).getLength())
236  one2one=false;
237  sal_Int32 convertedLength=OUString(&wordData[current]).getLength();
238  while (wordData[current]) {
239  offsetRange[count]=nStartPos + currPos + ((current-start) *
240  word.getLength() / convertedLength);
241  newStr[count++] = wordData[current++];
242  }
243  // offset[count-1]=nStartPos + currPos + word.getLength() - 1;
244  } else {
245  while (wordData[current])
246  newStr[count++] = wordData[current++];
247  }
248  currPos += word.getLength();
249  found = true;
250  }
251  }
252  }
253  }
254  if (!found) {
255  if (offset.hasElements())
256  offsetRange[count]=nStartPos+currPos;
257  newStr[count++] =
258  getOneCharConversion(aText[nStartPos+currPos], charData, charIndex);
259  currPos++;
260  }
261  }
262  if (offset.hasElements())
263  offset.realloc(one2one ? 0 : count);
264  OUString aRet(newStr.get(), count);
265  return aRet;
266 }
267 
268 TextConversionResult SAL_CALL
269 TextConversion_zh::getConversions( const OUString& aText, sal_Int32 nStartPos, sal_Int32 nLength,
270  const Locale& rLocale, sal_Int16 nConversionType, sal_Int32 nConversionOptions)
271 {
272  TextConversionResult result;
273 
274  result.Candidates =
275  { getConversion( aText, nStartPos, nLength, rLocale, nConversionType, nConversionOptions) };
276  result.Boundary.startPos = nStartPos;
277  result.Boundary.endPos = nStartPos + nLength;
278 
279  return result;
280 }
281 
282 OUString SAL_CALL
283 TextConversion_zh::getConversion( const OUString& aText, sal_Int32 nStartPos, sal_Int32 nLength,
284  const Locale& rLocale, sal_Int16 nConversionType, sal_Int32 nConversionOptions)
285 {
286  if (rLocale.Language != "zh" || ( nConversionType != TextConversionType::TO_SCHINESE && nConversionType != TextConversionType::TO_TCHINESE) )
287  throw NoSupportException(); // Conversion type is not supported in this service.
288 
289  aLocale=rLocale;
290  bool toSChinese = nConversionType == TextConversionType::TO_SCHINESE;
291 
292  if (nConversionOptions & TextConversionOption::CHARACTER_BY_CHARACTER)
293  // char to char dictionary
294  return getCharConversion(aText, nStartPos, nLength, toSChinese, nConversionOptions);
295  else {
296  Sequence <sal_Int32> offset;
297  // word to word dictionary
298  return getWordConversion(aText, nStartPos, nLength, toSChinese, nConversionOptions, offset);
299  }
300 }
301 
302 OUString SAL_CALL
303 TextConversion_zh::getConversionWithOffset( const OUString& aText, sal_Int32 nStartPos, sal_Int32 nLength,
304  const Locale& rLocale, sal_Int16 nConversionType, sal_Int32 nConversionOptions, Sequence<sal_Int32>& offset)
305 {
306  if (rLocale.Language != "zh" || ( nConversionType != TextConversionType::TO_SCHINESE && nConversionType != TextConversionType::TO_TCHINESE) )
307  throw NoSupportException(); // Conversion type is not supported in this service.
308 
309  aLocale=rLocale;
310  bool toSChinese = nConversionType == TextConversionType::TO_SCHINESE;
311 
312  if (nConversionOptions & TextConversionOption::CHARACTER_BY_CHARACTER) {
313  offset.realloc(0);
314  // char to char dictionary
315  return getCharConversion(aText, nStartPos, nLength, toSChinese, nConversionOptions);
316  } else {
317  if (offset.getLength() < 2*nLength)
318  offset.realloc(2*nLength);
319  // word to word dictionary
320  return getWordConversion(aText, nStartPos, nLength, toSChinese, nConversionOptions, offset);
321  }
322 }
323 
324 sal_Bool SAL_CALL
325 TextConversion_zh::interactiveConversion( const Locale& /*rLocale*/, sal_Int16 /*nTextConversionType*/, sal_Int32 /*nTextConversionOptions*/ )
326 {
327  return false;
328 }
329 
330 }
331 
332 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
css::i18n::TextConversionResult SAL_CALL getConversions(const OUString &aText, sal_Int32 nStartPos, sal_Int32 nLength, const css::lang::Locale &aLocale, sal_Int16 nTextConversionType, sal_Int32 nTextConversionOptions) override
css::uno::Reference< css::linguistic2::XConversionDictionaryList > xCDL
OUString SAL_CALL getConversion(const OUString &aText, sal_Int32 nStartPos, sal_Int32 nLength, const css::lang::Locale &aLocale, sal_Int16 nTextConversionType, sal_Int32 nTextConversionOptions) override
sal_uInt16 sal_Unicode
OUString SAL_CALL getConversionWithOffset(const OUString &aText, sal_Int32 nStartPos, sal_Int32 nLength, const css::lang::Locale &aLocale, sal_Int16 nTextConversionType, sal_Int32 nTextConversionOptions, css::uno::Sequence< sal_Int32 > &offset) override
sal_Bool SAL_CALL interactiveConversion(const css::lang::Locale &aLocale, sal_Int16 nTextConversionType, sal_Int32 nTextConversionOptions) override
int i
unsigned char sal_Bool
rtl::OUString getCharConversion(const rtl::OUString &aText, sal_Int32 nStartPos, sal_Int32 nLength, bool toSChinese, sal_Int32 nConversionOptions)
Constant values shared between i18npool and, for example, the number formatter.
oslGenericFunction getFunctionBySymbol(const char *func)
tuple index
OUString getWordConversion(const OUString &aText, sal_Int32 nStartPos, sal_Int32 nLength, bool toSChinese, sal_Int32 nConversionOptions, css::uno::Sequence< sal_Int32 > &offset)
static sal_Unicode getOneCharConversion(sal_Unicode ch, const sal_Unicode *Data, const sal_uInt16 *Index)
double getLength(const B2DPolygon &rCandidate)
Any result