LibreOffice Module i18npool (master)  1
xdictionary.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <config_folders.h>
21 #include <o3tl/temporary.hxx>
22 #include <osl/file.h>
23 #include <osl/module.h>
24 #include <osl/mutex.hxx>
25 #include <rtl/bootstrap.hxx>
26 #include <com/sun/star/i18n/ScriptType.hpp>
27 #include <com/sun/star/i18n/WordType.hpp>
28 #include <xdictionary.hxx>
29 #include <unicode/uchar.h>
30 #include <string.h>
31 #include <breakiteratorImpl.hxx>
32 
33 using namespace com::sun::star::i18n;
34 
35 namespace i18npool {
36 
37 #ifdef DICT_JA_ZH_IN_DATAFILE
38 
39 #elif !defined DISABLE_DYNLOADING
40 
41 extern "C" { static void thisModule() {} }
42 
43 #else
44 
45 extern "C" {
46 
47 sal_uInt8* getExistMark_ja();
48 sal_Int16* getIndex1_ja();
49 sal_Int32* getIndex2_ja();
50 sal_Int32* getLenArray_ja();
51 sal_Unicode* getDataArea_ja();
52 
53 sal_uInt8* getExistMark_zh();
54 sal_Int16* getIndex1_zh();
55 sal_Int32* getIndex2_zh();
56 sal_Int32* getLenArray_zh();
57 sal_Unicode* getDataArea_zh();
58 
59 }
60 
61 #endif
62 
63 xdictionary::xdictionary(const char *lang) :
64  japaneseWordBreak( false )
65 #ifdef DICT_JA_ZH_IN_DATAFILE
66  , m_aFileHandle(nullptr),
67  m_nFileSize(-1),
68  m_pMapping(nullptr)
69 #endif
70 {
71 
72 #ifdef DICT_JA_ZH_IN_DATAFILE
73 
74  if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
75  {
76  OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
77  rtl::Bootstrap::expandMacros(sUrl);
78 
79  if( strcmp( lang, "ja" ) == 0 )
80  sUrl += "ja.data";
81  else if( strcmp( lang, "zh" ) == 0 )
82  sUrl += "zh.data";
83 
84  if( osl_openFile( sUrl.pData, &m_aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
85  osl_getFileSize( m_aFileHandle, &m_nFileSize) == osl_File_E_None &&
86  osl_mapFile( m_aFileHandle, (void **) &m_pMapping, m_nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
87  {
88  // We have the offsets to the parts of the file at its end, see gendict.cxx
89  sal_Int64 *pEOF = (sal_Int64*)(m_pMapping + m_nFileSize);
90 
91  data.existMark = (sal_uInt8*) (m_pMapping + pEOF[-1]);
92  data.index2 = (sal_Int32*) (m_pMapping + pEOF[-2]);
93  data.index1 = (sal_Int16*) (m_pMapping + pEOF[-3]);
94  data.lenArray = (sal_Int32*) (m_pMapping + pEOF[-4]);
95  data.dataArea = (sal_Unicode*) (m_pMapping + pEOF[-5]);
96  }
97  }
98 
99 #elif !defined DISABLE_DYNLOADING
100 
101  initDictionaryData( lang );
102 
103 #else
104 
105  if( strcmp( lang, "ja" ) == 0 ) {
106  data.existMark = getExistMark_ja();
107  data.index1 = getIndex1_ja();
108  data.index2 = getIndex2_ja();
109  data.lenArray = getLenArray_ja();
110  data.dataArea = getDataArea_ja();
111  }
112  else if( strcmp( lang, "zh" ) == 0 ) {
113  data.existMark = getExistMark_zh();
114  data.index1 = getIndex1_zh();
115  data.index2 = getIndex2_zh();
116  data.lenArray = getLenArray_zh();
117  data.dataArea = getDataArea_zh();
118  }
119 
120 #endif
121 
122  for (WordBreakCache & i : cache)
123  i.size = 0;
124 
125  japaneseWordBreak = false;
126 }
127 
129 {
130  for (const WordBreakCache & i : cache) {
131  if (i.size > 0) {
132  delete [] i.contents;
133  delete [] i.wordboundary;
134  }
135  }
136 #ifdef DICT_JA_ZH_IN_DATAFILE
137  if (m_aFileHandle) {
138  if (m_pMapping) {
139  osl_unmapMappedFile(m_aFileHandle, m_pMapping, m_nFileSize);
140  }
141  osl_closeFile(m_aFileHandle);
142  }
143 #endif
144 }
145 
146 namespace {
147  struct datacache {
148  oslModule mhModule;
149  OString maLang;
150  xdictionarydata maData;
151  };
152 }
153 
154 #if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
155 
156 void xdictionary::initDictionaryData(const char *pLang)
157 {
158  // Global cache, never released for performance
159  static std::vector< datacache > aLoadedCache;
160 
161  osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
162  for(const datacache & i : aLoadedCache)
163  {
164  if( i.maLang == pLang )
165  {
166  data = i.maData;
167  return;
168  }
169  }
170 
171  // otherwise add to the cache, positive or negative.
172  datacache aEntry;
173  aEntry.maLang = OString( pLang, strlen( pLang ) );
174 
175 #ifdef SAL_DLLPREFIX
176  OString sModuleName = // mostly "lib*.so" (with * == dict_zh)
177  OString::Concat(SAL_DLLPREFIX "dict_") + pLang + SAL_DLLEXTENSION;
178 #else
179  OString sModuleName = // mostly "*.dll" (with * == dict_zh)
180  OString::Concat("dict_") + pLang + SAL_DLLEXTENSION;
181 #endif
182  aEntry.mhModule = osl_loadModuleRelativeAscii( &thisModule, sModuleName.getStr(), SAL_LOADMODULE_DEFAULT );
183  if( aEntry.mhModule ) {
184  oslGenericFunction func;
185  func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
186  aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
187  func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
188  aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
189  func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
190  aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
191  func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
192  aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
193  func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
194  aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
195  }
196 
197  data = aEntry.maData;
198  aLoadedCache.push_back( aEntry );
199 }
200 
201 #endif
202 
204 {
205  japaneseWordBreak = true;
206 }
207 
208 bool xdictionary::exists(const sal_uInt32 c) const
209 {
210  // 0x1FFF is the hardcoded limit in gendict for data.existMarks
211  bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
212  if (!exist && japaneseWordBreak)
213  return BreakIteratorImpl::getScriptClass(c) == css::i18n::ScriptType::ASIAN;
214  else
215  return exist;
216 }
217 
218 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) const
219 {
220  if ( !data.index1 ) return 0;
221 
222  sal_Int16 idx = data.index1[str[0] >> 8];
223 
224  if (idx == 0xFF) return 0;
225 
226  idx = (idx<<8) | (str[0]&0xff);
227 
228  sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
229 
230  if (begin == 0) return 0;
231 
232  str++; sLen--; // first character is not stored in the dictionary
233  for (sal_uInt32 i = end; i > begin; i--) {
234  sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
235  if (sLen >= len) {
236  const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
237  sal_Int32 pos = 0;
238 
239  while (pos < len && dstr[pos] == str[pos]) { pos++; }
240 
241  if (pos == len)
242  return len + 1;
243  }
244  }
245  return 0;
246 }
247 
248 
249 /*
250  * c-tor
251  */
252 
254  contents( nullptr ),
255  wordboundary( nullptr ),
256  length( 0 ),
257  size( 0 )
258 {
259 }
260 
261 /*
262  * Compare two unicode string,
263  */
264 
265 bool WordBreakCache::equals(const sal_Unicode* str, Boundary const & boundary) const
266 {
267  // Different length, different string.
268  if (length != boundary.endPos - boundary.startPos) return false;
269 
270  for (sal_Int32 i = 0; i < length; i++)
271  if (contents[i] != str[i + boundary.startPos]) return false;
272 
273  return true;
274 }
275 
276 
277 /*
278  * Retrieve the segment containing the character at pos.
279  * @param pos : Position of the given character.
280  * @return true if CJK.
281  */
282 bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
283  Boundary& segBoundary)
284 {
285  sal_Int32 indexUtf16;
286 
287  if (segmentCachedString.pData != rText.pData) {
288  // Cache the passed text so we can avoid regenerating the segment if it's the same
289  // (pData is refcounted and assigning the OUString references it, which ensures that
290  // the object is the same if we get the same pointer back later)
291  segmentCachedString = rText;
292  } else {
293  // If pos is within the cached boundary, use that boundary
294  if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
295  segBoundary.startPos = segmentCachedBoundary.startPos;
296  segBoundary.endPos = segmentCachedBoundary.endPos;
297  indexUtf16 = segmentCachedBoundary.startPos;
298  rText.iterateCodePoints(&indexUtf16);
299  return segmentCachedBoundary.endPos > indexUtf16;
300  }
301  }
302 
303  segBoundary.endPos = segBoundary.startPos = pos;
304 
305  indexUtf16 = pos;
306  while (indexUtf16 > 0)
307  {
308  sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
309  if (u_isWhitespace(ch) || exists(ch))
310  segBoundary.startPos = indexUtf16;
311  else
312  break;
313  }
314 
315  indexUtf16 = pos;
316  while (indexUtf16 < rText.getLength())
317  {
318  sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
319  if (u_isWhitespace(ch) || exists(ch))
320  segBoundary.endPos = indexUtf16;
321  else
322  break;
323  }
324 
325  // Cache the calculated boundary
326  segmentCachedBoundary.startPos = segBoundary.startPos;
327  segmentCachedBoundary.endPos = segBoundary.endPos;
328 
329  indexUtf16 = segBoundary.startPos;
330  rText.iterateCodePoints(&indexUtf16);
331  return segBoundary.endPos > indexUtf16;
332 }
333 
334 #define KANJA 1
335 #define KATAKANA 2
336 #define HIRAKANA 3
337 
338 static sal_Int16 JapaneseCharType(sal_Unicode c)
339 {
340  if (0x3041 <= c && c <= 0x309e)
341  return HIRAKANA;
342  if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
343  return KATAKANA;
344  return KANJA;
345 }
346 
347 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary const & wordBoundary)
348 {
349  WordBreakCache& rCache = cache[text[0] & 0x1f];
350 
351  if (rCache.size != 0 && rCache.equals(text, wordBoundary))
352  return rCache;
353 
354  sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
355 
356  if (rCache.size == 0 || len > rCache.size) {
357  if (rCache.size != 0) {
358  delete [] rCache.contents;
359  delete [] rCache.wordboundary;
360  rCache.size = len;
361  }
362  else
363  rCache.size = std::max<sal_Int32>(len, DEFAULT_SIZE);
364  rCache.contents = new sal_Unicode[rCache.size + 1];
365  rCache.wordboundary = new sal_Int32[rCache.size + 2];
366  }
367  rCache.length = len;
368  memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
369  *(rCache.contents + len) = 0x0000;
370  // reset the wordboundary in cache
371  memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
372 
373  sal_Int32 i = 0; // loop variable
374  while (rCache.wordboundary[i] < rCache.length) {
375  len = 0;
376  // look the continuous white space as one word and cache it
377  while (u_isWhitespace(static_cast<sal_uInt32>(text[wordBoundary.startPos + rCache.wordboundary[i] + len])))
378  len ++;
379 
380  if (len == 0) {
381  const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
382  sal_Int32 slen = rCache.length - rCache.wordboundary[i];
383  sal_Int16 type = 0, count = 0;
384  for (;len == 0 && slen > 0; str++, slen--) {
385  len = getLongestMatch(str, slen);
386  if (len == 0) {
387  if (!japaneseWordBreak) {
388  len = 1;
389  } else {
390  if (count == 0)
391  type = JapaneseCharType(*str);
392  else if (type != JapaneseCharType(*str))
393  break;
394  count++;
395  }
396  }
397  }
398  if (count)
399  {
400  rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
401  i++;
402  }
403  }
404 
405  if (len) {
406  rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
407  i++;
408  }
409  }
410  rCache.wordboundary[i + 1] = rCache.length + 1;
411 
412  return rCache;
413 }
414 
415 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
416 {
417  // looking for the first non-whitespace character from anyPos
418  sal_uInt32 ch = 0;
419  if (anyPos > 0)
420  rText.iterateCodePoints(&anyPos, -1);
421 
422  while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
423 
424  return getWordBoundary(rText, anyPos, wordType, true);
425 }
426 
427 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
428 {
429  boundary = getWordBoundary(rText, anyPos, wordType, true);
430  anyPos = boundary.endPos;
431  const sal_Int32 nLen = rText.getLength();
432  if (anyPos < nLen) {
433  // looking for the first non-whitespace character from anyPos
434  sal_uInt32 ch = rText.iterateCodePoints(&anyPos);
435  while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos);
436  if (anyPos > 0)
437  rText.iterateCodePoints(&anyPos, -1);
438  }
439 
440  return getWordBoundary(rText, anyPos, wordType, true);
441 }
442 
443 Boundary const & xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
444 {
445  const sal_Unicode *text=rText.getStr();
446  sal_Int32 len=rText.getLength();
447  if (anyPos >= len || anyPos < 0) {
448  boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
449  } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
450  WordBreakCache& aCache = getCache(text, boundary);
451  sal_Int32 i = 0;
452 
453  while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
454 
455  sal_Int32 startPos = aCache.wordboundary[i - 1];
456  // if bDirection is false
457  if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
458  {
459  sal_uInt32 ch = rText.iterateCodePoints(&o3tl::temporary(sal_Int32(anyPos-1)));
460  if (u_isWhitespace(ch))
461  i--;
462  }
463 
464  boundary.endPos = boundary.startPos;
465  boundary.endPos += aCache.wordboundary[i];
466  boundary.startPos += aCache.wordboundary[i-1];
467 
468  } else {
469  boundary.startPos = anyPos;
470  if (anyPos < len) rText.iterateCodePoints(&anyPos);
471  boundary.endPos = std::min(anyPos, len);
472  }
473  if (wordType == WordType::WORD_COUNT) {
474  // skip punctuation for word count.
475  while (boundary.endPos < len)
476  {
477  sal_Int32 indexUtf16 = boundary.endPos;
478  if (u_ispunct(rText.iterateCodePoints(&indexUtf16)))
479  boundary.endPos = indexUtf16;
480  else
481  break;
482  }
483  }
484 
485  return boundary;
486 }
487 
488 }
489 
490 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
xdictionarydata maData
xdictionarydata data
Definition: xdictionary.hxx:62
static sal_Int16 getScriptClass(sal_uInt32 currentChar)
#define SAL_DLLEXTENSION
#define KATAKANA
const sal_uInt8 * existMark
Definition: xdictionary.hxx:44
const sal_Int32 * lenArray
Definition: xdictionary.hxx:47
const sal_Int16 * index1
Definition: xdictionary.hxx:45
void initDictionaryData(const char *lang)
WordBreakCache & getCache(const sal_Unicode *text, css::i18n::Boundary const &boundary)
oslModule mhModule
const sal_Unicode * dataArea
Definition: xdictionary.hxx:48
#define SAL_DLLPREFIX
sal_uInt16 sal_Unicode
enumrange< T >::Iterator begin(enumrange< T >)
size_t pos
static sal_Int16 JapaneseCharType(sal_Unicode c)
bool exists(const sal_uInt32 u) const
exports com.sun.star. text
bool seekSegment(const OUString &rText, sal_Int32 pos, css::i18n::Boundary &boundary)
int i
#define KANJA
size
Constant values shared between i18npool and, for example, the number formatter.
css::i18n::Boundary nextWord(const OUString &rText, sal_Int32 nPos, sal_Int16 wordType)
css::i18n::Boundary boundary
Definition: xdictionary.hxx:65
css::i18n::Boundary const & getWordBoundary(const OUString &rText, sal_Int32 nPos, sal_Int16 wordType, bool bDirection)
#define DEFAULT_SIZE
Definition: xdictionary.hxx:29
enumrange< T >::Iterator end(enumrange< T >)
constexpr T & temporary(T &&x)
const sal_uInt16 idx[]
sal_Int32 getLongestMatch(const sal_Unicode *text, sal_Int32 len) const
const sal_Int32 * index2
Definition: xdictionary.hxx:46
unsigned char sal_uInt8
WordBreakCache cache[CACHE_MAX]
Definition: xdictionary.hxx:82
OUString segmentCachedString
Definition: xdictionary.hxx:83
ResultType type
#define HIRAKANA
OString maLang
css::i18n::Boundary segmentCachedBoundary
Definition: xdictionary.hxx:84
css::i18n::Boundary previousWord(const OUString &rText, sal_Int32 nPos, sal_Int16 wordType)
static void thisModule()
Definition: xdictionary.cxx:41
bool equals(const sal_Unicode *str, css::i18n::Boundary const &boundary) const
bool m_bDetectedRangeSegmentation false