LibreOffice Module i18npool (master) 1
xdictionary.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <config_folders.h>
21#include <o3tl/temporary.hxx>
22#include <osl/file.h>
23#include <osl/module.h>
24#include <osl/mutex.hxx>
25#include <rtl/bootstrap.hxx>
26#include <com/sun/star/i18n/ScriptType.hpp>
27#include <com/sun/star/i18n/WordType.hpp>
28#include <xdictionary.hxx>
29#include <unicode/uchar.h>
30#include <string.h>
31#include <breakiteratorImpl.hxx>
32
33using namespace com::sun::star::i18n;
34
35namespace i18npool {
36
37#ifdef DICT_JA_ZH_IN_DATAFILE
38
39#elif !defined DISABLE_DYNLOADING
40
41extern "C" { static void thisModule() {} }
42
43#else
44
45extern "C" {
46
47sal_uInt8* getExistMark_ja();
48sal_Int16* getIndex1_ja();
49sal_Int32* getIndex2_ja();
50sal_Int32* getLenArray_ja();
51sal_Unicode* getDataArea_ja();
52
53sal_uInt8* getExistMark_zh();
54sal_Int16* getIndex1_zh();
55sal_Int32* getIndex2_zh();
56sal_Int32* getLenArray_zh();
57sal_Unicode* getDataArea_zh();
58
59}
60
61#endif
62
63xdictionary::xdictionary(const char *lang) :
64 japaneseWordBreak( false )
65#ifdef DICT_JA_ZH_IN_DATAFILE
66 , m_aFileHandle(nullptr),
67 m_nFileSize(-1),
68 m_pMapping(nullptr)
69#endif
70{
71
72#ifdef DICT_JA_ZH_IN_DATAFILE
73
74 if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
75 {
76 OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
77 rtl::Bootstrap::expandMacros(sUrl);
78
79 if( strcmp( lang, "ja" ) == 0 )
80 sUrl += "ja.data";
81 else if( strcmp( lang, "zh" ) == 0 )
82 sUrl += "zh.data";
83
84 if( osl_openFile( sUrl.pData, &m_aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
85 osl_getFileSize( m_aFileHandle, &m_nFileSize) == osl_File_E_None &&
86 osl_mapFile( m_aFileHandle, (void **) &m_pMapping, m_nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
87 {
88 // We have the offsets to the parts of the file at its end, see gendict.cxx
89 sal_Int64 *pEOF = (sal_Int64*)(m_pMapping + m_nFileSize);
90
91 data.existMark = (sal_uInt8*) (m_pMapping + pEOF[-1]);
92 data.index2 = (sal_Int32*) (m_pMapping + pEOF[-2]);
93 data.index1 = (sal_Int16*) (m_pMapping + pEOF[-3]);
94 data.lenArray = (sal_Int32*) (m_pMapping + pEOF[-4]);
95 data.dataArea = (sal_Unicode*) (m_pMapping + pEOF[-5]);
96 }
97 }
98
99#elif !defined DISABLE_DYNLOADING
100
101 initDictionaryData( lang );
102
103#else
104
105 if( strcmp( lang, "ja" ) == 0 ) {
106 data.existMark = getExistMark_ja();
107 data.index1 = getIndex1_ja();
108 data.index2 = getIndex2_ja();
109 data.lenArray = getLenArray_ja();
110 data.dataArea = getDataArea_ja();
111 }
112 else if( strcmp( lang, "zh" ) == 0 ) {
113 data.existMark = getExistMark_zh();
114 data.index1 = getIndex1_zh();
115 data.index2 = getIndex2_zh();
116 data.lenArray = getLenArray_zh();
117 data.dataArea = getDataArea_zh();
118 }
119
120#endif
121
122 for (WordBreakCache & i : cache)
123 i.size = 0;
124
125 japaneseWordBreak = false;
126}
127
129{
130 for (const WordBreakCache & i : cache) {
131 if (i.size > 0) {
132 delete [] i.contents;
133 delete [] i.wordboundary;
134 }
135 }
136#ifdef DICT_JA_ZH_IN_DATAFILE
137 if (m_aFileHandle) {
138 if (m_pMapping) {
139 osl_unmapMappedFile(m_aFileHandle, m_pMapping, m_nFileSize);
140 }
141 osl_closeFile(m_aFileHandle);
142 }
143#endif
144}
145
146namespace {
147 struct datacache {
148 oslModule mhModule;
149 OString maLang;
150 xdictionarydata maData;
151 };
152}
153
154#if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
155
156void xdictionary::initDictionaryData(const char *pLang)
157{
158 // Global cache, never released for performance
159 static std::vector< datacache > aLoadedCache;
160
161 osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
162 for(const datacache & i : aLoadedCache)
163 {
164 if( i.maLang == pLang )
165 {
166 data = i.maData;
167 return;
168 }
169 }
170
171 // otherwise add to the cache, positive or negative.
172 datacache aEntry;
173 aEntry.maLang = OString( pLang, strlen( pLang ) );
174
175#ifdef SAL_DLLPREFIX
176 OString sModuleName = // mostly "lib*.so" (with * == dict_zh)
177 OString::Concat(SAL_DLLPREFIX "dict_") + pLang + SAL_DLLEXTENSION;
178#else
179 OString sModuleName = // mostly "*.dll" (with * == dict_zh)
180 OString::Concat("dict_") + pLang + SAL_DLLEXTENSION;
181#endif
182 aEntry.mhModule = osl_loadModuleRelativeAscii( &thisModule, sModuleName.getStr(), SAL_LOADMODULE_DEFAULT );
183 if( aEntry.mhModule ) {
184 oslGenericFunction func;
185 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
186 aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
187 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
188 aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
189 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
190 aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
191 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
192 aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
193 func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
194 aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
195 }
196
197 data = aEntry.maData;
198 aLoadedCache.push_back( aEntry );
199}
200
201#endif
202
204{
205 japaneseWordBreak = true;
206}
207
208bool xdictionary::exists(const sal_uInt32 c) const
209{
210 // 0x1FFF is the hardcoded limit in gendict for data.existMarks
211 bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
212 if (!exist && japaneseWordBreak)
213 return BreakIteratorImpl::getScriptClass(c) == css::i18n::ScriptType::ASIAN;
214 else
215 return exist;
216}
217
218sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) const
219{
220 if ( !data.index1 ) return 0;
221
222 sal_Int16 idx = data.index1[str[0] >> 8];
223
224 if (idx == 0xFF) return 0;
225
226 idx = (idx<<8) | (str[0]&0xff);
227
228 sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
229
230 if (begin == 0) return 0;
231
232 str++; sLen--; // first character is not stored in the dictionary
233 for (sal_uInt32 i = end; i > begin; i--) {
234 sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
235 if (sLen >= len) {
236 const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
237 sal_Int32 pos = 0;
238
239 while (pos < len && dstr[pos] == str[pos]) { pos++; }
240
241 if (pos == len)
242 return len + 1;
243 }
244 }
245 return 0;
246}
247
248
249/*
250 * c-tor
251 */
252
254 contents( nullptr ),
255 wordboundary( nullptr ),
256 length( 0 ),
257 size( 0 )
258{
259}
260
261/*
262 * Compare two unicode string,
263 */
264
265bool WordBreakCache::equals(const sal_Unicode* str, Boundary const & boundary) const
266{
267 // Different length, different string.
268 if (length != boundary.endPos - boundary.startPos) return false;
269
270 for (sal_Int32 i = 0; i < length; i++)
271 if (contents[i] != str[i + boundary.startPos]) return false;
272
273 return true;
274}
275
276
277/*
278 * Retrieve the segment containing the character at pos.
279 * @param pos : Position of the given character.
280 * @return true if CJK.
281 */
282bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
283 Boundary& segBoundary)
284{
285 sal_Int32 indexUtf16;
286
287 if (segmentCachedString.pData != rText.pData) {
288 // Cache the passed text so we can avoid regenerating the segment if it's the same
289 // (pData is refcounted and assigning the OUString references it, which ensures that
290 // the object is the same if we get the same pointer back later)
291 segmentCachedString = rText;
292 } else {
293 // If pos is within the cached boundary, use that boundary
294 if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
295 segBoundary.startPos = segmentCachedBoundary.startPos;
296 segBoundary.endPos = segmentCachedBoundary.endPos;
297 indexUtf16 = segmentCachedBoundary.startPos;
298 rText.iterateCodePoints(&indexUtf16);
299 return segmentCachedBoundary.endPos > indexUtf16;
300 }
301 }
302
303 segBoundary.endPos = segBoundary.startPos = pos;
304
305 indexUtf16 = pos;
306 while (indexUtf16 > 0)
307 {
308 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
309 if (u_isWhitespace(ch) || exists(ch))
310 segBoundary.startPos = indexUtf16;
311 else
312 break;
313 }
314
315 indexUtf16 = pos;
316 while (indexUtf16 < rText.getLength())
317 {
318 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
319 if (u_isWhitespace(ch) || exists(ch))
320 segBoundary.endPos = indexUtf16;
321 else
322 break;
323 }
324
325 // Cache the calculated boundary
326 segmentCachedBoundary.startPos = segBoundary.startPos;
327 segmentCachedBoundary.endPos = segBoundary.endPos;
328
329 indexUtf16 = segBoundary.startPos;
330 rText.iterateCodePoints(&indexUtf16);
331 return segBoundary.endPos > indexUtf16;
332}
333
334#define KANJA 1
335#define KATAKANA 2
336#define HIRAKANA 3
337
338static sal_Int16 JapaneseCharType(sal_Unicode c)
339{
340 if (0x3041 <= c && c <= 0x309e)
341 return HIRAKANA;
342 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
343 return KATAKANA;
344 return KANJA;
345}
346
347WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary const & wordBoundary)
348{
349 WordBreakCache& rCache = cache[text[0] & 0x1f];
350
351 if (rCache.size != 0 && rCache.equals(text, wordBoundary))
352 return rCache;
353
354 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
355
356 if (rCache.size == 0 || len > rCache.size) {
357 if (rCache.size != 0) {
358 delete [] rCache.contents;
359 delete [] rCache.wordboundary;
360 rCache.size = len;
361 }
362 else
363 rCache.size = std::max<sal_Int32>(len, DEFAULT_SIZE);
364 rCache.contents = new sal_Unicode[rCache.size + 1];
365 rCache.wordboundary = new sal_Int32[rCache.size + 2];
366 }
367 rCache.length = len;
368 memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
369 *(rCache.contents + len) = 0x0000;
370 // reset the wordboundary in cache
371 memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
372
373 sal_Int32 i = 0; // loop variable
374 while (rCache.wordboundary[i] < rCache.length) {
375 len = 0;
376 // look the continuous white space as one word and cache it
377 while (u_isWhitespace(static_cast<sal_uInt32>(text[wordBoundary.startPos + rCache.wordboundary[i] + len])))
378 len ++;
379
380 if (len == 0) {
381 const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
382 sal_Int32 slen = rCache.length - rCache.wordboundary[i];
383 sal_Int16 type = 0, count = 0;
384 for (;len == 0 && slen > 0; str++, slen--) {
385 len = getLongestMatch(str, slen);
386 if (len == 0) {
387 if (!japaneseWordBreak) {
388 len = 1;
389 } else {
390 if (count == 0)
391 type = JapaneseCharType(*str);
392 else if (type != JapaneseCharType(*str))
393 break;
394 count++;
395 }
396 }
397 }
398 if (count)
399 {
400 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
401 i++;
402 }
403 }
404
405 if (len) {
406 rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
407 i++;
408 }
409 }
410 rCache.wordboundary[i + 1] = rCache.length + 1;
411
412 return rCache;
413}
414
415Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
416{
417 // looking for the first non-whitespace character from anyPos
418 sal_uInt32 ch = 0;
419 if (anyPos > 0)
420 rText.iterateCodePoints(&anyPos, -1);
421
422 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
423
424 return getWordBoundary(rText, anyPos, wordType, true);
425}
426
427Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
428{
429 boundary = getWordBoundary(rText, anyPos, wordType, true);
430 anyPos = boundary.endPos;
431 const sal_Int32 nLen = rText.getLength();
432 if (anyPos < nLen) {
433 // looking for the first non-whitespace character from anyPos
434 sal_uInt32 ch = rText.iterateCodePoints(&anyPos);
435 while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos);
436 if (anyPos > 0)
437 rText.iterateCodePoints(&anyPos, -1);
438 }
439
440 return getWordBoundary(rText, anyPos, wordType, true);
441}
442
443Boundary const & xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
444{
445 const sal_Unicode *text=rText.getStr();
446 sal_Int32 len=rText.getLength();
447 if (anyPos >= len || anyPos < 0) {
448 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
449 } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
451 sal_Int32 i = 0;
452
453 while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
454
455 sal_Int32 startPos = aCache.wordboundary[i - 1];
456 // if bDirection is false
457 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
458 {
459 sal_uInt32 ch = rText.iterateCodePoints(&o3tl::temporary(sal_Int32(anyPos-1)));
460 if (u_isWhitespace(ch))
461 i--;
462 }
463
464 boundary.endPos = boundary.startPos;
465 boundary.endPos += aCache.wordboundary[i];
466 boundary.startPos += aCache.wordboundary[i-1];
467
468 } else {
469 boundary.startPos = anyPos;
470 if (anyPos < len) rText.iterateCodePoints(&anyPos);
471 boundary.endPos = std::min(anyPos, len);
472 }
473 if (wordType == WordType::WORD_COUNT) {
474 // skip punctuation for word count.
475 while (boundary.endPos < len)
476 {
477 sal_Int32 indexUtf16 = boundary.endPos;
478 if (u_ispunct(rText.iterateCodePoints(&indexUtf16)))
479 boundary.endPos = indexUtf16;
480 else
481 break;
482 }
483 }
484
485 return boundary;
486}
487
488}
489
490/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
#define SAL_DLLPREFIX
static sal_Int16 getScriptClass(sal_uInt32 currentChar)
css::i18n::Boundary nextWord(const OUString &rText, sal_Int32 nPos, sal_Int16 wordType)
css::i18n::Boundary boundary
Definition: xdictionary.hxx:65
sal_Int32 getLongestMatch(const sal_Unicode *text, sal_Int32 len) const
void initDictionaryData(const char *lang)
css::i18n::Boundary previousWord(const OUString &rText, sal_Int32 nPos, sal_Int16 wordType)
WordBreakCache cache[CACHE_MAX]
Definition: xdictionary.hxx:82
css::i18n::Boundary const & getWordBoundary(const OUString &rText, sal_Int32 nPos, sal_Int16 wordType, bool bDirection)
bool seekSegment(const OUString &rText, sal_Int32 pos, css::i18n::Boundary &boundary)
WordBreakCache & getCache(const sal_Unicode *text, css::i18n::Boundary const &boundary)
css::i18n::Boundary segmentCachedBoundary
Definition: xdictionary.hxx:84
OUString segmentCachedString
Definition: xdictionary.hxx:83
xdictionarydata data
Definition: xdictionary.hxx:62
bool exists(const sal_uInt32 u) const
xdictionary(const char *lang)
Definition: xdictionary.cxx:63
#define SAL_DLLEXTENSION
const sal_uInt16 idx[]
def text(shape, orig_st)
size
int i
Constant values shared between i18npool and, for example, the number formatter.
static sal_Int16 JapaneseCharType(sal_Unicode c)
static void thisModule()
Definition: xdictionary.cxx:41
enumrange< T >::Iterator begin(enumrange< T >)
constexpr T & temporary(T &&x)
end
bool equals(const sal_Unicode *str, css::i18n::Boundary const &boundary) const
const sal_Unicode * dataArea
Definition: xdictionary.hxx:48
const sal_Int32 * index2
Definition: xdictionary.hxx:46
const sal_Int32 * lenArray
Definition: xdictionary.hxx:47
const sal_uInt8 * existMark
Definition: xdictionary.hxx:44
const sal_Int16 * index1
Definition: xdictionary.hxx:45
unsigned char sal_uInt8
sal_uInt16 sal_Unicode
ResultType type
size_t pos
OString maLang
oslModule mhModule
#define KANJA
#define KATAKANA
xdictionarydata maData
#define HIRAKANA
#define DEFAULT_SIZE
Definition: xdictionary.hxx:29