LibreOffice Module i18npool (master)  1
collator_unicode.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <config_locales.h>
21 
22 #include <sal/log.hxx>
23 
24 #include <lrl_include.hxx>
25 
26 #include <rtl/ustrbuf.hxx>
29 #include <collator_unicode.hxx>
30 #include <localedata.hxx>
31 #include <com/sun/star/i18n/CollatorOptions.hpp>
33 
34 using namespace ::com::sun::star;
35 using namespace ::com::sun::star::i18n;
36 using namespace ::com::sun::star::lang;
37 using namespace ::com::sun::star::uno;
38 
39 namespace i18npool {
40 
41 constexpr OUStringLiteral implementationName = u"com.sun.star.i18n.Collator_Unicode";
42 
44 {
45  collator = nullptr;
46  uca_base = nullptr;
47 #ifndef DISABLE_DYNLOADING
48  hModule = nullptr;
49 #endif
50 }
51 
53 {
54  collator.reset();
55  uca_base.reset();
56 #ifndef DISABLE_DYNLOADING
57  if (hModule) osl_unloadModule(hModule);
58 #endif
59 }
60 
61 #ifdef DISABLE_DYNLOADING
62 
63 extern "C" {
64 
65 // For DISABLE_DYNLOADING the generated functions have names that
66 // start with get_collator_data_ to avoid clashing with a few
67 // functions in the generated libindex_data that are called just
68 // get_zh_pinyin for instance.
69 
70 const sal_uInt8* get_collator_data_ca_charset();
71 const sal_uInt8* get_collator_data_cu_charset();
72 const sal_uInt8* get_collator_data_dz_charset();
73 const sal_uInt8* get_collator_data_hu_charset();
74 const sal_uInt8* get_collator_data_ja_charset();
75 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_first();
76 const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_last();
77 const sal_uInt8* get_collator_data_ko_charset();
78 const sal_uInt8* get_collator_data_ku_alphanumeric();
79 const sal_uInt8* get_collator_data_ln_charset();
80 const sal_uInt8* get_collator_data_my_dictionary();
81 const sal_uInt8* get_collator_data_ne_charset();
82 const sal_uInt8* get_collator_data_sid_charset();
83 const sal_uInt8* get_collator_data_vro_alphanumeric();
84 const sal_uInt8* get_collator_data_zh_TW_charset();
85 const sal_uInt8* get_collator_data_zh_TW_radical();
86 const sal_uInt8* get_collator_data_zh_TW_stroke();
87 const sal_uInt8* get_collator_data_zh_charset();
88 const sal_uInt8* get_collator_data_zh_pinyin();
89 const sal_uInt8* get_collator_data_zh_radical();
90 const sal_uInt8* get_collator_data_zh_stroke();
91 const sal_uInt8* get_collator_data_zh_zhuyin();
92 
93 size_t get_collator_data_ca_charset_length();
94 size_t get_collator_data_cu_charset_length();
95 size_t get_collator_data_dz_charset_length();
96 size_t get_collator_data_hu_charset_length();
97 size_t get_collator_data_ja_charset_length();
98 size_t get_collator_data_ja_phonetic_alphanumeric_first_length();
99 size_t get_collator_data_ja_phonetic_alphanumeric_last_length();
100 size_t get_collator_data_ko_charset_length();
101 size_t get_collator_data_ku_alphanumeric_length();
102 size_t get_collator_data_ln_charset_length();
103 size_t get_collator_data_my_dictionary_length();
104 size_t get_collator_data_ne_charset_length();
105 size_t get_collator_data_sid_charset_length();
106 size_t get_collator_data_vro_alphanumeric_length();
107 size_t get_collator_data_zh_TW_charset_length();
108 size_t get_collator_data_zh_TW_radical_length();
109 size_t get_collator_data_zh_TW_stroke_length();
110 size_t get_collator_data_zh_charset_length();
111 size_t get_collator_data_zh_pinyin_length();
112 size_t get_collator_data_zh_radical_length();
113 size_t get_collator_data_zh_stroke_length();
114 size_t get_collator_data_zh_zhuyin_length();
115 
116 }
117 
118 #endif
119 
120 sal_Int32 SAL_CALL
121 Collator_Unicode::compareSubstring( const OUString& str1, sal_Int32 off1, sal_Int32 len1,
122  const OUString& str2, sal_Int32 off2, sal_Int32 len2)
123 {
124  return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()) + off1, len1, reinterpret_cast<const UChar *>(str2.getStr()) + off2, len2);
125 }
126 
127 sal_Int32 SAL_CALL
128 Collator_Unicode::compareString( const OUString& str1, const OUString& str2)
129 {
130  return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()), str1.getLength(),
131  reinterpret_cast<const UChar *>(str2.getStr()), str2.getLength());
132 }
133 
134 #ifndef DISABLE_DYNLOADING
135 
136 extern "C" { static void thisModule() {} }
137 
138 #endif
139 
140 sal_Int32 SAL_CALL
141 Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang::Locale& rLocale, sal_Int32 options)
142 {
143  if (!collator) {
144  UErrorCode status = U_ZERO_ERROR;
145  OUString rule = LocaleDataImpl::get()->getCollatorRuleByAlgorithm(rLocale, rAlgorithm);
146  if (!rule.isEmpty()) {
147  collator.reset( new icu::RuleBasedCollator(reinterpret_cast<const UChar *>(rule.getStr()), status) );
148  if (! U_SUCCESS(status)) {
149  OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
150  SAL_WARN("i18npool", message);
151  throw RuntimeException(message);
152  }
153  }
154  if (!collator && OUString(LOCAL_RULE_LANGS).indexOf(rLocale.Language) >= 0) {
155  const sal_uInt8* (*func)() = nullptr;
156  size_t (*funclen)() = nullptr;
157 
158 #ifndef DISABLE_DYNLOADING
159  OUStringBuffer aBuf;
160 #ifdef SAL_DLLPREFIX
161  aBuf.append(SAL_DLLPREFIX);
162 #endif
163  aBuf.append( "collator_data" SAL_DLLEXTENSION );
164  hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
165  if (hModule) {
166  aBuf.append("get_" + rLocale.Language + "_");
167  if ( rLocale.Language == "zh" ) {
168  OUString func_base = aBuf.makeStringAndClear();
169  if (OUString("TW HK MO").indexOf(rLocale.Country) >= 0)
170  {
171  func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule,
172  OUString(func_base + "TW_" + rAlgorithm).pData));
173  funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule,
174  OUString(func_base + "TW_" + rAlgorithm + "_length").pData));
175  }
176  if (!func)
177  {
178  func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(
179  hModule, OUString(func_base + rAlgorithm).pData));
180  funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(
181  hModule, OUString(func_base + rAlgorithm + "_length").pData));
182  }
183  } else {
184  if ( rLocale.Language == "ja" ) {
185  // replace algorithm name to implementation name.
186  if (rAlgorithm == "phonetic (alphanumeric first)")
187  aBuf.append("phonetic_alphanumeric_first");
188  else if (rAlgorithm == "phonetic (alphanumeric last)")
189  aBuf.append("phonetic_alphanumeric_last");
190  else
191  aBuf.append(rAlgorithm);
192  } else {
193  aBuf.append(rAlgorithm);
194  }
195  OUString func_base = aBuf.makeStringAndClear();
196  OUString funclen_base = func_base + "_length";
197  func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule, func_base.pData));
198  funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule, funclen_base.pData));
199  }
200  }
201 #else
202  if (false) {
203  ;
204 #if WITH_LOCALE_ALL || WITH_LOCALE_ca
205  } else if ( rLocale.Language == "ca" ) {
206  if ( rAlgorithm == "charset" )
207  {
208  func = get_collator_data_ca_charset;
209  funclen = get_collator_data_ca_charset_length;
210  }
211 #endif
212 #if WITH_LOCALE_ALL || WITH_LOCALE_cu
213  } else if ( rLocale.Language == "cu" ) {
214  if ( rAlgorithm == "charset" )
215  {
216  func = get_collator_data_cu_charset;
217  funclen = get_collator_data_cu_charset_length;
218  }
219 #endif
220 #if WITH_LOCALE_ALL || WITH_LOCALE_dz
221  } else if ( rLocale.Language == "dz" || rLocale.Language == "bo" ) {
222  // 'bo' Tibetan uses the same collation rules as 'dz' Dzongkha
223  if ( rAlgorithm == "charset" )
224  {
225  func = get_collator_data_dz_charset;
226  funclen = get_collator_data_dz_charset_length;
227  }
228 #endif
229 #if WITH_LOCALE_ALL || WITH_LOCALE_hu
230  } else if ( rLocale.Language == "hu" ) {
231  if ( rAlgorithm == "charset" )
232  {
233  func = get_collator_data_hu_charset;
234  funclen = get_collator_data_hu_charset_length;
235  }
236 #endif
237 #if WITH_LOCALE_ALL || WITH_LOCALE_ja
238  } else if ( rLocale.Language == "ja" ) {
239  if ( rAlgorithm == "charset" )
240  {
241  func = get_collator_data_ja_charset;
242  funclen = get_collator_data_ja_charset_length;
243  }
244  else if ( rAlgorithm == "phonetic (alphanumeric first)" )
245  {
246  func = get_collator_data_ja_phonetic_alphanumeric_first;
247  funclen = get_collator_data_ja_phonetic_alphanumeric_first_length;
248  }
249  else if ( rAlgorithm == "phonetic (alphanumeric last)" )
250  {
251  func = get_collator_data_ja_phonetic_alphanumeric_last;
252  funclen = get_collator_data_ja_phonetic_alphanumeric_last_length;
253  }
254 #endif
255 #if WITH_LOCALE_ALL || WITH_LOCALE_ko
256 #if (U_ICU_VERSION_MAJOR_NUM < 53)
257  } else if ( rLocale.Language == "ko" ) {
258  if ( rAlgorithm == "charset" )
259  {
260  func = get_collator_data_ko_charset;
261  funclen = get_collator_data_ko_charset_length;
262  }
263 #endif
264 #endif
265 #if WITH_LOCALE_ALL || WITH_LOCALE_ku
266  } else if ( rLocale.Language == "ku" ) {
267  if ( rAlgorithm == "alphanumeric" )
268  {
269  func = get_collator_data_ku_alphanumeric;
270  funclen = get_collator_data_ku_alphanumeric_length;
271  }
272 #endif
273 #if WITH_LOCALE_ALL || WITH_LOCALE_ln
274  } else if ( rLocale.Language == "ln" ) {
275  if ( rAlgorithm == "charset" )
276  {
277  func = get_collator_data_ln_charset;
278  funclen = get_collator_data_ln_charset_length;
279  }
280 #endif
281 #if WITH_LOCALE_ALL || WITH_LOCALE_my
282  } else if ( rLocale.Language == "my" ) {
283  if ( rAlgorithm == "dictionary" )
284  {
285  func = get_collator_data_my_dictionary;
286  funclen = get_collator_data_my_dictionary_length;
287  }
288 #endif
289 #if WITH_LOCALE_ALL || WITH_LOCALE_ne
290  } else if ( rLocale.Language == "ne" ) {
291  if ( rAlgorithm == "charset" )
292  {
293  func = get_collator_data_ne_charset;
294  funclen = get_collator_data_ne_charset_length;
295  }
296 #endif
297 #if WITH_LOCALE_ALL || WITH_LOCALE_sid
298  } else if ( rLocale.Language == "sid" ) {
299  if ( rAlgorithm == "charset" )
300  {
301  func = get_collator_data_sid_charset;
302  funclen = get_collator_data_sid_charset_length;
303  }
304 #endif
305 #if WITH_LOCALE_ALL || WITH_LOCALE_vro
306  } else if ( rLocale.Language == "vro" ) {
307  if ( rAlgorithm == "alphanumeric" )
308  {
309  func = get_collator_data_vro_alphanumeric;
310  funclen = get_collator_data_vro_alphanumeric_length;
311  }
312 #endif
313 #if WITH_LOCALE_ALL || WITH_LOCALE_zh
314  } else if ( rLocale.Language == "zh" && (rLocale.Country == "TW" || rLocale.Country == "HK" || rLocale.Country == "MO") ) {
315  if ( rAlgorithm == "charset" )
316  {
317  func = get_collator_data_zh_TW_charset;
318  funclen = get_collator_data_zh_TW_charset_length;
319  }
320  else if ( rAlgorithm == "radical" )
321  {
322  func = get_collator_data_zh_TW_radical;
323  funclen = get_collator_data_zh_TW_radical_length;
324  }
325  else if ( rAlgorithm == "stroke" )
326  {
327  func = get_collator_data_zh_TW_stroke;
328  funclen = get_collator_data_zh_TW_stroke_length;
329  }
330  } else if ( rLocale.Language == "zh" ) {
331  if ( rAlgorithm == "charset" )
332  {
333  func = get_collator_data_zh_charset;
334  funclen = get_collator_data_zh_charset_length;
335  }
336  else if ( rAlgorithm == "pinyin" )
337  {
338  func = get_collator_data_zh_pinyin;
339  funclen = get_collator_data_zh_pinyin_length;
340  }
341  else if ( rAlgorithm == "radical" )
342  {
343  func = get_collator_data_zh_radical;
344  funclen = get_collator_data_zh_radical_length;
345  }
346  else if ( rAlgorithm == "stroke" )
347  {
348  func = get_collator_data_zh_stroke;
349  funclen = get_collator_data_zh_stroke_length;
350  }
351  else if ( rAlgorithm == "zhuyin" )
352  {
353  func = get_collator_data_zh_zhuyin;
354  funclen = get_collator_data_zh_zhuyin_length;
355  }
356 #endif
357  }
358 #endif // DISABLE_DYNLOADING
359  if (func && funclen) {
360  const sal_uInt8* ruleImage=func();
361  size_t ruleImageSize = funclen();
362 
363  // Not only changed ICU 53.1 the API behavior that a negative
364  // length (ruleImageSize) now leads to failure, but also that
365  // the base RuleBasedCollator passed as uca_base here needs to
366  // have a base->tailoring == CollationRoot::getRoot() otherwise
367  // the init bails out as well, as it does for the previously
368  // used "empty" RuleBasedCollator.
369  // The default collator of the en-US locale would also fulfill
370  // the requirement. The collator of the actual locale or the
371  // NULL (default) locale does not.
372  uca_base.reset( static_cast<icu::RuleBasedCollator*>(icu::Collator::createInstance(
373  icu::Locale::getRoot(), status)) );
374  if (! U_SUCCESS(status)) {
375  OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
376  SAL_WARN("i18npool", message);
377  throw RuntimeException(message);
378  }
379  collator.reset( new icu::RuleBasedCollator(
380  reinterpret_cast<const uint8_t*>(ruleImage), ruleImageSize, uca_base.get(), status) );
381  if (! U_SUCCESS(status)) {
382  OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
383  SAL_WARN("i18npool", message);
384  throw RuntimeException(message);
385  }
386  }
387  }
388  if (!collator) {
401  icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale),
402  u"", rAlgorithm.isEmpty() ? OUString("") : "collation=" + rAlgorithm));
403 
404  // FIXME: apparently we get here in LOKit case only. When the language is Japanese, we pass "ja@collation=phonetic (alphanumeric first)" to ICU
405  // and ICU does not like this (U_ILLEGAL_ARGUMENT_ERROR). Subsequently LOKit crashes, because collator is nullptr.
406  if (!strcmp(icuLocale.getLanguage(), "ja"))
407  icuLocale = icu::Locale::getJapanese();
408 
409  // load ICU collator
410  collator.reset( static_cast<icu::RuleBasedCollator*>( icu::Collator::createInstance(icuLocale, status) ) );
411  if (! U_SUCCESS(status)) {
412  OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
413  SAL_WARN("i18npool", message);
414  throw RuntimeException(message);
415  }
416  }
417  }
418 
419  if (options & CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT)
420  collator->setStrength(icu::Collator::PRIMARY);
421  else if (options & CollatorOptions::CollatorOptions_IGNORE_CASE)
422  collator->setStrength(icu::Collator::SECONDARY);
423  else
424  collator->setStrength(icu::Collator::TERTIARY);
425 
426  return 0;
427 }
428 
429 
430 OUString SAL_CALL
432 {
433  return implementationName;
434 }
435 
436 sal_Bool SAL_CALL
437 Collator_Unicode::supportsService(const OUString& rServiceName)
438 {
439  return cppu::supportsService(this, rServiceName);
440 }
441 
442 Sequence< OUString > SAL_CALL
444 {
445  Sequence< OUString > aRet { OUString(implementationName) };
446  return aRet;
447 }
448 
449 }
450 
451 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
exports com.sun.star.frame. status
#define SAL_DLLEXTENSION
std::unique_ptr< ContentProperties > pData
aBuf
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
#define SAL_DLLPREFIX
std::unique_ptr< icu::RuleBasedCollator > uca_base
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
static rtl::Reference< LocaleDataImpl > get()
Definition: localedata.hxx:62
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
float u
unsigned char sal_Bool
Constant values shared between i18npool and, for example, the number formatter.
sal_Int32 SAL_CALL compareSubstring(const OUString &s1, sal_Int32 off1, sal_Int32 len1, const OUString &s2, sal_Int32 off2, sal_Int32 len2) override
unsigned char sal_uInt8
virtual OUString SAL_CALL getImplementationName() override
virtual ~Collator_Unicode() override
sal_Int32 SAL_CALL loadCollatorAlgorithm(const OUString &impl, const css::lang::Locale &rLocale, sal_Int32 collatorOptions) override
#define SAL_WARN(area, stream)
std::unique_ptr< icu::RuleBasedCollator > collator
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
sal_Int32 SAL_CALL compareString(const OUString &s1, const OUString &s2) override
static void thisModule()
Definition: xdictionary.cxx:41
constexpr OUStringLiteral implementationName