LibreOffice Module i18npool (master) 1
collator_unicode.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <config_locales.h>
21
22#include <sal/log.hxx>
23#include <rtl/ustrbuf.hxx>
24
25#include <lrl_include.hxx>
26
29#include <collator_unicode.hxx>
30#include <localedata.hxx>
31#include <com/sun/star/i18n/CollatorOptions.hpp>
33
34using namespace ::com::sun::star;
35using namespace ::com::sun::star::i18n;
36using namespace ::com::sun::star::lang;
37using namespace ::com::sun::star::uno;
38
39namespace i18npool {
40
41constexpr OUStringLiteral implementationName = u"com.sun.star.i18n.Collator_Unicode";
42
44{
45 collator = nullptr;
46 uca_base = nullptr;
47#ifndef DISABLE_DYNLOADING
48 hModule = nullptr;
49#endif
50}
51
53{
54 collator.reset();
55 uca_base.reset();
56#ifndef DISABLE_DYNLOADING
57 if (hModule) osl_unloadModule(hModule);
58#endif
59}
60
61#ifdef DISABLE_DYNLOADING
62
63extern "C" {
64
65// For DISABLE_DYNLOADING the generated functions have names that
66// start with get_collator_data_ to avoid clashing with a few
67// functions in the generated libindex_data that are called just
68// get_zh_pinyin for instance.
69
70const sal_uInt8* get_collator_data_ca_charset();
71const sal_uInt8* get_collator_data_cu_charset();
72const sal_uInt8* get_collator_data_dz_charset();
73const sal_uInt8* get_collator_data_hu_charset();
74const sal_uInt8* get_collator_data_ja_charset();
75const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_first();
76const sal_uInt8* get_collator_data_ja_phonetic_alphanumeric_last();
77const sal_uInt8* get_collator_data_ko_charset();
78const sal_uInt8* get_collator_data_ku_alphanumeric();
79const sal_uInt8* get_collator_data_ln_charset();
80const sal_uInt8* get_collator_data_my_dictionary();
81const sal_uInt8* get_collator_data_ne_charset();
82const sal_uInt8* get_collator_data_sid_charset();
83const sal_uInt8* get_collator_data_vro_alphanumeric();
84const sal_uInt8* get_collator_data_zh_TW_charset();
85const sal_uInt8* get_collator_data_zh_TW_radical();
86const sal_uInt8* get_collator_data_zh_TW_stroke();
87const sal_uInt8* get_collator_data_zh_charset();
88const sal_uInt8* get_collator_data_zh_pinyin();
89const sal_uInt8* get_collator_data_zh_radical();
90const sal_uInt8* get_collator_data_zh_stroke();
91const sal_uInt8* get_collator_data_zh_zhuyin();
92
93size_t get_collator_data_ca_charset_length();
94size_t get_collator_data_cu_charset_length();
95size_t get_collator_data_dz_charset_length();
96size_t get_collator_data_hu_charset_length();
97size_t get_collator_data_ja_charset_length();
98size_t get_collator_data_ja_phonetic_alphanumeric_first_length();
99size_t get_collator_data_ja_phonetic_alphanumeric_last_length();
100size_t get_collator_data_ko_charset_length();
101size_t get_collator_data_ku_alphanumeric_length();
102size_t get_collator_data_ln_charset_length();
103size_t get_collator_data_my_dictionary_length();
104size_t get_collator_data_ne_charset_length();
105size_t get_collator_data_sid_charset_length();
106size_t get_collator_data_vro_alphanumeric_length();
107size_t get_collator_data_zh_TW_charset_length();
108size_t get_collator_data_zh_TW_radical_length();
109size_t get_collator_data_zh_TW_stroke_length();
110size_t get_collator_data_zh_charset_length();
111size_t get_collator_data_zh_pinyin_length();
112size_t get_collator_data_zh_radical_length();
113size_t get_collator_data_zh_stroke_length();
114size_t get_collator_data_zh_zhuyin_length();
115
116}
117
118#endif
119
120sal_Int32 SAL_CALL
121Collator_Unicode::compareSubstring( const OUString& str1, sal_Int32 off1, sal_Int32 len1,
122 const OUString& str2, sal_Int32 off2, sal_Int32 len2)
123{
124 return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()) + off1, len1, reinterpret_cast<const UChar *>(str2.getStr()) + off2, len2);
125}
126
127sal_Int32 SAL_CALL
128Collator_Unicode::compareString( const OUString& str1, const OUString& str2)
129{
130 return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()), str1.getLength(),
131 reinterpret_cast<const UChar *>(str2.getStr()), str2.getLength());
132}
133
134#ifndef DISABLE_DYNLOADING
135
136extern "C" { static void thisModule() {} }
137
138#endif
139
140sal_Int32 SAL_CALL
141Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang::Locale& rLocale, sal_Int32 options)
142{
143 if (!collator) {
144 UErrorCode status = U_ZERO_ERROR;
145 OUString rule = LocaleDataImpl::get()->getCollatorRuleByAlgorithm(rLocale, rAlgorithm);
146 if (!rule.isEmpty()) {
147 collator.reset( new icu::RuleBasedCollator(reinterpret_cast<const UChar *>(rule.getStr()), status) );
148 if (! U_SUCCESS(status)) {
149 OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
150 SAL_WARN("i18npool", message);
151 throw RuntimeException(message);
152 }
153 }
154 if (!collator && OUString(LOCAL_RULE_LANGS).indexOf(rLocale.Language) >= 0) {
155 const sal_uInt8* (*func)() = nullptr;
156 size_t (*funclen)() = nullptr;
157
158#ifndef DISABLE_DYNLOADING
159 OUStringBuffer aBuf;
160#ifdef SAL_DLLPREFIX
161 aBuf.append(SAL_DLLPREFIX);
162#endif
163 aBuf.append( "collator_data" SAL_DLLEXTENSION );
164 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
165 if (hModule) {
166 aBuf.append("get_" + rLocale.Language + "_");
167 if ( rLocale.Language == "zh" ) {
168 OUString func_base = aBuf.makeStringAndClear();
169 if (OUString("TW HK MO").indexOf(rLocale.Country) >= 0)
170 {
171 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule,
172 OUString(func_base + "TW_" + rAlgorithm).pData));
173 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule,
174 OUString(func_base + "TW_" + rAlgorithm + "_length").pData));
175 }
176 if (!func)
177 {
178 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(
179 hModule, OUString(func_base + rAlgorithm).pData));
180 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(
181 hModule, OUString(func_base + rAlgorithm + "_length").pData));
182 }
183 } else {
184 if ( rLocale.Language == "ja" ) {
185 // replace algorithm name to implementation name.
186 if (rAlgorithm == "phonetic (alphanumeric first)")
187 aBuf.append("phonetic_alphanumeric_first");
188 else if (rAlgorithm == "phonetic (alphanumeric last)")
189 aBuf.append("phonetic_alphanumeric_last");
190 else
191 aBuf.append(rAlgorithm);
192 } else {
193 aBuf.append(rAlgorithm);
194 }
195 OUString func_base = aBuf.makeStringAndClear();
196 OUString funclen_base = func_base + "_length";
197 func = reinterpret_cast<const sal_uInt8* (*)()>(osl_getFunctionSymbol(hModule, func_base.pData));
198 funclen = reinterpret_cast<size_t (*)()>(osl_getFunctionSymbol(hModule, funclen_base.pData));
199 }
200 }
201#else
202 if (false) {
203 ;
204#if WITH_LOCALE_ALL || WITH_LOCALE_ca
205 } else if ( rLocale.Language == "ca" ) {
206 if ( rAlgorithm == "charset" )
207 {
208 func = get_collator_data_ca_charset;
209 funclen = get_collator_data_ca_charset_length;
210 }
211#endif
212#if WITH_LOCALE_ALL || WITH_LOCALE_cu
213 } else if ( rLocale.Language == "cu" ) {
214 if ( rAlgorithm == "charset" )
215 {
216 func = get_collator_data_cu_charset;
217 funclen = get_collator_data_cu_charset_length;
218 }
219#endif
220#if WITH_LOCALE_ALL || WITH_LOCALE_dz
221 } else if ( rLocale.Language == "dz" || rLocale.Language == "bo" ) {
222 // 'bo' Tibetan uses the same collation rules as 'dz' Dzongkha
223 if ( rAlgorithm == "charset" )
224 {
225 func = get_collator_data_dz_charset;
226 funclen = get_collator_data_dz_charset_length;
227 }
228#endif
229#if WITH_LOCALE_ALL || WITH_LOCALE_hu
230 } else if ( rLocale.Language == "hu" ) {
231 if ( rAlgorithm == "charset" )
232 {
233 func = get_collator_data_hu_charset;
234 funclen = get_collator_data_hu_charset_length;
235 }
236#endif
237#if WITH_LOCALE_ALL || WITH_LOCALE_ja
238 } else if ( rLocale.Language == "ja" ) {
239 if ( rAlgorithm == "charset" )
240 {
241 func = get_collator_data_ja_charset;
242 funclen = get_collator_data_ja_charset_length;
243 }
244 else if ( rAlgorithm == "phonetic (alphanumeric first)" )
245 {
246 func = get_collator_data_ja_phonetic_alphanumeric_first;
247 funclen = get_collator_data_ja_phonetic_alphanumeric_first_length;
248 }
249 else if ( rAlgorithm == "phonetic (alphanumeric last)" )
250 {
251 func = get_collator_data_ja_phonetic_alphanumeric_last;
252 funclen = get_collator_data_ja_phonetic_alphanumeric_last_length;
253 }
254#endif
255#if WITH_LOCALE_ALL || WITH_LOCALE_ku
256 } else if ( rLocale.Language == "ku" ) {
257 if ( rAlgorithm == "alphanumeric" )
258 {
259 func = get_collator_data_ku_alphanumeric;
260 funclen = get_collator_data_ku_alphanumeric_length;
261 }
262#endif
263#if WITH_LOCALE_ALL || WITH_LOCALE_ln
264 } else if ( rLocale.Language == "ln" ) {
265 if ( rAlgorithm == "charset" )
266 {
267 func = get_collator_data_ln_charset;
268 funclen = get_collator_data_ln_charset_length;
269 }
270#endif
271#if WITH_LOCALE_ALL || WITH_LOCALE_my
272 } else if ( rLocale.Language == "my" ) {
273 if ( rAlgorithm == "dictionary" )
274 {
275 func = get_collator_data_my_dictionary;
276 funclen = get_collator_data_my_dictionary_length;
277 }
278#endif
279#if WITH_LOCALE_ALL || WITH_LOCALE_ne
280 } else if ( rLocale.Language == "ne" ) {
281 if ( rAlgorithm == "charset" )
282 {
283 func = get_collator_data_ne_charset;
284 funclen = get_collator_data_ne_charset_length;
285 }
286#endif
287#if WITH_LOCALE_ALL || WITH_LOCALE_sid
288 } else if ( rLocale.Language == "sid" ) {
289 if ( rAlgorithm == "charset" )
290 {
291 func = get_collator_data_sid_charset;
292 funclen = get_collator_data_sid_charset_length;
293 }
294#endif
295#if WITH_LOCALE_ALL || WITH_LOCALE_vro
296 } else if ( rLocale.Language == "vro" ) {
297 if ( rAlgorithm == "alphanumeric" )
298 {
299 func = get_collator_data_vro_alphanumeric;
300 funclen = get_collator_data_vro_alphanumeric_length;
301 }
302#endif
303#if WITH_LOCALE_ALL || WITH_LOCALE_zh
304 } else if ( rLocale.Language == "zh" && (rLocale.Country == "TW" || rLocale.Country == "HK" || rLocale.Country == "MO") ) {
305 if ( rAlgorithm == "charset" )
306 {
307 func = get_collator_data_zh_TW_charset;
308 funclen = get_collator_data_zh_TW_charset_length;
309 }
310 else if ( rAlgorithm == "radical" )
311 {
312 func = get_collator_data_zh_TW_radical;
313 funclen = get_collator_data_zh_TW_radical_length;
314 }
315 else if ( rAlgorithm == "stroke" )
316 {
317 func = get_collator_data_zh_TW_stroke;
318 funclen = get_collator_data_zh_TW_stroke_length;
319 }
320 } else if ( rLocale.Language == "zh" ) {
321 if ( rAlgorithm == "charset" )
322 {
323 func = get_collator_data_zh_charset;
324 funclen = get_collator_data_zh_charset_length;
325 }
326 else if ( rAlgorithm == "pinyin" )
327 {
328 func = get_collator_data_zh_pinyin;
329 funclen = get_collator_data_zh_pinyin_length;
330 }
331 else if ( rAlgorithm == "radical" )
332 {
333 func = get_collator_data_zh_radical;
334 funclen = get_collator_data_zh_radical_length;
335 }
336 else if ( rAlgorithm == "stroke" )
337 {
338 func = get_collator_data_zh_stroke;
339 funclen = get_collator_data_zh_stroke_length;
340 }
341 else if ( rAlgorithm == "zhuyin" )
342 {
343 func = get_collator_data_zh_zhuyin;
344 funclen = get_collator_data_zh_zhuyin_length;
345 }
346#endif
347 }
348#endif // DISABLE_DYNLOADING
349 if (func && funclen) {
350 const sal_uInt8* ruleImage=func();
351 size_t ruleImageSize = funclen();
352
353 // Not only changed ICU 53.1 the API behavior that a negative
354 // length (ruleImageSize) now leads to failure, but also that
355 // the base RuleBasedCollator passed as uca_base here needs to
356 // have a base->tailoring == CollationRoot::getRoot() otherwise
357 // the init bails out as well, as it does for the previously
358 // used "empty" RuleBasedCollator.
359 // The default collator of the en-US locale would also fulfill
360 // the requirement. The collator of the actual locale or the
361 // NULL (default) locale does not.
362 uca_base.reset( static_cast<icu::RuleBasedCollator*>(icu::Collator::createInstance(
363 icu::Locale::getRoot(), status)) );
364 if (! U_SUCCESS(status)) {
365 OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
366 SAL_WARN("i18npool", message);
367 throw RuntimeException(message);
368 }
369 collator.reset( new icu::RuleBasedCollator(
370 reinterpret_cast<const uint8_t*>(ruleImage), ruleImageSize, uca_base.get(), status) );
371 if (! U_SUCCESS(status)) {
372 OUString message = "icu::RuleBasedCollator ctor failed: " + OUString::createFromAscii(u_errorName(status));
373 SAL_WARN("i18npool", message);
374 throw RuntimeException(message);
375 }
376 }
377 }
378 if (!collator) {
391 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale),
392 u"", rAlgorithm.isEmpty() ? OUString("") : "collation=" + rAlgorithm));
393
394 // FIXME: apparently we get here in LOKit case only. When the language is Japanese, we pass "ja@collation=phonetic (alphanumeric first)" to ICU
395 // and ICU does not like this (U_ILLEGAL_ARGUMENT_ERROR). Subsequently LOKit crashes, because collator is nullptr.
396 if (!strcmp(icuLocale.getLanguage(), "ja"))
397 icuLocale = icu::Locale::getJapanese();
398
399 // load ICU collator
400 collator.reset( static_cast<icu::RuleBasedCollator*>( icu::Collator::createInstance(icuLocale, status) ) );
401 if (! U_SUCCESS(status)) {
402 OUString message = "icu::Collator::createInstance() failed: " + OUString::createFromAscii(u_errorName(status));
403 SAL_WARN("i18npool", message);
404 throw RuntimeException(message);
405 }
406 }
407 }
408
409 if (options & CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT)
410 collator->setStrength(icu::Collator::PRIMARY);
411 else if (options & CollatorOptions::CollatorOptions_IGNORE_CASE)
412 collator->setStrength(icu::Collator::SECONDARY);
413 else
414 collator->setStrength(icu::Collator::TERTIARY);
415
416 return 0;
417}
418
419
420OUString SAL_CALL
422{
423 return implementationName;
424}
425
426sal_Bool SAL_CALL
427Collator_Unicode::supportsService(const OUString& rServiceName)
428{
429 return cppu::supportsService(this, rServiceName);
430}
431
432Sequence< OUString > SAL_CALL
434{
435 Sequence< OUString > aRet { OUString(implementationName) };
436 return aRet;
437}
438
439}
440
441/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
#define SAL_DLLPREFIX
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
virtual ~Collator_Unicode() override
std::unique_ptr< icu::RuleBasedCollator > uca_base
virtual OUString SAL_CALL getImplementationName() override
sal_Int32 SAL_CALL compareString(const OUString &s1, const OUString &s2) override
sal_Int32 SAL_CALL loadCollatorAlgorithm(const OUString &impl, const css::lang::Locale &rLocale, sal_Int32 collatorOptions) override
std::unique_ptr< icu::RuleBasedCollator > collator
sal_Int32 SAL_CALL compareSubstring(const OUString &s1, sal_Int32 off1, sal_Int32 len1, const OUString &s2, sal_Int32 off2, sal_Int32 len2) override
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
static rtl::Reference< LocaleDataImpl > get()
Definition: localedata.hxx:77
#define SAL_DLLEXTENSION
float u
#define SAL_WARN(area, stream)
aBuf
std::unique_ptr< sal_Int32[]> pData
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
Constant values shared between i18npool and, for example, the number formatter.
constexpr OUStringLiteral implementationName
static void thisModule()
Definition: xdictionary.cxx:41
unsigned char sal_uInt8
unsigned char sal_Bool