LibreOffice Module i18nutil (master) 1
casefolding.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
21#include "casefolding_data.h"
25#include <com/sun/star/lang/Locale.hpp>
26#include <com/sun/star/uno/RuntimeException.hpp>
27#include <rtl/character.hxx>
28
29#include <unicode/uchar.h>
30
31using namespace com::sun::star::lang;
32using namespace com::sun::star::uno;
33
34namespace i18nutil {
35
36const Mapping mapping_03a3[] = {{0, 1, {0x03c2, 0, 0}},{0, 1, {0x03c3, 0, 0}}};
37const Mapping mapping_0307[] = {{0, 0, {0, 0, 0}},{0, 1, {0x0307, 0, 0}}};
38const Mapping mapping_004a[] = {{0, 2, {0x006a, 0x0307, 0}},{0, 1, {0x006a, 0, 0}}};
39const Mapping mapping_012e[] = {{0, 2, {0x012f, 0x0307, 0}},{0, 1, {0x012f, 0, 0}}};
40const Mapping mapping_00cc[] = {{0, 3, {0x0069, 0x0307, 0x0300}},{0, 1, {0x00ec, 0, 0}}};
41const Mapping mapping_00cd[] = {{0, 3, {0x0069, 0x0307, 0x0301}},{0, 1, {0x00ed, 0, 0}}};
42const Mapping mapping_0128[] = {{0, 3, {0x0069, 0x0307, 0x0303}},{0, 1, {0x0129, 0, 0}}};
43const Mapping mapping_0049[] = {{0, 2, {0x0069, 0x0307, 0}},{0, 1, {0x0131, 0, 0}},{0, 1, {0x0069, 0, 0}}};
44const Mapping mapping_0069[] = {{0, 1, {0x0130, 0, 0}},{0, 1, {0x0049, 0, 0}}};
45const Mapping mapping_0130[] = {{0, 1, {0x0069, 0, 0}},{0, 1, {0x0130, 0, 0}}};
46
47#define langIs(lang) (aLocale.Language == lang)
48
49// only check simple case, there is more complicated case need to be checked.
50#define type_i(ch) ((ch) == 0x0069 || (ch) == 0x006a)
51
53{
54 int msb = ch >> 8;
55 int cmi = CaseMappingIndex[msb];
56 if (cmi < 0)
57 return false;
58 int cmv_idx = (cmi << 8) + (ch & 0xff);
59 return bool(static_cast<MappingType>(CaseMappingValue[cmv_idx].type) & MappingType::CasedLetterMask);
60}
61
62// for Lithuanian, condition to make explicit dot above when lowercasing capital I's and J's
63// whenever there are more accents above.
64#define accent_above(ch) (((ch) >= 0x0300 && (ch) <= 0x0314) || ((ch) >= 0x033D && (ch) <= 0x0344) || (ch) == 0x0346 || ((ch) >= 0x034A && (ch) <= 0x034C))
65
66const Mapping& casefolding::getConditionalValue(const sal_Unicode* str, sal_Int32 pos, sal_Int32 len, Locale const & aLocale, MappingType nMappingType)
67{
68 switch(str[pos]) {
69 case 0x03a3:
70 // final_sigma (not followed by cased and preceded by cased character)
71 // DOES NOT check ignorable sequence yet (more complicated implementation).
72 return !(pos < len && cased_letter(str[pos+1])) && (pos > 0 && cased_letter(str[pos-1])) ?
74 case 0x0307:
75 return (((nMappingType == MappingType::LowerToUpper && langIs("lt")) ||
76 (nMappingType == MappingType::UpperToLower && (langIs("tr") || langIs("az")))) &&
77 (pos > 0 && type_i(str[pos-1]))) ? // after_i
79 case 0x0130:
80 return (langIs("tr") || langIs("az")) ? mapping_0130[0] : mapping_0130[1];
81 case 0x0069:
82 return (langIs("tr") || langIs("az")) ? mapping_0069[0] : mapping_0069[1];
83 case 0x0049: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_0049[0] :
84 (langIs("tr") || langIs("az")) ? mapping_0049[1] : mapping_0049[2];
85 case 0x004a: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_004a[0] : mapping_004a[1];
86 case 0x012e: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_012e[0] : mapping_012e[1];
87 case 0x00cc: return langIs("lt") ? mapping_00cc[0] : mapping_00cc[1];
88 case 0x00cd: return langIs("lt") ? mapping_00cd[0] : mapping_00cd[1];
89 case 0x0128: return langIs("lt") ? mapping_0128[0] : mapping_0128[1];
90 }
91 // Should not come here
92 throw RuntimeException();
93}
94
95Mapping casefolding::getValue(const sal_Unicode* str, sal_Int32 pos, sal_Int32 len, Locale const & aLocale, MappingType nMappingType)
96{
97 if (pos > 0 && rtl::isHighSurrogate(str[pos-1]) && rtl::isLowSurrogate(str[pos]))
98 return { 0, 0, { 0, 0, 0 } };
99
100 Mapping dummy = { 0, 1, { str[pos], 0, 0 } };
101
102 sal_uInt32 c;
103 if (pos + 1 < len && rtl::isHighSurrogate(str[pos]) && rtl::isLowSurrogate(str[pos + 1]))
104 c = rtl::combineSurrogates(str[pos], str[pos + 1]);
105 else
106 c = str[pos];
107
108 sal_Int16 address = -1;
109 if (c < SAL_N_ELEMENTS(CaseMappingIndex) * 256)
110 address = CaseMappingIndex[c >> 8];
111
112 if (address >= 0) {
113 address = (address << 8) + (c & 0xFF);
114 if (static_cast<MappingType>(CaseMappingValue[address].type) & nMappingType) {
115 MappingType type = static_cast<MappingType>(CaseMappingValue[address].type);
116 if (type & MappingType::NotValue) {
117 if (CaseMappingValue[address].value == 0)
118 return getConditionalValue(str, pos, len, aLocale, nMappingType);
119 else {
120 for (int map = CaseMappingValue[address].value;
122 if (static_cast<MappingType>(CaseMappingExtra[map].type) & nMappingType) {
123 if (static_cast<MappingType>(CaseMappingExtra[map].type) & MappingType::NotValue)
124 return getConditionalValue(str, pos, len, aLocale, nMappingType);
125 else
126 return CaseMappingExtra[map];
127 }
128 }
129 // Should not come here
130 throw RuntimeException();
131 }
132 }
133 else
134 {
135 dummy.map[0] = CaseMappingValue[address].value;
136 return dummy;
137 }
138 }
139 }
140
141 // If the code point is not supported by our case mapping tables,
142 // fallback to ICU functions.
143 // TODO: this does not handle special case mapping as these require
144 // using ustring.h APIs, which work on the whole string not character
145 // by character.
146 // TODO: what is the difference between ToLower and UpperToLower etc.?
147 sal_uInt32 value = c;
148 switch (nMappingType)
149 {
152 value = u_tolower(c);
153 break;
156 value = u_toupper(c);
157 break;
159 value = u_totitle(c);
160 break;
163 value = u_foldCase(c, U_FOLD_CASE_DEFAULT);
164 break;
165 default: break;
166 }
167
168 dummy.nmap = rtl::splitSurrogates(value, dummy.map);
169
170 return dummy;
171}
172
173static bool
175{
176 if (next != 0x3099 && next != 0x309a)
177 return false;
178 sal_Unicode c = widthfolding::getCompositionChar(current, next);
179 if (c != 0)
180 current = c;
181 return c != 0;
182}
183
184sal_Unicode casefolding::getNextChar(const sal_Unicode *str, sal_Int32& idx, sal_Int32 len, MappingElement& e, Locale const & aLocale, MappingType nMappingType, TransliterationFlags moduleLoaded)
185{
186 if( idx >= len )
187 {
188 e = MappingElement();
189 return 0;
190 }
191
192 sal_Unicode c;
193
194 if (moduleLoaded & TransliterationFlags::IGNORE_CASE) {
195 if( e.current >= e.element.nmap ) {
196 e.element = getValue(str, idx++, len, aLocale, nMappingType);
197 e.current = 0;
198 }
199 c = e.element.map[e.current++];
200 } else {
201 c = *(str + idx++);
202 }
203
204 if (moduleLoaded & TransliterationFlags::IGNORE_KANA) {
205 if ((0x3040 <= c && c <= 0x3094) || (0x309d <= c && c <= 0x309f))
206 c += 0x60;
207 }
208
209 // composition: KA + voice-mark --> GA. see halfwidthToFullwidth.cxx for detail
210 if (moduleLoaded & TransliterationFlags::IGNORE_WIDTH) {
211 static oneToOneMapping& half2fullTable = widthfolding::gethalf2fullTable();
212 c = half2fullTable[c];
213 if (0x3040 <= c && c <= 0x30ff && idx < len &&
214 is_ja_voice_sound_mark(c, half2fullTable[*(str + idx)]))
215 idx++;
216 }
217
218 return c;
219}
220
221}
222
223/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
#define langIs(lang)
Definition: casefolding.cxx:47
#define type_i(ch)
Definition: casefolding.cxx:50
#define accent_above(ch)
Definition: casefolding.cxx:64
MappingType
Definition: casefolding.hxx:31
#define MaxCaseMappingExtras
Any value
const sal_uInt16 idx[]
#define SAL_N_ELEMENTS(arr)
const Mapping mapping_03a3[]
Definition: casefolding.cxx:36
const Mapping mapping_0130[]
Definition: casefolding.cxx:45
const Mapping mapping_0069[]
Definition: casefolding.cxx:44
const Mapping mapping_0049[]
Definition: casefolding.cxx:43
const Mapping mapping_0307[]
Definition: casefolding.cxx:37
const Mapping mapping_00cd[]
Definition: casefolding.cxx:41
static bool cased_letter(sal_Unicode ch)
Definition: casefolding.cxx:52
Mapping const CaseMappingExtra[]
const sal_Int8 CaseMappingIndex[]
const Mapping mapping_004a[]
Definition: casefolding.cxx:38
const Mapping mapping_00cc[]
Definition: casefolding.cxx:40
const Mapping mapping_0128[]
Definition: casefolding.cxx:42
const Mapping mapping_012e[]
Definition: casefolding.cxx:39
const Value CaseMappingValue[]
static bool is_ja_voice_sound_mark(sal_Unicode &current, sal_Unicode next)
css::beans::Optional< css::uno::Any > getValue(std::u16string_view id)
std::map< OUString, rtl::Reference< Entity > > map
sal_uInt16 value
Definition: casefolding.hxx:53
sal_uInt8 type
Definition: casefolding.hxx:52
TransliterationFlags
This is a superset type of the com::sun::star::i18n::TransliterationModules and TransliterationModule...
@ IGNORE_WIDTH
Ignore full width and half width character when comparing strings by transliteration service.
@ IGNORE_KANA
Ignore Hiragana and Katakana when comparing strings by transliteration service.
@ IGNORE_CASE
Ignore case when comparing strings by transliteration service.
sal_uInt16 sal_Unicode
ResultType type
size_t pos