LibreOffice Module i18nutil (master)  1
casefolding.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <i18nutil/casefolding.hxx>
21 #include "casefolding_data.h"
25 #include <com/sun/star/lang/Locale.hpp>
26 #include <com/sun/star/uno/RuntimeException.hpp>
27 #include <rtl/character.hxx>
28 
29 using namespace com::sun::star::lang;
30 using namespace com::sun::star::uno;
31 
32 namespace i18nutil {
33 
34 const Mapping mapping_03a3[] = {{0, 1, {0x03c2, 0, 0}},{0, 1, {0x03c3, 0, 0}}};
35 const Mapping mapping_0307[] = {{0, 0, {0, 0, 0}},{0, 1, {0x0307, 0, 0}}};
36 const Mapping mapping_004a[] = {{0, 2, {0x006a, 0x0307, 0}},{0, 1, {0x006a, 0, 0}}};
37 const Mapping mapping_012e[] = {{0, 2, {0x012f, 0x0307, 0}},{0, 1, {0x012f, 0, 0}}};
38 const Mapping mapping_00cc[] = {{0, 3, {0x0069, 0x0307, 0x0300}},{0, 1, {0x00ec, 0, 0}}};
39 const Mapping mapping_00cd[] = {{0, 3, {0x0069, 0x0307, 0x0301}},{0, 1, {0x00ed, 0, 0}}};
40 const Mapping mapping_0128[] = {{0, 3, {0x0069, 0x0307, 0x0303}},{0, 1, {0x0129, 0, 0}}};
41 const Mapping mapping_0049[] = {{0, 2, {0x0069, 0x0307, 0}},{0, 1, {0x0131, 0, 0}},{0, 1, {0x0069, 0, 0}}};
42 const Mapping mapping_0069[] = {{0, 1, {0x0130, 0, 0}},{0, 1, {0x0049, 0, 0}}};
43 const Mapping mapping_0130[] = {{0, 1, {0x0069, 0, 0}},{0, 1, {0x0130, 0, 0}}};
44 
45 #define langIs(lang) (aLocale.Language == lang)
46 
47 // only check simple case, there is more complicated case need to be checked.
48 #define type_i(ch) ((ch) == 0x0069 || (ch) == 0x006a)
49 
50 static bool cased_letter(sal_Unicode ch)
51 {
52  int msb = ch >> 8;
53  int cmi = CaseMappingIndex[msb];
54  if (cmi < 0)
55  return false;
56  int cmv_idx = (cmi << 8) + (ch & 0xff);
57  return bool(static_cast<MappingType>(CaseMappingValue[cmv_idx].type) & MappingType::CasedLetterMask);
58 }
59 
60 // for Lithuanian, condition to make explicit dot above when lowercasing capital I's and J's
61 // whenever there are more accents above.
62 #define accent_above(ch) (((ch) >= 0x0300 && (ch) <= 0x0314) || ((ch) >= 0x033D && (ch) <= 0x0344) || (ch) == 0x0346 || ((ch) >= 0x034A && (ch) <= 0x034C))
63 
64 const Mapping& casefolding::getConditionalValue(const sal_Unicode* str, sal_Int32 pos, sal_Int32 len, Locale const & aLocale, MappingType nMappingType)
65 {
66  switch(str[pos]) {
67  case 0x03a3:
68  // final_sigma (not followed by cased and preceded by cased character)
69  // DOES NOT check ignorable sequence yet (more complicated implementation).
70  return !(pos < len && cased_letter(str[pos+1])) && (pos > 0 && cased_letter(str[pos-1])) ?
71  mapping_03a3[0] : mapping_03a3[1];
72  case 0x0307:
73  return (((nMappingType == MappingType::LowerToUpper && langIs("lt")) ||
74  (nMappingType == MappingType::UpperToLower && (langIs("tr") || langIs("az")))) &&
75  (pos > 0 && type_i(str[pos-1]))) ? // after_i
76  mapping_0307[0] : mapping_0307[1];
77  case 0x0130:
78  return (langIs("tr") || langIs("az")) ? mapping_0130[0] : mapping_0130[1];
79  case 0x0069:
80  return (langIs("tr") || langIs("az")) ? mapping_0069[0] : mapping_0069[1];
81  case 0x0049: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_0049[0] :
82  (langIs("tr") || langIs("az")) ? mapping_0049[1] : mapping_0049[2];
83  case 0x004a: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_004a[0] : mapping_004a[1];
84  case 0x012e: return langIs("lt") && pos > len && accent_above(str[pos+1]) ? mapping_012e[0] : mapping_012e[1];
85  case 0x00cc: return langIs("lt") ? mapping_00cc[0] : mapping_00cc[1];
86  case 0x00cd: return langIs("lt") ? mapping_00cd[0] : mapping_00cd[1];
87  case 0x0128: return langIs("lt") ? mapping_0128[0] : mapping_0128[1];
88  }
89  // Should not come here
90  throw RuntimeException();
91 }
92 
93 Mapping casefolding::getValue(const sal_Unicode* str, sal_Int32 pos, sal_Int32 len, Locale const & aLocale, MappingType nMappingType)
94 {
95  Mapping dummy = { 0, 1, { str[pos], 0, 0 } };
96 
97  sal_uInt32 c;
98  if (pos > 0 && rtl::isHighSurrogate(str[pos-1]) && rtl::isLowSurrogate(str[pos])) {
99  c = rtl::combineSurrogates(str[pos-1], str[pos]);
100  if (c >= SAL_N_ELEMENTS(CaseMappingIndex) * 256)
101  return dummy;
102  } else {
103  c = str[pos];
104  }
105 
106  sal_Int16 address = CaseMappingIndex[c >> 8];
107 
108  if (address >= 0) {
109  address = (address << 8) + (c & 0xFF);
110  if (static_cast<MappingType>(CaseMappingValue[address].type) & nMappingType) {
111  MappingType type = static_cast<MappingType>(CaseMappingValue[address].type);
112  if (type & MappingType::NotValue) {
113  if (CaseMappingValue[address].value == 0)
114  return getConditionalValue(str, pos, len, aLocale, nMappingType);
115  else {
116  for (int map = CaseMappingValue[address].value;
118  if (static_cast<MappingType>(CaseMappingExtra[map].type) & nMappingType) {
119  if (static_cast<MappingType>(CaseMappingExtra[map].type) & MappingType::NotValue)
120  return getConditionalValue(str, pos, len, aLocale, nMappingType);
121  else
122  return CaseMappingExtra[map];
123  }
124  }
125  // Should not come here
126  throw RuntimeException();
127  }
128  } else
129  dummy.map[0] = CaseMappingValue[address].value;
130  }
131  }
132  return dummy;
133 }
134 
135 static bool
137 {
138  if (next != 0x3099 && next != 0x309a)
139  return false;
140  sal_Unicode c = widthfolding::getCompositionChar(current, next);
141  if (c != 0)
142  current = c;
143  return c != 0;
144 }
145 
146 sal_Unicode casefolding::getNextChar(const sal_Unicode *str, sal_Int32& idx, sal_Int32 len, MappingElement& e, Locale const & aLocale, MappingType nMappingType, TransliterationFlags moduleLoaded)
147 {
148  if( idx >= len )
149  {
150  e = MappingElement();
151  return 0;
152  }
153 
154  sal_Unicode c;
155 
156  if (moduleLoaded & TransliterationFlags::IGNORE_CASE) {
157  if( e.current >= e.element.nmap ) {
158  e.element = getValue(str, idx++, len, aLocale, nMappingType);
159  e.current = 0;
160  }
161  c = e.element.map[e.current++];
162  } else {
163  c = *(str + idx++);
164  }
165 
166  if (moduleLoaded & TransliterationFlags::IGNORE_KANA) {
167  if ((0x3040 <= c && c <= 0x3094) || (0x309d <= c && c <= 0x309f))
168  c += 0x60;
169  }
170 
171  // composition: KA + voice-mark --> GA. see halfwidthToFullwidth.cxx for detail
172  if (moduleLoaded & TransliterationFlags::IGNORE_WIDTH) {
173  static oneToOneMapping& half2fullTable = widthfolding::gethalf2fullTable();
174  c = half2fullTable[c];
175  if (0x3040 <= c && c <= 0x30ff && idx < len &&
176  is_ja_voice_sound_mark(c, half2fullTable[*(str + idx)]))
177  idx++;
178  }
179 
180  return c;
181 }
182 
183 }
184 
185 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
#define accent_above(ch)
Definition: casefolding.cxx:62
const Mapping mapping_0069[]
Definition: casefolding.cxx:42
const Mapping mapping_0049[]
Definition: casefolding.cxx:41
sal_uInt8 type
Definition: casefolding.hxx:52
const Mapping mapping_0130[]
Definition: casefolding.cxx:43
Ignore full width and half width character when comparing strings by transliteration service...
const Mapping mapping_00cd[]
Definition: casefolding.cxx:39
const Mapping mapping_0307[]
Definition: casefolding.cxx:35
MappingType
Definition: casefolding.hxx:31
sal_uInt16 sal_Unicode
#define type_i(ch)
Definition: casefolding.cxx:48
const Mapping mapping_03a3[]
Definition: casefolding.cxx:34
#define MaxCaseMappingExtras
TransliterationFlags
This is a superset type of the com::sun::star::i18n::TransliterationModules and TransliterationModule...
#define SAL_N_ELEMENTS(arr)
const Mapping mapping_004a[]
Definition: casefolding.cxx:36
#define langIs(lang)
Definition: casefolding.cxx:45
Ignore case when comparing strings by transliteration service.
const Mapping mapping_00cc[]
Definition: casefolding.cxx:38
css::beans::Optional< css::uno::Any > getValue(std::u16string_view id)
static bool is_ja_voice_sound_mark(sal_Unicode &current, sal_Unicode next)
Ignore Hiragana and Katakana when comparing strings by transliteration service.
const sal_Int8 CaseMappingIndex[]
Mapping const CaseMappingExtra[]
std::map< OUString, rtl::Reference< Entity > > map
const Mapping mapping_012e[]
Definition: casefolding.cxx:37
ResultType type
const Value CaseMappingValue[]
sal_uInt16 value
Definition: casefolding.hxx:53
const Mapping mapping_0128[]
Definition: casefolding.cxx:40
static bool cased_letter(sal_Unicode ch)
Definition: casefolding.cxx:50