LibreOffice Module i18nutil (master)  1
unicode.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include "unicode_data.h"
28 #include <rtl/character.hxx>
29 #include <memory>
30 
31 // Workaround for glibc braindamage:
32 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
33 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
34 #undef CURRENCY_SYMBOL
35 
36 using namespace ::com::sun::star::i18n;
37 
38 template<class L, typename T>
39 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
40 
41  sal_Int16 i = 0;
42  css::i18n::UnicodeScript type = typeList[0].to;
43  while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
44  type = typeList[++i].to;
45  }
46 
47  return (type < UnicodeScript_kScriptCount &&
48  ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
49  typeList[i].value : unknownType;
50 }
51 
52 sal_Int16
53 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
54  return getScriptType(ch, typeList, unknownType);
55 }
56 
58 unicode::getUnicodeScriptStart( UnicodeScript type) {
59  return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
60 }
61 
63 unicode::getUnicodeScriptEnd( UnicodeScript type) {
64  return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
65 }
66 
67 sal_Int16
69  static sal_Unicode c = 0x00;
70  static sal_Int16 r = 0x00;
71 
72  if (ch == c) return r;
73  else c = ch;
74 
75  sal_Int16 address = UnicodeTypeIndex[ch >> 8];
76  r = static_cast<sal_Int16>(
77  (address < UnicodeTypeNumberBlock)
78  ? UnicodeTypeBlockValue[address]
79  : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
80  return r;
81 }
82 
85  static sal_Unicode c = 0x00;
86  static sal_uInt8 r = 0x00;
87 
88  if (ch == c) return r;
89  else c = ch;
90 
91  sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
92  r = (address < UnicodeDirectionNumberBlock)
94  : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
95  return r;
96 }
97 
98 #define bit(name) (1U << name)
99 
100 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
101 
102 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
103 
104 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
105 
106 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
107  bit(UnicodeType::MODIFIER_LETTER)|\
108  bit(UnicodeType::OTHER_LETTER)
109 
110 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
111  bit(UnicodeType::LINE_SEPARATOR)|\
112  bit(UnicodeType::PARAGRAPH_SEPARATOR)
113 
114 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
115  bit(UnicodeType::FORMAT)|\
116  bit(UnicodeType::LINE_SEPARATOR)|\
117  bit(UnicodeType::PARAGRAPH_SEPARATOR)
118 
119 #define IsType(func, mask) \
120 bool func( const sal_Unicode ch) {\
121  return (bit(getUnicodeType(ch)) & (mask)) != 0;\
122 }
123 
125 IsType(unicode::isAlpha, ALPHAMASK)
126 IsType(unicode::isSpace, SPACEMASK)
127 
128 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
129  bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
130 
131 bool unicode::isWhiteSpace( const sal_Unicode ch) {
132  return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
133 }
134 
135 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
136 {
137  //See unicode/uscript.h
138  static const sal_Int16 scriptTypes[] =
139  {
140  ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
141  ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
142  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
143  // 15
144  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
145  ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
146  ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
147  // 30
148  ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
149  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
150  ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
151  // 45
152  ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
153  ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
154  ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
155  // 60
156  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
157  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
158  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
159  // 75
160  ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
161  ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
162  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
163  // 90
164  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
165  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
166  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
167  // 105
168  ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
169  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
170  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
171  // 120
172  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
173  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
174  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
175  // 135
176  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
177  ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
178  ScriptType::COMPLEX,
179  ScriptType::WEAK
180  };
181 
182  sal_Int16 nRet;
183  if (eScript < USCRIPT_COMMON)
184  nRet = ScriptType::WEAK;
185  else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
186  nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
187  else
188  nRet = scriptTypes[eScript];
189  return nRet;
190 }
191 
192 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
193 {
194  OString sRet;
195  switch (eScript)
196  {
197  case USCRIPT_CODE_LIMIT:
198  case USCRIPT_INVALID_CODE:
199  sRet = "zxx";
200  break;
201  case USCRIPT_COMMON:
202  case USCRIPT_INHERITED:
203  sRet = "und";
204  break;
205  case USCRIPT_MATHEMATICAL_NOTATION:
206  case USCRIPT_SYMBOLS:
207  sRet = "zxx";
208  break;
209  case USCRIPT_UNWRITTEN_LANGUAGES:
210  case USCRIPT_UNKNOWN:
211  sRet = "und";
212  break;
213  case USCRIPT_ARABIC:
214  sRet = "ar";
215  break;
216  case USCRIPT_ARMENIAN:
217  sRet = "hy";
218  break;
219  case USCRIPT_BENGALI:
220  sRet = "bn";
221  break;
222  case USCRIPT_BOPOMOFO:
223  sRet = "zh";
224  break;
225  case USCRIPT_CHEROKEE:
226  sRet = "chr";
227  break;
228  case USCRIPT_COPTIC:
229  sRet = "cop";
230  break;
231  case USCRIPT_CYRILLIC:
232  sRet = "ru";
233  break;
234  case USCRIPT_DESERET:
235  sRet = "en";
236  break;
237  case USCRIPT_DEVANAGARI:
238  sRet = "hi";
239  break;
240  case USCRIPT_ETHIOPIC:
241  sRet = "am";
242  break;
243  case USCRIPT_GEORGIAN:
244  sRet = "ka";
245  break;
246  case USCRIPT_GOTHIC:
247  sRet = "got";
248  break;
249  case USCRIPT_GREEK:
250  sRet = "el";
251  break;
252  case USCRIPT_GUJARATI:
253  sRet = "gu";
254  break;
255  case USCRIPT_GURMUKHI:
256  sRet = "pa";
257  break;
258  case USCRIPT_HAN:
259  sRet = "zh";
260  break;
261  case USCRIPT_HANGUL:
262  sRet = "ko";
263  break;
264  case USCRIPT_HEBREW:
265  sRet = "hr";
266  break;
267  case USCRIPT_HIRAGANA:
268  sRet = "ja";
269  break;
270  case USCRIPT_KANNADA:
271  sRet = "kn";
272  break;
273  case USCRIPT_KATAKANA:
274  sRet = "ja";
275  break;
276  case USCRIPT_KHMER:
277  sRet = "km";
278  break;
279  case USCRIPT_LAO:
280  sRet = "lo";
281  break;
282  case USCRIPT_LATIN:
283  sRet = "en";
284  break;
285  case USCRIPT_MALAYALAM:
286  sRet = "ml";
287  break;
288  case USCRIPT_MONGOLIAN:
289  sRet = "mn";
290  break;
291  case USCRIPT_MYANMAR:
292  sRet = "my";
293  break;
294  case USCRIPT_OGHAM:
295  sRet = "pgl";
296  break;
297  case USCRIPT_OLD_ITALIC:
298  sRet = "osc";
299  break;
300  case USCRIPT_ORIYA:
301  sRet = "or";
302  break;
303  case USCRIPT_RUNIC:
304  sRet = "ang";
305  break;
306  case USCRIPT_SINHALA:
307  sRet = "si";
308  break;
309  case USCRIPT_SYRIAC:
310  sRet = "syr";
311  break;
312  case USCRIPT_TAMIL:
313  sRet = "ta";
314  break;
315  case USCRIPT_TELUGU:
316  sRet = "te";
317  break;
318  case USCRIPT_THAANA:
319  sRet = "dv";
320  break;
321  case USCRIPT_THAI:
322  sRet = "th";
323  break;
324  case USCRIPT_TIBETAN:
325  sRet = "bo";
326  break;
327  case USCRIPT_CANADIAN_ABORIGINAL:
328  sRet = "iu";
329  break;
330  case USCRIPT_YI:
331  sRet = "ii";
332  break;
333  case USCRIPT_TAGALOG:
334  sRet = "tl";
335  break;
336  case USCRIPT_HANUNOO:
337  sRet = "hnn";
338  break;
339  case USCRIPT_BUHID:
340  sRet = "bku";
341  break;
342  case USCRIPT_TAGBANWA:
343  sRet = "tbw";
344  break;
345  case USCRIPT_BRAILLE:
346  sRet = "en";
347  break;
348  case USCRIPT_CYPRIOT:
349  sRet = "ecy";
350  break;
351  case USCRIPT_LIMBU:
352  sRet = "lif";
353  break;
354  case USCRIPT_LINEAR_B:
355  sRet = "gmy";
356  break;
357  case USCRIPT_OSMANYA:
358  sRet = "so";
359  break;
360  case USCRIPT_SHAVIAN:
361  sRet = "en";
362  break;
363  case USCRIPT_TAI_LE:
364  sRet = "tdd";
365  break;
366  case USCRIPT_UGARITIC:
367  sRet = "uga";
368  break;
369  case USCRIPT_KATAKANA_OR_HIRAGANA:
370  sRet = "ja";
371  break;
372  case USCRIPT_BUGINESE:
373  sRet = "bug";
374  break;
375  case USCRIPT_GLAGOLITIC:
376  sRet = "ch";
377  break;
378  case USCRIPT_KHAROSHTHI:
379  sRet = "pra";
380  break;
381  case USCRIPT_SYLOTI_NAGRI:
382  sRet = "syl";
383  break;
384  case USCRIPT_NEW_TAI_LUE:
385  sRet = "khb";
386  break;
387  case USCRIPT_TIFINAGH:
388  sRet = "tmh";
389  break;
390  case USCRIPT_OLD_PERSIAN:
391  sRet = "peo";
392  break;
393  case USCRIPT_BALINESE:
394  sRet = "ban";
395  break;
396  case USCRIPT_BATAK:
397  sRet = "btk";
398  break;
399  case USCRIPT_BLISSYMBOLS:
400  sRet = "en";
401  break;
402  case USCRIPT_BRAHMI:
403  sRet = "pra";
404  break;
405  case USCRIPT_CHAM:
406  sRet = "cja";
407  break;
408  case USCRIPT_CIRTH:
409  sRet = "sjn";
410  break;
411  case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
412  sRet = "cu";
413  break;
414  case USCRIPT_DEMOTIC_EGYPTIAN:
415  case USCRIPT_HIERATIC_EGYPTIAN:
416  case USCRIPT_EGYPTIAN_HIEROGLYPHS:
417  sRet = "egy";
418  break;
419  case USCRIPT_KHUTSURI:
420  sRet = "ka";
421  break;
422  case USCRIPT_SIMPLIFIED_HAN:
423  sRet = "zh";
424  break;
425  case USCRIPT_TRADITIONAL_HAN:
426  sRet = "zh";
427  break;
428  case USCRIPT_PAHAWH_HMONG:
429  sRet = "blu";
430  break;
431  case USCRIPT_OLD_HUNGARIAN:
432  sRet = "ohu";
433  break;
434  case USCRIPT_HARAPPAN_INDUS:
435  sRet = "xiv";
436  break;
437  case USCRIPT_JAVANESE:
438  sRet = "kaw";
439  break;
440  case USCRIPT_KAYAH_LI:
441  sRet = "eky";
442  break;
443  case USCRIPT_LATIN_FRAKTUR:
444  sRet = "de";
445  break;
446  case USCRIPT_LATIN_GAELIC:
447  sRet = "ga";
448  break;
449  case USCRIPT_LEPCHA:
450  sRet = "lep";
451  break;
452  case USCRIPT_LINEAR_A:
453  sRet = "ecr";
454  break;
455  case USCRIPT_MAYAN_HIEROGLYPHS:
456  sRet = "myn";
457  break;
458  case USCRIPT_MEROITIC:
459  sRet = "xmr";
460  break;
461  case USCRIPT_NKO:
462  sRet = "nqo";
463  break;
464  case USCRIPT_ORKHON:
465  sRet = "otk";
466  break;
467  case USCRIPT_OLD_PERMIC:
468  sRet = "kv";
469  break;
470  case USCRIPT_PHAGS_PA:
471  sRet = "xng";
472  break;
473  case USCRIPT_PHOENICIAN:
474  sRet = "phn";
475  break;
476  case USCRIPT_PHONETIC_POLLARD:
477  sRet = "hmd";
478  break;
479  case USCRIPT_RONGORONGO:
480  sRet = "rap";
481  break;
482  case USCRIPT_SARATI:
483  sRet = "qya";
484  break;
485  case USCRIPT_ESTRANGELO_SYRIAC:
486  sRet = "syr";
487  break;
488  case USCRIPT_WESTERN_SYRIAC:
489  sRet = "tru";
490  break;
491  case USCRIPT_EASTERN_SYRIAC:
492  sRet = "aii";
493  break;
494  case USCRIPT_TENGWAR:
495  sRet = "sjn";
496  break;
497  case USCRIPT_VAI:
498  sRet = "vai";
499  break;
500  case USCRIPT_VISIBLE_SPEECH:
501  sRet = "en";
502  break;
503  case USCRIPT_CUNEIFORM:
504  sRet = "akk";
505  break;
506  case USCRIPT_CARIAN:
507  sRet = "xcr";
508  break;
509  case USCRIPT_JAPANESE:
510  sRet = "ja";
511  break;
512  case USCRIPT_LANNA:
513  sRet = "nod";
514  break;
515  case USCRIPT_LYCIAN:
516  sRet = "xlc";
517  break;
518  case USCRIPT_LYDIAN:
519  sRet = "xld";
520  break;
521  case USCRIPT_OL_CHIKI:
522  sRet = "sat";
523  break;
524  case USCRIPT_REJANG:
525  sRet = "rej";
526  break;
527  case USCRIPT_SAURASHTRA:
528  sRet = "saz";
529  break;
530  case USCRIPT_SIGN_WRITING:
531  sRet = "en";
532  break;
533  case USCRIPT_SUNDANESE:
534  sRet = "su";
535  break;
536  case USCRIPT_MOON:
537  sRet = "en";
538  break;
539  case USCRIPT_MEITEI_MAYEK:
540  sRet = "mni";
541  break;
542  case USCRIPT_IMPERIAL_ARAMAIC:
543  sRet = "arc";
544  break;
545  case USCRIPT_AVESTAN:
546  sRet = "ae";
547  break;
548  case USCRIPT_CHAKMA:
549  sRet = "ccp";
550  break;
551  case USCRIPT_KOREAN:
552  sRet = "ko";
553  break;
554  case USCRIPT_KAITHI:
555  sRet = "awa";
556  break;
557  case USCRIPT_MANICHAEAN:
558  sRet = "xmn";
559  break;
560  case USCRIPT_INSCRIPTIONAL_PAHLAVI:
561  case USCRIPT_PSALTER_PAHLAVI:
562  case USCRIPT_BOOK_PAHLAVI:
563  case USCRIPT_INSCRIPTIONAL_PARTHIAN:
564  sRet = "xpr";
565  break;
566  case USCRIPT_SAMARITAN:
567  sRet = "heb";
568  break;
569  case USCRIPT_TAI_VIET:
570  sRet = "blt";
571  break;
572  case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
573  sRet = "mic";
574  break;
575  case USCRIPT_NABATAEAN: //no language with an assigned code yet
576  sRet = "mis";
577  break;
578  case USCRIPT_PALMYRENE: //no language with an assigned code yet
579  sRet = "mis";
580  break;
581  case USCRIPT_BAMUM:
582  sRet = "bax";
583  break;
584  case USCRIPT_LISU:
585  sRet = "lis";
586  break;
587  case USCRIPT_NAKHI_GEBA:
588  sRet = "nxq";
589  break;
590  case USCRIPT_OLD_SOUTH_ARABIAN:
591  sRet = "xsa";
592  break;
593  case USCRIPT_BASSA_VAH:
594  sRet = "bsq";
595  break;
596  case USCRIPT_DUPLOYAN_SHORTAND:
597  sRet = "fr";
598  break;
599  case USCRIPT_ELBASAN:
600  sRet = "sq";
601  break;
602  case USCRIPT_GRANTHA:
603  sRet = "ta";
604  break;
605  case USCRIPT_KPELLE:
606  sRet = "kpe";
607  break;
608  case USCRIPT_LOMA:
609  sRet = "lom";
610  break;
611  case USCRIPT_MENDE:
612  sRet = "men";
613  break;
614  case USCRIPT_MEROITIC_CURSIVE:
615  sRet = "xmr";
616  break;
617  case USCRIPT_OLD_NORTH_ARABIAN:
618  sRet = "xna";
619  break;
620  case USCRIPT_SINDHI:
621  sRet = "sd";
622  break;
623  case USCRIPT_WARANG_CITI:
624  sRet = "hoc";
625  break;
626 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
627  case USCRIPT_AFAKA:
628  sRet = "djk";
629  break;
630  case USCRIPT_JURCHEN:
631  sRet = "juc";
632  break;
633  case USCRIPT_MRO:
634  sRet = "cmr";
635  break;
636  case USCRIPT_NUSHU: //no language with an assigned code yet
637  sRet = "mis";
638  break;
639  case USCRIPT_SHARADA:
640  sRet = "sa";
641  break;
642  case USCRIPT_SORA_SOMPENG:
643  sRet = "srb";
644  break;
645  case USCRIPT_TAKRI:
646  sRet = "doi";
647  break;
648  case USCRIPT_TANGUT:
649  sRet = "txg";
650  break;
651  case USCRIPT_WOLEAI:
652  sRet = "woe";
653  break;
654 #endif
655 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
656  case USCRIPT_ANATOLIAN_HIEROGLYPHS:
657  sRet = "hlu";
658  break;
659  case USCRIPT_KHOJKI:
660  sRet = "gu";
661  break;
662  case USCRIPT_TIRHUTA:
663  sRet = "mai";
664  break;
665 #endif
666 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
667  case USCRIPT_CAUCASIAN_ALBANIAN:
668  sRet = "xag";
669  break;
670  case USCRIPT_MAHAJANI:
671  sRet = "mwr";
672  break;
673 #endif
674 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
675  case USCRIPT_AHOM:
676  sRet = "aho";
677  break;
678  case USCRIPT_HATRAN:
679  sRet = "qly-Hatr";
680  break;
681  case USCRIPT_MODI:
682  sRet = "mr-Modi";
683  break;
684  case USCRIPT_MULTANI:
685  sRet = "skr-Mutl";
686  break;
687  case USCRIPT_PAU_CIN_HAU:
688  sRet = "ctd-Pauc";
689  break;
690  case USCRIPT_SIDDHAM:
691  sRet = "sa-Sidd";
692  break;
693 #endif
694 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
695  case USCRIPT_ADLAM:
696  sRet = "mis"; // Adlm - Adlam for Fulani, no language code
697  break;
698  case USCRIPT_BHAIKSUKI:
699  sRet = "mis"; // Bhks - Bhaiksuki for some Buddhist texts, no language code
700  break;
701  case USCRIPT_MARCHEN:
702  sRet = "bo-Marc";
703  break;
704  case USCRIPT_NEWA:
705  sRet = "new-Newa";
706  break;
707  case USCRIPT_OSAGE:
708  sRet = "osa-Osge";
709  break;
710  case USCRIPT_HAN_WITH_BOPOMOFO:
711  sRet = "mis"; // Hanb - Han with Bopomofo, zh-Hanb ?
712  break;
713  case USCRIPT_JAMO:
714  sRet = "ko"; // Jamo - elements of Hangul Syllables
715  break;
716  case USCRIPT_SYMBOLS_EMOJI:
717  sRet = "mis"; // Zsye - Emoji variant
718  break;
719 #endif
720 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
721  case USCRIPT_MASARAM_GONDI:
722  sRet = "gon-Gonm"; // macro language code, could be wsg,esg,gno
723  break;
724  case USCRIPT_SOYOMBO:
725  sRet = "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
726  break;
727  case USCRIPT_ZANABAZAR_SQUARE:
728  sRet = "mn-Zanb"; // abugida to write Mongolian
729  break;
730 #endif
731 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
732  case USCRIPT_DOGRA:
733  sRet = "dgo"; // Dogri proper
734  break;
735  case USCRIPT_GUNJALA_GONDI:
736  sRet = "wsg"; // Adilabad Gondi
737  break;
738  case USCRIPT_MAKASAR:
739  sRet = "mak";
740  break;
741  case USCRIPT_MEDEFAIDRIN:
742  sRet = "mis-Medf"; // Uncoded with script
743  break;
744  case USCRIPT_HANIFI_ROHINGYA:
745  sRet = "rhg";
746  break;
747  case USCRIPT_SOGDIAN:
748  sRet = "sog";
749  break;
750  case USCRIPT_OLD_SOGDIAN:
751  sRet = "sog";
752  break;
753 #endif
754 #if (U_ICU_VERSION_MAJOR_NUM >= 64)
755  case USCRIPT_ELYMAIC:
756  sRet = "arc-Elym";
757  break;
758  case USCRIPT_NYIAKENG_PUACHUE_HMONG:
759  sRet = "hmn-Hmnp"; // macrolanguage code
760  break;
761  case USCRIPT_NANDINAGARI:
762  sRet = "sa-Nand";
763  break;
764  case USCRIPT_WANCHO:
765  sRet = "nnp-Wcho";
766  break;
767 #endif
768 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
769  case USCRIPT_CHORASMIAN:
770  sRet = "xco-Chrs";
771  break;
772  case USCRIPT_DIVES_AKURU:
773  sRet = "dv-Diak";
774  break;
775  case USCRIPT_KHITAN_SMALL_SCRIPT:
776  sRet = "zkt-Kits";
777  break;
778  case USCRIPT_YEZIDI:
779  sRet = "kmr-Yezi";
780  break;
781 #endif
782  }
783  return sRet;
784 }
785 
786 //Format a number as a percentage according to the rules of the given
787 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
788 OUString unicode::formatPercent(double dNumber,
789  const LanguageTag &rLangTag)
790 {
791  // get a currency formatter for this locale ID
792  UErrorCode errorCode=U_ZERO_ERROR;
793 
794  LanguageTag aLangTag(rLangTag);
795 
796  // As of CLDR Version 24 these languages were not listed as using spacing
797  // between number and % but are reported as such by our l10n groups
798  // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
799  // so format using French which has the desired rules
800  if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
801  aLangTag.reset("fr-FR");
802 
803  icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
804 
805  std::unique_ptr<icu::NumberFormat> xF(
806  icu::NumberFormat::createPercentInstance(aLocale, errorCode));
807  if(U_FAILURE(errorCode))
808  {
809  SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
810  return OUString::number(dNumber) + "%";
811  }
812 
813  icu::UnicodeString output;
814  xF->format(dNumber/100, output);
815  OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
816  output.length());
817  if (rLangTag.getLanguage() == "de")
818  {
819  //narrow no-break space instead of (normal) no-break space
820  return aRet.replace(0x00A0, 0x202F);
821  }
822  return aRet;
823 }
824 
826 {
827  //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
828  if( maInput.getLength() > 255 )
829  mbAllowMoreChars = false;
830 
831  if( !mbAllowMoreChars )
832  return false;
833 
834  bool bPreventNonHex = false;
835  if( maInput.indexOf("U+") != -1 )
836  bPreventNonHex = true;
837 
838  switch ( unicode::getUnicodeType(uChar) )
839  {
840  case css::i18n::UnicodeType::SURROGATE:
841  if( bPreventNonHex )
842  {
843  mbAllowMoreChars = false;
844  return false;
845  }
846 
847  if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
848  {
849  maUtf16.append(uChar);
850  return true;
851  }
852  if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
853  maUtf16.insert(0, uChar );
854  //end of hex strings, or unexpected order of high/low, so don't accept more
855  if( !maUtf16.isEmpty() )
856  maInput.append(maUtf16);
857  if( !maCombining.isEmpty() )
858  maInput.append(maCombining);
859  mbAllowMoreChars = false;
860  break;
861 
862  case css::i18n::UnicodeType::NON_SPACING_MARK:
863  case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
864  if( bPreventNonHex )
865  {
866  mbAllowMoreChars = false;
867  return false;
868  }
869 
870  //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
871  if( !maUtf16.isEmpty() )
872  {
873  maInput = maUtf16;
874  if( !maCombining.isEmpty() )
875  maInput.append(maCombining);
876  mbAllowMoreChars = false;
877  return false;
878  }
879  maCombining.insert(0, uChar);
880  break;
881 
882  default:
883  //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
884  if( !maUtf16.isEmpty() )
885  {
886  maInput = maUtf16;
887  if( !maCombining.isEmpty() )
888  maInput.append(maCombining);
889  mbAllowMoreChars = false;
890  return false;
891  }
892 
893  if( !maCombining.isEmpty() )
894  {
895  maCombining.insert(0, uChar);
897  mbAllowMoreChars = false;
898  return false;
899  }
900 
901  // 0 - 1f are control characters. Do not process those.
902  if( uChar < 0x20 )
903  {
904  mbAllowMoreChars = false;
905  return false;
906  }
907 
908  switch( uChar )
909  {
910  case 'u':
911  case 'U':
912  // U+ notation found. Continue looking for another one.
913  if( mbRequiresU )
914  {
915  mbRequiresU = false;
916  maInput.insert(0,"U+");
917  }
918  // treat as a normal character
919  else
920  {
921  mbAllowMoreChars = false;
922  if( !bPreventNonHex )
923  maInput.insertUtf32(0, uChar);
924  }
925  break;
926  case '+':
927  // + already found: skip when not U, or edge case of +U+xxxx
928  if( mbRequiresU || (maInput.indexOf("U+") == 0) )
929  mbAllowMoreChars = false;
930  // hex chars followed by '+' - now require a 'U'
931  else if ( !maInput.isEmpty() )
932  mbRequiresU = true;
933  // treat as a normal character
934  else
935  {
936  mbAllowMoreChars = false;
937  if( !bPreventNonHex )
938  maInput.insertUtf32(0, uChar);
939  }
940  break;
941  default:
942  // + already found. Since not U, cancel further input
943  if( mbRequiresU )
944  mbAllowMoreChars = false;
945  // maximum digits per notation is 8: only one notation
946  else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
947  mbAllowMoreChars = false;
948  // maximum digits per notation is 8: previous notation found
949  else if( maInput.indexOf("U+") == 8 )
950  mbAllowMoreChars = false;
951  // a hex character. Add to string.
952  else if( rtl::isAsciiHexDigit(uChar) )
953  {
954  mbIsHexString = true;
955  maInput.insertUtf32(0, uChar);
956  }
957  // not a hex character: stop input. keep if it is the first input provided
958  else
959  {
960  mbAllowMoreChars = false;
961  if( maInput.isEmpty() )
962  maInput.insertUtf32(0, uChar);
963  }
964  }
965  }
966  return mbAllowMoreChars;
967 }
968 
970 {
971  if( maInput.isEmpty() )
972  {
973  //edge case - input finished with incomplete low surrogate or combining characters without a base
974  if( mbAllowMoreChars )
975  {
976  if( !maUtf16.isEmpty() )
977  maInput = maUtf16;
978  if( !maCombining.isEmpty() )
979  maInput.append(maCombining);
980  }
981  return maInput.toString();
982  }
983 
984  if( !mbIsHexString )
985  return maInput.toString();
986 
987  //this function potentially modifies the input string. Prevent addition of further characters
988  mbAllowMoreChars = false;
989 
990  //validate unicode notation.
991  OUString sIn;
992  sal_uInt32 nUnicode = 0;
993  sal_Int32 nUPlus = maInput.indexOf("U+");
994  //if U+ notation used, strip off all extra chars added not in U+ notation
995  if( nUPlus != -1 )
996  {
997  maInput.remove(0, nUPlus);
998  sIn = maInput.copy(2).makeStringAndClear();
999  nUPlus = sIn.indexOf("U+");
1000  }
1001  else
1002  sIn = maInput.toString();
1003  while( nUPlus != -1 )
1004  {
1005  nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1006  //prevent creating control characters or invalid Unicode values
1007  if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1008  maInput = sIn.subView(nUPlus);
1009  sIn = sIn.copy(nUPlus+2);
1010  nUPlus = sIn.indexOf("U+");
1011  }
1012 
1013  nUnicode = sIn.toUInt32(16);
1014  if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1015  maInput.truncate().append( sIn[sIn.getLength()-1] );
1016  return maInput.toString();
1017 }
1018 
1020 {
1021  OUString sIn = StringToReplace();
1022  sal_Int32 nPos = 0;
1023  sal_uInt32 counter = 0;
1024  while( nPos < sIn.getLength() )
1025  {
1026  sIn.iterateCodePoints(&nPos);
1027  ++counter;
1028  }
1029  return counter;
1030 }
1031 
1033 {
1034  OUString sIn = StringToReplace();
1035  OUStringBuffer output = "";
1036  sal_Int32 nUPlus = sIn.indexOf("U+");
1037  // convert from hex notation to glyph
1038  if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1039  {
1040  sal_uInt32 nUnicode = 0;
1041  if( nUPlus == 0)
1042  {
1043  sIn = sIn.copy(2);
1044  nUPlus = sIn.indexOf("U+");
1045  }
1046  while( nUPlus > 0 )
1047  {
1048  nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1049  output.appendUtf32( nUnicode );
1050 
1051  sIn = sIn.copy(nUPlus+2);
1052  nUPlus = sIn.indexOf("U+");
1053  }
1054  nUnicode = sIn.toUInt32(16);
1055  output.appendUtf32( nUnicode );
1056  }
1057  // convert from glyph to hex notation
1058  else
1059  {
1060  sal_Int32 nPos = 0;
1061  while( nPos < sIn.getLength() )
1062  {
1063  OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1064  //pad with zeros - minimum length of 4.
1065  for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1066  aTmp.insert( 0,"0" );
1067  output.append( "U+" );
1068  output.append( aTmp );
1069  }
1070  }
1071  return output.toString();
1072 }
1073 
1074 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
static sal_uInt8 getUnicodeDirection(const sal_Unicode ch)
Definition: unicode.cxx:84
const sal_Int8 UnicodeDirectionIndex[]
Definition: unicode_data.h:908
static sal_Int16 getUnicodeScriptType(const sal_Unicode ch, const ScriptTypeList *typeList, sal_Int16 unknownType=0)
Definition: unicode.cxx:53
LanguageTag & reset(const OUString &rBcp47LanguageTag)
const sal_Int8 UnicodeTypeIndex[]
Definition: unicode_data.h:28
const sal_Int8 UnicodeTypeBlockValue[]
Definition: unicode_data.h:64
const sal_Unicode UnicodeScriptType[][2]
static sal_Unicode getUnicodeScriptStart(css::i18n::UnicodeScript type)
Definition: unicode.cxx:58
OUString ReplacementString()
Definition: unicode.cxx:1032
static bool isWhiteSpace(const sal_Unicode ch)
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
Definition: unicode.cxx:135
sal_uInt16 sal_Unicode
sal_uInt32 CharsToDelete()
While sInput.getLength() returns the number of utf16 units to delete, this function returns the numbe...
Definition: unicode.cxx:1019
OUString getLanguage() const
const sal_Int8 UnicodeTypeValue[]
Definition: unicode_data.h:69
#define bit(name)
Definition: unicode.cxx:98
UBlockCode from
OUStringBuffer maInput
Definition: unicode.hxx:99
const sal_Int8 UnicodeDirectionBlockValue[]
Definition: unicode_data.h:944
#define SAL_N_ELEMENTS(arr)
#define CONTROLSPACE
static T getScriptType(const sal_Unicode ch, const L *typeList, T unknownType)
Definition: unicode.cxx:39
#define CONTROLMASK
Definition: unicode.cxx:114
int i
OUString StringToReplace()
Validates (and potentially modifies) the input string.
Definition: unicode.cxx:969
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
static bool isSpace(const sal_Unicode ch)
const sal_Int8 UnicodeDirectionValue[]
Definition: unicode_data.h:949
static bool isControl(const sal_Unicode ch)
#define UnicodeTypeNumberBlock
Definition: unicode_data.h:26
static sal_Unicode getUnicodeScriptEnd(css::i18n::UnicodeScript type)
Definition: unicode.cxx:63
#define IsType(func, mask)
Definition: unicode.cxx:119
#define UnicodeScriptTypeTo
static OUString formatPercent(double dNumber, const LanguageTag &rLangTag)
Definition: unicode.cxx:788
bool AllowMoreInput(sal_Unicode uChar)
Build an input string of valid UTF16 units to toggle.
Definition: unicode.cxx:825
unsigned char sal_uInt8
#define UnicodeDirectionNumberBlock
Definition: unicode_data.h:906
#define SPACEMASK
Definition: unicode.cxx:110
OUStringBuffer maCombining
Definition: unicode.hxx:101
ResultType type
#define SAL_WARN(area, stream)
static sal_Int16 getUnicodeType(const sal_Unicode ch)
Definition: unicode.cxx:68
static OString getExemplarLanguageForUScriptCode(UScriptCode eScript)
Definition: unicode.cxx:192
#define ALPHAMASK
Definition: unicode.cxx:106
#define UnicodeScriptTypeFrom
sal_uInt16 nPos
OUStringBuffer maUtf16
Definition: unicode.hxx:100