LibreOffice Module i18nutil (master) 1
unicode.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <com/sun/star/i18n/UnicodeType.hpp>
21#include <com/sun/star/i18n/ScriptType.hpp>
24#include <i18nutil/unicode.hxx>
25#include <sal/log.hxx>
26#include <unicode/numfmt.h>
27#include "unicode_data.h"
28#include <rtl/character.hxx>
29#include <o3tl/string_view.hxx>
30#include <memory>
31
32// Workaround for glibc braindamage:
33// glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
34// which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
35#undef CURRENCY_SYMBOL
36
37using namespace ::com::sun::star::i18n;
38
39template<class L, typename T>
40static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
41
42 sal_Int16 i = 0;
43 css::i18n::UnicodeScript type = typeList[0].to;
44 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
45 type = typeList[++i].to;
46 }
47
48 return (type < UnicodeScript_kScriptCount &&
49 ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
50 typeList[i].value : unknownType;
51}
52
53sal_Int16
54unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
55 return getScriptType(ch, typeList, unknownType);
56}
57
59unicode::getUnicodeScriptStart( UnicodeScript type) {
60 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
61}
62
64unicode::getUnicodeScriptEnd( UnicodeScript type) {
65 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
66}
67
68sal_Int16
70 static sal_Unicode c = 0x00;
71 static sal_Int16 r = 0x00;
72
73 if (ch == c) return r;
74 else c = ch;
75
76 sal_Int16 address = UnicodeTypeIndex[ch >> 8];
77 r = static_cast<sal_Int16>(
78 (address < UnicodeTypeNumberBlock)
79 ? UnicodeTypeBlockValue[address]
80 : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
81 return r;
82}
83
86 static sal_Unicode c = 0x00;
87 static sal_uInt8 r = 0x00;
88
89 if (ch == c) return r;
90 else c = ch;
91
92 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
93 r = (address < UnicodeDirectionNumberBlock)
95 : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
96 return r;
97}
98
99#define bit(name) (1U << name)
100
101#define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
102
103#define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
104
105#define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
106
107#define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
108 bit(UnicodeType::MODIFIER_LETTER)|\
109 bit(UnicodeType::OTHER_LETTER)
110
111#define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
112 bit(UnicodeType::LINE_SEPARATOR)|\
113 bit(UnicodeType::PARAGRAPH_SEPARATOR)
114
115#define CONTROLMASK bit(UnicodeType::CONTROL)|\
116 bit(UnicodeType::FORMAT)|\
117 bit(UnicodeType::LINE_SEPARATOR)|\
118 bit(UnicodeType::PARAGRAPH_SEPARATOR)
119
120#define IsType(func, mask) \
121bool func( const sal_Unicode ch) {\
122 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
123}
124
128
129#define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
130 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
131
132bool unicode::isWhiteSpace( const sal_Unicode ch) {
133 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
134}
135
136sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
137{
138 //See unicode/uscript.h
139 static const sal_Int16 scriptTypes[] =
140 {
141 ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
142 ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
143 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
144 // 15
145 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
146 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
147 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
148 // 30
149 ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
150 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
151 ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
152 // 45
153 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
154 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
155 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
156 // 60
157 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
158 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
159 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
160 // 75
161 ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
162 ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
163 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
164 // 90
165 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
166 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
167 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
168 // 105
169 ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
170 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
171 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
172 // 120
173 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
174 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
175 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
176 // 135
177 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
178 ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
179 ScriptType::COMPLEX,
180 ScriptType::WEAK
181 };
182
183 sal_Int16 nRet;
184 if (eScript < USCRIPT_COMMON)
185 nRet = ScriptType::WEAK;
186 else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
187 nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
188 else
189 nRet = scriptTypes[eScript];
190 return nRet;
191}
192
194{
195 OString sRet;
196 switch (eScript)
197 {
198 case USCRIPT_CODE_LIMIT:
199 case USCRIPT_INVALID_CODE:
200 sRet = "zxx";
201 break;
202 case USCRIPT_COMMON:
203 case USCRIPT_INHERITED:
204 sRet = "und";
205 break;
206 case USCRIPT_MATHEMATICAL_NOTATION:
207 case USCRIPT_SYMBOLS:
208 sRet = "zxx";
209 break;
210 case USCRIPT_UNWRITTEN_LANGUAGES:
211 case USCRIPT_UNKNOWN:
212 sRet = "und";
213 break;
214 case USCRIPT_ARABIC:
215 sRet = "ar";
216 break;
217 case USCRIPT_ARMENIAN:
218 sRet = "hy";
219 break;
220 case USCRIPT_BENGALI:
221 sRet = "bn";
222 break;
223 case USCRIPT_BOPOMOFO:
224 sRet = "zh";
225 break;
226 case USCRIPT_CHEROKEE:
227 sRet = "chr";
228 break;
229 case USCRIPT_COPTIC:
230 sRet = "cop";
231 break;
232 case USCRIPT_CYRILLIC:
233 sRet = "ru";
234 break;
235 case USCRIPT_DESERET:
236 sRet = "en";
237 break;
238 case USCRIPT_DEVANAGARI:
239 sRet = "hi";
240 break;
241 case USCRIPT_ETHIOPIC:
242 sRet = "am";
243 break;
244 case USCRIPT_GEORGIAN:
245 sRet = "ka";
246 break;
247 case USCRIPT_GOTHIC:
248 sRet = "got";
249 break;
250 case USCRIPT_GREEK:
251 sRet = "el";
252 break;
253 case USCRIPT_GUJARATI:
254 sRet = "gu";
255 break;
256 case USCRIPT_GURMUKHI:
257 sRet = "pa";
258 break;
259 case USCRIPT_HAN:
260 sRet = "zh";
261 break;
262 case USCRIPT_HANGUL:
263 sRet = "ko";
264 break;
265 case USCRIPT_HEBREW:
266 sRet = "hr";
267 break;
268 case USCRIPT_HIRAGANA:
269 sRet = "ja";
270 break;
271 case USCRIPT_KANNADA:
272 sRet = "kn";
273 break;
274 case USCRIPT_KATAKANA:
275 sRet = "ja";
276 break;
277 case USCRIPT_KHMER:
278 sRet = "km";
279 break;
280 case USCRIPT_LAO:
281 sRet = "lo";
282 break;
283 case USCRIPT_LATIN:
284 sRet = "en";
285 break;
286 case USCRIPT_MALAYALAM:
287 sRet = "ml";
288 break;
289 case USCRIPT_MONGOLIAN:
290 sRet = "mn";
291 break;
292 case USCRIPT_MYANMAR:
293 sRet = "my";
294 break;
295 case USCRIPT_OGHAM:
296 sRet = "pgl";
297 break;
298 case USCRIPT_OLD_ITALIC:
299 sRet = "osc";
300 break;
301 case USCRIPT_ORIYA:
302 sRet = "or";
303 break;
304 case USCRIPT_RUNIC:
305 sRet = "ang";
306 break;
307 case USCRIPT_SINHALA:
308 sRet = "si";
309 break;
310 case USCRIPT_SYRIAC:
311 sRet = "syr";
312 break;
313 case USCRIPT_TAMIL:
314 sRet = "ta";
315 break;
316 case USCRIPT_TELUGU:
317 sRet = "te";
318 break;
319 case USCRIPT_THAANA:
320 sRet = "dv";
321 break;
322 case USCRIPT_THAI:
323 sRet = "th";
324 break;
325 case USCRIPT_TIBETAN:
326 sRet = "bo";
327 break;
328 case USCRIPT_CANADIAN_ABORIGINAL:
329 sRet = "iu";
330 break;
331 case USCRIPT_YI:
332 sRet = "ii";
333 break;
334 case USCRIPT_TAGALOG:
335 sRet = "tl";
336 break;
337 case USCRIPT_HANUNOO:
338 sRet = "hnn";
339 break;
340 case USCRIPT_BUHID:
341 sRet = "bku";
342 break;
343 case USCRIPT_TAGBANWA:
344 sRet = "tbw";
345 break;
346 case USCRIPT_BRAILLE:
347 sRet = "en";
348 break;
349 case USCRIPT_CYPRIOT:
350 sRet = "ecy";
351 break;
352 case USCRIPT_LIMBU:
353 sRet = "lif";
354 break;
355 case USCRIPT_LINEAR_B:
356 sRet = "gmy";
357 break;
358 case USCRIPT_OSMANYA:
359 sRet = "so";
360 break;
361 case USCRIPT_SHAVIAN:
362 sRet = "en";
363 break;
364 case USCRIPT_TAI_LE:
365 sRet = "tdd";
366 break;
367 case USCRIPT_UGARITIC:
368 sRet = "uga";
369 break;
370 case USCRIPT_KATAKANA_OR_HIRAGANA:
371 sRet = "ja";
372 break;
373 case USCRIPT_BUGINESE:
374 sRet = "bug";
375 break;
376 case USCRIPT_GLAGOLITIC:
377 sRet = "ch";
378 break;
379 case USCRIPT_KHAROSHTHI:
380 sRet = "pra";
381 break;
382 case USCRIPT_SYLOTI_NAGRI:
383 sRet = "syl";
384 break;
385 case USCRIPT_NEW_TAI_LUE:
386 sRet = "khb";
387 break;
388 case USCRIPT_TIFINAGH:
389 sRet = "tmh";
390 break;
391 case USCRIPT_OLD_PERSIAN:
392 sRet = "peo";
393 break;
394 case USCRIPT_BALINESE:
395 sRet = "ban";
396 break;
397 case USCRIPT_BATAK:
398 sRet = "btk";
399 break;
400 case USCRIPT_BLISSYMBOLS:
401 sRet = "en";
402 break;
403 case USCRIPT_BRAHMI:
404 sRet = "pra";
405 break;
406 case USCRIPT_CHAM:
407 sRet = "cja";
408 break;
409 case USCRIPT_CIRTH:
410 sRet = "sjn";
411 break;
412 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
413 sRet = "cu";
414 break;
415 case USCRIPT_DEMOTIC_EGYPTIAN:
416 case USCRIPT_HIERATIC_EGYPTIAN:
417 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
418 sRet = "egy";
419 break;
420 case USCRIPT_KHUTSURI:
421 sRet = "ka";
422 break;
423 case USCRIPT_SIMPLIFIED_HAN:
424 sRet = "zh";
425 break;
426 case USCRIPT_TRADITIONAL_HAN:
427 sRet = "zh";
428 break;
429 case USCRIPT_PAHAWH_HMONG:
430 sRet = "blu";
431 break;
432 case USCRIPT_OLD_HUNGARIAN:
433 sRet = "ohu";
434 break;
435 case USCRIPT_HARAPPAN_INDUS:
436 sRet = "xiv";
437 break;
438 case USCRIPT_JAVANESE:
439 sRet = "kaw";
440 break;
441 case USCRIPT_KAYAH_LI:
442 sRet = "eky";
443 break;
444 case USCRIPT_LATIN_FRAKTUR:
445 sRet = "de";
446 break;
447 case USCRIPT_LATIN_GAELIC:
448 sRet = "ga";
449 break;
450 case USCRIPT_LEPCHA:
451 sRet = "lep";
452 break;
453 case USCRIPT_LINEAR_A:
454 sRet = "ecr";
455 break;
456 case USCRIPT_MAYAN_HIEROGLYPHS:
457 sRet = "myn";
458 break;
459 case USCRIPT_MEROITIC:
460 sRet = "xmr";
461 break;
462 case USCRIPT_NKO:
463 sRet = "nqo";
464 break;
465 case USCRIPT_ORKHON:
466 sRet = "otk";
467 break;
468 case USCRIPT_OLD_PERMIC:
469 sRet = "kv";
470 break;
471 case USCRIPT_PHAGS_PA:
472 sRet = "xng";
473 break;
474 case USCRIPT_PHOENICIAN:
475 sRet = "phn";
476 break;
477 case USCRIPT_PHONETIC_POLLARD:
478 sRet = "hmd";
479 break;
480 case USCRIPT_RONGORONGO:
481 sRet = "rap";
482 break;
483 case USCRIPT_SARATI:
484 sRet = "qya";
485 break;
486 case USCRIPT_ESTRANGELO_SYRIAC:
487 sRet = "syr";
488 break;
489 case USCRIPT_WESTERN_SYRIAC:
490 sRet = "tru";
491 break;
492 case USCRIPT_EASTERN_SYRIAC:
493 sRet = "aii";
494 break;
495 case USCRIPT_TENGWAR:
496 sRet = "sjn";
497 break;
498 case USCRIPT_VAI:
499 sRet = "vai";
500 break;
501 case USCRIPT_VISIBLE_SPEECH:
502 sRet = "en";
503 break;
504 case USCRIPT_CUNEIFORM:
505 sRet = "akk";
506 break;
507 case USCRIPT_CARIAN:
508 sRet = "xcr";
509 break;
510 case USCRIPT_JAPANESE:
511 sRet = "ja";
512 break;
513 case USCRIPT_LANNA:
514 sRet = "nod";
515 break;
516 case USCRIPT_LYCIAN:
517 sRet = "xlc";
518 break;
519 case USCRIPT_LYDIAN:
520 sRet = "xld";
521 break;
522 case USCRIPT_OL_CHIKI:
523 sRet = "sat";
524 break;
525 case USCRIPT_REJANG:
526 sRet = "rej";
527 break;
528 case USCRIPT_SAURASHTRA:
529 sRet = "saz";
530 break;
531 case USCRIPT_SIGN_WRITING:
532 sRet = "en";
533 break;
534 case USCRIPT_SUNDANESE:
535 sRet = "su";
536 break;
537 case USCRIPT_MOON:
538 sRet = "en";
539 break;
540 case USCRIPT_MEITEI_MAYEK:
541 sRet = "mni";
542 break;
543 case USCRIPT_IMPERIAL_ARAMAIC:
544 sRet = "arc";
545 break;
546 case USCRIPT_AVESTAN:
547 sRet = "ae";
548 break;
549 case USCRIPT_CHAKMA:
550 sRet = "ccp";
551 break;
552 case USCRIPT_KOREAN:
553 sRet = "ko";
554 break;
555 case USCRIPT_KAITHI:
556 sRet = "awa";
557 break;
558 case USCRIPT_MANICHAEAN:
559 sRet = "xmn";
560 break;
561 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
562 case USCRIPT_PSALTER_PAHLAVI:
563 case USCRIPT_BOOK_PAHLAVI:
564 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
565 sRet = "xpr";
566 break;
567 case USCRIPT_SAMARITAN:
568 sRet = "heb";
569 break;
570 case USCRIPT_TAI_VIET:
571 sRet = "blt";
572 break;
573 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
574 sRet = "mic";
575 break;
576 case USCRIPT_NABATAEAN:
577 sRet = "mis-Nbat"; // Uncoded with script
578 break;
579 case USCRIPT_PALMYRENE:
580 sRet = "mis-Palm"; // Uncoded with script
581 break;
582 case USCRIPT_BAMUM:
583 sRet = "bax";
584 break;
585 case USCRIPT_LISU:
586 sRet = "lis";
587 break;
588 case USCRIPT_NAKHI_GEBA:
589 sRet = "nxq";
590 break;
591 case USCRIPT_OLD_SOUTH_ARABIAN:
592 sRet = "xsa";
593 break;
594 case USCRIPT_BASSA_VAH:
595 sRet = "bsq";
596 break;
597 case USCRIPT_DUPLOYAN_SHORTAND:
598 sRet = "fr";
599 break;
600 case USCRIPT_ELBASAN:
601 sRet = "sq";
602 break;
603 case USCRIPT_GRANTHA:
604 sRet = "ta";
605 break;
606 case USCRIPT_KPELLE:
607 sRet = "kpe";
608 break;
609 case USCRIPT_LOMA:
610 sRet = "lom";
611 break;
612 case USCRIPT_MENDE:
613 sRet = "men";
614 break;
615 case USCRIPT_MEROITIC_CURSIVE:
616 sRet = "xmr";
617 break;
618 case USCRIPT_OLD_NORTH_ARABIAN:
619 sRet = "xna";
620 break;
621 case USCRIPT_SINDHI:
622 sRet = "sd";
623 break;
624 case USCRIPT_WARANG_CITI:
625 sRet = "hoc";
626 break;
627#if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
628 case USCRIPT_AFAKA:
629 sRet = "djk";
630 break;
631 case USCRIPT_JURCHEN:
632 sRet = "juc";
633 break;
634 case USCRIPT_MRO:
635 sRet = "cmr";
636 break;
637 case USCRIPT_NUSHU:
638 sRet = "mis-Nshu"; // Uncoded with script
639 break;
640 case USCRIPT_SHARADA:
641 sRet = "sa";
642 break;
643 case USCRIPT_SORA_SOMPENG:
644 sRet = "srb";
645 break;
646 case USCRIPT_TAKRI:
647 sRet = "doi";
648 break;
649 case USCRIPT_TANGUT:
650 sRet = "txg";
651 break;
652 case USCRIPT_WOLEAI:
653 sRet = "woe";
654 break;
655#endif
656#if (U_ICU_VERSION_MAJOR_NUM >= 49)
657 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
658 sRet = "hlu";
659 break;
660 case USCRIPT_KHOJKI:
661 sRet = "gu";
662 break;
663 case USCRIPT_TIRHUTA:
664 sRet = "mai";
665 break;
666#endif
667#if (U_ICU_VERSION_MAJOR_NUM >= 52)
668 case USCRIPT_CAUCASIAN_ALBANIAN:
669 sRet = "xag";
670 break;
671 case USCRIPT_MAHAJANI:
672 sRet = "mwr";
673 break;
674#endif
675#if (U_ICU_VERSION_MAJOR_NUM >= 54)
676 case USCRIPT_AHOM:
677 sRet = "aho";
678 break;
679 case USCRIPT_HATRAN:
680 sRet = "qly-Hatr";
681 break;
682 case USCRIPT_MODI:
683 sRet = "mr-Modi";
684 break;
685 case USCRIPT_MULTANI:
686 sRet = "skr-Mutl";
687 break;
688 case USCRIPT_PAU_CIN_HAU:
689 sRet = "ctd-Pauc";
690 break;
691 case USCRIPT_SIDDHAM:
692 sRet = "sa-Sidd";
693 break;
694#endif
695#if (U_ICU_VERSION_MAJOR_NUM >= 58)
696 case USCRIPT_ADLAM:
697 sRet = "mis-Adlm"; // Adlam for Fulani, no language code
698 break;
699 case USCRIPT_BHAIKSUKI:
700 sRet = "mis-Bhks"; // Bhaiksuki for some Buddhist texts, no language code
701 break;
702 case USCRIPT_MARCHEN:
703 sRet = "bo-Marc";
704 break;
705 case USCRIPT_NEWA:
706 sRet = "new-Newa";
707 break;
708 case USCRIPT_OSAGE:
709 sRet = "osa-Osge";
710 break;
711 case USCRIPT_HAN_WITH_BOPOMOFO:
712 sRet = "mis-Hanb"; // Han with Bopomofo, zh-Hanb ?
713 break;
714 case USCRIPT_JAMO:
715 sRet = "ko"; // Jamo - elements of Hangul Syllables
716 break;
717 case USCRIPT_SYMBOLS_EMOJI:
718 sRet = "mis-Zsye"; // Emoji variant
719 break;
720#endif
721#if (U_ICU_VERSION_MAJOR_NUM >= 60)
722 case USCRIPT_MASARAM_GONDI:
723 sRet = "gon-Gonm"; // macro language code, could be wsg,esg,gno
724 break;
725 case USCRIPT_SOYOMBO:
726 sRet = "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
727 break;
728 case USCRIPT_ZANABAZAR_SQUARE:
729 sRet = "mn-Zanb"; // abugida to write Mongolian
730 break;
731#endif
732#if (U_ICU_VERSION_MAJOR_NUM >= 62)
733 case USCRIPT_DOGRA:
734 sRet = "dgo"; // Dogri proper
735 break;
736 case USCRIPT_GUNJALA_GONDI:
737 sRet = "wsg"; // Adilabad Gondi
738 break;
739 case USCRIPT_MAKASAR:
740 sRet = "mak";
741 break;
742 case USCRIPT_MEDEFAIDRIN:
743 sRet = "dmf-Medf";
744 break;
745 case USCRIPT_HANIFI_ROHINGYA:
746 sRet = "rhg";
747 break;
748 case USCRIPT_SOGDIAN:
749 sRet = "sog";
750 break;
751 case USCRIPT_OLD_SOGDIAN:
752 sRet = "sog";
753 break;
754#endif
755#if (U_ICU_VERSION_MAJOR_NUM >= 64)
756 case USCRIPT_ELYMAIC:
757 sRet = "arc-Elym";
758 break;
759 case USCRIPT_NYIAKENG_PUACHUE_HMONG:
760 sRet = "hmn-Hmnp"; // macrolanguage code
761 break;
762 case USCRIPT_NANDINAGARI:
763 sRet = "sa-Nand";
764 break;
765 case USCRIPT_WANCHO:
766 sRet = "nnp-Wcho";
767 break;
768#endif
769#if (U_ICU_VERSION_MAJOR_NUM >= 66)
770 case USCRIPT_CHORASMIAN:
771 sRet = "xco-Chrs";
772 break;
773 case USCRIPT_DIVES_AKURU:
774 sRet = "dv-Diak";
775 break;
776 case USCRIPT_KHITAN_SMALL_SCRIPT:
777 sRet = "zkt-Kits";
778 break;
779 case USCRIPT_YEZIDI:
780 sRet = "kmr-Yezi";
781 break;
782#endif
783#if (U_ICU_VERSION_MAJOR_NUM >= 70)
784 case USCRIPT_CYPRO_MINOAN:
785 sRet = "mis-Cpmn"; // Uncoded with script
786 break;
787 case USCRIPT_OLD_UYGHUR:
788 sRet = "oui-Ougr";
789 break;
790 case USCRIPT_TANGSA:
791 sRet = "nst-Tnsa";
792 break;
793 case USCRIPT_TOTO:
794 sRet = "txo-Toto";
795 break;
796 case USCRIPT_VITHKUQI:
797 sRet = "sq-Vith"; // macrolanguage code
798 break;
799#endif
800 }
801 return sRet;
802}
803
804//Format a number as a percentage according to the rules of the given
805//language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
806OUString unicode::formatPercent(double dNumber,
807 const LanguageTag &rLangTag)
808{
809 // get a currency formatter for this locale ID
810 UErrorCode errorCode=U_ZERO_ERROR;
811
812 LanguageTag aLangTag(rLangTag);
813
814 // As of CLDR Version 24 these languages were not listed as using spacing
815 // between number and % but are reported as such by our l10n groups
816 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
817 // so format using French which has the desired rules
818 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
819 aLangTag.reset("fr-FR");
820
821 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
822
823 std::unique_ptr<icu::NumberFormat> xF(
824 icu::NumberFormat::createPercentInstance(aLocale, errorCode));
825 if(U_FAILURE(errorCode))
826 {
827 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
828 return OUString::number(dNumber) + "%";
829 }
830
831 icu::UnicodeString output;
832 xF->format(dNumber/100, output);
833 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
834 output.length());
835 if (rLangTag.getLanguage() == "de")
836 {
837 //narrow no-break space instead of (normal) no-break space
838 return aRet.replace(0x00A0, 0x202F);
839 }
840 return aRet;
841}
842
844{
845 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
846 if( maInput.getLength() > 255 )
847 mbAllowMoreChars = false;
848
849 if( !mbAllowMoreChars )
850 return false;
851
852 bool bPreventNonHex = false;
853 if( maInput.indexOf("U+") != -1 )
854 bPreventNonHex = true;
855
856 switch ( unicode::getUnicodeType(uChar) )
857 {
858 case css::i18n::UnicodeType::SURROGATE:
859 if( bPreventNonHex )
860 {
861 mbAllowMoreChars = false;
862 return false;
863 }
864
865 if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
866 {
867 maUtf16.append(uChar);
868 return true;
869 }
870 if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
871 maUtf16.insert(0, uChar );
872 //end of hex strings, or unexpected order of high/low, so don't accept more
873 if( !maUtf16.isEmpty() )
874 maInput.append(maUtf16);
875 if( !maCombining.isEmpty() )
876 maInput.append(maCombining);
877 mbAllowMoreChars = false;
878 break;
879
880 case css::i18n::UnicodeType::NON_SPACING_MARK:
881 case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
882 if( bPreventNonHex )
883 {
884 mbAllowMoreChars = false;
885 return false;
886 }
887
888 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
889 if( !maUtf16.isEmpty() )
890 {
892 if( !maCombining.isEmpty() )
893 maInput.append(maCombining);
894 mbAllowMoreChars = false;
895 return false;
896 }
897 maCombining.insert(0, uChar);
898 break;
899
900 default:
901 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
902 if( !maUtf16.isEmpty() )
903 {
905 if( !maCombining.isEmpty() )
906 maInput.append(maCombining);
907 mbAllowMoreChars = false;
908 return false;
909 }
910
911 if( !maCombining.isEmpty() )
912 {
913 maCombining.insert(0, uChar);
915 mbAllowMoreChars = false;
916 return false;
917 }
918
919 // 0 - 1f are control characters. Do not process those.
920 if( uChar < 0x20 )
921 {
922 mbAllowMoreChars = false;
923 return false;
924 }
925
926 switch( uChar )
927 {
928 case 'u':
929 case 'U':
930 // U+ notation found. Continue looking for another one.
931 if( mbRequiresU )
932 {
933 mbRequiresU = false;
934 maInput.insert(0,"U+");
935 }
936 // treat as a normal character
937 else
938 {
939 mbAllowMoreChars = false;
940 if( !bPreventNonHex )
941 maInput.insertUtf32(0, uChar);
942 }
943 break;
944 case '+':
945 // + already found: skip when not U, or edge case of +U+xxxx
946 if( mbRequiresU || (maInput.indexOf("U+") == 0) )
947 mbAllowMoreChars = false;
948 // hex chars followed by '+' - now require a 'U'
949 else if ( !maInput.isEmpty() )
950 mbRequiresU = true;
951 // treat as a normal character
952 else
953 {
954 mbAllowMoreChars = false;
955 if( !bPreventNonHex )
956 maInput.insertUtf32(0, uChar);
957 }
958 break;
959 default:
960 // + already found. Since not U, cancel further input
961 if( mbRequiresU )
962 mbAllowMoreChars = false;
963 // maximum digits per notation is 8: only one notation
964 else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
965 mbAllowMoreChars = false;
966 // maximum digits per notation is 8: previous notation found
967 else if( maInput.indexOf("U+") == 8 )
968 mbAllowMoreChars = false;
969 // a hex character. Add to string.
970 else if( rtl::isAsciiHexDigit(uChar) )
971 {
972 mbIsHexString = true;
973 maInput.insertUtf32(0, uChar);
974 }
975 // not a hex character: stop input. keep if it is the first input provided
976 else
977 {
978 mbAllowMoreChars = false;
979 if( maInput.isEmpty() )
980 maInput.insertUtf32(0, uChar);
981 }
982 }
983 }
984 return mbAllowMoreChars;
985}
986
988{
989 if( maInput.isEmpty() )
990 {
991 //edge case - input finished with incomplete low surrogate or combining characters without a base
992 if( mbAllowMoreChars )
993 {
994 if( !maUtf16.isEmpty() )
996 if( !maCombining.isEmpty() )
997 maInput.append(maCombining);
998 }
999 return maInput.toString();
1000 }
1001
1002 if( !mbIsHexString )
1003 return maInput.toString();
1004
1005 //this function potentially modifies the input string. Prevent addition of further characters
1006 mbAllowMoreChars = false;
1007
1008 //validate unicode notation.
1009 OUString sIn;
1010 sal_uInt32 nUnicode = 0;
1011 sal_Int32 nUPlus = maInput.indexOf("U+");
1012 //if U+ notation used, strip off all extra chars added not in U+ notation
1013 if( nUPlus != -1 )
1014 {
1015 maInput.remove(0, nUPlus);
1016 sIn = maInput.copy(2).makeStringAndClear();
1017 nUPlus = sIn.indexOf("U+");
1018 }
1019 else
1020 sIn = maInput.toString();
1021 while( nUPlus != -1 )
1022 {
1023 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1024 //prevent creating control characters or invalid Unicode values
1025 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1026 maInput = sIn.subView(nUPlus);
1027 sIn = sIn.copy(nUPlus+2);
1028 nUPlus = sIn.indexOf("U+");
1029 }
1030
1031 nUnicode = sIn.toUInt32(16);
1032 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1033 maInput.truncate().append( sIn[sIn.getLength()-1] );
1034 return maInput.toString();
1035}
1036
1038{
1039 OUString sIn = StringToReplace();
1040 sal_Int32 nPos = 0;
1041 sal_uInt32 counter = 0;
1042 while( nPos < sIn.getLength() )
1043 {
1044 sIn.iterateCodePoints(&nPos);
1045 ++counter;
1046 }
1047 return counter;
1048}
1049
1051{
1052 OUString sIn = StringToReplace();
1053 OUStringBuffer output = "";
1054 sal_Int32 nUPlus = sIn.indexOf("U+");
1055 // convert from hex notation to glyph
1056 if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1057 {
1058 sal_uInt32 nUnicode = 0;
1059 if( nUPlus == 0)
1060 {
1061 sIn = sIn.copy(2);
1062 nUPlus = sIn.indexOf("U+");
1063 }
1064 while( nUPlus > 0 )
1065 {
1066 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1067 output.appendUtf32( nUnicode );
1068
1069 sIn = sIn.copy(nUPlus+2);
1070 nUPlus = sIn.indexOf("U+");
1071 }
1072 nUnicode = sIn.toUInt32(16);
1073 output.appendUtf32( nUnicode );
1074 }
1075 // convert from glyph to hex notation
1076 else
1077 {
1078 sal_Int32 nPos = 0;
1079 while( nPos < sIn.getLength() )
1080 {
1081 OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1082 //pad with zeros - minimum length of 4.
1083 for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1084 aTmp.insert( 0,"0" );
1085 output.append( "U+" );
1086 output.append( aTmp );
1087 }
1088 }
1089 return output.makeStringAndClear();
1090}
1091
1092/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
UBlockCode from
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
OUString getLanguage() const
LanguageTag & reset(const OUString &rBcp47LanguageTag)
OUString ReplacementString()
Definition: unicode.cxx:1050
OUStringBuffer maInput
Definition: unicode.hxx:99
bool AllowMoreInput(sal_Unicode uChar)
Build an input string of valid UTF16 units to toggle.
Definition: unicode.cxx:843
OUStringBuffer maUtf16
Definition: unicode.hxx:100
OUString StringToReplace()
Validates (and potentially modifies) the input string.
Definition: unicode.cxx:987
sal_uInt32 CharsToDelete()
While sInput.getLength() returns the number of utf16 units to delete, this function returns the numbe...
Definition: unicode.cxx:1037
OUStringBuffer maCombining
Definition: unicode.hxx:101
static sal_Int16 getUnicodeType(const sal_Unicode ch)
Definition: unicode.cxx:69
static bool isAlpha(const sal_Unicode ch)
static OUString formatPercent(double dNumber, const LanguageTag &rLangTag)
Definition: unicode.cxx:806
static bool isSpace(const sal_Unicode ch)
static sal_uInt8 getUnicodeDirection(const sal_Unicode ch)
Definition: unicode.cxx:85
static sal_Int16 getUnicodeScriptType(const sal_Unicode ch, const ScriptTypeList *typeList, sal_Int16 unknownType=0)
Definition: unicode.cxx:54
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
Definition: unicode.cxx:136
static sal_Unicode getUnicodeScriptStart(css::i18n::UnicodeScript type)
Definition: unicode.cxx:59
static bool isWhiteSpace(const sal_Unicode ch)
static bool isControl(const sal_Unicode ch)
static OString getExemplarLanguageForUScriptCode(UScriptCode eScript)
Definition: unicode.cxx:193
static sal_Unicode getUnicodeScriptEnd(css::i18n::UnicodeScript type)
Definition: unicode.cxx:64
sal_uInt16 nPos
#define SAL_WARN(area, stream)
#define SAL_N_ELEMENTS(arr)
int i
ScriptTypeList const typeList[]
sal_uInt32 toUInt32(std::u16string_view str, sal_Int16 radix=10)
css::i18n::UnicodeScript to
Definition: unicode.hxx:33
sal_Int16 value
Definition: unicode.hxx:34
unsigned char sal_uInt8
sal_uInt16 sal_Unicode
ResultType type
#define ALPHAMASK
Definition: unicode.cxx:107
#define SPACEMASK
Definition: unicode.cxx:111
#define IsType(func, mask)
Definition: unicode.cxx:120
#define CONTROLSPACE
#define bit(name)
Definition: unicode.cxx:99
#define CONTROLMASK
Definition: unicode.cxx:115
static T getScriptType(const sal_Unicode ch, const L *typeList, T unknownType)
Definition: unicode.cxx:40
const sal_Int8 UnicodeDirectionBlockValue[]
Definition: unicode_data.h:944
#define UnicodeScriptTypeFrom
const sal_Int8 UnicodeTypeBlockValue[]
Definition: unicode_data.h:64
const sal_Int8 UnicodeTypeIndex[]
Definition: unicode_data.h:28
#define UnicodeTypeNumberBlock
Definition: unicode_data.h:26
const sal_Int8 UnicodeDirectionIndex[]
Definition: unicode_data.h:908
const sal_Unicode UnicodeScriptType[][2]
const sal_Int8 UnicodeTypeValue[]
Definition: unicode_data.h:69
const sal_Int8 UnicodeDirectionValue[]
Definition: unicode_data.h:949
#define UnicodeScriptTypeTo
#define UnicodeDirectionNumberBlock
Definition: unicode_data.h:906