LibreOffice Module i18nutil (master) 1
unicode.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <com/sun/star/i18n/UnicodeType.hpp>
21#include <com/sun/star/i18n/ScriptType.hpp>
24#include <i18nutil/unicode.hxx>
25#include <sal/log.hxx>
26#include <unicode/numfmt.h>
27#include <unicode/uchar.h>
28#include "unicode_data.h"
29#include <rtl/character.hxx>
30#include <o3tl/string_view.hxx>
31#include <memory>
32
33// Workaround for glibc braindamage:
34// glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
35// which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
36#undef CURRENCY_SYMBOL
37
38using namespace ::com::sun::star::i18n;
39
40template<class L, typename T>
41static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
42
43 sal_Int16 i = 0;
44 css::i18n::UnicodeScript type = typeList[0].to;
45 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
46 type = typeList[++i].to;
47 }
48
49 return (type < UnicodeScript_kScriptCount &&
50 ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
51 typeList[i].value : unknownType;
52}
53
54sal_Int16
55unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
56 return getScriptType(ch, typeList, unknownType);
57}
58
60unicode::getUnicodeScriptStart( UnicodeScript type) {
61 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
62}
63
65unicode::getUnicodeScriptEnd( UnicodeScript type) {
66 return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
67}
68
69sal_Int16
70unicode::getUnicodeType(const sal_uInt32 ch)
71{
72 static sal_uInt32 c = 0x00;
73 static sal_uInt32 r = 0x00;
74
75 if (ch == c) return r;
76 else c = ch;
77
78 switch (u_charType(ch))
79 {
80 case U_UNASSIGNED:
81 r = css::i18n::UnicodeType::UNASSIGNED;
82 break;
83 case U_UPPERCASE_LETTER:
84 r = css::i18n::UnicodeType::UPPERCASE_LETTER;
85 break;
86 case U_LOWERCASE_LETTER:
87 r = css::i18n::UnicodeType::LOWERCASE_LETTER;
88 break;
89 case U_TITLECASE_LETTER:
90 r = css::i18n::UnicodeType::TITLECASE_LETTER;
91 break;
92 case U_MODIFIER_LETTER:
93 r = css::i18n::UnicodeType::MODIFIER_LETTER;
94 break;
95 case U_OTHER_LETTER:
96 r = css::i18n::UnicodeType::OTHER_LETTER;
97 break;
98 case U_NON_SPACING_MARK:
99 r = css::i18n::UnicodeType::NON_SPACING_MARK;
100 break;
101 case U_ENCLOSING_MARK:
102 r = css::i18n::UnicodeType::ENCLOSING_MARK;
103 break;
104 case U_COMBINING_SPACING_MARK:
105 r = css::i18n::UnicodeType::COMBINING_SPACING_MARK;
106 break;
107 case U_DECIMAL_DIGIT_NUMBER:
108 r = css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER;
109 break;
110 case U_LETTER_NUMBER:
111 r = css::i18n::UnicodeType::LETTER_NUMBER;
112 break;
113 case U_OTHER_NUMBER:
114 r = css::i18n::UnicodeType::OTHER_NUMBER;
115 break;
116 case U_SPACE_SEPARATOR:
117 r = css::i18n::UnicodeType::SPACE_SEPARATOR;
118 break;
119 case U_LINE_SEPARATOR:
120 r = css::i18n::UnicodeType::LINE_SEPARATOR;
121 break;
122 case U_PARAGRAPH_SEPARATOR:
123 r = css::i18n::UnicodeType::PARAGRAPH_SEPARATOR;
124 break;
125 case U_CONTROL_CHAR:
126 r = css::i18n::UnicodeType::CONTROL;
127 break;
128 case U_FORMAT_CHAR:
129 r = css::i18n::UnicodeType::FORMAT;
130 break;
131 case U_PRIVATE_USE_CHAR:
132 r = css::i18n::UnicodeType::PRIVATE_USE;
133 break;
134 case U_SURROGATE:
135 r = css::i18n::UnicodeType::SURROGATE;
136 break;
137 case U_DASH_PUNCTUATION:
138 r = css::i18n::UnicodeType::DASH_PUNCTUATION;
139 break;
140 case U_INITIAL_PUNCTUATION:
141 r = css::i18n::UnicodeType::INITIAL_PUNCTUATION;
142 break;
143 case U_FINAL_PUNCTUATION:
144 r = css::i18n::UnicodeType::FINAL_PUNCTUATION;
145 break;
146 case U_CONNECTOR_PUNCTUATION:
147 r = css::i18n::UnicodeType::CONNECTOR_PUNCTUATION;
148 break;
149 case U_OTHER_PUNCTUATION:
150 r = css::i18n::UnicodeType::OTHER_PUNCTUATION;
151 break;
152 case U_MATH_SYMBOL:
153 r = css::i18n::UnicodeType::MATH_SYMBOL;
154 break;
155 case U_CURRENCY_SYMBOL:
156 r = css::i18n::UnicodeType::CURRENCY_SYMBOL;
157 break;
158 case U_MODIFIER_SYMBOL:
159 r = css::i18n::UnicodeType::MODIFIER_SYMBOL;
160 break;
161 case U_OTHER_SYMBOL:
162 r = css::i18n::UnicodeType::OTHER_SYMBOL;
163 break;
164 case U_START_PUNCTUATION:
165 r = css::i18n::UnicodeType::START_PUNCTUATION;
166 break;
167 case U_END_PUNCTUATION:
168 r = css::i18n::UnicodeType::END_PUNCTUATION;
169 break;
170 }
171
172 return r;
173}
174
177 static sal_Unicode c = 0x00;
178 static sal_uInt8 r = 0x00;
179
180 if (ch == c) return r;
181 else c = ch;
182
183 sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
184 r = (address < UnicodeDirectionNumberBlock)
186 : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
187 return r;
188}
189
190sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) {
191 nChar = u_charMirror(nChar);
192 return nChar;
193}
194
195#define bit(name) (1U << name)
196
197#define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
198
199#define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
200
201#define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
202
203#define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
204 bit(UnicodeType::MODIFIER_LETTER)|\
205 bit(UnicodeType::OTHER_LETTER)
206
207#define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
208 bit(UnicodeType::LINE_SEPARATOR)|\
209 bit(UnicodeType::PARAGRAPH_SEPARATOR)
210
211#define CONTROLMASK bit(UnicodeType::CONTROL)|\
212 bit(UnicodeType::FORMAT)|\
213 bit(UnicodeType::LINE_SEPARATOR)|\
214 bit(UnicodeType::PARAGRAPH_SEPARATOR)
215
216#define IsType(func, mask) \
217bool func( const sal_uInt32 ch) {\
218 return (bit(getUnicodeType(ch)) & (mask)) != 0;\
219}
220
224
225#define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
226 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
227
228bool unicode::isWhiteSpace(const sal_uInt32 ch)
229{
230 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
231}
232
233sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
234{
235 //See unicode/uscript.h
236 sal_Int16 nRet;
237 switch (eScript)
238 {
239 case USCRIPT_INVALID_CODE:
240 case USCRIPT_COMMON:
241 case USCRIPT_INHERITED:
242 case USCRIPT_UNWRITTEN_LANGUAGES:
243 case USCRIPT_UNKNOWN:
244 case USCRIPT_MATHEMATICAL_NOTATION:
245 case USCRIPT_SYMBOLS:
246 case USCRIPT_CODE_LIMIT:
247 nRet = ScriptType::WEAK;
248 break;
249 case USCRIPT_ARMENIAN:
250 case USCRIPT_CHEROKEE:
251 case USCRIPT_COPTIC:
252 case USCRIPT_CYRILLIC:
253 case USCRIPT_GEORGIAN:
254 case USCRIPT_GOTHIC:
255 case USCRIPT_GREEK:
256 case USCRIPT_LATIN:
257 case USCRIPT_OGHAM:
258 case USCRIPT_OLD_ITALIC:
259 case USCRIPT_RUNIC:
260 case USCRIPT_CANADIAN_ABORIGINAL:
261 case USCRIPT_BRAILLE:
262 case USCRIPT_CYPRIOT:
263 case USCRIPT_OSMANYA:
264 case USCRIPT_SHAVIAN:
265 case USCRIPT_KATAKANA_OR_HIRAGANA:
266 case USCRIPT_GLAGOLITIC:
267 case USCRIPT_CIRTH:
268 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
269 case USCRIPT_OLD_HUNGARIAN:
270 case USCRIPT_LATIN_FRAKTUR:
271 case USCRIPT_LATIN_GAELIC:
272 nRet = ScriptType::LATIN;
273 break;
274 case USCRIPT_BOPOMOFO:
275 case USCRIPT_HAN:
276 case USCRIPT_HANGUL:
277 case USCRIPT_HIRAGANA:
278 case USCRIPT_KATAKANA:
279 case USCRIPT_YI:
280 case USCRIPT_SIMPLIFIED_HAN:
281 case USCRIPT_TRADITIONAL_HAN:
282 case USCRIPT_JAPANESE:
283 case USCRIPT_KOREAN:
284 case USCRIPT_TANGUT:
285 case USCRIPT_KHITAN_SMALL_SCRIPT:
286 nRet = ScriptType::ASIAN;
287 break;
288 case USCRIPT_ARABIC:
289 case USCRIPT_BENGALI:
290 case USCRIPT_DESERET:
291 case USCRIPT_DEVANAGARI:
292 case USCRIPT_ETHIOPIC:
293 case USCRIPT_GUJARATI:
294 case USCRIPT_GURMUKHI:
295 case USCRIPT_HEBREW:
296 case USCRIPT_KANNADA:
297 case USCRIPT_KHMER:
298 case USCRIPT_LAO:
299 case USCRIPT_MALAYALAM:
300 case USCRIPT_MONGOLIAN:
301 case USCRIPT_MYANMAR:
302 case USCRIPT_ORIYA:
303 case USCRIPT_SINHALA:
304 case USCRIPT_SYRIAC:
305 case USCRIPT_TAMIL:
306 case USCRIPT_TELUGU:
307 case USCRIPT_THAANA:
308 case USCRIPT_THAI:
309 case USCRIPT_TIBETAN:
310 case USCRIPT_TAGALOG:
311 case USCRIPT_HANUNOO:
312 case USCRIPT_BUHID:
313 case USCRIPT_TAGBANWA:
314 case USCRIPT_LIMBU:
315 case USCRIPT_LINEAR_B:
316 case USCRIPT_TAI_LE:
317 case USCRIPT_UGARITIC:
318 case USCRIPT_BUGINESE:
319 case USCRIPT_KHAROSHTHI:
320 case USCRIPT_SYLOTI_NAGRI:
321 case USCRIPT_NEW_TAI_LUE:
322 case USCRIPT_TIFINAGH:
323 case USCRIPT_OLD_PERSIAN:
324 case USCRIPT_BALINESE:
325 case USCRIPT_BATAK:
326 case USCRIPT_BLISSYMBOLS:
327 case USCRIPT_BRAHMI:
328 case USCRIPT_CHAM:
329 case USCRIPT_DEMOTIC_EGYPTIAN:
330 case USCRIPT_HIERATIC_EGYPTIAN:
331 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
332 case USCRIPT_KHUTSURI:
333 case USCRIPT_PAHAWH_HMONG:
334 case USCRIPT_HARAPPAN_INDUS:
335 case USCRIPT_JAVANESE:
336 case USCRIPT_KAYAH_LI:
337 case USCRIPT_LEPCHA:
338 case USCRIPT_LINEAR_A:
339 case USCRIPT_MANDAEAN:
340 case USCRIPT_MAYAN_HIEROGLYPHS:
341 case USCRIPT_MEROITIC:
342 case USCRIPT_NKO:
343 case USCRIPT_ORKHON:
344 case USCRIPT_OLD_PERMIC:
345 case USCRIPT_PHAGS_PA:
346 case USCRIPT_PHOENICIAN:
347 case USCRIPT_PHONETIC_POLLARD:
348 case USCRIPT_RONGORONGO:
349 case USCRIPT_SARATI:
350 case USCRIPT_ESTRANGELO_SYRIAC:
351 case USCRIPT_WESTERN_SYRIAC:
352 case USCRIPT_EASTERN_SYRIAC:
353 case USCRIPT_TENGWAR:
354 case USCRIPT_VAI:
355 case USCRIPT_VISIBLE_SPEECH:
356 case USCRIPT_CUNEIFORM:
357 case USCRIPT_CARIAN:
358 case USCRIPT_LANNA:
359 case USCRIPT_LYCIAN:
360 case USCRIPT_LYDIAN:
361 case USCRIPT_OL_CHIKI:
362 case USCRIPT_REJANG:
363 case USCRIPT_SAURASHTRA:
364 case USCRIPT_SIGN_WRITING:
365 case USCRIPT_SUNDANESE:
366 case USCRIPT_MOON:
367 case USCRIPT_MEITEI_MAYEK:
368 case USCRIPT_IMPERIAL_ARAMAIC:
369 case USCRIPT_AVESTAN:
370 case USCRIPT_CHAKMA:
371 case USCRIPT_KAITHI:
372 case USCRIPT_MANICHAEAN:
373 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
374 case USCRIPT_PSALTER_PAHLAVI:
375 case USCRIPT_BOOK_PAHLAVI:
376 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
377 case USCRIPT_SAMARITAN:
378 case USCRIPT_TAI_VIET:
379 case USCRIPT_BAMUM:
380 case USCRIPT_LISU:
381 case USCRIPT_NAKHI_GEBA:
382 case USCRIPT_OLD_SOUTH_ARABIAN:
383 case USCRIPT_BASSA_VAH:
384 case USCRIPT_DUPLOYAN_SHORTAND:
385 case USCRIPT_ELBASAN:
386 case USCRIPT_GRANTHA:
387 case USCRIPT_KPELLE:
388 case USCRIPT_LOMA:
389 case USCRIPT_MENDE:
390 case USCRIPT_MEROITIC_CURSIVE:
391 case USCRIPT_OLD_NORTH_ARABIAN:
392 case USCRIPT_NABATAEAN:
393 case USCRIPT_PALMYRENE:
394 case USCRIPT_SINDHI:
395 case USCRIPT_WARANG_CITI:
396 default: // anything new is going to be pretty wild
397 nRet = ScriptType::COMPLEX;
398 break;
399 }
400 return nRet;
401}
402
404{
405 constexpr int32_t nBuf = 42;
406 UScriptCode aBuf[nBuf];
407 if (rLanguageTag.hasScript())
408 {
409 aBuf[0] = static_cast<UScriptCode>(u_getPropertyValueEnum( UCHAR_SCRIPT,
410 OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr()));
411 }
412 else
413 {
414 OUString aName;
415 if (rLanguageTag.getCountry().isEmpty())
416 aName = rLanguageTag.getLanguage();
417 else
418 aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry();
419 UErrorCode status = U_ZERO_ERROR;
420 const int32_t nScripts = uscript_getCode(
421 OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(),
422 aBuf, nBuf, &status);
423 // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
424 // and required capacity returned, but really..
425 if (nScripts == 0 || !U_SUCCESS(status))
426 return css::i18n::ScriptType::LATIN;
427 }
429}
430
432{
433 OString sRet;
434 switch (eScript)
435 {
436 case USCRIPT_CODE_LIMIT:
437 case USCRIPT_INVALID_CODE:
438 sRet = "zxx";
439 break;
440 case USCRIPT_COMMON:
441 case USCRIPT_INHERITED:
442 sRet = "und";
443 break;
444 case USCRIPT_MATHEMATICAL_NOTATION:
445 case USCRIPT_SYMBOLS:
446 sRet = "zxx";
447 break;
448 case USCRIPT_UNWRITTEN_LANGUAGES:
449 case USCRIPT_UNKNOWN:
450 sRet = "und";
451 break;
452 case USCRIPT_ARABIC:
453 sRet = "ar";
454 break;
455 case USCRIPT_ARMENIAN:
456 sRet = "hy";
457 break;
458 case USCRIPT_BENGALI:
459 sRet = "bn";
460 break;
461 case USCRIPT_BOPOMOFO:
462 sRet = "zh";
463 break;
464 case USCRIPT_CHEROKEE:
465 sRet = "chr";
466 break;
467 case USCRIPT_COPTIC:
468 sRet = "cop";
469 break;
470 case USCRIPT_CYRILLIC:
471 sRet = "ru";
472 break;
473 case USCRIPT_DESERET:
474 sRet = "en";
475 break;
476 case USCRIPT_DEVANAGARI:
477 sRet = "hi";
478 break;
479 case USCRIPT_ETHIOPIC:
480 sRet = "am";
481 break;
482 case USCRIPT_GEORGIAN:
483 sRet = "ka";
484 break;
485 case USCRIPT_GOTHIC:
486 sRet = "got";
487 break;
488 case USCRIPT_GREEK:
489 sRet = "el";
490 break;
491 case USCRIPT_GUJARATI:
492 sRet = "gu";
493 break;
494 case USCRIPT_GURMUKHI:
495 sRet = "pa";
496 break;
497 case USCRIPT_HAN:
498 sRet = "zh";
499 break;
500 case USCRIPT_HANGUL:
501 sRet = "ko";
502 break;
503 case USCRIPT_HEBREW:
504 sRet = "hr";
505 break;
506 case USCRIPT_HIRAGANA:
507 sRet = "ja";
508 break;
509 case USCRIPT_KANNADA:
510 sRet = "kn";
511 break;
512 case USCRIPT_KATAKANA:
513 sRet = "ja";
514 break;
515 case USCRIPT_KHMER:
516 sRet = "km";
517 break;
518 case USCRIPT_LAO:
519 sRet = "lo";
520 break;
521 case USCRIPT_LATIN:
522 sRet = "en";
523 break;
524 case USCRIPT_MALAYALAM:
525 sRet = "ml";
526 break;
527 case USCRIPT_MONGOLIAN:
528 sRet = "mn";
529 break;
530 case USCRIPT_MYANMAR:
531 sRet = "my";
532 break;
533 case USCRIPT_OGHAM:
534 sRet = "pgl";
535 break;
536 case USCRIPT_OLD_ITALIC:
537 sRet = "osc";
538 break;
539 case USCRIPT_ORIYA:
540 sRet = "or";
541 break;
542 case USCRIPT_RUNIC:
543 sRet = "ang";
544 break;
545 case USCRIPT_SINHALA:
546 sRet = "si";
547 break;
548 case USCRIPT_SYRIAC:
549 sRet = "syr";
550 break;
551 case USCRIPT_TAMIL:
552 sRet = "ta";
553 break;
554 case USCRIPT_TELUGU:
555 sRet = "te";
556 break;
557 case USCRIPT_THAANA:
558 sRet = "dv";
559 break;
560 case USCRIPT_THAI:
561 sRet = "th";
562 break;
563 case USCRIPT_TIBETAN:
564 sRet = "bo";
565 break;
566 case USCRIPT_CANADIAN_ABORIGINAL:
567 sRet = "iu";
568 break;
569 case USCRIPT_YI:
570 sRet = "ii";
571 break;
572 case USCRIPT_TAGALOG:
573 sRet = "tl";
574 break;
575 case USCRIPT_HANUNOO:
576 sRet = "hnn";
577 break;
578 case USCRIPT_BUHID:
579 sRet = "bku";
580 break;
581 case USCRIPT_TAGBANWA:
582 sRet = "tbw";
583 break;
584 case USCRIPT_BRAILLE:
585 sRet = "en";
586 break;
587 case USCRIPT_CYPRIOT:
588 sRet = "ecy";
589 break;
590 case USCRIPT_LIMBU:
591 sRet = "lif";
592 break;
593 case USCRIPT_LINEAR_B:
594 sRet = "gmy";
595 break;
596 case USCRIPT_OSMANYA:
597 sRet = "so";
598 break;
599 case USCRIPT_SHAVIAN:
600 sRet = "en";
601 break;
602 case USCRIPT_TAI_LE:
603 sRet = "tdd";
604 break;
605 case USCRIPT_UGARITIC:
606 sRet = "uga";
607 break;
608 case USCRIPT_KATAKANA_OR_HIRAGANA:
609 sRet = "ja";
610 break;
611 case USCRIPT_BUGINESE:
612 sRet = "bug";
613 break;
614 case USCRIPT_GLAGOLITIC:
615 sRet = "ch";
616 break;
617 case USCRIPT_KHAROSHTHI:
618 sRet = "pra";
619 break;
620 case USCRIPT_SYLOTI_NAGRI:
621 sRet = "syl";
622 break;
623 case USCRIPT_NEW_TAI_LUE:
624 sRet = "khb";
625 break;
626 case USCRIPT_TIFINAGH:
627 sRet = "tmh";
628 break;
629 case USCRIPT_OLD_PERSIAN:
630 sRet = "peo";
631 break;
632 case USCRIPT_BALINESE:
633 sRet = "ban";
634 break;
635 case USCRIPT_BATAK:
636 sRet = "btk";
637 break;
638 case USCRIPT_BLISSYMBOLS:
639 sRet = "en";
640 break;
641 case USCRIPT_BRAHMI:
642 sRet = "pra";
643 break;
644 case USCRIPT_CHAM:
645 sRet = "cja";
646 break;
647 case USCRIPT_CIRTH:
648 sRet = "sjn";
649 break;
650 case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
651 sRet = "cu";
652 break;
653 case USCRIPT_DEMOTIC_EGYPTIAN:
654 case USCRIPT_HIERATIC_EGYPTIAN:
655 case USCRIPT_EGYPTIAN_HIEROGLYPHS:
656 sRet = "egy";
657 break;
658 case USCRIPT_KHUTSURI:
659 sRet = "ka";
660 break;
661 case USCRIPT_SIMPLIFIED_HAN:
662 sRet = "zh";
663 break;
664 case USCRIPT_TRADITIONAL_HAN:
665 sRet = "zh";
666 break;
667 case USCRIPT_PAHAWH_HMONG:
668 sRet = "blu";
669 break;
670 case USCRIPT_OLD_HUNGARIAN:
671 sRet = "ohu";
672 break;
673 case USCRIPT_HARAPPAN_INDUS:
674 sRet = "xiv";
675 break;
676 case USCRIPT_JAVANESE:
677 sRet = "kaw";
678 break;
679 case USCRIPT_KAYAH_LI:
680 sRet = "eky";
681 break;
682 case USCRIPT_LATIN_FRAKTUR:
683 sRet = "de";
684 break;
685 case USCRIPT_LATIN_GAELIC:
686 sRet = "ga";
687 break;
688 case USCRIPT_LEPCHA:
689 sRet = "lep";
690 break;
691 case USCRIPT_LINEAR_A:
692 sRet = "ecr";
693 break;
694 case USCRIPT_MAYAN_HIEROGLYPHS:
695 sRet = "myn";
696 break;
697 case USCRIPT_MEROITIC:
698 sRet = "xmr";
699 break;
700 case USCRIPT_NKO:
701 sRet = "nqo";
702 break;
703 case USCRIPT_ORKHON:
704 sRet = "otk";
705 break;
706 case USCRIPT_OLD_PERMIC:
707 sRet = "kv";
708 break;
709 case USCRIPT_PHAGS_PA:
710 sRet = "xng";
711 break;
712 case USCRIPT_PHOENICIAN:
713 sRet = "phn";
714 break;
715 case USCRIPT_PHONETIC_POLLARD:
716 sRet = "hmd";
717 break;
718 case USCRIPT_RONGORONGO:
719 sRet = "rap";
720 break;
721 case USCRIPT_SARATI:
722 sRet = "qya";
723 break;
724 case USCRIPT_ESTRANGELO_SYRIAC:
725 sRet = "syr";
726 break;
727 case USCRIPT_WESTERN_SYRIAC:
728 sRet = "tru";
729 break;
730 case USCRIPT_EASTERN_SYRIAC:
731 sRet = "aii";
732 break;
733 case USCRIPT_TENGWAR:
734 sRet = "sjn";
735 break;
736 case USCRIPT_VAI:
737 sRet = "vai";
738 break;
739 case USCRIPT_VISIBLE_SPEECH:
740 sRet = "en";
741 break;
742 case USCRIPT_CUNEIFORM:
743 sRet = "akk";
744 break;
745 case USCRIPT_CARIAN:
746 sRet = "xcr";
747 break;
748 case USCRIPT_JAPANESE:
749 sRet = "ja";
750 break;
751 case USCRIPT_LANNA:
752 sRet = "nod";
753 break;
754 case USCRIPT_LYCIAN:
755 sRet = "xlc";
756 break;
757 case USCRIPT_LYDIAN:
758 sRet = "xld";
759 break;
760 case USCRIPT_OL_CHIKI:
761 sRet = "sat";
762 break;
763 case USCRIPT_REJANG:
764 sRet = "rej";
765 break;
766 case USCRIPT_SAURASHTRA:
767 sRet = "saz";
768 break;
769 case USCRIPT_SIGN_WRITING:
770 sRet = "en";
771 break;
772 case USCRIPT_SUNDANESE:
773 sRet = "su";
774 break;
775 case USCRIPT_MOON:
776 sRet = "en";
777 break;
778 case USCRIPT_MEITEI_MAYEK:
779 sRet = "mni";
780 break;
781 case USCRIPT_IMPERIAL_ARAMAIC:
782 sRet = "arc";
783 break;
784 case USCRIPT_AVESTAN:
785 sRet = "ae";
786 break;
787 case USCRIPT_CHAKMA:
788 sRet = "ccp";
789 break;
790 case USCRIPT_KOREAN:
791 sRet = "ko";
792 break;
793 case USCRIPT_KAITHI:
794 sRet = "awa";
795 break;
796 case USCRIPT_MANICHAEAN:
797 sRet = "xmn";
798 break;
799 case USCRIPT_INSCRIPTIONAL_PAHLAVI:
800 case USCRIPT_PSALTER_PAHLAVI:
801 case USCRIPT_BOOK_PAHLAVI:
802 case USCRIPT_INSCRIPTIONAL_PARTHIAN:
803 sRet = "xpr";
804 break;
805 case USCRIPT_SAMARITAN:
806 sRet = "heb";
807 break;
808 case USCRIPT_TAI_VIET:
809 sRet = "blt";
810 break;
811 case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
812 sRet = "mic";
813 break;
814 case USCRIPT_NABATAEAN:
815 sRet = "mis-Nbat"; // Uncoded with script
816 break;
817 case USCRIPT_PALMYRENE:
818 sRet = "mis-Palm"; // Uncoded with script
819 break;
820 case USCRIPT_BAMUM:
821 sRet = "bax";
822 break;
823 case USCRIPT_LISU:
824 sRet = "lis";
825 break;
826 case USCRIPT_NAKHI_GEBA:
827 sRet = "nxq";
828 break;
829 case USCRIPT_OLD_SOUTH_ARABIAN:
830 sRet = "xsa";
831 break;
832 case USCRIPT_BASSA_VAH:
833 sRet = "bsq";
834 break;
835 case USCRIPT_DUPLOYAN_SHORTAND:
836 sRet = "fr";
837 break;
838 case USCRIPT_ELBASAN:
839 sRet = "sq";
840 break;
841 case USCRIPT_GRANTHA:
842 sRet = "ta";
843 break;
844 case USCRIPT_KPELLE:
845 sRet = "kpe";
846 break;
847 case USCRIPT_LOMA:
848 sRet = "lom";
849 break;
850 case USCRIPT_MENDE:
851 sRet = "men";
852 break;
853 case USCRIPT_MEROITIC_CURSIVE:
854 sRet = "xmr";
855 break;
856 case USCRIPT_OLD_NORTH_ARABIAN:
857 sRet = "xna";
858 break;
859 case USCRIPT_SINDHI:
860 sRet = "sd";
861 break;
862 case USCRIPT_WARANG_CITI:
863 sRet = "hoc";
864 break;
865 case USCRIPT_AFAKA:
866 sRet = "djk";
867 break;
868 case USCRIPT_JURCHEN:
869 sRet = "juc";
870 break;
871 case USCRIPT_MRO:
872 sRet = "cmr";
873 break;
874 case USCRIPT_NUSHU:
875 sRet = "mis-Nshu"; // Uncoded with script
876 break;
877 case USCRIPT_SHARADA:
878 sRet = "sa";
879 break;
880 case USCRIPT_SORA_SOMPENG:
881 sRet = "srb";
882 break;
883 case USCRIPT_TAKRI:
884 sRet = "doi";
885 break;
886 case USCRIPT_TANGUT:
887 sRet = "txg";
888 break;
889 case USCRIPT_WOLEAI:
890 sRet = "woe";
891 break;
892 case USCRIPT_ANATOLIAN_HIEROGLYPHS:
893 sRet = "hlu";
894 break;
895 case USCRIPT_KHOJKI:
896 sRet = "gu";
897 break;
898 case USCRIPT_TIRHUTA:
899 sRet = "mai";
900 break;
901 case USCRIPT_CAUCASIAN_ALBANIAN:
902 sRet = "xag";
903 break;
904 case USCRIPT_MAHAJANI:
905 sRet = "mwr";
906 break;
907 case USCRIPT_AHOM:
908 sRet = "aho";
909 break;
910 case USCRIPT_HATRAN:
911 sRet = "qly-Hatr";
912 break;
913 case USCRIPT_MODI:
914 sRet = "mr-Modi";
915 break;
916 case USCRIPT_MULTANI:
917 sRet = "skr-Mutl";
918 break;
919 case USCRIPT_PAU_CIN_HAU:
920 sRet = "ctd-Pauc";
921 break;
922 case USCRIPT_SIDDHAM:
923 sRet = "sa-Sidd";
924 break;
925 case USCRIPT_ADLAM:
926 sRet = "mis-Adlm"; // Adlam for Fulani, no language code
927 break;
928 case USCRIPT_BHAIKSUKI:
929 sRet = "mis-Bhks"; // Bhaiksuki for some Buddhist texts, no language code
930 break;
931 case USCRIPT_MARCHEN:
932 sRet = "bo-Marc";
933 break;
934 case USCRIPT_NEWA:
935 sRet = "new-Newa";
936 break;
937 case USCRIPT_OSAGE:
938 sRet = "osa-Osge";
939 break;
940 case USCRIPT_HAN_WITH_BOPOMOFO:
941 sRet = "mis-Hanb"; // Han with Bopomofo, zh-Hanb ?
942 break;
943 case USCRIPT_JAMO:
944 sRet = "ko"; // Jamo - elements of Hangul Syllables
945 break;
946 case USCRIPT_SYMBOLS_EMOJI:
947 sRet = "mis-Zsye"; // Emoji variant
948 break;
949 case USCRIPT_MASARAM_GONDI:
950 sRet = "gon-Gonm"; // macro language code, could be wsg,esg,gno
951 break;
952 case USCRIPT_SOYOMBO:
953 sRet = "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
954 break;
955 case USCRIPT_ZANABAZAR_SQUARE:
956 sRet = "mn-Zanb"; // abugida to write Mongolian
957 break;
958 case USCRIPT_DOGRA:
959 sRet = "dgo"; // Dogri proper
960 break;
961 case USCRIPT_GUNJALA_GONDI:
962 sRet = "wsg"; // Adilabad Gondi
963 break;
964 case USCRIPT_MAKASAR:
965 sRet = "mak";
966 break;
967 case USCRIPT_MEDEFAIDRIN:
968 sRet = "dmf-Medf";
969 break;
970 case USCRIPT_HANIFI_ROHINGYA:
971 sRet = "rhg";
972 break;
973 case USCRIPT_SOGDIAN:
974 sRet = "sog";
975 break;
976 case USCRIPT_OLD_SOGDIAN:
977 sRet = "sog";
978 break;
979 case USCRIPT_ELYMAIC:
980 sRet = "arc-Elym";
981 break;
982 case USCRIPT_NYIAKENG_PUACHUE_HMONG:
983 sRet = "hmn-Hmnp"; // macrolanguage code
984 break;
985 case USCRIPT_NANDINAGARI:
986 sRet = "sa-Nand";
987 break;
988 case USCRIPT_WANCHO:
989 sRet = "nnp-Wcho";
990 break;
991 case USCRIPT_CHORASMIAN:
992 sRet = "xco-Chrs";
993 break;
994 case USCRIPT_DIVES_AKURU:
995 sRet = "dv-Diak";
996 break;
997 case USCRIPT_KHITAN_SMALL_SCRIPT:
998 sRet = "zkt-Kits";
999 break;
1000 case USCRIPT_YEZIDI:
1001 sRet = "kmr-Yezi";
1002 break;
1003#if (U_ICU_VERSION_MAJOR_NUM >= 70)
1004 case USCRIPT_CYPRO_MINOAN:
1005 sRet = "mis-Cpmn"; // Uncoded with script
1006 break;
1007 case USCRIPT_OLD_UYGHUR:
1008 sRet = "oui-Ougr";
1009 break;
1010 case USCRIPT_TANGSA:
1011 sRet = "nst-Tnsa";
1012 break;
1013 case USCRIPT_TOTO:
1014 sRet = "txo-Toto";
1015 break;
1016 case USCRIPT_VITHKUQI:
1017 sRet = "sq-Vith"; // macrolanguage code
1018 break;
1019#endif
1020#if (U_ICU_VERSION_MAJOR_NUM >= 72)
1021 case USCRIPT_KAWI:
1022 sRet = "mis-Kawi"; // Uncoded with script
1023 break;
1024 case USCRIPT_NAG_MUNDARI:
1025 sRet = "unr-Nagm";
1026 break;
1027#endif
1028 }
1029 return sRet;
1030}
1031
1032//Format a number as a percentage according to the rules of the given
1033//language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
1034OUString unicode::formatPercent(double dNumber,
1035 const LanguageTag &rLangTag)
1036{
1037 // get a currency formatter for this locale ID
1038 UErrorCode errorCode=U_ZERO_ERROR;
1039
1040 LanguageTag aLangTag(rLangTag);
1041
1042 // As of CLDR Version 24 these languages were not listed as using spacing
1043 // between number and % but are reported as such by our l10n groups
1044 // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
1045 // so format using French which has the desired rules
1046 if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
1047 aLangTag.reset("fr-FR");
1048
1049 icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
1050
1051 std::unique_ptr<icu::NumberFormat> xF(
1052 icu::NumberFormat::createPercentInstance(aLocale, errorCode));
1053 if(U_FAILURE(errorCode))
1054 {
1055 SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
1056 return OUString::number(dNumber) + "%";
1057 }
1058
1059 icu::UnicodeString output;
1060 xF->format(dNumber/100, output);
1061 OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
1062 output.length());
1063 if (rLangTag.getLanguage() == "de")
1064 {
1065 //narrow no-break space instead of (normal) no-break space
1066 return aRet.replace(0x00A0, 0x202F);
1067 }
1068 return aRet;
1069}
1070
1072{
1073 //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
1074 if( maInput.getLength() > 255 )
1075 mbAllowMoreChars = false;
1076
1077 if( !mbAllowMoreChars )
1078 return false;
1079
1080 bool bPreventNonHex = false;
1081 if( maInput.indexOf("U+") != -1 )
1082 bPreventNonHex = true;
1083
1084 switch ( unicode::getUnicodeType(uChar) )
1085 {
1086 case css::i18n::UnicodeType::SURROGATE:
1087 if( bPreventNonHex )
1088 {
1089 mbAllowMoreChars = false;
1090 return false;
1091 }
1092
1093 if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
1094 {
1095 maUtf16.append(uChar);
1096 return true;
1097 }
1098 if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
1099 maUtf16.insert(0, uChar );
1100 //end of hex strings, or unexpected order of high/low, so don't accept more
1101 if( !maUtf16.isEmpty() )
1102 maInput.append(maUtf16);
1103 if( !maCombining.isEmpty() )
1104 maInput.append(maCombining);
1105 mbAllowMoreChars = false;
1106 break;
1107
1108 case css::i18n::UnicodeType::NON_SPACING_MARK:
1109 case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
1110 if( bPreventNonHex )
1111 {
1112 mbAllowMoreChars = false;
1113 return false;
1114 }
1115
1116 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
1117 if( !maUtf16.isEmpty() )
1118 {
1119 maInput = maUtf16;
1120 if( !maCombining.isEmpty() )
1121 maInput.append(maCombining);
1122 mbAllowMoreChars = false;
1123 return false;
1124 }
1125 maCombining.insert(0, uChar);
1126 break;
1127
1128 default:
1129 //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
1130 if( !maUtf16.isEmpty() )
1131 {
1132 maInput = maUtf16;
1133 if( !maCombining.isEmpty() )
1134 maInput.append(maCombining);
1135 mbAllowMoreChars = false;
1136 return false;
1137 }
1138
1139 if( !maCombining.isEmpty() )
1140 {
1141 maCombining.insert(0, uChar);
1143 mbAllowMoreChars = false;
1144 return false;
1145 }
1146
1147 // 0 - 1f are control characters. Do not process those.
1148 if( uChar < 0x20 )
1149 {
1150 mbAllowMoreChars = false;
1151 return false;
1152 }
1153
1154 switch( uChar )
1155 {
1156 case 'u':
1157 case 'U':
1158 // U+ notation found. Continue looking for another one.
1159 if( mbRequiresU )
1160 {
1161 mbRequiresU = false;
1162 maInput.insert(0,"U+");
1163 }
1164 // treat as a normal character
1165 else
1166 {
1167 mbAllowMoreChars = false;
1168 if( !bPreventNonHex )
1169 maInput.insertUtf32(0, uChar);
1170 }
1171 break;
1172 case '+':
1173 // + already found: skip when not U, or edge case of +U+xxxx
1174 if( mbRequiresU || (maInput.indexOf("U+") == 0) )
1175 mbAllowMoreChars = false;
1176 // hex chars followed by '+' - now require a 'U'
1177 else if ( !maInput.isEmpty() )
1178 mbRequiresU = true;
1179 // treat as a normal character
1180 else
1181 {
1182 mbAllowMoreChars = false;
1183 if( !bPreventNonHex )
1184 maInput.insertUtf32(0, uChar);
1185 }
1186 break;
1187 default:
1188 // + already found. Since not U, cancel further input
1189 if( mbRequiresU )
1190 mbAllowMoreChars = false;
1191 // maximum digits per notation is 8: only one notation
1192 else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
1193 mbAllowMoreChars = false;
1194 // maximum digits per notation is 8: previous notation found
1195 else if( maInput.indexOf("U+") == 8 )
1196 mbAllowMoreChars = false;
1197 // a hex character. Add to string.
1198 else if( rtl::isAsciiHexDigit(uChar) )
1199 {
1200 mbIsHexString = true;
1201 maInput.insertUtf32(0, uChar);
1202 }
1203 // not a hex character: stop input. keep if it is the first input provided
1204 else
1205 {
1206 mbAllowMoreChars = false;
1207 if( maInput.isEmpty() )
1208 maInput.insertUtf32(0, uChar);
1209 }
1210 }
1211 }
1212 return mbAllowMoreChars;
1213}
1214
1216{
1217 if( maInput.isEmpty() )
1218 {
1219 //edge case - input finished with incomplete low surrogate or combining characters without a base
1220 if( mbAllowMoreChars )
1221 {
1222 if( !maUtf16.isEmpty() )
1223 maInput = maUtf16;
1224 if( !maCombining.isEmpty() )
1225 maInput.append(maCombining);
1226 }
1227 return maInput.toString();
1228 }
1229
1230 if( !mbIsHexString )
1231 return maInput.toString();
1232
1233 //this function potentially modifies the input string. Prevent addition of further characters
1234 mbAllowMoreChars = false;
1235
1236 //validate unicode notation.
1237 OUString sIn;
1238 sal_uInt32 nUnicode = 0;
1239 sal_Int32 nUPlus = maInput.indexOf("U+");
1240 //if U+ notation used, strip off all extra chars added not in U+ notation
1241 if( nUPlus != -1 )
1242 {
1243 maInput.remove(0, nUPlus);
1244 sIn = maInput.copy(2).makeStringAndClear();
1245 nUPlus = sIn.indexOf("U+");
1246 }
1247 else
1248 sIn = maInput.toString();
1249 while( nUPlus != -1 )
1250 {
1251 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1252 //prevent creating control characters or invalid Unicode values
1253 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1254 maInput = sIn.subView(nUPlus);
1255 sIn = sIn.copy(nUPlus+2);
1256 nUPlus = sIn.indexOf("U+");
1257 }
1258
1259 nUnicode = sIn.toUInt32(16);
1260 if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1261 maInput.truncate().append( sIn[sIn.getLength()-1] );
1262 return maInput.toString();
1263}
1264
1266{
1267 OUString sIn = StringToReplace();
1268 sal_Int32 nPos = 0;
1269 sal_uInt32 counter = 0;
1270 while( nPos < sIn.getLength() )
1271 {
1272 sIn.iterateCodePoints(&nPos);
1273 ++counter;
1274 }
1275 return counter;
1276}
1277
1279{
1280 OUString sIn = StringToReplace();
1281 OUStringBuffer output = "";
1282 sal_Int32 nUPlus = sIn.indexOf("U+");
1283 // convert from hex notation to glyph
1284 if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1285 {
1286 sal_uInt32 nUnicode = 0;
1287 if( nUPlus == 0)
1288 {
1289 sIn = sIn.copy(2);
1290 nUPlus = sIn.indexOf("U+");
1291 }
1292 while( nUPlus > 0 )
1293 {
1294 nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
1295 output.appendUtf32( nUnicode );
1296
1297 sIn = sIn.copy(nUPlus+2);
1298 nUPlus = sIn.indexOf("U+");
1299 }
1300 nUnicode = sIn.toUInt32(16);
1301 output.appendUtf32( nUnicode );
1302 }
1303 // convert from glyph to hex notation
1304 else
1305 {
1306 sal_Int32 nPos = 0;
1307 while( nPos < sIn.getLength() )
1308 {
1309 OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1310 //pad with zeros - minimum length of 4.
1311 for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1312 aTmp.insert( 0,"0" );
1313 output.append( "U+" + aTmp );
1314 }
1315 }
1316 return output.makeStringAndClear();
1317}
1318
1319/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
UBlockCode from
static icu::Locale getIcuLocale(const LanguageTag &rLanguageTag)
OUString getLanguage() const
OUString getScript() const
bool hasScript() const
OUString getCountry() const
LanguageTag & reset(const OUString &rBcp47LanguageTag)
OUString ReplacementString()
Definition: unicode.cxx:1278
OUStringBuffer maInput
Definition: unicode.hxx:96
bool AllowMoreInput(sal_Unicode uChar)
Build an input string of valid UTF16 units to toggle.
Definition: unicode.cxx:1071
OUStringBuffer maUtf16
Definition: unicode.hxx:97
OUString StringToReplace()
Validates (and potentially modifies) the input string.
Definition: unicode.cxx:1215
sal_uInt32 CharsToDelete()
While sInput.getLength() returns the number of utf16 units to delete, this function returns the numbe...
Definition: unicode.cxx:1265
OUStringBuffer maCombining
Definition: unicode.hxx:98
static OUString formatPercent(double dNumber, const LanguageTag &rLangTag)
Definition: unicode.cxx:1034
static bool isWhiteSpace(const sal_uInt32 ch)
static sal_uInt32 GetMirroredChar(sal_uInt32)
Definition: unicode.cxx:190
static sal_uInt8 getUnicodeDirection(const sal_Unicode ch)
Definition: unicode.cxx:176
static sal_Int16 getUnicodeScriptType(const sal_Unicode ch, const ScriptTypeList *typeList, sal_Int16 unknownType=0)
Definition: unicode.cxx:55
static bool isSpace(const sal_uInt32 ch)
static sal_Int16 getScriptClassFromUScriptCode(UScriptCode eScript)
Definition: unicode.cxx:233
static sal_Unicode getUnicodeScriptStart(css::i18n::UnicodeScript type)
Definition: unicode.cxx:60
static sal_Int16 getScriptClassFromLanguageTag(const LanguageTag &rLanguageTag)
Map a LanguageTag's language ISO 639 code or script ISO 15924 code or language-script or locale to La...
Definition: unicode.cxx:403
static bool isAlpha(const sal_uInt32 ch)
static OString getExemplarLanguageForUScriptCode(UScriptCode eScript)
Definition: unicode.cxx:431
static sal_Int16 getUnicodeType(const sal_uInt32 ch)
Definition: unicode.cxx:70
static bool isControl(const sal_uInt32 ch)
static sal_Unicode getUnicodeScriptEnd(css::i18n::UnicodeScript type)
Definition: unicode.cxx:65
OUString aName
sal_uInt16 nPos
#define SAL_WARN(area, stream)
aBuf
int i
ScriptTypeList const typeList[]
sal_uInt32 toUInt32(std::u16string_view str, sal_Int16 radix=10)
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
css::i18n::UnicodeScript to
Definition: unicode.hxx:34
sal_Int16 value
Definition: unicode.hxx:35
unsigned char sal_uInt8
sal_uInt16 sal_Unicode
ResultType type
#define ALPHAMASK
Definition: unicode.cxx:203
#define SPACEMASK
Definition: unicode.cxx:207
#define IsType(func, mask)
Definition: unicode.cxx:216
#define CONTROLSPACE
#define bit(name)
Definition: unicode.cxx:195
#define CONTROLMASK
Definition: unicode.cxx:211
static T getScriptType(const sal_Unicode ch, const L *typeList, T unknownType)
Definition: unicode.cxx:41
const sal_Int8 UnicodeDirectionBlockValue[]
Definition: unicode_data.h:64
#define UnicodeScriptTypeFrom
Definition: unicode_data.h:668
const sal_Int8 UnicodeDirectionIndex[]
Definition: unicode_data.h:28
const sal_Unicode UnicodeScriptType[][2]
Definition: unicode_data.h:671
const sal_Int8 UnicodeDirectionValue[]
Definition: unicode_data.h:69
#define UnicodeScriptTypeTo
Definition: unicode_data.h:669
#define UnicodeDirectionNumberBlock
Definition: unicode_data.h:26