LibreOffice Module i18npool (master) 1
cclass_unicode_parser.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20
21#include <cclass_unicode.hxx>
22#include <unicode/uchar.h>
23#include <rtl/character.hxx>
24#include <rtl/math.hxx>
25#include <rtl/ustring.hxx>
26#include <com/sun/star/i18n/KParseTokens.hpp>
27#include <com/sun/star/i18n/KParseType.hpp>
28#include <com/sun/star/i18n/LocaleData2.hpp>
29#include <com/sun/star/i18n/NativeNumberMode.hpp>
30#include <com/sun/star/i18n/NativeNumberSupplier.hpp>
31
32#include <string.h>
33#include <string_view>
34
35using namespace ::com::sun::star::uno;
36using namespace ::com::sun::star::i18n;
37using namespace ::com::sun::star::lang;
38
39#define TOKEN_DIGIT_FLAGS (ParserFlags::CHAR_VALUE | ParserFlags::VALUE | ParserFlags::VALUE_EXP | ParserFlags::VALUE_EXP_VALUE | ParserFlags::VALUE_DIGIT)
40
41namespace i18npool {
42
43// Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
44
47{
48// (...) == Calc formula compiler specific, commented out and modified
49
50 /* \0 */ ParserFlags::EXCLUDED,
59 /* 9 \t */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
61 /* 11 \v */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
85 /* 35 # */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD_SEP)
86 /* 36 $ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
87 /* 37 % */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::VALUE)
89 /* 39 ' */ ParserFlags::NAME_SEP,
94 /* 44 , */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
96 /* 46 . */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD | ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
98 //for ( i = 48; i < 58; i++ )
109 /* 58 : */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD)
114 /* 63 ? */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
115 /* 64 @ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
116 //for ( i = 65; i < 91; i++ )
143 /* 91 [ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
144 /* 92 \ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
145 /* 93 ] */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
148 /* 96 ` */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
149 //for ( i = 97; i < 123; i++ )
176 /* 123 { */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
177 /* 124 | */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
178 /* 125 } */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
179 /* 126 ~ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
180 /* 127 */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP // (ParserFlags::ILLEGAL // UNUSED)
181};
182
183
184const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] =
185{
186 /* \0 */ KParseTokens::ASC_OTHER,
187 KParseTokens::ASC_CONTROL,
188 KParseTokens::ASC_CONTROL,
189 KParseTokens::ASC_CONTROL,
190 KParseTokens::ASC_CONTROL,
191 KParseTokens::ASC_CONTROL,
192 KParseTokens::ASC_CONTROL,
193 KParseTokens::ASC_CONTROL,
194 KParseTokens::ASC_CONTROL,
195 /* 9 \t */ KParseTokens::ASC_CONTROL,
196 KParseTokens::ASC_CONTROL,
197 /* 11 \v */ KParseTokens::ASC_CONTROL,
198 KParseTokens::ASC_CONTROL,
199 KParseTokens::ASC_CONTROL,
200 KParseTokens::ASC_CONTROL,
201 KParseTokens::ASC_CONTROL,
202 KParseTokens::ASC_CONTROL,
203 KParseTokens::ASC_CONTROL,
204 KParseTokens::ASC_CONTROL,
205 KParseTokens::ASC_CONTROL,
206 KParseTokens::ASC_CONTROL,
207 KParseTokens::ASC_CONTROL,
208 KParseTokens::ASC_CONTROL,
209 KParseTokens::ASC_CONTROL,
210 KParseTokens::ASC_CONTROL,
211 KParseTokens::ASC_CONTROL,
212 KParseTokens::ASC_CONTROL,
213 KParseTokens::ASC_CONTROL,
214 KParseTokens::ASC_CONTROL,
215 KParseTokens::ASC_CONTROL,
216 KParseTokens::ASC_CONTROL,
217 KParseTokens::ASC_CONTROL,
218 /* 32 */ KParseTokens::ASC_OTHER,
219 /* 33 ! */ KParseTokens::ASC_OTHER,
220 /* 34 " */ KParseTokens::ASC_OTHER,
221 /* 35 # */ KParseTokens::ASC_OTHER,
222 /* 36 $ */ KParseTokens::ASC_DOLLAR,
223 /* 37 % */ KParseTokens::ASC_OTHER,
224 /* 38 & */ KParseTokens::ASC_OTHER,
225 /* 39 ' */ KParseTokens::ASC_OTHER,
226 /* 40 ( */ KParseTokens::ASC_OTHER,
227 /* 41 ) */ KParseTokens::ASC_OTHER,
228 /* 42 * */ KParseTokens::ASC_OTHER,
229 /* 43 + */ KParseTokens::ASC_OTHER,
230 /* 44 , */ KParseTokens::ASC_OTHER,
231 /* 45 - */ KParseTokens::ASC_OTHER,
232 /* 46 . */ KParseTokens::ASC_DOT,
233 /* 47 / */ KParseTokens::ASC_OTHER,
234 //for ( i = 48; i < 58; i++ )
235 /* 48 0 */ KParseTokens::ASC_DIGIT,
236 /* 49 1 */ KParseTokens::ASC_DIGIT,
237 /* 50 2 */ KParseTokens::ASC_DIGIT,
238 /* 51 3 */ KParseTokens::ASC_DIGIT,
239 /* 52 4 */ KParseTokens::ASC_DIGIT,
240 /* 53 5 */ KParseTokens::ASC_DIGIT,
241 /* 54 6 */ KParseTokens::ASC_DIGIT,
242 /* 55 7 */ KParseTokens::ASC_DIGIT,
243 /* 56 8 */ KParseTokens::ASC_DIGIT,
244 /* 57 9 */ KParseTokens::ASC_DIGIT,
245 /* 58 : */ KParseTokens::ASC_COLON,
246 /* 59 ; */ KParseTokens::ASC_OTHER,
247 /* 60 < */ KParseTokens::ASC_OTHER,
248 /* 61 = */ KParseTokens::ASC_OTHER,
249 /* 62 > */ KParseTokens::ASC_OTHER,
250 /* 63 ? */ KParseTokens::ASC_OTHER,
251 /* 64 @ */ KParseTokens::ASC_OTHER,
252 //for ( i = 65; i < 91; i++ )
253 /* 65 A */ KParseTokens::ASC_UPALPHA,
254 /* 66 B */ KParseTokens::ASC_UPALPHA,
255 /* 67 C */ KParseTokens::ASC_UPALPHA,
256 /* 68 D */ KParseTokens::ASC_UPALPHA,
257 /* 69 E */ KParseTokens::ASC_UPALPHA,
258 /* 70 F */ KParseTokens::ASC_UPALPHA,
259 /* 71 G */ KParseTokens::ASC_UPALPHA,
260 /* 72 H */ KParseTokens::ASC_UPALPHA,
261 /* 73 I */ KParseTokens::ASC_UPALPHA,
262 /* 74 J */ KParseTokens::ASC_UPALPHA,
263 /* 75 K */ KParseTokens::ASC_UPALPHA,
264 /* 76 L */ KParseTokens::ASC_UPALPHA,
265 /* 77 M */ KParseTokens::ASC_UPALPHA,
266 /* 78 N */ KParseTokens::ASC_UPALPHA,
267 /* 79 O */ KParseTokens::ASC_UPALPHA,
268 /* 80 P */ KParseTokens::ASC_UPALPHA,
269 /* 81 Q */ KParseTokens::ASC_UPALPHA,
270 /* 82 R */ KParseTokens::ASC_UPALPHA,
271 /* 83 S */ KParseTokens::ASC_UPALPHA,
272 /* 84 T */ KParseTokens::ASC_UPALPHA,
273 /* 85 U */ KParseTokens::ASC_UPALPHA,
274 /* 86 V */ KParseTokens::ASC_UPALPHA,
275 /* 87 W */ KParseTokens::ASC_UPALPHA,
276 /* 88 X */ KParseTokens::ASC_UPALPHA,
277 /* 89 Y */ KParseTokens::ASC_UPALPHA,
278 /* 90 Z */ KParseTokens::ASC_UPALPHA,
279 /* 91 [ */ KParseTokens::ASC_OTHER,
280 /* 92 \ */ KParseTokens::ASC_OTHER,
281 /* 93 ] */ KParseTokens::ASC_OTHER,
282 /* 94 ^ */ KParseTokens::ASC_OTHER,
283 /* 95 _ */ KParseTokens::ASC_UNDERSCORE,
284 /* 96 ` */ KParseTokens::ASC_OTHER,
285 //for ( i = 97; i < 123; i++ )
286 /* 97 a */ KParseTokens::ASC_LOALPHA,
287 /* 98 b */ KParseTokens::ASC_LOALPHA,
288 /* 99 c */ KParseTokens::ASC_LOALPHA,
289 /* 100 d */ KParseTokens::ASC_LOALPHA,
290 /* 101 e */ KParseTokens::ASC_LOALPHA,
291 /* 102 f */ KParseTokens::ASC_LOALPHA,
292 /* 103 g */ KParseTokens::ASC_LOALPHA,
293 /* 104 h */ KParseTokens::ASC_LOALPHA,
294 /* 105 i */ KParseTokens::ASC_LOALPHA,
295 /* 106 j */ KParseTokens::ASC_LOALPHA,
296 /* 107 k */ KParseTokens::ASC_LOALPHA,
297 /* 108 l */ KParseTokens::ASC_LOALPHA,
298 /* 109 m */ KParseTokens::ASC_LOALPHA,
299 /* 110 n */ KParseTokens::ASC_LOALPHA,
300 /* 111 o */ KParseTokens::ASC_LOALPHA,
301 /* 112 p */ KParseTokens::ASC_LOALPHA,
302 /* 113 q */ KParseTokens::ASC_LOALPHA,
303 /* 114 r */ KParseTokens::ASC_LOALPHA,
304 /* 115 s */ KParseTokens::ASC_LOALPHA,
305 /* 116 t */ KParseTokens::ASC_LOALPHA,
306 /* 117 u */ KParseTokens::ASC_LOALPHA,
307 /* 118 v */ KParseTokens::ASC_LOALPHA,
308 /* 119 w */ KParseTokens::ASC_LOALPHA,
309 /* 120 x */ KParseTokens::ASC_LOALPHA,
310 /* 121 y */ KParseTokens::ASC_LOALPHA,
311 /* 122 z */ KParseTokens::ASC_LOALPHA,
312 /* 123 { */ KParseTokens::ASC_OTHER,
313 /* 124 | */ KParseTokens::ASC_OTHER,
314 /* 125 } */ KParseTokens::ASC_OTHER,
315 /* 126 ~ */ KParseTokens::ASC_OTHER,
316 /* 127 */ KParseTokens::ASC_OTHER
317};
318
319
320// static
321const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_uInt32 c )
322{
323 if ( !pStr )
324 return nullptr;
325 sal_Unicode cs[2];
326 auto const n = rtl::splitSurrogates(c, cs);
327 while ( *pStr )
328 {
329 if ( *pStr == cs[0] && (n == 1 || pStr[1] == cs[1]) )
330 return pStr;
331 pStr++;
332 }
333 return nullptr;
334}
335
336
337sal_Int32 cclass_Unicode::getParseTokensType(sal_uInt32 const c, bool const isFirst)
338{
339 if ( c < nDefCnt )
340 return pParseTokensType[ sal_uInt8(c) ];
341 else
342 {
343
345 switch (u_charType(c))
346 {
347 case U_UPPERCASE_LETTER :
348 return KParseTokens::UNI_UPALPHA;
349 case U_LOWERCASE_LETTER :
350 return KParseTokens::UNI_LOALPHA;
351 case U_TITLECASE_LETTER :
352 return KParseTokens::UNI_TITLE_ALPHA;
353 case U_MODIFIER_LETTER :
354 return KParseTokens::UNI_MODIFIER_LETTER;
355 case U_OTHER_LETTER :
356 // Non_Spacing_Mark could not be as leading character
357 if (isFirst) break;
358 [[fallthrough]]; // treat it as Other_Letter.
359 case U_NON_SPACING_MARK :
360 return KParseTokens::UNI_OTHER_LETTER;
361 case U_DECIMAL_DIGIT_NUMBER :
362 return KParseTokens::UNI_DIGIT;
363 case U_LETTER_NUMBER :
364 return KParseTokens::UNI_LETTER_NUMBER;
365 case U_OTHER_NUMBER :
366 return KParseTokens::UNI_OTHER_NUMBER;
367 }
368
369 return KParseTokens::UNI_OTHER;
370 }
371}
372
373void cclass_Unicode::setupInternational( const Locale& rLocale )
374{
375 bool bChanged = (aParserLocale.Language != rLocale.Language
376 || aParserLocale.Country != rLocale.Country
377 || aParserLocale.Variant != rLocale.Variant);
378 if ( bChanged )
379 {
380 aParserLocale.Language = rLocale.Language;
381 aParserLocale.Country = rLocale.Country;
382 aParserLocale.Variant = rLocale.Variant;
383 }
384 if ( !mxLocaleData.is() )
385 {
386 mxLocaleData.set( LocaleData2::create(m_xContext) );
387 }
388}
389
390
391void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
392 const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
393 const OUString& userDefinedCharactersCont )
394{
395 bool bIntlEqual = (rLocale.Language == aParserLocale.Language &&
396 rLocale.Country == aParserLocale.Country &&
397 rLocale.Variant == aParserLocale.Variant);
398 if ( !pTable || !bIntlEqual ||
399 startCharTokenType != nStartTypes ||
400 contCharTokenType != nContTypes ||
401 userDefinedCharactersStart != aStartChars ||
402 userDefinedCharactersCont != aContChars )
403 initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
404 contCharTokenType, userDefinedCharactersCont );
405}
406
407
408void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
409 const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
410 const OUString& userDefinedCharactersCont )
411{
412 // (Re)Init
413 setupInternational( rLocale );
414 // Memory of pTable is reused.
415 if ( !pTable )
416 pTable.reset(new ParserFlags[nDefCnt]);
417 memcpy( pTable.get(), pDefaultParserTable, sizeof(ParserFlags) * nDefCnt );
418 // Start and cont tables only need reallocation if different length.
419 if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() )
420 {
421 pStart.reset();
422 }
423 if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() )
424 {
425 pCont.reset();
426 }
427 nStartTypes = startCharTokenType;
428 nContTypes = contCharTokenType;
429 aStartChars = userDefinedCharactersStart;
430 aContChars = userDefinedCharactersCont;
431
432 // specials
433 if( mxLocaleData.is() )
434 {
435 LocaleDataItem2 aItem =
436 mxLocaleData->getLocaleItem2( aParserLocale );
439 cGroupSep = aItem.thousandSeparator[0];
440 cDecimalSep = aItem.decimalSeparator[0];
441 cDecimalSepAlt = aItem.decimalSeparatorAlternative.toChar();
442 }
443
444 if (nContTypes & KParseTokens::GROUP_SEPARATOR_IN_NUMBER)
445 {
446 if ( cGroupSep < nDefCnt )
448 }
449 else
450 {
451 cGroupSep = 0;
452 }
453 if ( cDecimalSep < nDefCnt )
457
458 // Modify characters according to KParseTokens definitions.
459 {
460 using namespace KParseTokens;
461 sal_uInt8 i;
462
463 if ( !(nStartTypes & ASC_UPALPHA) )
464 for ( i = 65; i < 91; i++ )
465 pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
466 if ( !(nContTypes & ASC_UPALPHA) )
467 for ( i = 65; i < 91; i++ )
468 pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
469
470 if ( !(nStartTypes & ASC_LOALPHA) )
471 for ( i = 97; i < 123; i++ )
472 pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
473 if ( !(nContTypes & ASC_LOALPHA) )
474 for ( i = 97; i < 123; i++ )
475 pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
476
477 if ( nStartTypes & ASC_DIGIT )
478 for ( i = 48; i < 58; i++ )
479 pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
480 if ( !(nContTypes & ASC_DIGIT) )
481 for ( i = 48; i < 58; i++ )
482 pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
483
484 if ( !(nStartTypes & ASC_UNDERSCORE) )
485 pTable[95] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
486 if ( !(nContTypes & ASC_UNDERSCORE) )
487 pTable[95] &= ~ParserFlags::WORD; // not allowed as cont character
488
489 if ( nStartTypes & ASC_DOLLAR )
490 pTable[36] |= ParserFlags::CHAR_WORD; // allowed as start character
491 if ( nContTypes & ASC_DOLLAR )
492 pTable[36] |= ParserFlags::WORD; // allowed as cont character
493
494 if ( nStartTypes & ASC_DOT )
495 pTable[46] |= ParserFlags::CHAR_WORD; // allowed as start character
496 if ( nContTypes & ASC_DOT )
497 pTable[46] |= ParserFlags::WORD; // allowed as cont character
498
499 if ( nStartTypes & ASC_COLON )
500 pTable[58] |= ParserFlags::CHAR_WORD; // allowed as start character
501 if ( nContTypes & ASC_COLON )
502 pTable[58] |= ParserFlags::WORD; // allowed as cont character
503
504 if ( nStartTypes & ASC_CONTROL )
505 for ( i = 1; i < 32; i++ )
506 pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
507 if ( nContTypes & ASC_CONTROL )
508 for ( i = 1; i < 32; i++ )
509 pTable[i] |= ParserFlags::WORD; // allowed as cont character
510
511 if ( nStartTypes & ASC_ANY_BUT_CONTROL )
512 for ( i = 32; i < nDefCnt; i++ )
513 pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
514 if ( nContTypes & ASC_ANY_BUT_CONTROL )
515 for ( i = 32; i < nDefCnt; i++ )
516 pTable[i] |= ParserFlags::WORD; // allowed as cont character
517
518 }
519
520 // Merge in (positively override with) user defined characters.
521 // StartChars
522 sal_Int32 nLen = aStartChars.getLength();
523 if ( nLen )
524 {
525 if ( !pStart )
526 pStart.reset(new ParserFlags[ nLen ]);
527 const sal_Unicode* p = aStartChars.getStr();
528 for ( sal_Int32 j=0; j<nLen; j++, p++ )
529 {
531 if ( *p < nDefCnt )
533 }
534 }
535 // ContChars
536 nLen = aContChars.getLength();
537 if ( nLen )
538 {
539 if ( !pCont )
540 pCont.reset(new ParserFlags[ nLen ]);
541 const sal_Unicode* p = aContChars.getStr();
542 for ( sal_Int32 j=0; j<nLen; j++ )
543 {
545 if ( *p < nDefCnt )
547 }
548 }
549}
550
551
553{
554 pCont.reset();
555 pStart.reset();
556 pTable.reset();
557}
558
559
561{
562 ParserFlags nMask;
563 if ( c < nDefCnt )
564 nMask = pTable[ sal_uInt8(c) ];
565 else
566 nMask = getFlagsExtended(c, eState);
567 switch ( eState )
568 {
569 case ssGetChar :
570 case ssRewindFromValue :
572 case ssGetWordFirstChar :
573 if ( !(nMask & ParserFlags::CHAR_WORD) )
574 {
575 nMask |= getStartCharsFlags( c );
576 if ( nMask & ParserFlags::CHAR_WORD )
577 nMask &= ~ParserFlags::EXCLUDED;
578 }
579 break;
580 case ssGetValue :
581 case ssGetWord :
582 if ( !(nMask & ParserFlags::WORD) )
583 {
584 nMask |= getContCharsFlags( c );
585 if ( nMask & ParserFlags::WORD )
586 nMask &= ~ParserFlags::EXCLUDED;
587 }
588 break;
589 default:
590 ; // other cases aren't needed, no compiler warning
591 }
592 return nMask;
593}
594
595
597{
598 if ( c == cGroupSep )
599 return ParserFlags::VALUE;
600 else if ( c == cDecimalSep )
602 else if ( cDecimalSepAlt && c == cDecimalSepAlt )
604 bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar ||
605 eState == ssRewindFromValue || eState == ssIgnoreLeadingInRewind);
606 sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes);
607
609 switch (u_charType(c))
610 {
611 case U_UPPERCASE_LETTER :
612 return (nTypes & KParseTokens::UNI_UPALPHA) ?
615 case U_LOWERCASE_LETTER :
616 return (nTypes & KParseTokens::UNI_LOALPHA) ?
619 case U_TITLECASE_LETTER :
620 return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
623 case U_MODIFIER_LETTER :
624 return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
627 case U_NON_SPACING_MARK :
628 case U_COMBINING_SPACING_MARK :
629 // Non_Spacing_Mark can't be a leading character,
630 // nor can a spacing combining mark.
631 if (bStart)
633 [[fallthrough]]; // treat it as Other_Letter.
634 case U_OTHER_LETTER :
635 return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
638 case U_DECIMAL_DIGIT_NUMBER :
639 return ((nTypes & KParseTokens::UNI_DIGIT) ?
642 case U_LETTER_NUMBER :
643 return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
646 case U_OTHER_NUMBER :
647 return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
650 case U_SPACE_SEPARATOR :
651 return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
653 case U_OTHER_PUNCTUATION:
654 // fdo#61754 Lets see (if we not at the start) if this is midletter
655 // punctuation and allow it in a word if it is similarly to
656 // U_NON_SPACING_MARK, for example U+00B7 MIDDLE DOT.
657 // tdf#123575 for U+30FB KATAKANA MIDDLE DOT property is not
658 // U_WB_MIDLETTER but U_WB_KATAKANA instead, explicitly test that
659 // and U+FF65 HALFWIDTH KATAKANA MIDDLE DOT.
660 if (bStart || (U_WB_MIDLETTER != u_getIntPropertyValue(c, UCHAR_WORD_BREAK)
661 && c != 0x30FB && c != 0xFF65))
663 else
664 {
665 //allowing it to continue the word
666 return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
668 }
669 break;
670 }
671
673}
674
675
677{
678 if ( pStart )
679 {
680 const sal_Unicode* pStr = aStartChars.getStr();
681 const sal_Unicode* p = StrChr( pStr, c );
682 if ( p )
683 return pStart[ p - pStr ];
684 }
686}
687
688
690{
691 if ( pCont )
692 {
693 const sal_Unicode* pStr = aContChars.getStr();
694 const sal_Unicode* p = StrChr( pStr, c );
695 if ( p )
696 return pCont[ p - pStr ];
697 }
699}
700
701
702void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType )
703{
704 assert(r.LeadingWhiteSpace == 0);
705 ScanState eState = ssGetChar;
706
708 OUStringBuffer aSymbol;
709 bool isFirst(true);
710 sal_Int32 index(nPos); // index of next code point after current
711 sal_Int32 postSymbolIndex(index); // index of code point following last quote
712 sal_uInt32 current((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
713 sal_uInt32 cLast = 0;
714 sal_Int32 nCodePoints(0);
715 int nDecSeps = 0;
716 bool bQuote = false;
717 bool bMightBeWord = true;
718 bool bMightBeWordLast = true;
719 bool bDecSepAltUsed = false;
721 sal_Int32 nextCharIndex(nPos); // == index of nextChar
722
723 while ((current != 0) && (eState != ssStop))
724 {
725 ++nCodePoints;
726 ParserFlags nMask = getFlags(current, eState);
727 if ( nMask & ParserFlags::EXCLUDED )
728 eState = ssBounce;
729 if ( bMightBeWord )
730 { // only relevant for ssGetValue fall back
731 if ( eState == ssGetChar || eState == ssRewindFromValue ||
732 eState == ssIgnoreLeadingInRewind )
733 bMightBeWord = bool(nMask & ParserFlags::CHAR_WORD);
734 else
735 bMightBeWord = bool(nMask & ParserFlags::WORD);
736 }
737 sal_Int32 nParseTokensType = getParseTokensType(current, isFirst);
738 isFirst = false;
739 sal_Int32 const nextIndex(nextCharIndex); // == index of char following current
740 nextCharIndex = index; // == index of nextChar
741 sal_uInt32 nextChar((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
742 switch (eState)
743 {
744 case ssGetChar :
745 case ssRewindFromValue :
747 {
748 if ( (nMask & ParserFlags::CHAR_VALUE) && eState != ssRewindFromValue
749 && eState != ssIgnoreLeadingInRewind )
750 {
751 eState = ssGetValue;
752 if ( nMask & ParserFlags::VALUE_DIGIT )
753 {
754 if (128 <= current)
755 r.TokenType = KParseType::UNI_NUMBER;
756 else
757 r.TokenType = KParseType::ASC_NUMBER;
758 }
759 else if (current == cDecimalSep || (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt)))
760 {
761 if (nextChar)
762 ++nDecSeps;
763 else
764 eState = ssRewindFromValue;
765 // retry for ONE_SINGLE_CHAR or others
766 }
767 }
768 else if ( nMask & ParserFlags::CHAR_WORD )
769 {
770 eState = ssGetWord;
771 r.TokenType = KParseType::IDENTNAME;
772 }
773 else if ( nMask & ParserFlags::NAME_SEP )
774 {
775 eState = ssGetWordFirstChar;
776 bQuote = true;
777 postSymbolIndex = nextCharIndex;
778 nParseTokensType = 0; // will be taken of first real character
779 r.TokenType = KParseType::SINGLE_QUOTE_NAME;
780 }
781 else if ( nMask & ParserFlags::CHAR_STRING )
782 {
783 eState = ssGetString;
784 postSymbolIndex = nextCharIndex;
785 nParseTokensType = 0; // will be taken of first real character
786 r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
787 }
788 else if ( nMask & ParserFlags::CHAR_DONTCARE )
789 {
790 if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS )
791 {
792 if (eState == ssRewindFromValue)
794 r.LeadingWhiteSpace = nextCharIndex - nPos;
795 nCodePoints--; // exclude leading whitespace
796 postSymbolIndex = nextCharIndex;
797 nParseTokensType = 0; // wait until real character
798 bMightBeWord = true;
799 }
800 else
801 eState = ssBounce;
802 }
803 else if ( nMask & ParserFlags::CHAR_BOOL )
804 {
805 eState = ssGetBool;
806 r.TokenType = KParseType::BOOLEAN;
807 }
808 else if ( nMask & ParserFlags::CHAR )
809 {
810 eState = ssStop;
811 r.TokenType = KParseType::ONE_SINGLE_CHAR;
812 }
813 else
814 eState = ssBounce; // not known
815 }
816 break;
817 case ssGetValue :
818 {
819 if ( nMask & ParserFlags::VALUE_DIGIT )
820 {
821 if (128 <= current)
822 r.TokenType = KParseType::UNI_NUMBER;
823 else if ( r.TokenType != KParseType::UNI_NUMBER )
824 r.TokenType = KParseType::ASC_NUMBER;
825 }
826 if ( nMask & ParserFlags::VALUE )
827 {
828 if (current == cGroupSep)
829 {
830 // accept only if it is followed by 3 digits
831 sal_Int32 tempIndex(index);
832 sal_uInt32 const nextChar2((tempIndex < rText.getLength()) ? rText.iterateCodePoints(&tempIndex) : 0);
833 sal_uInt32 const nextChar3((tempIndex < rText.getLength()) ? rText.iterateCodePoints(&tempIndex) : 0);
834 if (getFlags(nextChar, eState) & ParserFlags::VALUE_DIGIT
835 && getFlags(nextChar2, eState) & ParserFlags::VALUE_DIGIT
836 && getFlags(nextChar3, eState) & ParserFlags::VALUE_DIGIT)
837 {
838 nParseTokensType |= KParseTokens::GROUP_SEPARATOR_IN_NUMBER;
839 }
840 else
841 {
842 // Trailing group separator character is not a
843 // group separator.
844 eState = ssStopBack;
845 }
846 }
847 else if ((current == cDecimalSep ||
848 (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt))) &&
849 ++nDecSeps > 1)
850 {
851 if (nCodePoints == 2)
852 eState = ssRewindFromValue;
853 // consecutive separators
854 else
855 eState = ssStopBack;
856 }
857 // else keep it going
858 }
859 else if (current == 'E' || current == 'e')
860 {
861 ParserFlags nNext = getFlags(nextChar, eState);
862 if ( nNext & ParserFlags::VALUE_EXP )
863 ; // keep it going
864 else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
865 { // might be a numerical name (1.2efg)
866 eState = ssGetWord;
867 r.TokenType = KParseType::IDENTNAME;
868 }
869 else
870 eState = ssStopBack;
871 }
872 else if ( nMask & ParserFlags::VALUE_SIGN )
873 {
874 if ( (cLast == 'E') || (cLast == 'e') )
875 {
876 ParserFlags nNext = getFlags(nextChar, eState);
877 if ( nNext & ParserFlags::VALUE_EXP_VALUE )
878 ; // keep it going
879 else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
880 { // might be a numerical name (1.2e+fg)
881 eState = ssGetWord;
882 r.TokenType = KParseType::IDENTNAME;
883 }
884 else
885 eState = ssStopBack;
886 }
887 else if ( bMightBeWord )
888 { // might be a numerical name (1.2+fg)
889 eState = ssGetWord;
890 r.TokenType = KParseType::IDENTNAME;
891 }
892 else
893 eState = ssStopBack;
894 }
895 else if ( bMightBeWord && (nMask & ParserFlags::WORD) )
896 { // might be a numerical name (1995.A1)
897 eState = ssGetWord;
898 r.TokenType = KParseType::IDENTNAME;
899 }
900 else
901 eState = ssStopBack;
902 }
903 break;
904 case ssGetWordFirstChar :
905 eState = ssGetWord;
906 [[fallthrough]];
907 case ssGetWord :
908 {
909 if ( nMask & ParserFlags::WORD )
910 ; // keep it going
911 else if ( nMask & ParserFlags::NAME_SEP )
912 {
913 if ( bQuote )
914 {
915 if ( cLast == '\\' )
916 { // escaped
917 aSymbol.append(
918 OUString::Concat(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2))
919 + OUString(&current, 1));
920 }
921 else
922 {
923 eState = ssStop;
924 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
925 }
926 postSymbolIndex = nextCharIndex;
927 }
928 else
929 eState = ssStopBack;
930 }
931 else if ( bQuote )
932 ; // keep it going
933 else
934 eState = ssStopBack;
935 }
936 break;
937 case ssGetString :
938 {
939 if ( nMask & ParserFlags::STRING_SEP )
940 {
941 if ( cLast == '\\' )
942 { // escaped
943 aSymbol.append(
944 rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2)
945 + OUString(&current, 1));
946 }
947 else if (current == nextChar &&
948 !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
949 { // "" => literal " escaped
950 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex));
951 nextCharIndex = index;
952 if (index < rText.getLength()) { ++nCodePoints; }
953 nextChar = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
954 }
955 else
956 {
957 eState = ssStop;
958 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
959 }
960 postSymbolIndex = nextCharIndex;
961 }
962 }
963 break;
964 case ssGetBool :
965 {
966 if ( nMask & ParserFlags::BOOL )
967 eState = ssStop; // maximum 2: <, >, <>, <=, >=
968 else
969 eState = ssStopBack;
970 }
971 break;
972 case ssStopBack :
973 case ssBounce :
974 case ssStop :
975 ; // nothing, no compiler warning
976 break;
977 }
978 if ( eState == ssRewindFromValue )
979 {
980 r = ParseResult();
981 index = nPos;
982 postSymbolIndex = nPos;
983 nextCharIndex = nPos;
984 aSymbol.setLength(0);
985 current = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
986 nCodePoints = (nPos < rText.getLength()) ? 1 : 0;
987 isFirst = true;
988 cLast = 0;
989 nDecSeps = 0;
990 bQuote = false;
991 bMightBeWord = true;
992 bMightBeWordLast = true;
993 bDecSepAltUsed = false;
994 }
995 else
996 {
997 if ( !(r.TokenType & nTokenType) )
998 {
999 if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
1000 && (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
1001 ; // keep a number that might be a word
1002 else if (r.LeadingWhiteSpace == (nextCharIndex - nPos))
1003 ; // keep ignored white space
1004 else if ( !r.TokenType && eState == ssGetValue && (nMask & ParserFlags::VALUE_SEP) )
1005 ; // keep uncertain value
1006 else
1007 eState = ssBounce;
1008 }
1009 if ( eState == ssBounce )
1010 {
1011 r.TokenType = 0;
1012 eState = ssStopBack;
1013 }
1014 if ( eState == ssStopBack )
1015 { // put back
1016 nextChar = rText.iterateCodePoints(&index, -1);
1017 nextCharIndex = nextIndex;
1018 --nCodePoints;
1019 bMightBeWord = bMightBeWordLast;
1020 eState = ssStop;
1021 }
1022 if ( eState != ssStop )
1023 {
1024 if ( !r.StartFlags )
1025 r.StartFlags |= nParseTokensType;
1026 else
1027 r.ContFlags |= nParseTokensType;
1028 }
1029 bMightBeWordLast = bMightBeWord;
1030 cLast = current;
1031 current = nextChar;
1032 }
1033 }
1034 // r.CharLen is the length in characters (not code units) of the parsed
1035 // token not including any leading white space.
1036 r.CharLen = nCodePoints;
1037 r.EndPos = nextCharIndex;
1038 if ( r.TokenType & KParseType::ASC_NUMBER )
1039 {
1040 r.Value = rtl_math_uStringToDouble(rText.getStr() + nPos + r.LeadingWhiteSpace,
1041 rText.getStr() + r.EndPos, (bDecSepAltUsed ? cDecimalSepAlt : cDecimalSep), cGroupSep, nullptr, nullptr);
1042 if ( bMightBeWord )
1043 r.TokenType |= KParseType::IDENTNAME;
1044 }
1045 else if ( r.TokenType & KParseType::UNI_NUMBER )
1046 {
1047 if ( !xNatNumSup.is() )
1048 {
1049 if ( m_xContext.is() )
1050 {
1051 xNatNumSup = NativeNumberSupplier::create( m_xContext );
1052 }
1053 }
1054 OUString aTmp(rText.getStr() + nPos + r.LeadingWhiteSpace,
1055 r.EndPos - nPos - r.LeadingWhiteSpace);
1056 // transliterate to ASCII
1057 aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale,
1058 NativeNumberMode::NATNUM0 );
1059 r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep );
1060 if ( bMightBeWord )
1061 r.TokenType |= KParseType::IDENTNAME;
1062 }
1063 else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
1064 {
1065 if (postSymbolIndex < nextCharIndex)
1066 {
1067 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
1068 r.TokenType |= KParseType::MISSING_QUOTE;
1069 }
1070 r.DequotedNameOrString = aSymbol.makeStringAndClear();
1071 }
1072}
1073
1074}
1075
1076/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
ParserFlags
Flag values of table.
#define TOKEN_DIGIT_FLAGS
css::lang::Locale aParserLocale
used for parser only
static const ParserFlags pDefaultParserTable[]
void parseText(css::i18n::ParseResult &r, const OUString &rText, sal_Int32 nPos, sal_Int32 nTokenType=0xffffffff)
Parse a text.
void setupInternational(const css::lang::Locale &rLocale)
Setup International class, new'ed only if different from existing.
static const sal_uInt8 nDefCnt
std::unique_ptr< ParserFlags[]> pTable
void setupParserTable(const css::lang::Locale &rLocale, sal_Int32 startCharTokenType, const OUString &userDefinedCharactersStart, sal_Int32 contCharTokenType, const OUString &userDefinedCharactersCont)
Setup parser table. Calls initParserTable() only if needed.
ParserFlags getStartCharsFlags(sal_uInt32 c)
Access parser table flags for user defined start characters.
css::uno::Reference< css::i18n::XNativeNumberSupplier > xNatNumSup
std::unique_ptr< ParserFlags[]> pCont
ParserFlags getFlagsExtended(sal_uInt32 c, ScanState eState) const
Access parser flags via International and special definitions.
void destroyParserTable()
Destroy parser table.
css::uno::Reference< css::uno::XComponentContext > m_xContext
ParserFlags getFlags(sal_uInt32 c, ScanState eState)
Access parser table flags.
std::unique_ptr< ParserFlags[]> pStart
static const sal_Unicode * StrChr(const sal_Unicode *pStr, sal_uInt32 c)
If and where c occurs in pStr.
static sal_Int32 getParseTokensType(sal_uInt32 c, bool isFirst)
Get corresponding KParseTokens flag for a character.
static const sal_Int32 pParseTokensType[]
css::uno::Reference< css::i18n::XLocaleData5 > mxLocaleData
void initParserTable(const css::lang::Locale &rLocale, sal_Int32 startCharTokenType, const OUString &userDefinedCharactersStart, sal_Int32 contCharTokenType, const OUString &userDefinedCharactersCont)
Init parser table.
ParserFlags getContCharsFlags(sal_Unicode c)
Access parser table flags for user defined continuation characters.
void * p
sal_Int64 n
sal_uInt16 nPos
unsigned short WORD
int i
Constant values shared between i18npool and, for example, the number formatter.
index
unsigned char sal_uInt8
sal_uInt16 sal_Unicode