LibreOffice Module i18npool (master)  1
cclass_unicode_parser.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 
21 #include <cclass_unicode.hxx>
22 #include <unicode/uchar.h>
23 #include <rtl/character.hxx>
24 #include <rtl/math.hxx>
25 #include <rtl/ustring.hxx>
26 #include <com/sun/star/i18n/KParseTokens.hpp>
27 #include <com/sun/star/i18n/KParseType.hpp>
28 #include <com/sun/star/i18n/LocaleData2.hpp>
29 #include <com/sun/star/i18n/NativeNumberMode.hpp>
30 #include <com/sun/star/i18n/NativeNumberSupplier.hpp>
31 
32 #include <string.h>
33 #include <string_view>
34 
35 using namespace ::com::sun::star::uno;
36 using namespace ::com::sun::star::i18n;
37 using namespace ::com::sun::star::lang;
38 
39 #define TOKEN_DIGIT_FLAGS (ParserFlags::CHAR_VALUE | ParserFlags::VALUE | ParserFlags::VALUE_EXP | ParserFlags::VALUE_EXP_VALUE | ParserFlags::VALUE_DIGIT)
40 
41 namespace i18npool {
42 
43 // Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
44 
47 {
48 // (...) == Calc formula compiler specific, commented out and modified
49 
50  /* \0 */ ParserFlags::EXCLUDED,
59  /* 9 \t */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
61  /* 11 \v */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL)
85  /* 35 # */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD_SEP)
86  /* 36 $ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
87  /* 37 % */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::VALUE)
89  /* 39 ' */ ParserFlags::NAME_SEP,
94  /* 44 , */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
96  /* 46 . */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD | ParserFlags::CHAR_VALUE | ParserFlags::VALUE)
98  //for ( i = 48; i < 58; i++ )
109  /* 58 : */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD)
114  /* 63 ? */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD)
115  /* 64 @ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
116  //for ( i = 65; i < 91; i++ )
143  /* 91 [ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
144  /* 92 \ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
145  /* 93 ] */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
148  /* 96 ` */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
149  //for ( i = 97; i < 123; i++ )
176  /* 123 { */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
177  /* 124 | */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
178  /* 125 } */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
179  /* 126 ~ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED)
180  /* 127 */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP // (ParserFlags::ILLEGAL // UNUSED)
181 };
182 
183 
184 const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] =
185 {
186  /* \0 */ KParseTokens::ASC_OTHER,
187  KParseTokens::ASC_CONTROL,
188  KParseTokens::ASC_CONTROL,
189  KParseTokens::ASC_CONTROL,
190  KParseTokens::ASC_CONTROL,
191  KParseTokens::ASC_CONTROL,
192  KParseTokens::ASC_CONTROL,
193  KParseTokens::ASC_CONTROL,
194  KParseTokens::ASC_CONTROL,
195  /* 9 \t */ KParseTokens::ASC_CONTROL,
196  KParseTokens::ASC_CONTROL,
197  /* 11 \v */ KParseTokens::ASC_CONTROL,
198  KParseTokens::ASC_CONTROL,
199  KParseTokens::ASC_CONTROL,
200  KParseTokens::ASC_CONTROL,
201  KParseTokens::ASC_CONTROL,
202  KParseTokens::ASC_CONTROL,
203  KParseTokens::ASC_CONTROL,
204  KParseTokens::ASC_CONTROL,
205  KParseTokens::ASC_CONTROL,
206  KParseTokens::ASC_CONTROL,
207  KParseTokens::ASC_CONTROL,
208  KParseTokens::ASC_CONTROL,
209  KParseTokens::ASC_CONTROL,
210  KParseTokens::ASC_CONTROL,
211  KParseTokens::ASC_CONTROL,
212  KParseTokens::ASC_CONTROL,
213  KParseTokens::ASC_CONTROL,
214  KParseTokens::ASC_CONTROL,
215  KParseTokens::ASC_CONTROL,
216  KParseTokens::ASC_CONTROL,
217  KParseTokens::ASC_CONTROL,
218  /* 32 */ KParseTokens::ASC_OTHER,
219  /* 33 ! */ KParseTokens::ASC_OTHER,
220  /* 34 " */ KParseTokens::ASC_OTHER,
221  /* 35 # */ KParseTokens::ASC_OTHER,
222  /* 36 $ */ KParseTokens::ASC_DOLLAR,
223  /* 37 % */ KParseTokens::ASC_OTHER,
224  /* 38 & */ KParseTokens::ASC_OTHER,
225  /* 39 ' */ KParseTokens::ASC_OTHER,
226  /* 40 ( */ KParseTokens::ASC_OTHER,
227  /* 41 ) */ KParseTokens::ASC_OTHER,
228  /* 42 * */ KParseTokens::ASC_OTHER,
229  /* 43 + */ KParseTokens::ASC_OTHER,
230  /* 44 , */ KParseTokens::ASC_OTHER,
231  /* 45 - */ KParseTokens::ASC_OTHER,
232  /* 46 . */ KParseTokens::ASC_DOT,
233  /* 47 / */ KParseTokens::ASC_OTHER,
234  //for ( i = 48; i < 58; i++ )
235  /* 48 0 */ KParseTokens::ASC_DIGIT,
236  /* 49 1 */ KParseTokens::ASC_DIGIT,
237  /* 50 2 */ KParseTokens::ASC_DIGIT,
238  /* 51 3 */ KParseTokens::ASC_DIGIT,
239  /* 52 4 */ KParseTokens::ASC_DIGIT,
240  /* 53 5 */ KParseTokens::ASC_DIGIT,
241  /* 54 6 */ KParseTokens::ASC_DIGIT,
242  /* 55 7 */ KParseTokens::ASC_DIGIT,
243  /* 56 8 */ KParseTokens::ASC_DIGIT,
244  /* 57 9 */ KParseTokens::ASC_DIGIT,
245  /* 58 : */ KParseTokens::ASC_COLON,
246  /* 59 ; */ KParseTokens::ASC_OTHER,
247  /* 60 < */ KParseTokens::ASC_OTHER,
248  /* 61 = */ KParseTokens::ASC_OTHER,
249  /* 62 > */ KParseTokens::ASC_OTHER,
250  /* 63 ? */ KParseTokens::ASC_OTHER,
251  /* 64 @ */ KParseTokens::ASC_OTHER,
252  //for ( i = 65; i < 91; i++ )
253  /* 65 A */ KParseTokens::ASC_UPALPHA,
254  /* 66 B */ KParseTokens::ASC_UPALPHA,
255  /* 67 C */ KParseTokens::ASC_UPALPHA,
256  /* 68 D */ KParseTokens::ASC_UPALPHA,
257  /* 69 E */ KParseTokens::ASC_UPALPHA,
258  /* 70 F */ KParseTokens::ASC_UPALPHA,
259  /* 71 G */ KParseTokens::ASC_UPALPHA,
260  /* 72 H */ KParseTokens::ASC_UPALPHA,
261  /* 73 I */ KParseTokens::ASC_UPALPHA,
262  /* 74 J */ KParseTokens::ASC_UPALPHA,
263  /* 75 K */ KParseTokens::ASC_UPALPHA,
264  /* 76 L */ KParseTokens::ASC_UPALPHA,
265  /* 77 M */ KParseTokens::ASC_UPALPHA,
266  /* 78 N */ KParseTokens::ASC_UPALPHA,
267  /* 79 O */ KParseTokens::ASC_UPALPHA,
268  /* 80 P */ KParseTokens::ASC_UPALPHA,
269  /* 81 Q */ KParseTokens::ASC_UPALPHA,
270  /* 82 R */ KParseTokens::ASC_UPALPHA,
271  /* 83 S */ KParseTokens::ASC_UPALPHA,
272  /* 84 T */ KParseTokens::ASC_UPALPHA,
273  /* 85 U */ KParseTokens::ASC_UPALPHA,
274  /* 86 V */ KParseTokens::ASC_UPALPHA,
275  /* 87 W */ KParseTokens::ASC_UPALPHA,
276  /* 88 X */ KParseTokens::ASC_UPALPHA,
277  /* 89 Y */ KParseTokens::ASC_UPALPHA,
278  /* 90 Z */ KParseTokens::ASC_UPALPHA,
279  /* 91 [ */ KParseTokens::ASC_OTHER,
280  /* 92 \ */ KParseTokens::ASC_OTHER,
281  /* 93 ] */ KParseTokens::ASC_OTHER,
282  /* 94 ^ */ KParseTokens::ASC_OTHER,
283  /* 95 _ */ KParseTokens::ASC_UNDERSCORE,
284  /* 96 ` */ KParseTokens::ASC_OTHER,
285  //for ( i = 97; i < 123; i++ )
286  /* 97 a */ KParseTokens::ASC_LOALPHA,
287  /* 98 b */ KParseTokens::ASC_LOALPHA,
288  /* 99 c */ KParseTokens::ASC_LOALPHA,
289  /* 100 d */ KParseTokens::ASC_LOALPHA,
290  /* 101 e */ KParseTokens::ASC_LOALPHA,
291  /* 102 f */ KParseTokens::ASC_LOALPHA,
292  /* 103 g */ KParseTokens::ASC_LOALPHA,
293  /* 104 h */ KParseTokens::ASC_LOALPHA,
294  /* 105 i */ KParseTokens::ASC_LOALPHA,
295  /* 106 j */ KParseTokens::ASC_LOALPHA,
296  /* 107 k */ KParseTokens::ASC_LOALPHA,
297  /* 108 l */ KParseTokens::ASC_LOALPHA,
298  /* 109 m */ KParseTokens::ASC_LOALPHA,
299  /* 110 n */ KParseTokens::ASC_LOALPHA,
300  /* 111 o */ KParseTokens::ASC_LOALPHA,
301  /* 112 p */ KParseTokens::ASC_LOALPHA,
302  /* 113 q */ KParseTokens::ASC_LOALPHA,
303  /* 114 r */ KParseTokens::ASC_LOALPHA,
304  /* 115 s */ KParseTokens::ASC_LOALPHA,
305  /* 116 t */ KParseTokens::ASC_LOALPHA,
306  /* 117 u */ KParseTokens::ASC_LOALPHA,
307  /* 118 v */ KParseTokens::ASC_LOALPHA,
308  /* 119 w */ KParseTokens::ASC_LOALPHA,
309  /* 120 x */ KParseTokens::ASC_LOALPHA,
310  /* 121 y */ KParseTokens::ASC_LOALPHA,
311  /* 122 z */ KParseTokens::ASC_LOALPHA,
312  /* 123 { */ KParseTokens::ASC_OTHER,
313  /* 124 | */ KParseTokens::ASC_OTHER,
314  /* 125 } */ KParseTokens::ASC_OTHER,
315  /* 126 ~ */ KParseTokens::ASC_OTHER,
316  /* 127 */ KParseTokens::ASC_OTHER
317 };
318 
319 
320 // static
321 const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_uInt32 c )
322 {
323  if ( !pStr )
324  return nullptr;
325  sal_Unicode cs[2];
326  auto const n = rtl::splitSurrogates(c, cs);
327  while ( *pStr )
328  {
329  if ( *pStr == cs[0] && (n == 1 || pStr[1] == cs[1]) )
330  return pStr;
331  pStr++;
332  }
333  return nullptr;
334 }
335 
336 
337 sal_Int32 cclass_Unicode::getParseTokensType(sal_uInt32 const c, bool const isFirst)
338 {
339  if ( c < nDefCnt )
340  return pParseTokensType[ sal_uInt8(c) ];
341  else
342  {
343 
345  switch (u_charType(c))
346  {
347  case U_UPPERCASE_LETTER :
348  return KParseTokens::UNI_UPALPHA;
349  case U_LOWERCASE_LETTER :
350  return KParseTokens::UNI_LOALPHA;
351  case U_TITLECASE_LETTER :
352  return KParseTokens::UNI_TITLE_ALPHA;
353  case U_MODIFIER_LETTER :
354  return KParseTokens::UNI_MODIFIER_LETTER;
355  case U_OTHER_LETTER :
356  // Non_Spacing_Mark could not be as leading character
357  if (isFirst) break;
358  [[fallthrough]]; // treat it as Other_Letter.
359  case U_NON_SPACING_MARK :
360  return KParseTokens::UNI_OTHER_LETTER;
361  case U_DECIMAL_DIGIT_NUMBER :
362  return KParseTokens::UNI_DIGIT;
363  case U_LETTER_NUMBER :
364  return KParseTokens::UNI_LETTER_NUMBER;
365  case U_OTHER_NUMBER :
366  return KParseTokens::UNI_OTHER_NUMBER;
367  }
368 
369  return KParseTokens::UNI_OTHER;
370  }
371 }
372 
373 void cclass_Unicode::setupInternational( const Locale& rLocale )
374 {
375  bool bChanged = (aParserLocale.Language != rLocale.Language
376  || aParserLocale.Country != rLocale.Country
377  || aParserLocale.Variant != rLocale.Variant);
378  if ( bChanged )
379  {
380  aParserLocale.Language = rLocale.Language;
381  aParserLocale.Country = rLocale.Country;
382  aParserLocale.Variant = rLocale.Variant;
383  }
384  if ( !mxLocaleData.is() )
385  {
386  mxLocaleData.set( LocaleData2::create(m_xContext) );
387  }
388 }
389 
390 
391 void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
392  const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
393  const OUString& userDefinedCharactersCont )
394 {
395  bool bIntlEqual = (rLocale.Language == aParserLocale.Language &&
396  rLocale.Country == aParserLocale.Country &&
397  rLocale.Variant == aParserLocale.Variant);
398  if ( !pTable || !bIntlEqual ||
399  startCharTokenType != nStartTypes ||
400  contCharTokenType != nContTypes ||
401  userDefinedCharactersStart != aStartChars ||
402  userDefinedCharactersCont != aContChars )
403  initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
404  contCharTokenType, userDefinedCharactersCont );
405 }
406 
407 
408 void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
409  const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
410  const OUString& userDefinedCharactersCont )
411 {
412  // (Re)Init
413  setupInternational( rLocale );
414  // Memory of pTable is reused.
415  if ( !pTable )
416  pTable.reset(new ParserFlags[nDefCnt]);
417  memcpy( pTable.get(), pDefaultParserTable, sizeof(ParserFlags) * nDefCnt );
418  // Start and cont tables only need reallocation if different length.
419  if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() )
420  {
421  pStart.reset();
422  }
423  if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() )
424  {
425  pCont.reset();
426  }
427  nStartTypes = startCharTokenType;
428  nContTypes = contCharTokenType;
429  aStartChars = userDefinedCharactersStart;
430  aContChars = userDefinedCharactersCont;
431 
432  // specials
433  if( mxLocaleData.is() )
434  {
435  LocaleDataItem2 aItem =
436  mxLocaleData->getLocaleItem2( aParserLocale );
439  cGroupSep = aItem.thousandSeparator[0];
440  cDecimalSep = aItem.decimalSeparator[0];
441  cDecimalSepAlt = aItem.decimalSeparatorAlternative.toChar();
442  }
443 
444  if (nContTypes & KParseTokens::GROUP_SEPARATOR_IN_NUMBER)
445  {
446  if ( cGroupSep < nDefCnt )
448  }
449  else
450  {
451  cGroupSep = 0;
452  }
453  if ( cDecimalSep < nDefCnt )
455  if ( cDecimalSepAlt && cDecimalSepAlt < nDefCnt )
457 
458  // Modify characters according to KParseTokens definitions.
459  {
460  using namespace KParseTokens;
461  sal_uInt8 i;
462 
463  if ( !(nStartTypes & ASC_UPALPHA) )
464  for ( i = 65; i < 91; i++ )
465  pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
466  if ( !(nContTypes & ASC_UPALPHA) )
467  for ( i = 65; i < 91; i++ )
468  pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
469 
470  if ( !(nStartTypes & ASC_LOALPHA) )
471  for ( i = 97; i < 123; i++ )
472  pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
473  if ( !(nContTypes & ASC_LOALPHA) )
474  for ( i = 97; i < 123; i++ )
475  pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
476 
477  if ( nStartTypes & ASC_DIGIT )
478  for ( i = 48; i < 58; i++ )
479  pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
480  if ( !(nContTypes & ASC_DIGIT) )
481  for ( i = 48; i < 58; i++ )
482  pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character
483 
484  if ( !(nStartTypes & ASC_UNDERSCORE) )
485  pTable[95] &= ~ParserFlags::CHAR_WORD; // not allowed as start character
486  if ( !(nContTypes & ASC_UNDERSCORE) )
487  pTable[95] &= ~ParserFlags::WORD; // not allowed as cont character
488 
489  if ( nStartTypes & ASC_DOLLAR )
490  pTable[36] |= ParserFlags::CHAR_WORD; // allowed as start character
491  if ( nContTypes & ASC_DOLLAR )
492  pTable[36] |= ParserFlags::WORD; // allowed as cont character
493 
494  if ( nStartTypes & ASC_DOT )
495  pTable[46] |= ParserFlags::CHAR_WORD; // allowed as start character
496  if ( nContTypes & ASC_DOT )
497  pTable[46] |= ParserFlags::WORD; // allowed as cont character
498 
499  if ( nStartTypes & ASC_COLON )
500  pTable[58] |= ParserFlags::CHAR_WORD; // allowed as start character
501  if ( nContTypes & ASC_COLON )
502  pTable[58] |= ParserFlags::WORD; // allowed as cont character
503 
504  if ( nStartTypes & ASC_CONTROL )
505  for ( i = 1; i < 32; i++ )
506  pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
507  if ( nContTypes & ASC_CONTROL )
508  for ( i = 1; i < 32; i++ )
509  pTable[i] |= ParserFlags::WORD; // allowed as cont character
510 
511  if ( nStartTypes & ASC_ANY_BUT_CONTROL )
512  for ( i = 32; i < nDefCnt; i++ )
513  pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character
514  if ( nContTypes & ASC_ANY_BUT_CONTROL )
515  for ( i = 32; i < nDefCnt; i++ )
516  pTable[i] |= ParserFlags::WORD; // allowed as cont character
517 
518  }
519 
520  // Merge in (positively override with) user defined characters.
521  // StartChars
522  sal_Int32 nLen = aStartChars.getLength();
523  if ( nLen )
524  {
525  if ( !pStart )
526  pStart.reset(new ParserFlags[ nLen ]);
527  const sal_Unicode* p = aStartChars.getStr();
528  for ( sal_Int32 j=0; j<nLen; j++, p++ )
529  {
531  if ( *p < nDefCnt )
533  }
534  }
535  // ContChars
536  nLen = aContChars.getLength();
537  if ( nLen )
538  {
539  if ( !pCont )
540  pCont.reset(new ParserFlags[ nLen ]);
541  const sal_Unicode* p = aContChars.getStr();
542  for ( sal_Int32 j=0; j<nLen; j++ )
543  {
545  if ( *p < nDefCnt )
546  pTable[*p] |= ParserFlags::WORD;
547  }
548  }
549 }
550 
551 
553 {
554  pCont.reset();
555  pStart.reset();
556  pTable.reset();
557 }
558 
559 
561 {
562  ParserFlags nMask;
563  if ( c < nDefCnt )
564  nMask = pTable[ sal_uInt8(c) ];
565  else
566  nMask = getFlagsExtended(c);
567  switch ( eState )
568  {
569  case ssGetChar :
570  case ssRewindFromValue :
572  case ssGetWordFirstChar :
573  if ( !(nMask & ParserFlags::CHAR_WORD) )
574  {
575  nMask |= getStartCharsFlags( c );
576  if ( nMask & ParserFlags::CHAR_WORD )
577  nMask &= ~ParserFlags::EXCLUDED;
578  }
579  break;
580  case ssGetValue :
581  case ssGetWord :
582  if ( !(nMask & ParserFlags::WORD) )
583  {
584  nMask |= getContCharsFlags( c );
585  if ( nMask & ParserFlags::WORD )
586  nMask &= ~ParserFlags::EXCLUDED;
587  }
588  break;
589  default:
590  ; // other cases aren't needed, no compiler warning
591  }
592  return nMask;
593 }
594 
595 
597 {
598  if ( c == cGroupSep )
599  return ParserFlags::VALUE;
600  else if ( c == cDecimalSep )
602  else if ( cDecimalSepAlt && c == cDecimalSepAlt )
604  bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar ||
606  sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes);
607 
609  switch (u_charType(c))
610  {
611  case U_UPPERCASE_LETTER :
612  return (nTypes & KParseTokens::UNI_UPALPHA) ?
615  case U_LOWERCASE_LETTER :
616  return (nTypes & KParseTokens::UNI_LOALPHA) ?
619  case U_TITLECASE_LETTER :
620  return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
623  case U_MODIFIER_LETTER :
624  return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
627  case U_NON_SPACING_MARK :
628  case U_COMBINING_SPACING_MARK :
629  // Non_Spacing_Mark can't be a leading character,
630  // nor can a spacing combining mark.
631  if (bStart)
632  return ParserFlags::ILLEGAL;
633  [[fallthrough]]; // treat it as Other_Letter.
634  case U_OTHER_LETTER :
635  return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
638  case U_DECIMAL_DIGIT_NUMBER :
639  return ((nTypes & KParseTokens::UNI_DIGIT) ?
642  case U_LETTER_NUMBER :
643  return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
645  ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
646  case U_OTHER_NUMBER :
647  return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
649  ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS;
650  case U_SPACE_SEPARATOR :
651  return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
653  case U_OTHER_PUNCTUATION:
654  // fdo#61754 Lets see (if we not at the start) if this is midletter
655  // punctuation and allow it in a word if it is similarly to
656  // U_NON_SPACING_MARK, for example U+00B7 MIDDLE DOT.
657  // tdf#123575 for U+30FB KATAKANA MIDDLE DOT property is not
658  // U_WB_MIDLETTER but U_WB_KATAKANA instead, explicitly test that
659  // and U+FF65 HALFWIDTH KATAKANA MIDDLE DOT.
660  if (bStart || (U_WB_MIDLETTER != u_getIntPropertyValue(c, UCHAR_WORD_BREAK)
661  && c != 0x30FB && c != 0xFF65))
662  return ParserFlags::ILLEGAL;
663  else
664  {
665  //allowing it to continue the word
666  return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
668  }
669  break;
670  }
671 
672  return ParserFlags::ILLEGAL;
673 }
674 
675 
677 {
678  if ( pStart )
679  {
680  const sal_Unicode* pStr = aStartChars.getStr();
681  const sal_Unicode* p = StrChr( pStr, c );
682  if ( p )
683  return pStart[ p - pStr ];
684  }
685  return ParserFlags::ILLEGAL;
686 }
687 
688 
690 {
691  if ( pCont )
692  {
693  const sal_Unicode* pStr = aContChars.getStr();
694  const sal_Unicode* p = StrChr( pStr, c );
695  if ( p )
696  return pCont[ p - pStr ];
697  }
698  return ParserFlags::ILLEGAL;
699 }
700 
701 
702 void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType )
703 {
704  assert(r.LeadingWhiteSpace == 0);
705  eState = ssGetChar;
706 
708  OUStringBuffer aSymbol;
709  bool isFirst(true);
710  sal_Int32 index(nPos); // index of next code point after current
711  sal_Int32 postSymbolIndex(index); // index of code point following last quote
712  sal_uInt32 current((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
713  sal_uInt32 cLast = 0;
714  sal_Int32 nCodePoints(0);
715  int nDecSeps = 0;
716  bool bQuote = false;
717  bool bMightBeWord = true;
718  bool bMightBeWordLast = true;
719  bool bDecSepAltUsed = false;
721  sal_Int32 nextCharIndex(nPos); // == index of nextChar
722 
723  while ((current != 0) && (eState != ssStop))
724  {
725  ++nCodePoints;
726  ParserFlags nMask = getFlags(current);
727  if ( nMask & ParserFlags::EXCLUDED )
728  eState = ssBounce;
729  if ( bMightBeWord )
730  { // only relevant for ssGetValue fall back
731  if ( eState == ssGetChar || eState == ssRewindFromValue ||
733  bMightBeWord = bool(nMask & ParserFlags::CHAR_WORD);
734  else
735  bMightBeWord = bool(nMask & ParserFlags::WORD);
736  }
737  sal_Int32 nParseTokensType = getParseTokensType(current, isFirst);
738  isFirst = false;
739  sal_Int32 const nextIndex(nextCharIndex); // == index of char following current
740  nextCharIndex = index; // == index of nextChar
741  sal_uInt32 nextChar((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0);
742  switch (eState)
743  {
744  case ssGetChar :
745  case ssRewindFromValue :
747  {
750  {
751  eState = ssGetValue;
752  if ( nMask & ParserFlags::VALUE_DIGIT )
753  {
754  if (128 <= current)
755  r.TokenType = KParseType::UNI_NUMBER;
756  else
757  r.TokenType = KParseType::ASC_NUMBER;
758  }
759  else if (current == cDecimalSep || (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt)))
760  {
761  if (nextChar)
762  ++nDecSeps;
763  else
765  // retry for ONE_SINGLE_CHAR or others
766  }
767  }
768  else if ( nMask & ParserFlags::CHAR_WORD )
769  {
770  eState = ssGetWord;
771  r.TokenType = KParseType::IDENTNAME;
772  }
773  else if ( nMask & ParserFlags::NAME_SEP )
774  {
776  bQuote = true;
777  postSymbolIndex = nextCharIndex;
778  nParseTokensType = 0; // will be taken of first real character
779  r.TokenType = KParseType::SINGLE_QUOTE_NAME;
780  }
781  else if ( nMask & ParserFlags::CHAR_STRING )
782  {
784  postSymbolIndex = nextCharIndex;
785  nParseTokensType = 0; // will be taken of first real character
786  r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
787  }
788  else if ( nMask & ParserFlags::CHAR_DONTCARE )
789  {
790  if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS )
791  {
792  if (eState == ssRewindFromValue)
794  r.LeadingWhiteSpace = nextCharIndex - nPos;
795  nCodePoints--; // exclude leading whitespace
796  postSymbolIndex = nextCharIndex;
797  nParseTokensType = 0; // wait until real character
798  bMightBeWord = true;
799  }
800  else
801  eState = ssBounce;
802  }
803  else if ( nMask & ParserFlags::CHAR_BOOL )
804  {
805  eState = ssGetBool;
806  r.TokenType = KParseType::BOOLEAN;
807  }
808  else if ( nMask & ParserFlags::CHAR )
809  {
810  eState = ssStop;
811  r.TokenType = KParseType::ONE_SINGLE_CHAR;
812  }
813  else
814  eState = ssBounce; // not known
815  }
816  break;
817  case ssGetValue :
818  {
819  if ( nMask & ParserFlags::VALUE_DIGIT )
820  {
821  if (128 <= current)
822  r.TokenType = KParseType::UNI_NUMBER;
823  else if ( r.TokenType != KParseType::UNI_NUMBER )
824  r.TokenType = KParseType::ASC_NUMBER;
825  }
826  if ( nMask & ParserFlags::VALUE )
827  {
828  if (current == cGroupSep)
829  {
830  if (getFlags(nextChar) & ParserFlags::VALUE_DIGIT)
831  nParseTokensType |= KParseTokens::GROUP_SEPARATOR_IN_NUMBER;
832  else
833  {
834  // Trailing group separator character is not a
835  // group separator.
836  eState = ssStopBack;
837  }
838  }
839  else if ((current == cDecimalSep ||
840  (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt))) &&
841  ++nDecSeps > 1)
842  {
843  if (nCodePoints == 2)
845  // consecutive separators
846  else
847  eState = ssStopBack;
848  }
849  // else keep it going
850  }
851  else if (current == 'E' || current == 'e')
852  {
853  ParserFlags nNext = getFlags(nextChar);
854  if ( nNext & ParserFlags::VALUE_EXP )
855  ; // keep it going
856  else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
857  { // might be a numerical name (1.2efg)
858  eState = ssGetWord;
859  r.TokenType = KParseType::IDENTNAME;
860  }
861  else
862  eState = ssStopBack;
863  }
864  else if ( nMask & ParserFlags::VALUE_SIGN )
865  {
866  if ( (cLast == 'E') || (cLast == 'e') )
867  {
868  ParserFlags nNext = getFlags(nextChar);
869  if ( nNext & ParserFlags::VALUE_EXP_VALUE )
870  ; // keep it going
871  else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar))
872  { // might be a numerical name (1.2e+fg)
873  eState = ssGetWord;
874  r.TokenType = KParseType::IDENTNAME;
875  }
876  else
877  eState = ssStopBack;
878  }
879  else if ( bMightBeWord )
880  { // might be a numerical name (1.2+fg)
881  eState = ssGetWord;
882  r.TokenType = KParseType::IDENTNAME;
883  }
884  else
885  eState = ssStopBack;
886  }
887  else if ( bMightBeWord && (nMask & ParserFlags::WORD) )
888  { // might be a numerical name (1995.A1)
889  eState = ssGetWord;
890  r.TokenType = KParseType::IDENTNAME;
891  }
892  else
893  eState = ssStopBack;
894  }
895  break;
896  case ssGetWordFirstChar :
897  eState = ssGetWord;
898  [[fallthrough]];
899  case ssGetWord :
900  {
901  if ( nMask & ParserFlags::WORD )
902  ; // keep it going
903  else if ( nMask & ParserFlags::NAME_SEP )
904  {
905  if ( bQuote )
906  {
907  if ( cLast == '\\' )
908  { // escaped
909  aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2));
910  aSymbol.append(OUString(&current, 1));
911  }
912  else
913  {
914  eState = ssStop;
915  aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
916  }
917  postSymbolIndex = nextCharIndex;
918  }
919  else
920  eState = ssStopBack;
921  }
922  else if ( bQuote )
923  ; // keep it going
924  else
925  eState = ssStopBack;
926  }
927  break;
928  case ssGetString :
929  {
930  if ( nMask & ParserFlags::STRING_SEP )
931  {
932  if ( cLast == '\\' )
933  { // escaped
934  aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2));
935  aSymbol.append(OUString(&current, 1));
936  }
937  else if (current == nextChar &&
938  !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
939  { // "" => literal " escaped
940  aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex));
941  nextCharIndex = index;
942  if (index < rText.getLength()) { ++nCodePoints; }
943  nextChar = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
944  }
945  else
946  {
947  eState = ssStop;
948  aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
949  }
950  postSymbolIndex = nextCharIndex;
951  }
952  }
953  break;
954  case ssGetBool :
955  {
956  if ( nMask & ParserFlags::BOOL )
957  eState = ssStop; // maximum 2: <, >, <>, <=, >=
958  else
959  eState = ssStopBack;
960  }
961  break;
962  case ssStopBack :
963  case ssBounce :
964  case ssStop :
965  ; // nothing, no compiler warning
966  break;
967  }
968  if ( eState == ssRewindFromValue )
969  {
970  r = ParseResult();
971  index = nPos;
972  postSymbolIndex = nPos;
973  nextCharIndex = nPos;
974  aSymbol.setLength(0);
975  current = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0;
976  nCodePoints = (nPos < rText.getLength()) ? 1 : 0;
977  isFirst = true;
978  cLast = 0;
979  nDecSeps = 0;
980  bQuote = false;
981  bMightBeWord = true;
982  bMightBeWordLast = true;
983  bDecSepAltUsed = false;
984  }
985  else
986  {
987  if ( !(r.TokenType & nTokenType) )
988  {
989  if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
990  && (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
991  ; // keep a number that might be a word
992  else if (r.LeadingWhiteSpace == (nextCharIndex - nPos))
993  ; // keep ignored white space
994  else if ( !r.TokenType && eState == ssGetValue && (nMask & ParserFlags::VALUE_SEP) )
995  ; // keep uncertain value
996  else
997  eState = ssBounce;
998  }
999  if ( eState == ssBounce )
1000  {
1001  r.TokenType = 0;
1002  eState = ssStopBack;
1003  }
1004  if ( eState == ssStopBack )
1005  { // put back
1006  nextChar = rText.iterateCodePoints(&index, -1);
1007  nextCharIndex = nextIndex;
1008  --nCodePoints;
1009  bMightBeWord = bMightBeWordLast;
1010  eState = ssStop;
1011  }
1012  if ( eState != ssStop )
1013  {
1014  if ( !r.StartFlags )
1015  r.StartFlags |= nParseTokensType;
1016  else
1017  r.ContFlags |= nParseTokensType;
1018  }
1019  bMightBeWordLast = bMightBeWord;
1020  cLast = current;
1021  current = nextChar;
1022  }
1023  }
1024  // r.CharLen is the length in characters (not code units) of the parsed
1025  // token not including any leading white space.
1026  r.CharLen = nCodePoints;
1027  r.EndPos = nextCharIndex;
1028  if ( r.TokenType & KParseType::ASC_NUMBER )
1029  {
1030  r.Value = rtl_math_uStringToDouble(rText.getStr() + nPos + r.LeadingWhiteSpace,
1031  rText.getStr() + r.EndPos, (bDecSepAltUsed ? cDecimalSepAlt : cDecimalSep), cGroupSep, nullptr, nullptr);
1032  if ( bMightBeWord )
1033  r.TokenType |= KParseType::IDENTNAME;
1034  }
1035  else if ( r.TokenType & KParseType::UNI_NUMBER )
1036  {
1037  if ( !xNatNumSup.is() )
1038  {
1039  if ( m_xContext.is() )
1040  {
1041  xNatNumSup = NativeNumberSupplier::create( m_xContext );
1042  }
1043  }
1044  OUString aTmp(rText.getStr() + nPos + r.LeadingWhiteSpace,
1045  r.EndPos - nPos - r.LeadingWhiteSpace);
1046  // transliterate to ASCII
1047  aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale,
1048  NativeNumberMode::NATNUM0 );
1049  r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep );
1050  if ( bMightBeWord )
1051  r.TokenType |= KParseType::IDENTNAME;
1052  }
1053  else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
1054  {
1055  if (postSymbolIndex < nextCharIndex)
1056  {
1057  aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
1058  r.TokenType |= KParseType::MISSING_QUOTE;
1059  }
1060  r.DequotedNameOrString = aSymbol.toString();
1061  }
1062 }
1063 
1064 }
1065 
1066 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
void destroyParserTable()
Destroy parser table.
void parseText(css::i18n::ParseResult &r, const OUString &rText, sal_Int32 nPos, sal_Int32 nTokenType=0xffffffff)
Parse a text.
static const sal_Unicode * StrChr(const sal_Unicode *pStr, sal_uInt32 c)
If and where c occurs in pStr.
css::uno::Reference< css::i18n::XNativeNumberSupplier > xNatNumSup
ParserFlags getStartCharsFlags(sal_uInt32 c)
Access parser table flags for user defined start characters.
std::unique_ptr< ParserFlags[]> pStart
void setupInternational(const css::lang::Locale &rLocale)
Setup International class, new'ed only if different from existing.
sal_Int64 n
css::lang::Locale aParserLocale
used for parser only
static const sal_uInt8 nDefCnt
#define TOKEN_DIGIT_FLAGS
sal_uInt16 sal_Unicode
css::uno::Reference< css::i18n::XLocaleData5 > mxLocaleData
static sal_Int32 getParseTokensType(sal_uInt32 c, bool isFirst)
Get corresponding KParseTokens flag for a character.
std::unique_ptr< ParserFlags[]> pTable
static const sal_Int32 pParseTokensType[]
ParserFlags getFlagsExtended(sal_uInt32 c) const
Access parser flags via International and special definitions.
int i
ParserFlags getContCharsFlags(sal_Unicode c)
Access parser table flags for user defined continuation characters.
void initParserTable(const css::lang::Locale &rLocale, sal_Int32 startCharTokenType, const OUString &userDefinedCharactersStart, sal_Int32 contCharTokenType, const OUString &userDefinedCharactersCont)
Init parser table.
Constant values shared between i18npool and, for example, the number formatter.
unsigned short WORD
tuple index
static const ParserFlags pDefaultParserTable[]
std::unique_ptr< ParserFlags[]> pCont
ParserFlags
Flag values of table.
unsigned char sal_uInt8
void * p
ParserFlags getFlags(sal_uInt32 c)
Access parser table flags.
void setupParserTable(const css::lang::Locale &rLocale, sal_Int32 startCharTokenType, const OUString &userDefinedCharactersStart, sal_Int32 contCharTokenType, const OUString &userDefinedCharactersCont)
Setup parser table. Calls initParserTable() only if needed.
css::uno::Reference< css::uno::XComponentContext > m_xContext