22#include <unicode/uchar.h>
23#include <rtl/character.hxx>
24#include <rtl/math.hxx>
25#include <rtl/ustring.hxx>
26#include <com/sun/star/i18n/KParseTokens.hpp>
27#include <com/sun/star/i18n/KParseType.hpp>
28#include <com/sun/star/i18n/LocaleData2.hpp>
29#include <com/sun/star/i18n/NativeNumberMode.hpp>
30#include <com/sun/star/i18n/NativeNumberSupplier.hpp>
39#define TOKEN_DIGIT_FLAGS (ParserFlags::CHAR_VALUE | ParserFlags::VALUE | ParserFlags::VALUE_EXP | ParserFlags::VALUE_EXP_VALUE | ParserFlags::VALUE_DIGIT)
186 KParseTokens::ASC_OTHER,
187 KParseTokens::ASC_CONTROL,
188 KParseTokens::ASC_CONTROL,
189 KParseTokens::ASC_CONTROL,
190 KParseTokens::ASC_CONTROL,
191 KParseTokens::ASC_CONTROL,
192 KParseTokens::ASC_CONTROL,
193 KParseTokens::ASC_CONTROL,
194 KParseTokens::ASC_CONTROL,
195 KParseTokens::ASC_CONTROL,
196 KParseTokens::ASC_CONTROL,
197 KParseTokens::ASC_CONTROL,
198 KParseTokens::ASC_CONTROL,
199 KParseTokens::ASC_CONTROL,
200 KParseTokens::ASC_CONTROL,
201 KParseTokens::ASC_CONTROL,
202 KParseTokens::ASC_CONTROL,
203 KParseTokens::ASC_CONTROL,
204 KParseTokens::ASC_CONTROL,
205 KParseTokens::ASC_CONTROL,
206 KParseTokens::ASC_CONTROL,
207 KParseTokens::ASC_CONTROL,
208 KParseTokens::ASC_CONTROL,
209 KParseTokens::ASC_CONTROL,
210 KParseTokens::ASC_CONTROL,
211 KParseTokens::ASC_CONTROL,
212 KParseTokens::ASC_CONTROL,
213 KParseTokens::ASC_CONTROL,
214 KParseTokens::ASC_CONTROL,
215 KParseTokens::ASC_CONTROL,
216 KParseTokens::ASC_CONTROL,
217 KParseTokens::ASC_CONTROL,
218 KParseTokens::ASC_OTHER,
219 KParseTokens::ASC_OTHER,
220 KParseTokens::ASC_OTHER,
221 KParseTokens::ASC_OTHER,
222 KParseTokens::ASC_DOLLAR,
223 KParseTokens::ASC_OTHER,
224 KParseTokens::ASC_OTHER,
225 KParseTokens::ASC_OTHER,
226 KParseTokens::ASC_OTHER,
227 KParseTokens::ASC_OTHER,
228 KParseTokens::ASC_OTHER,
229 KParseTokens::ASC_OTHER,
230 KParseTokens::ASC_OTHER,
231 KParseTokens::ASC_OTHER,
232 KParseTokens::ASC_DOT,
233 KParseTokens::ASC_OTHER,
235 KParseTokens::ASC_DIGIT,
236 KParseTokens::ASC_DIGIT,
237 KParseTokens::ASC_DIGIT,
238 KParseTokens::ASC_DIGIT,
239 KParseTokens::ASC_DIGIT,
240 KParseTokens::ASC_DIGIT,
241 KParseTokens::ASC_DIGIT,
242 KParseTokens::ASC_DIGIT,
243 KParseTokens::ASC_DIGIT,
244 KParseTokens::ASC_DIGIT,
245 KParseTokens::ASC_COLON,
246 KParseTokens::ASC_OTHER,
247 KParseTokens::ASC_OTHER,
248 KParseTokens::ASC_OTHER,
249 KParseTokens::ASC_OTHER,
250 KParseTokens::ASC_OTHER,
251 KParseTokens::ASC_OTHER,
253 KParseTokens::ASC_UPALPHA,
254 KParseTokens::ASC_UPALPHA,
255 KParseTokens::ASC_UPALPHA,
256 KParseTokens::ASC_UPALPHA,
257 KParseTokens::ASC_UPALPHA,
258 KParseTokens::ASC_UPALPHA,
259 KParseTokens::ASC_UPALPHA,
260 KParseTokens::ASC_UPALPHA,
261 KParseTokens::ASC_UPALPHA,
262 KParseTokens::ASC_UPALPHA,
263 KParseTokens::ASC_UPALPHA,
264 KParseTokens::ASC_UPALPHA,
265 KParseTokens::ASC_UPALPHA,
266 KParseTokens::ASC_UPALPHA,
267 KParseTokens::ASC_UPALPHA,
268 KParseTokens::ASC_UPALPHA,
269 KParseTokens::ASC_UPALPHA,
270 KParseTokens::ASC_UPALPHA,
271 KParseTokens::ASC_UPALPHA,
272 KParseTokens::ASC_UPALPHA,
273 KParseTokens::ASC_UPALPHA,
274 KParseTokens::ASC_UPALPHA,
275 KParseTokens::ASC_UPALPHA,
276 KParseTokens::ASC_UPALPHA,
277 KParseTokens::ASC_UPALPHA,
278 KParseTokens::ASC_UPALPHA,
279 KParseTokens::ASC_OTHER,
280 KParseTokens::ASC_OTHER,
281 KParseTokens::ASC_OTHER,
282 KParseTokens::ASC_OTHER,
283 KParseTokens::ASC_UNDERSCORE,
284 KParseTokens::ASC_OTHER,
286 KParseTokens::ASC_LOALPHA,
287 KParseTokens::ASC_LOALPHA,
288 KParseTokens::ASC_LOALPHA,
289 KParseTokens::ASC_LOALPHA,
290 KParseTokens::ASC_LOALPHA,
291 KParseTokens::ASC_LOALPHA,
292 KParseTokens::ASC_LOALPHA,
293 KParseTokens::ASC_LOALPHA,
294 KParseTokens::ASC_LOALPHA,
295 KParseTokens::ASC_LOALPHA,
296 KParseTokens::ASC_LOALPHA,
297 KParseTokens::ASC_LOALPHA,
298 KParseTokens::ASC_LOALPHA,
299 KParseTokens::ASC_LOALPHA,
300 KParseTokens::ASC_LOALPHA,
301 KParseTokens::ASC_LOALPHA,
302 KParseTokens::ASC_LOALPHA,
303 KParseTokens::ASC_LOALPHA,
304 KParseTokens::ASC_LOALPHA,
305 KParseTokens::ASC_LOALPHA,
306 KParseTokens::ASC_LOALPHA,
307 KParseTokens::ASC_LOALPHA,
308 KParseTokens::ASC_LOALPHA,
309 KParseTokens::ASC_LOALPHA,
310 KParseTokens::ASC_LOALPHA,
311 KParseTokens::ASC_LOALPHA,
312 KParseTokens::ASC_OTHER,
313 KParseTokens::ASC_OTHER,
314 KParseTokens::ASC_OTHER,
315 KParseTokens::ASC_OTHER,
316 KParseTokens::ASC_OTHER
326 auto const n = rtl::splitSurrogates(c, cs);
329 if ( *pStr == cs[0] && (
n == 1 || pStr[1] == cs[1]) )
345 switch (u_charType(c))
347 case U_UPPERCASE_LETTER :
348 return KParseTokens::UNI_UPALPHA;
349 case U_LOWERCASE_LETTER :
350 return KParseTokens::UNI_LOALPHA;
351 case U_TITLECASE_LETTER :
352 return KParseTokens::UNI_TITLE_ALPHA;
353 case U_MODIFIER_LETTER :
354 return KParseTokens::UNI_MODIFIER_LETTER;
355 case U_OTHER_LETTER :
359 case U_NON_SPACING_MARK :
360 return KParseTokens::UNI_OTHER_LETTER;
361 case U_DECIMAL_DIGIT_NUMBER :
362 return KParseTokens::UNI_DIGIT;
363 case U_LETTER_NUMBER :
364 return KParseTokens::UNI_LETTER_NUMBER;
365 case U_OTHER_NUMBER :
366 return KParseTokens::UNI_OTHER_NUMBER;
369 return KParseTokens::UNI_OTHER;
392 const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
393 const OUString& userDefinedCharactersCont )
395 bool bIntlEqual = (rLocale.Language ==
aParserLocale.Language &&
398 if ( !
pTable || !bIntlEqual ||
403 initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
404 contCharTokenType, userDefinedCharactersCont );
409 const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
410 const OUString& userDefinedCharactersCont )
419 if (
pStart && userDefinedCharactersStart.getLength() !=
aStartChars.getLength() )
423 if (
pCont && userDefinedCharactersCont.getLength() !=
aContChars.getLength() )
435 LocaleDataItem2 aItem =
444 if (
nContTypes & KParseTokens::GROUP_SEPARATOR_IN_NUMBER)
460 using namespace KParseTokens;
464 for (
i = 65;
i < 91;
i++ )
467 for (
i = 65;
i < 91;
i++ )
471 for (
i = 97;
i < 123;
i++ )
474 for (
i = 97;
i < 123;
i++ )
478 for (
i = 48;
i < 58;
i++ )
481 for (
i = 48;
i < 58;
i++ )
505 for (
i = 1;
i < 32;
i++ )
508 for (
i = 1;
i < 32;
i++ )
528 for ( sal_Int32 j=0; j<nLen; j++,
p++ )
542 for ( sal_Int32 j=0; j<nLen; j++ )
609 switch (u_charType(c))
611 case U_UPPERCASE_LETTER :
612 return (nTypes & KParseTokens::UNI_UPALPHA) ?
615 case U_LOWERCASE_LETTER :
616 return (nTypes & KParseTokens::UNI_LOALPHA) ?
619 case U_TITLECASE_LETTER :
620 return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
623 case U_MODIFIER_LETTER :
624 return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
627 case U_NON_SPACING_MARK :
628 case U_COMBINING_SPACING_MARK :
634 case U_OTHER_LETTER :
635 return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
638 case U_DECIMAL_DIGIT_NUMBER :
639 return ((nTypes & KParseTokens::UNI_DIGIT) ?
642 case U_LETTER_NUMBER :
643 return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
646 case U_OTHER_NUMBER :
647 return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
650 case U_SPACE_SEPARATOR :
651 return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
653 case U_OTHER_PUNCTUATION:
660 if (bStart || (U_WB_MIDLETTER != u_getIntPropertyValue(c, UCHAR_WORD_BREAK)
661 && c != 0x30FB && c != 0xFF65))
666 return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
704 assert(r.LeadingWhiteSpace == 0);
708 OUStringBuffer aSymbol;
711 sal_Int32 postSymbolIndex(
index);
712 sal_uInt32 current((
index < rText.getLength()) ? rText.iterateCodePoints(&
index) : 0);
713 sal_uInt32 cLast = 0;
714 sal_Int32 nCodePoints(0);
717 bool bMightBeWord =
true;
718 bool bMightBeWordLast =
true;
719 bool bDecSepAltUsed =
false;
721 sal_Int32 nextCharIndex(
nPos);
723 while ((current != 0) && (eState !=
ssStop))
739 sal_Int32
const nextIndex(nextCharIndex);
740 nextCharIndex =
index;
741 sal_uInt32 nextChar((
index < rText.getLength()) ? rText.iterateCodePoints(&
index) : 0);
755 r.TokenType = KParseType::UNI_NUMBER;
757 r.TokenType = KParseType::ASC_NUMBER;
771 r.TokenType = KParseType::IDENTNAME;
777 postSymbolIndex = nextCharIndex;
778 nParseTokensType = 0;
779 r.TokenType = KParseType::SINGLE_QUOTE_NAME;
784 postSymbolIndex = nextCharIndex;
785 nParseTokensType = 0;
786 r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
790 if (
nStartTypes & KParseTokens::IGNORE_LEADING_WS )
794 r.LeadingWhiteSpace = nextCharIndex -
nPos;
796 postSymbolIndex = nextCharIndex;
797 nParseTokensType = 0;
806 r.TokenType = KParseType::BOOLEAN;
811 r.TokenType = KParseType::ONE_SINGLE_CHAR;
822 r.TokenType = KParseType::UNI_NUMBER;
823 else if ( r.TokenType != KParseType::UNI_NUMBER )
824 r.TokenType = KParseType::ASC_NUMBER;
831 sal_Int32 tempIndex(
index);
832 sal_uInt32
const nextChar2((tempIndex < rText.getLength()) ? rText.iterateCodePoints(&tempIndex) : 0);
833 sal_uInt32
const nextChar3((tempIndex < rText.getLength()) ? rText.iterateCodePoints(&tempIndex) : 0);
838 nParseTokensType |= KParseTokens::GROUP_SEPARATOR_IN_NUMBER;
851 if (nCodePoints == 2)
859 else if (current ==
'E' || current ==
'e')
867 r.TokenType = KParseType::IDENTNAME;
874 if ( (cLast ==
'E') || (cLast ==
'e') )
882 r.TokenType = KParseType::IDENTNAME;
887 else if ( bMightBeWord )
890 r.TokenType = KParseType::IDENTNAME;
898 r.TokenType = KParseType::IDENTNAME;
918 OUString::Concat(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2))
919 + OUString(¤t, 1));
924 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
926 postSymbolIndex = nextCharIndex;
944 rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2)
945 + OUString(¤t, 1));
947 else if (current == nextChar &&
948 !(
nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
950 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex));
951 nextCharIndex =
index;
952 if (
index < rText.getLength()) { ++nCodePoints; }
953 nextChar = (
index < rText.getLength()) ? rText.iterateCodePoints(&
index) : 0;
958 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
960 postSymbolIndex = nextCharIndex;
982 postSymbolIndex =
nPos;
983 nextCharIndex =
nPos;
984 aSymbol.setLength(0);
985 current = (
index < rText.getLength()) ? rText.iterateCodePoints(&
index) : 0;
986 nCodePoints = (
nPos < rText.getLength()) ? 1 : 0;
992 bMightBeWordLast =
true;
993 bDecSepAltUsed =
false;
997 if ( !(r.TokenType & nTokenType) )
999 if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
1000 && (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
1002 else if (r.LeadingWhiteSpace == (nextCharIndex -
nPos))
1016 nextChar = rText.iterateCodePoints(&
index, -1);
1017 nextCharIndex = nextIndex;
1019 bMightBeWord = bMightBeWordLast;
1024 if ( !r.StartFlags )
1025 r.StartFlags |= nParseTokensType;
1027 r.ContFlags |= nParseTokensType;
1029 bMightBeWordLast = bMightBeWord;
1036 r.CharLen = nCodePoints;
1037 r.EndPos = nextCharIndex;
1038 if ( r.TokenType & KParseType::ASC_NUMBER )
1040 r.Value = rtl_math_uStringToDouble(rText.getStr() +
nPos + r.LeadingWhiteSpace,
1043 r.TokenType |= KParseType::IDENTNAME;
1045 else if ( r.TokenType & KParseType::UNI_NUMBER )
1054 OUString aTmp(rText.getStr() +
nPos + r.LeadingWhiteSpace,
1055 r.EndPos -
nPos - r.LeadingWhiteSpace);
1058 NativeNumberMode::NATNUM0 );
1061 r.TokenType |= KParseType::IDENTNAME;
1063 else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
1065 if (postSymbolIndex < nextCharIndex)
1067 aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1));
1068 r.TokenType |= KParseType::MISSING_QUOTE;
1070 r.DequotedNameOrString = aSymbol.makeStringAndClear();
ParserFlags
Flag values of table.
#define TOKEN_DIGIT_FLAGS
css::lang::Locale aParserLocale
used for parser only
static const ParserFlags pDefaultParserTable[]
void parseText(css::i18n::ParseResult &r, const OUString &rText, sal_Int32 nPos, sal_Int32 nTokenType=0xffffffff)
Parse a text.
void setupInternational(const css::lang::Locale &rLocale)
Setup International class, new'ed only if different from existing.
static const sal_uInt8 nDefCnt
std::unique_ptr< ParserFlags[]> pTable
void setupParserTable(const css::lang::Locale &rLocale, sal_Int32 startCharTokenType, const OUString &userDefinedCharactersStart, sal_Int32 contCharTokenType, const OUString &userDefinedCharactersCont)
Setup parser table. Calls initParserTable() only if needed.
ParserFlags getStartCharsFlags(sal_uInt32 c)
Access parser table flags for user defined start characters.
sal_Unicode cDecimalSepAlt
css::uno::Reference< css::i18n::XNativeNumberSupplier > xNatNumSup
std::unique_ptr< ParserFlags[]> pCont
ParserFlags getFlagsExtended(sal_uInt32 c, ScanState eState) const
Access parser flags via International and special definitions.
void destroyParserTable()
Destroy parser table.
css::uno::Reference< css::uno::XComponentContext > m_xContext
ParserFlags getFlags(sal_uInt32 c, ScanState eState)
Access parser table flags.
std::unique_ptr< ParserFlags[]> pStart
static const sal_Unicode * StrChr(const sal_Unicode *pStr, sal_uInt32 c)
If and where c occurs in pStr.
static sal_Int32 getParseTokensType(sal_uInt32 c, bool isFirst)
Get corresponding KParseTokens flag for a character.
static const sal_Int32 pParseTokensType[]
css::uno::Reference< css::i18n::XLocaleData5 > mxLocaleData
void initParserTable(const css::lang::Locale &rLocale, sal_Int32 startCharTokenType, const OUString &userDefinedCharactersStart, sal_Int32 contCharTokenType, const OUString &userDefinedCharactersCont)
Init parser table.
@ ssIgnoreLeadingInRewind
ParserFlags getContCharsFlags(sal_Unicode c)
Access parser table flags for user defined continuation characters.
Constant values shared between i18npool and, for example, the number formatter.