LibreOffice Module comphelper (master) 1
syntaxhighlight.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <sal/config.h>
21
22#include <cassert>
23
24#include <rtl/character.hxx>
25#include <unicode/uchar.h>
28
29namespace {
30
31// Flags for character properties
32enum class CharFlags {
33 StartIdentifier = 0x0001,
34 InIdentifier = 0x0002,
35 StartNumber = 0x0004,
36 InNumber = 0x0008,
37 InHexNumber = 0x0010,
38 InOctNumber = 0x0020,
39 StartString = 0x0040,
40 Operator = 0x0080,
41 Space = 0x0100,
42 EOL = 0x0200
43};
44
45}
46
47namespace o3tl {
48 template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
49}
50
51// ##########################################################################
52// ATTENTION: all these words need to be in lower case
53// ##########################################################################
54static const char* strListBasicKeyWords[] = {
55 "access",
56 "alias",
57 "and",
58 "any",
59 "append",
60 "as",
61 "attribute",
62 "base",
63 "binary",
64 "boolean",
65 "byref",
66 "byte",
67 "byval",
68 "call",
69 "case",
70 "cdecl",
71 "classmodule",
72 "close",
73 "compare",
74 "compatible",
75 "const",
76 "currency",
77 "date",
78 "declare",
79 "defbool",
80 "defcur",
81 "defdate",
82 "defdbl",
83 "deferr",
84 "defint",
85 "deflng",
86 "defobj",
87 "defsng",
88 "defstr",
89 "defvar",
90 "dim",
91 "do",
92 "doevents",
93 "double",
94 "each",
95 "else",
96 "elseif",
97 "end",
98 "end enum",
99 "end function",
100 "end if",
101 "end property",
102 "end select",
103 "end sub",
104 "end type",
105 "endif",
106 "enum",
107 "eqv",
108 "erase",
109 "error",
110 "exit",
111 "explicit",
112 "for",
113 "function",
114 "get",
115 "global",
116 "gosub",
117 "goto",
118 "if",
119 "imp",
120 "implements",
121 "in",
122 "input",
123 "integer",
124 "is",
125 "let",
126 "lib",
127 "like",
128 "line",
129 "line input",
130 "local",
131 "lock",
132 "long",
133 "loop",
134 "lprint",
135 "lset",
136 "mod",
137 "name",
138 "new",
139 "next",
140 "not",
141 "object",
142 "on",
143 "open",
144 "option",
145 "optional",
146 "or",
147 "output",
148 "paramarray",
149 "preserve",
150 "print",
151 "private",
152 "property",
153 "public",
154 "random",
155 "read",
156 "redim",
157 "rem",
158 "resume",
159 "return",
160 "rset",
161 "select",
162 "set",
163 "shared",
164 "single",
165 "static",
166 "step",
167 "stop",
168 "string",
169 "sub",
170 "system",
171 "text",
172 "then",
173 "to",
174 "type",
175 "typeof",
176 "until",
177 "variant",
178 "vbasupport",
179 "wend",
180 "while",
181 "with",
182 "withevents",
183 "write",
184 "xor"
185};
186
187
188static const char* strListSqlKeyWords[] = {
189 "all",
190 "and",
191 "any",
192 "as",
193 "asc",
194 "avg",
195 "between",
196 "by",
197 "cast",
198 "corresponding",
199 "count",
200 "create",
201 "cross",
202 "delete",
203 "desc",
204 "distinct",
205 "drop",
206 "escape",
207 "except",
208 "exists",
209 "false",
210 "from",
211 "full",
212 "global",
213 "group",
214 "having",
215 "in",
216 "inner",
217 "insert",
218 "intersect",
219 "into",
220 "is",
221 "join",
222 "left",
223 "like",
224 "limit",
225 "local",
226 "match",
227 "max",
228 "min",
229 "natural",
230 "not",
231 "null",
232 "on",
233 "or",
234 "order",
235 "outer",
236 "right",
237 "select",
238 "set",
239 "some",
240 "sum",
241 "table",
242 "temporary",
243 "true",
244 "union",
245 "unique",
246 "unknown",
247 "update",
248 "using",
249 "values",
250 "where"
251};
252
253
254extern "C" {
255
256static int compare_strings( const void *arg1, const void *arg2 )
257{
258 return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
259}
260
261}
262
263namespace
264{
265 bool isAlpha(sal_Unicode c)
266 {
267 if (rtl::isAsciiAlpha(c))
268 return true;
269 return u_isalpha(c);
270 }
271}
272
274{
275 // Character information tables
276 CharFlags aCharTypeTab[256] = {};
277
278 // Auxiliary function: testing of the character flags
279 bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
280
281 // Get new token, EmptyString == nothing more over there
282 bool getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end, /*out*/TokenType& reType,
283 /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const;
284
285 const char** ppListKeyWords;
286 sal_uInt16 nKeyWordCount;
287
288public:
290
291 explicit Tokenizer( HighlighterLanguage aLang );
292
293 void getHighlightPortions(std::u16string_view rLine,
294 /*out*/std::vector<HighlightPortion>& portions) const;
295 void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
296};
297
298// Helper function: test character flag
300{
301 bool bRet = false;
302 if( c != 0 && c <= 255 )
303 {
304 bRet = bool(aCharTypeTab[c] & nTestFlags);
305 }
306 else if( c > 255 )
307 {
308 bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
309 && isAlpha(c);
310 }
311 return bRet;
312}
313
314void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
315{
316 ppListKeyWords = ppKeyWords;
317 nKeyWordCount = nCount;
318}
319
320bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end,
321 /*out*/TokenType& reType,
322 /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const
323{
324 reType = TokenType::Unknown;
325
326 rpStartPos = pos;
327
328 if( pos == end )
329 return false;
330
331 sal_Unicode c = *pos;
332 ++pos;
333
334 //*** Go through all possibilities ***
335 // Space?
336 if ( testCharFlags( c, CharFlags::Space ) )
337 {
338 while( pos != end && testCharFlags( *pos, CharFlags::Space ) )
339 ++pos;
340
341 reType = TokenType::Whitespace;
342 }
343
344 // Identifier?
345 else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
346 {
347 bool bIdentifierChar;
348 do
349 {
350 if (pos == end)
351 break;
352 // Fetch next character
353 c = *pos;
354 bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
355 if( bIdentifierChar )
356 ++pos;
357 }
358 while( bIdentifierChar );
359
360 reType = TokenType::Identifier;
361
362 // Keyword table
363 if (ppListKeyWords != nullptr)
364 {
365 int nCount = pos - rpStartPos;
366
367 // No keyword if string contains char > 255
368 bool bCanBeKeyword = true;
369 for( int i = 0 ; i < nCount ; i++ )
370 {
371 if( rpStartPos[i] > 255 )
372 {
373 bCanBeKeyword = false;
374 break;
375 }
376 }
377
378 if( bCanBeKeyword )
379 {
380 std::u16string_view aKWString(&*rpStartPos, nCount);
381 OString aByteStr = OUStringToOString(aKWString,
382 RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
383 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
385 {
386 reType = TokenType::Keywords;
387
388 if( aByteStr == "rem" )
389 {
390 // Remove all characters until end of line or EOF
391 for (;;)
392 {
393 if (pos == end)
394 break;
395 sal_Unicode cPeek = *pos;
396 if ( testCharFlags( cPeek, CharFlags::EOL ) )
397 break;
398 ++pos;
399 }
400
401 reType = TokenType::Comment;
402 }
403 }
404 }
405 }
406 }
407
408 // Operator?
409 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
410 else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
411 {
412 // parameters for SQL view
413 if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
414 {
415 if (c!='?')
416 {
417 bool bIdentifierChar;
418 do
419 {
420 // Get next character
421 if (pos == end)
422 break;
423 c = *pos;
424 bIdentifierChar = isAlpha(c);
425 if( bIdentifierChar )
426 ++pos;
427 }
428 while( bIdentifierChar );
429 }
430 reType = TokenType::Parameter;
431 }
432 else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
433 {
434 if (pos != end && *pos=='-')
435 {
436 // Remove all characters until end of line or EOF
437 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
438 {
439 ++pos;
440 }
441 reType = TokenType::Comment;
442 }
443 else
444 reType = TokenType::Operator;
445 }
446 else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
447 {
448 if (pos != end && *pos=='/')
449 {
450 // Remove all characters until end of line or EOF
451 while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
452 {
453 ++pos;
454 }
455 reType = TokenType::Comment;
456 }
457 else
458 reType = TokenType::Operator;
459 }
460 else
461 {
462 // Apostrophe is Basic comment
463 if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
464 {
465 // Skip all characters until end of input or end of line:
466 for (;;) {
467 if (pos == end)
468 break;
469 c = *pos;
470 if (testCharFlags(c, CharFlags::EOL)) {
471 break;
472 }
473 ++pos;
474 }
475
476 reType = TokenType::Comment;
477 }
478
479 // The real operator; can be easily used since not the actual
480 // operator (e.g. +=) is concerned, but the fact that it is one
481 if( reType != TokenType::Comment )
482 {
483 reType = TokenType::Operator;
484 }
485
486 }
487 }
488
489 // Object separator? Must be handled before Number
490 else if( c == '.' && ( pos == end || *pos < '0' || *pos > '9' ) )
491 {
492 reType = TokenType::Operator;
493 }
494
495 // Number?
496 else if( testCharFlags( c, CharFlags::StartNumber ) )
497 {
498 reType = TokenType::Number;
499
500 // Number system, 10 = normal, it is changed for Oct/Hex
501 int nRadix = 10;
502
503 // Is it an Oct or a Hex number?
504 if( c == '&' )
505 {
506 // Octal?
507 if( pos != end && (*pos == 'o' || *pos == 'O' ))
508 {
509 // remove o
510 ++pos;
511 nRadix = 8; // Octal base
512
513 // Read all numbers
514 while( pos != end && testCharFlags( *pos, CharFlags::InOctNumber ) )
515 ++pos;
516 }
517 // Hexadecimal?
518 else if( pos != end && (*pos == 'h' || *pos == 'H' ))
519 {
520 // remove x
521 ++pos;
522 nRadix = 16; // Hexadecimal base
523
524 // Read all numbers
525 while( pos != end && testCharFlags( *pos, CharFlags::InHexNumber ) )
526 ++pos;
527 }
528 else
529 {
530 reType = TokenType::Operator;
531 }
532 }
533
534 // When it is not Oct or Hex, then it is double
535 if( reType == TokenType::Number && nRadix == 10 )
536 {
537 // Flag if the last character is an exponent
538 bool bAfterExpChar = false;
539
540 // Read all numbers
541 while( pos != end && (testCharFlags( *pos, CharFlags::InNumber ) ||
542 (bAfterExpChar && *pos == '+' ) ||
543 (bAfterExpChar && *pos == '-' ) ))
544 // After exponent +/- are OK, too
545 {
546 c = *pos++;
547 bAfterExpChar = ( c == 'e' || c == 'E' );
548 }
549 }
550 }
551
552 // String?
553 else if( testCharFlags( c, CharFlags::StartString ) )
554 {
555 // Remember which character has opened the string
556 sal_Unicode cEndString = c;
557 if( c == '[' )
558 cEndString = ']';
559
560 // Read all characters
561 while( pos == end || *pos != cEndString )
562 {
563 // Detect EOF before reading next char, so we do not lose EOF
564 if( pos == end )
565 {
566 // ERROR: unterminated string literal
567 reType = TokenType::Error;
568 break;
569 }
570 c = *pos++;
571 if( testCharFlags( c, CharFlags::EOL ) )
572 {
573 // ERROR: unterminated string literal
574 reType = TokenType::Error;
575 break;
576 }
577 }
578
579 if( reType != TokenType::Error )
580 {
581 ++pos;
582 if( cEndString == ']' )
583 reType = TokenType::Identifier;
584 else
585 reType = TokenType::String;
586 }
587 }
588
589 // End of line?
590 else if( testCharFlags( c, CharFlags::EOL ) )
591 {
592 // If another EOL character comes, read it
593 if (pos != end)
594 {
595 sal_Unicode cNext = *pos;
596 if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
597 ++pos;
598 }
599
600 reType = TokenType::EOL;
601 }
602
603 // All other will remain TokenType::Unknown
604
605 // Save end position
606 rpEndPos = pos;
607 return true;
608}
609
611{
612 // Fill character table
613 sal_uInt16 i;
614
615 // Allowed characters for identifiers
616 CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
617 for( i = 'a' ; i <= 'z' ; i++ )
618 aCharTypeTab[i] |= nHelpMask;
619 for( i = 'A' ; i <= 'Z' ; i++ )
620 aCharTypeTab[i] |= nHelpMask;
621 aCharTypeTab[int('_')] |= nHelpMask;
622 aCharTypeTab[int('$')] |= nHelpMask;
623
624 // Digit (can be identifier and number)
625 nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
626 CharFlags::InNumber | CharFlags::InHexNumber;
627 for( i = '0' ; i <= '9' ; i++ )
628 aCharTypeTab[i] |= nHelpMask;
629
630 // Add e, E, . and & here manually
631 aCharTypeTab[int('e')] |= CharFlags::InNumber;
632 aCharTypeTab[int('E')] |= CharFlags::InNumber;
633 aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
634 aCharTypeTab[int('&')] |= CharFlags::StartNumber;
635
636 // Hexadecimal digit
637 for( i = 'a' ; i <= 'f' ; i++ )
638 aCharTypeTab[i] |= CharFlags::InHexNumber;
639 for( i = 'A' ; i <= 'F' ; i++ )
640 aCharTypeTab[i] |= CharFlags::InHexNumber;
641
642 // Octal digit
643 for( i = '0' ; i <= '7' ; i++ )
644 aCharTypeTab[i] |= CharFlags::InOctNumber;
645
646 // String literal start/end characters
647 aCharTypeTab[int('\'')] |= CharFlags::StartString;
648 aCharTypeTab[int('\"')] |= CharFlags::StartString;
649 aCharTypeTab[int('[')] |= CharFlags::StartString;
650 aCharTypeTab[int('`')] |= CharFlags::StartString;
651
652 // Operator characters
653 aCharTypeTab[int('!')] |= CharFlags::Operator;
654 aCharTypeTab[int('%')] |= CharFlags::Operator;
655 // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
656 aCharTypeTab[int('(')] |= CharFlags::Operator;
657 aCharTypeTab[int(')')] |= CharFlags::Operator;
658 aCharTypeTab[int('*')] |= CharFlags::Operator;
659 aCharTypeTab[int('+')] |= CharFlags::Operator;
660 aCharTypeTab[int(',')] |= CharFlags::Operator;
661 aCharTypeTab[int('-')] |= CharFlags::Operator;
662 aCharTypeTab[int('/')] |= CharFlags::Operator;
663 aCharTypeTab[int(':')] |= CharFlags::Operator;
664 aCharTypeTab[int('<')] |= CharFlags::Operator;
665 aCharTypeTab[int('=')] |= CharFlags::Operator;
666 aCharTypeTab[int('>')] |= CharFlags::Operator;
667 aCharTypeTab[int('?')] |= CharFlags::Operator;
668 aCharTypeTab[int('^')] |= CharFlags::Operator;
669 aCharTypeTab[int('|')] |= CharFlags::Operator;
670 aCharTypeTab[int('~')] |= CharFlags::Operator;
671 aCharTypeTab[int('{')] |= CharFlags::Operator;
672 aCharTypeTab[int('}')] |= CharFlags::Operator;
673 // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
674 aCharTypeTab[int(']')] |= CharFlags::Operator;
675 aCharTypeTab[int(';')] |= CharFlags::Operator;
676
677 // Space
678 aCharTypeTab[int(' ') ] |= CharFlags::Space;
679 aCharTypeTab[int('\t')] |= CharFlags::Space;
680
681 // End of line characters
682 aCharTypeTab[int('\r')] |= CharFlags::EOL;
683 aCharTypeTab[int('\n')] |= CharFlags::EOL;
684
685 ppListKeyWords = nullptr;
686 nKeyWordCount = 0;
687}
688
690 /*out*/std::vector<HighlightPortion>& portions) const
691{
692 // Set the position to the beginning of the source string
693 auto pos = rLine.begin();
694
695 // Variables for the out parameter
697 std::u16string_view::const_iterator pStartPos;
698 std::u16string_view::const_iterator pEndPos;
699
700 // Loop over all the tokens
701 while( getNextToken( pos, rLine.end(), eType, pStartPos, pEndPos ) )
702 {
703 portions.emplace_back(
704 pStartPos - rLine.begin(), pEndPos - rLine.begin(), eType);
705 }
706}
707
708
711{
712 switch (language)
713 {
715 m_tokenizer->setKeyWords( strListBasicKeyWords,
716 std::size( strListBasicKeyWords ));
717 break;
719 m_tokenizer->setKeyWords( strListSqlKeyWords,
720 std::size( strListSqlKeyWords ));
721 break;
722 default:
723 assert(false); // this cannot happen
724 }
725}
726
728
729void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine,
730 /*out*/std::vector<HighlightPortion>& portions) const
731{
732 m_tokenizer->getHighlightPortions( rLine, portions );
733}
734
736{
737 return m_tokenizer->aLanguage;
738}
739
740/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
bool getNextToken(std::u16string_view::const_iterator &pos, std::u16string_view::const_iterator end, TokenType &reType, std::u16string_view::const_iterator &rpStartPos, std::u16string_view::const_iterator &rpEndPos) const
Tokenizer(HighlighterLanguage aLang)
void getHighlightPortions(std::u16string_view rLine, std::vector< HighlightPortion > &portions) const
HighlighterLanguage const aLanguage
bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
void setKeyWords(const char **ppKeyWords, sal_uInt16 nCount)
void getHighlightPortions(std::u16string_view rLine, std::vector< HighlightPortion > &pPortions) const
SyntaxHighlighter(const SyntaxHighlighter &)=delete
std::unique_ptr< Tokenizer > m_tokenizer
HighlighterLanguage GetLanguage() const
int nCount
DocumentType eType
bool isAlpha(sal_Unicode c, bool bCompatible)
int i
end
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
static int compare_strings(const void *arg1, const void *arg2)
static const char * strListSqlKeyWords[]
static const char * strListBasicKeyWords[]
HighlighterLanguage
TokenType
sal_uInt16 sal_Unicode
size_t pos