LibreOffice Module comphelper (master)  1
syntaxhighlight.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cassert>
23 
24 #include <rtl/character.hxx>
25 #include <unicode/uchar.h>
27 #include <o3tl/typed_flags_set.hxx>
28 
29 namespace {
30 
31 // Flags for character properties
32 enum class CharFlags {
33  StartIdentifier = 0x0001,
34  InIdentifier = 0x0002,
35  StartNumber = 0x0004,
36  InNumber = 0x0008,
37  InHexNumber = 0x0010,
38  InOctNumber = 0x0020,
39  StartString = 0x0040,
40  Operator = 0x0080,
41  Space = 0x0100,
42  EOL = 0x0200
43 };
44 
45 }
46 
47 namespace o3tl {
48  template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
49 }
50 
51 // ##########################################################################
52 // ATTENTION: all these words need to be in lower case
53 // ##########################################################################
54 static const char* strListBasicKeyWords[] = {
55  "access",
56  "alias",
57  "and",
58  "any",
59  "append",
60  "as",
61  "attribute",
62  "base",
63  "binary",
64  "boolean",
65  "byref",
66  "byte",
67  "byval",
68  "call",
69  "case",
70  "cdecl",
71  "classmodule",
72  "close",
73  "compare",
74  "compatible",
75  "const",
76  "currency",
77  "date",
78  "declare",
79  "defbool",
80  "defcur",
81  "defdate",
82  "defdbl",
83  "deferr",
84  "defint",
85  "deflng",
86  "defobj",
87  "defsng",
88  "defstr",
89  "defvar",
90  "dim",
91  "do",
92  "doevents",
93  "double",
94  "each",
95  "else",
96  "elseif",
97  "end",
98  "end enum",
99  "end function",
100  "end if",
101  "end property",
102  "end select",
103  "end sub",
104  "end type",
105  "endif",
106  "enum",
107  "eqv",
108  "erase",
109  "error",
110  "exit",
111  "explicit",
112  "for",
113  "function",
114  "get",
115  "global",
116  "gosub",
117  "goto",
118  "if",
119  "imp",
120  "implements",
121  "in",
122  "input",
123  "integer",
124  "is",
125  "let",
126  "lib",
127  "like",
128  "line",
129  "line input",
130  "local",
131  "lock",
132  "long",
133  "loop",
134  "lprint",
135  "lset",
136  "mod",
137  "name",
138  "new",
139  "next",
140  "not",
141  "object",
142  "on",
143  "open",
144  "option",
145  "optional",
146  "or",
147  "output",
148  "paramarray",
149  "preserve",
150  "print",
151  "private",
152  "property",
153  "public",
154  "random",
155  "read",
156  "redim",
157  "rem",
158  "resume",
159  "return",
160  "rset",
161  "select",
162  "set",
163  "shared",
164  "single",
165  "static",
166  "step",
167  "stop",
168  "string",
169  "sub",
170  "system",
171  "text",
172  "then",
173  "to",
174  "type",
175  "typeof",
176  "until",
177  "variant",
178  "vbasupport",
179  "wend",
180  "while",
181  "with",
182  "withevents",
183  "write",
184  "xor"
185 };
186 
187 
188 static const char* strListSqlKeyWords[] = {
189  "all",
190  "and",
191  "any",
192  "as",
193  "asc",
194  "avg",
195  "between",
196  "by",
197  "cast",
198  "corresponding",
199  "count",
200  "create",
201  "cross",
202  "delete",
203  "desc",
204  "distinct",
205  "drop",
206  "escape",
207  "except",
208  "exists",
209  "false",
210  "from",
211  "full",
212  "global",
213  "group",
214  "having",
215  "in",
216  "inner",
217  "insert",
218  "intersect",
219  "into",
220  "is",
221  "join",
222  "left",
223  "like",
224  "limit",
225  "local",
226  "match",
227  "max",
228  "min",
229  "natural",
230  "not",
231  "null",
232  "on",
233  "or",
234  "order",
235  "outer",
236  "right",
237  "select",
238  "set",
239  "some",
240  "sum",
241  "table",
242  "temporary",
243  "true",
244  "union",
245  "unique",
246  "unknown",
247  "update",
248  "using",
249  "values",
250  "where"
251 };
252 
253 
254 extern "C" {
255 
256 static int compare_strings( const void *arg1, const void *arg2 )
257 {
258  return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
259 }
260 
261 }
262 
263 namespace
264 {
265  bool isAlpha(sal_Unicode c)
266  {
267  if (rtl::isAsciiAlpha(c))
268  return true;
269  return u_isalpha(c);
270  }
271 }
272 
274 {
275  // Character information tables
277 
278  // Auxiliary function: testing of the character flags
279  bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
280 
281  // Get new token, EmptyString == nothing more over there
282  bool getNextToken(const sal_Unicode*& pos, /*out*/TokenType& reType,
283  /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const;
284 
285  const char** ppListKeyWords;
286  sal_uInt16 nKeyWordCount;
287 
288 public:
290 
291  explicit Tokenizer( HighlighterLanguage aLang );
292 
293  void getHighlightPortions(const OUString& rLine,
294  /*out*/std::vector<HighlightPortion>& portions) const;
295  void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
296 };
297 
298 // Helper function: test character flag
300 {
301  bool bRet = false;
302  if( c != 0 && c <= 255 )
303  {
304  bRet = bool(aCharTypeTab[c] & nTestFlags);
305  }
306  else if( c > 255 )
307  {
308  bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
309  && isAlpha(c);
310  }
311  return bRet;
312 }
313 
314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
315 {
316  ppListKeyWords = ppKeyWords;
317  nKeyWordCount = nCount;
318 }
319 
321  /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const
322 {
323  reType = TokenType::Unknown;
324 
325  rpStartPos = pos;
326 
327  sal_Unicode c = *pos;
328  if( c == 0 )
329  return false;
330 
331  ++pos;
332 
333  //*** Go through all possibilities ***
334  // Space?
335  if ( testCharFlags( c, CharFlags::Space ) )
336  {
337  while( testCharFlags( *pos, CharFlags::Space ) )
338  ++pos;
339 
340  reType = TokenType::Whitespace;
341  }
342 
343  // Identifier?
344  else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
345  {
346  bool bIdentifierChar;
347  do
348  {
349  // Fetch next character
350  c = *pos;
351  bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
352  if( bIdentifierChar )
353  ++pos;
354  }
355  while( bIdentifierChar );
356 
357  reType = TokenType::Identifier;
358 
359  // Keyword table
360  if (ppListKeyWords != nullptr)
361  {
362  int nCount = pos - rpStartPos;
363 
364  // No keyword if string contains char > 255
365  bool bCanBeKeyword = true;
366  for( int i = 0 ; i < nCount ; i++ )
367  {
368  if( rpStartPos[i] > 255 )
369  {
370  bCanBeKeyword = false;
371  break;
372  }
373  }
374 
375  if( bCanBeKeyword )
376  {
377  OUString aKWString(rpStartPos, nCount);
378  OString aByteStr = OUStringToOString(aKWString,
379  RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
380  if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
381  compare_strings ) )
382  {
383  reType = TokenType::Keywords;
384 
385  if( aByteStr == "rem" )
386  {
387  // Remove all characters until end of line or EOF
388  sal_Unicode cPeek = *pos;
389  while( cPeek != 0 && !testCharFlags( cPeek, CharFlags::EOL ) )
390  {
391  cPeek = *++pos;
392  }
393 
394  reType = TokenType::Comment;
395  }
396  }
397  }
398  }
399  }
400 
401  // Operator?
402  // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
403  else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
404  {
405  // parameters for SQL view
406  if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
407  {
408  if (c!='?')
409  {
410  bool bIdentifierChar;
411  do
412  {
413  // Get next character
414  c = *pos;
415  bIdentifierChar = isAlpha(c);
416  if( bIdentifierChar )
417  ++pos;
418  }
419  while( bIdentifierChar );
420  }
421  reType = TokenType::Parameter;
422  }
423  else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
424  {
425  sal_Unicode cPeekNext = *pos;
426  if (cPeekNext=='-')
427  {
428  // Remove all characters until end of line or EOF
429  while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
430  {
431  ++pos;
432  cPeekNext = *pos;
433  }
434  reType = TokenType::Comment;
435  }
436  else
437  reType = TokenType::Operator;
438  }
439  else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
440  {
441  sal_Unicode cPeekNext = *pos;
442  if (cPeekNext=='/')
443  {
444  // Remove all characters until end of line or EOF
445  while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
446  {
447  ++pos;
448  cPeekNext = *pos;
449  }
450  reType = TokenType::Comment;
451  }
452  else
453  reType = TokenType::Operator;
454  }
455  else
456  {
457  // Apostrophe is Basic comment
458  if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
459  {
460  // Skip all characters until end of input or end of line:
461  for (;;) {
462  c = *pos;
463  if (c == 0 || testCharFlags(c, CharFlags::EOL)) {
464  break;
465  }
466  ++pos;
467  }
468 
469  reType = TokenType::Comment;
470  }
471 
472  // The real operator; can be easily used since not the actual
473  // operator (e.g. +=) is concerned, but the fact that it is one
474  if( reType != TokenType::Comment )
475  {
476  reType = TokenType::Operator;
477  }
478 
479  }
480  }
481 
482  // Object separator? Must be handled before Number
483  else if( c == '.' && ( *pos < '0' || *pos > '9' ) )
484  {
485  reType = TokenType::Operator;
486  }
487 
488  // Number?
489  else if( testCharFlags( c, CharFlags::StartNumber ) )
490  {
491  reType = TokenType::Number;
492 
493  // Number system, 10 = normal, it is changed for Oct/Hex
494  int nRadix = 10;
495 
496  // Is it an Oct or a Hex number?
497  if( c == '&' )
498  {
499  // Octal?
500  if( *pos == 'o' || *pos == 'O' )
501  {
502  // remove o
503  ++pos;
504  nRadix = 8; // Octal base
505 
506  // Read all numbers
507  while( testCharFlags( *pos, CharFlags::InOctNumber ) )
508  ++pos;
509  }
510  // Hexadecimal?
511  else if( *pos == 'h' || *pos == 'H' )
512  {
513  // remove x
514  ++pos;
515  nRadix = 16; // Hexadecimal base
516 
517  // Read all numbers
518  while( testCharFlags( *pos, CharFlags::InHexNumber ) )
519  ++pos;
520  }
521  else
522  {
523  reType = TokenType::Operator;
524  }
525  }
526 
527  // When it is not Oct or Hex, then it is double
528  if( reType == TokenType::Number && nRadix == 10 )
529  {
530  // Flag if the last character is an exponent
531  bool bAfterExpChar = false;
532 
533  // Read all numbers
534  while( testCharFlags( *pos, CharFlags::InNumber ) ||
535  (bAfterExpChar && *pos == '+' ) ||
536  (bAfterExpChar && *pos == '-' ) )
537  // After exponent +/- are OK, too
538  {
539  c = *pos++;
540  bAfterExpChar = ( c == 'e' || c == 'E' );
541  }
542  }
543  }
544 
545  // String?
546  else if( testCharFlags( c, CharFlags::StartString ) )
547  {
548  // Remember which character has opened the string
549  sal_Unicode cEndString = c;
550  if( c == '[' )
551  cEndString = ']';
552 
553  // Read all characters
554  while( *pos != cEndString )
555  {
556  // Detect EOF before reading next char, so we do not lose EOF
557  if( *pos == 0 )
558  {
559  // ERROR: unterminated string literal
560  reType = TokenType::Error;
561  break;
562  }
563  c = *pos++;
564  if( testCharFlags( c, CharFlags::EOL ) )
565  {
566  // ERROR: unterminated string literal
567  reType = TokenType::Error;
568  break;
569  }
570  }
571 
572  if( reType != TokenType::Error )
573  {
574  ++pos;
575  if( cEndString == ']' )
576  reType = TokenType::Identifier;
577  else
578  reType = TokenType::String;
579  }
580  }
581 
582  // End of line?
583  else if( testCharFlags( c, CharFlags::EOL ) )
584  {
585  // If another EOL character comes, read it
586  sal_Unicode cNext = *pos;
587  if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
588  ++pos;
589 
590  reType = TokenType::EOL;
591  }
592 
593  // All other will remain TokenType::Unknown
594 
595  // Save end position
596  rpEndPos = pos;
597  return true;
598 }
599 
601 {
602  // Fill character table
603  sal_uInt16 i;
604 
605  // Allowed characters for identifiers
606  CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
607  for( i = 'a' ; i <= 'z' ; i++ )
608  aCharTypeTab[i] |= nHelpMask;
609  for( i = 'A' ; i <= 'Z' ; i++ )
610  aCharTypeTab[i] |= nHelpMask;
611  aCharTypeTab[int('_')] |= nHelpMask;
612  aCharTypeTab[int('$')] |= nHelpMask;
613 
614  // Digit (can be identifier and number)
615  nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
616  CharFlags::InNumber | CharFlags::InHexNumber;
617  for( i = '0' ; i <= '9' ; i++ )
618  aCharTypeTab[i] |= nHelpMask;
619 
620  // Add e, E, . and & here manually
621  aCharTypeTab[int('e')] |= CharFlags::InNumber;
622  aCharTypeTab[int('E')] |= CharFlags::InNumber;
623  aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
624  aCharTypeTab[int('&')] |= CharFlags::StartNumber;
625 
626  // Hexadecimal digit
627  for( i = 'a' ; i <= 'f' ; i++ )
628  aCharTypeTab[i] |= CharFlags::InHexNumber;
629  for( i = 'A' ; i <= 'F' ; i++ )
630  aCharTypeTab[i] |= CharFlags::InHexNumber;
631 
632  // Octal digit
633  for( i = '0' ; i <= '7' ; i++ )
634  aCharTypeTab[i] |= CharFlags::InOctNumber;
635 
636  // String literal start/end characters
637  aCharTypeTab[int('\'')] |= CharFlags::StartString;
638  aCharTypeTab[int('\"')] |= CharFlags::StartString;
639  aCharTypeTab[int('[')] |= CharFlags::StartString;
640  aCharTypeTab[int('`')] |= CharFlags::StartString;
641 
642  // Operator characters
643  aCharTypeTab[int('!')] |= CharFlags::Operator;
644  aCharTypeTab[int('%')] |= CharFlags::Operator;
645  // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
646  aCharTypeTab[int('(')] |= CharFlags::Operator;
647  aCharTypeTab[int(')')] |= CharFlags::Operator;
648  aCharTypeTab[int('*')] |= CharFlags::Operator;
649  aCharTypeTab[int('+')] |= CharFlags::Operator;
650  aCharTypeTab[int(',')] |= CharFlags::Operator;
651  aCharTypeTab[int('-')] |= CharFlags::Operator;
652  aCharTypeTab[int('/')] |= CharFlags::Operator;
653  aCharTypeTab[int(':')] |= CharFlags::Operator;
654  aCharTypeTab[int('<')] |= CharFlags::Operator;
655  aCharTypeTab[int('=')] |= CharFlags::Operator;
656  aCharTypeTab[int('>')] |= CharFlags::Operator;
657  aCharTypeTab[int('?')] |= CharFlags::Operator;
658  aCharTypeTab[int('^')] |= CharFlags::Operator;
659  aCharTypeTab[int('|')] |= CharFlags::Operator;
660  aCharTypeTab[int('~')] |= CharFlags::Operator;
661  aCharTypeTab[int('{')] |= CharFlags::Operator;
662  aCharTypeTab[int('}')] |= CharFlags::Operator;
663  // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
664  aCharTypeTab[int(']')] |= CharFlags::Operator;
665  aCharTypeTab[int(';')] |= CharFlags::Operator;
666 
667  // Space
668  aCharTypeTab[int(' ') ] |= CharFlags::Space;
669  aCharTypeTab[int('\t')] |= CharFlags::Space;
670 
671  // End of line characters
672  aCharTypeTab[int('\r')] |= CharFlags::EOL;
673  aCharTypeTab[int('\n')] |= CharFlags::EOL;
674 
675  ppListKeyWords = nullptr;
676  nKeyWordCount = 0;
677 }
678 
680  /*out*/std::vector<HighlightPortion>& portions) const
681 {
682  // Set the position to the beginning of the source string
683  const sal_Unicode* pos = rLine.getStr();
684 
685  // Variables for the out parameter
687  const sal_Unicode* pStartPos;
688  const sal_Unicode* pEndPos;
689 
690  // Loop over all the tokens
691  while( getNextToken( pos, eType, pStartPos, pEndPos ) )
692  {
693  portions.emplace_back(
694  pStartPos - rLine.getStr(), pEndPos - rLine.getStr(), eType);
695  }
696 }
697 
698 
700  m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
701 {
702  switch (language)
703  {
705  m_tokenizer->setKeyWords( strListBasicKeyWords,
707  break;
709  m_tokenizer->setKeyWords( strListSqlKeyWords,
711  break;
712  default:
713  assert(false); // this cannot happen
714  }
715 }
716 
718 
719 void SyntaxHighlighter::getHighlightPortions(const OUString& rLine,
720  /*out*/std::vector<HighlightPortion>& portions) const
721 {
722  m_tokenizer->getHighlightPortions( rLine, portions );
723 }
724 
726 {
727  return m_tokenizer->aLanguage;
728 }
729 
730 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Tokenizer(HighlighterLanguage aLang)
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
static int compare_strings(const void *arg1, const void *arg2)
void getHighlightPortions(const OUString &rLine, std::vector< HighlightPortion > &pPortions) const
HighlighterLanguage GetLanguage() const
sal_uInt16 sal_Unicode
size_t pos
int nCount
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
#define EOL
#define SAL_N_ELEMENTS(arr)
DocumentType eType
HighlighterLanguage const aLanguage
int i
bool isAlpha(sal_Unicode c, bool bCompatible)
std::unique_ptr< Tokenizer > m_tokenizer
HighlighterLanguage
void getHighlightPortions(const OUString &rLine, std::vector< HighlightPortion > &portions) const
TokenType
bool getNextToken(const sal_Unicode *&pos, TokenType &reType, const sal_Unicode *&rpStartPos, const sal_Unicode *&rpEndPos) const
SyntaxHighlighter(const SyntaxHighlighter &)=delete
static const char * strListSqlKeyWords[]
CharFlags
void setKeyWords(const char **ppKeyWords, sal_uInt16 nCount)
static const char * strListBasicKeyWords[]