LibreOffice Module comphelper (master)  1
syntaxhighlight.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cassert>
23 
24 #include <rtl/character.hxx>
25 #include <unicode/uchar.h>
27 #include <o3tl/typed_flags_set.hxx>
28 
29 namespace {
30 
31 // Flags for character properties
32 enum class CharFlags {
33  StartIdentifier = 0x0001,
34  InIdentifier = 0x0002,
35  StartNumber = 0x0004,
36  InNumber = 0x0008,
37  InHexNumber = 0x0010,
38  InOctNumber = 0x0020,
39  StartString = 0x0040,
40  Operator = 0x0080,
41  Space = 0x0100,
42  EOL = 0x0200
43 };
44 
45 }
46 
47 namespace o3tl {
48  template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
49 }
50 
51 // ##########################################################################
52 // ATTENTION: all these words need to be in lower case
53 // ##########################################################################
54 static const char* strListBasicKeyWords[] = {
55  "access",
56  "alias",
57  "and",
58  "any",
59  "append",
60  "as",
61  "attribute",
62  "base",
63  "binary",
64  "boolean",
65  "byref",
66  "byte",
67  "byval",
68  "call",
69  "case",
70  "cdecl",
71  "classmodule",
72  "close",
73  "compare",
74  "compatible",
75  "const",
76  "currency",
77  "date",
78  "declare",
79  "defbool",
80  "defcur",
81  "defdate",
82  "defdbl",
83  "deferr",
84  "defint",
85  "deflng",
86  "defobj",
87  "defsng",
88  "defstr",
89  "defvar",
90  "dim",
91  "do",
92  "doevents",
93  "double",
94  "each",
95  "else",
96  "elseif",
97  "end",
98  "end enum",
99  "end function",
100  "end if",
101  "end property",
102  "end select",
103  "end sub",
104  "end type",
105  "endif",
106  "enum",
107  "eqv",
108  "erase",
109  "error",
110  "exit",
111  "explicit",
112  "for",
113  "function",
114  "get",
115  "global",
116  "gosub",
117  "goto",
118  "if",
119  "imp",
120  "implements",
121  "in",
122  "input",
123  "integer",
124  "is",
125  "let",
126  "lib",
127  "like",
128  "line",
129  "line input",
130  "local",
131  "lock",
132  "long",
133  "loop",
134  "lprint",
135  "lset",
136  "mod",
137  "name",
138  "new",
139  "next",
140  "not",
141  "object",
142  "on",
143  "open",
144  "option",
145  "optional",
146  "or",
147  "output",
148  "paramarray",
149  "preserve",
150  "print",
151  "private",
152  "property",
153  "public",
154  "random",
155  "read",
156  "redim",
157  "rem",
158  "resume",
159  "return",
160  "rset",
161  "select",
162  "set",
163  "shared",
164  "single",
165  "static",
166  "step",
167  "stop",
168  "string",
169  "sub",
170  "system",
171  "text",
172  "then",
173  "to",
174  "type",
175  "typeof",
176  "until",
177  "variant",
178  "vbasupport",
179  "wend",
180  "while",
181  "with",
182  "withevents",
183  "write",
184  "xor"
185 };
186 
187 
188 static const char* strListSqlKeyWords[] = {
189  "all",
190  "and",
191  "any",
192  "as",
193  "asc",
194  "avg",
195  "between",
196  "by",
197  "cast",
198  "corresponding",
199  "count",
200  "create",
201  "cross",
202  "delete",
203  "desc",
204  "distinct",
205  "drop",
206  "escape",
207  "except",
208  "exists",
209  "false",
210  "from",
211  "full",
212  "global",
213  "group",
214  "having",
215  "in",
216  "inner",
217  "insert",
218  "intersect",
219  "into",
220  "is",
221  "join",
222  "left",
223  "like",
224  "limit",
225  "local",
226  "match",
227  "max",
228  "min",
229  "natural",
230  "not",
231  "null",
232  "on",
233  "or",
234  "order",
235  "outer",
236  "right",
237  "select",
238  "set",
239  "some",
240  "sum",
241  "table",
242  "temporary",
243  "true",
244  "union",
245  "unique",
246  "unknown",
247  "update",
248  "using",
249  "values",
250  "where"
251 };
252 
253 
254 extern "C" {
255 
256 static int compare_strings( const void *arg1, const void *arg2 )
257 {
258  return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
259 }
260 
261 }
262 
263 namespace
264 {
265  bool isAlpha(sal_Unicode c)
266  {
267  if (rtl::isAsciiAlpha(c))
268  return true;
269  return u_isalpha(c);
270  }
271 }
272 
274 {
275  // Character information tables
277 
278  // Auxiliary function: testing of the character flags
279  bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
280 
281  // Get new token, EmptyString == nothing more over there
282  bool getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end, /*out*/TokenType& reType,
283  /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const;
284 
285  const char** ppListKeyWords;
286  sal_uInt16 nKeyWordCount;
287 
288 public:
290 
291  explicit Tokenizer( HighlighterLanguage aLang );
292 
293  void getHighlightPortions(std::u16string_view rLine,
294  /*out*/std::vector<HighlightPortion>& portions) const;
295  void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
296 };
297 
298 // Helper function: test character flag
300 {
301  bool bRet = false;
302  if( c != 0 && c <= 255 )
303  {
304  bRet = bool(aCharTypeTab[c] & nTestFlags);
305  }
306  else if( c > 255 )
307  {
308  bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
309  && isAlpha(c);
310  }
311  return bRet;
312 }
313 
314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
315 {
316  ppListKeyWords = ppKeyWords;
317  nKeyWordCount = nCount;
318 }
319 
320 bool SyntaxHighlighter::Tokenizer::getNextToken(std::u16string_view::const_iterator& pos, std::u16string_view::const_iterator end,
321  /*out*/TokenType& reType,
322  /*out*/std::u16string_view::const_iterator& rpStartPos, /*out*/std::u16string_view::const_iterator& rpEndPos) const
323 {
324  reType = TokenType::Unknown;
325 
326  rpStartPos = pos;
327 
328  if( pos == end )
329  return false;
330 
331  sal_Unicode c = *pos;
332  ++pos;
333 
334  //*** Go through all possibilities ***
335  // Space?
336  if ( testCharFlags( c, CharFlags::Space ) )
337  {
338  while( pos != end && testCharFlags( *pos, CharFlags::Space ) )
339  ++pos;
340 
341  reType = TokenType::Whitespace;
342  }
343 
344  // Identifier?
345  else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
346  {
347  bool bIdentifierChar;
348  do
349  {
350  if (pos == end)
351  break;
352  // Fetch next character
353  c = *pos;
354  bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
355  if( bIdentifierChar )
356  ++pos;
357  }
358  while( bIdentifierChar );
359 
360  reType = TokenType::Identifier;
361 
362  // Keyword table
363  if (ppListKeyWords != nullptr)
364  {
365  int nCount = pos - rpStartPos;
366 
367  // No keyword if string contains char > 255
368  bool bCanBeKeyword = true;
369  for( int i = 0 ; i < nCount ; i++ )
370  {
371  if( rpStartPos[i] > 255 )
372  {
373  bCanBeKeyword = false;
374  break;
375  }
376  }
377 
378  if( bCanBeKeyword )
379  {
380  std::u16string_view aKWString(&*rpStartPos, nCount);
381  OString aByteStr = OUStringToOString(aKWString,
382  RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
383  if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
384  compare_strings ) )
385  {
386  reType = TokenType::Keywords;
387 
388  if( aByteStr == "rem" )
389  {
390  // Remove all characters until end of line or EOF
391  for (;;)
392  {
393  if (pos == end)
394  break;
395  sal_Unicode cPeek = *pos;
396  if ( testCharFlags( cPeek, CharFlags::EOL ) )
397  break;
398  ++pos;
399  }
400 
401  reType = TokenType::Comment;
402  }
403  }
404  }
405  }
406  }
407 
408  // Operator?
409  // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
410  else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
411  {
412  // parameters for SQL view
413  if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
414  {
415  if (c!='?')
416  {
417  bool bIdentifierChar;
418  do
419  {
420  // Get next character
421  if (pos == end)
422  break;
423  c = *pos;
424  bIdentifierChar = isAlpha(c);
425  if( bIdentifierChar )
426  ++pos;
427  }
428  while( bIdentifierChar );
429  }
430  reType = TokenType::Parameter;
431  }
432  else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
433  {
434  if (pos != end && *pos=='-')
435  {
436  // Remove all characters until end of line or EOF
437  while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
438  {
439  ++pos;
440  }
441  reType = TokenType::Comment;
442  }
443  else
444  reType = TokenType::Operator;
445  }
446  else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
447  {
448  if (pos != end && *pos=='/')
449  {
450  // Remove all characters until end of line or EOF
451  while( pos != end && !testCharFlags( *pos, CharFlags::EOL ) )
452  {
453  ++pos;
454  }
455  reType = TokenType::Comment;
456  }
457  else
458  reType = TokenType::Operator;
459  }
460  else
461  {
462  // Apostrophe is Basic comment
463  if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
464  {
465  // Skip all characters until end of input or end of line:
466  for (;;) {
467  if (pos == end)
468  break;
469  c = *pos;
470  if (testCharFlags(c, CharFlags::EOL)) {
471  break;
472  }
473  ++pos;
474  }
475 
476  reType = TokenType::Comment;
477  }
478 
479  // The real operator; can be easily used since not the actual
480  // operator (e.g. +=) is concerned, but the fact that it is one
481  if( reType != TokenType::Comment )
482  {
483  reType = TokenType::Operator;
484  }
485 
486  }
487  }
488 
489  // Object separator? Must be handled before Number
490  else if( c == '.' && ( pos == end || *pos < '0' || *pos > '9' ) )
491  {
492  reType = TokenType::Operator;
493  }
494 
495  // Number?
496  else if( testCharFlags( c, CharFlags::StartNumber ) )
497  {
498  reType = TokenType::Number;
499 
500  // Number system, 10 = normal, it is changed for Oct/Hex
501  int nRadix = 10;
502 
503  // Is it an Oct or a Hex number?
504  if( c == '&' )
505  {
506  // Octal?
507  if( pos != end && (*pos == 'o' || *pos == 'O' ))
508  {
509  // remove o
510  ++pos;
511  nRadix = 8; // Octal base
512 
513  // Read all numbers
514  while( pos != end && testCharFlags( *pos, CharFlags::InOctNumber ) )
515  ++pos;
516  }
517  // Hexadecimal?
518  else if( pos != end && (*pos == 'h' || *pos == 'H' ))
519  {
520  // remove x
521  ++pos;
522  nRadix = 16; // Hexadecimal base
523 
524  // Read all numbers
525  while( pos != end && testCharFlags( *pos, CharFlags::InHexNumber ) )
526  ++pos;
527  }
528  else
529  {
530  reType = TokenType::Operator;
531  }
532  }
533 
534  // When it is not Oct or Hex, then it is double
535  if( reType == TokenType::Number && nRadix == 10 )
536  {
537  // Flag if the last character is an exponent
538  bool bAfterExpChar = false;
539 
540  // Read all numbers
541  while( pos != end && (testCharFlags( *pos, CharFlags::InNumber ) ||
542  (bAfterExpChar && *pos == '+' ) ||
543  (bAfterExpChar && *pos == '-' ) ))
544  // After exponent +/- are OK, too
545  {
546  c = *pos++;
547  bAfterExpChar = ( c == 'e' || c == 'E' );
548  }
549  }
550  }
551 
552  // String?
553  else if( testCharFlags( c, CharFlags::StartString ) )
554  {
555  // Remember which character has opened the string
556  sal_Unicode cEndString = c;
557  if( c == '[' )
558  cEndString = ']';
559 
560  // Read all characters
561  while( pos == end || *pos != cEndString )
562  {
563  // Detect EOF before reading next char, so we do not lose EOF
564  if( pos == end )
565  {
566  // ERROR: unterminated string literal
567  reType = TokenType::Error;
568  break;
569  }
570  c = *pos++;
571  if( testCharFlags( c, CharFlags::EOL ) )
572  {
573  // ERROR: unterminated string literal
574  reType = TokenType::Error;
575  break;
576  }
577  }
578 
579  if( reType != TokenType::Error )
580  {
581  ++pos;
582  if( cEndString == ']' )
583  reType = TokenType::Identifier;
584  else
585  reType = TokenType::String;
586  }
587  }
588 
589  // End of line?
590  else if( testCharFlags( c, CharFlags::EOL ) )
591  {
592  // If another EOL character comes, read it
593  if (pos != end)
594  {
595  sal_Unicode cNext = *pos;
596  if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
597  ++pos;
598  }
599 
600  reType = TokenType::EOL;
601  }
602 
603  // All other will remain TokenType::Unknown
604 
605  // Save end position
606  rpEndPos = pos;
607  return true;
608 }
609 
611 {
612  // Fill character table
613  sal_uInt16 i;
614 
615  // Allowed characters for identifiers
616  CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
617  for( i = 'a' ; i <= 'z' ; i++ )
618  aCharTypeTab[i] |= nHelpMask;
619  for( i = 'A' ; i <= 'Z' ; i++ )
620  aCharTypeTab[i] |= nHelpMask;
621  aCharTypeTab[int('_')] |= nHelpMask;
622  aCharTypeTab[int('$')] |= nHelpMask;
623 
624  // Digit (can be identifier and number)
625  nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
626  CharFlags::InNumber | CharFlags::InHexNumber;
627  for( i = '0' ; i <= '9' ; i++ )
628  aCharTypeTab[i] |= nHelpMask;
629 
630  // Add e, E, . and & here manually
631  aCharTypeTab[int('e')] |= CharFlags::InNumber;
632  aCharTypeTab[int('E')] |= CharFlags::InNumber;
633  aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
634  aCharTypeTab[int('&')] |= CharFlags::StartNumber;
635 
636  // Hexadecimal digit
637  for( i = 'a' ; i <= 'f' ; i++ )
638  aCharTypeTab[i] |= CharFlags::InHexNumber;
639  for( i = 'A' ; i <= 'F' ; i++ )
640  aCharTypeTab[i] |= CharFlags::InHexNumber;
641 
642  // Octal digit
643  for( i = '0' ; i <= '7' ; i++ )
644  aCharTypeTab[i] |= CharFlags::InOctNumber;
645 
646  // String literal start/end characters
647  aCharTypeTab[int('\'')] |= CharFlags::StartString;
648  aCharTypeTab[int('\"')] |= CharFlags::StartString;
649  aCharTypeTab[int('[')] |= CharFlags::StartString;
650  aCharTypeTab[int('`')] |= CharFlags::StartString;
651 
652  // Operator characters
653  aCharTypeTab[int('!')] |= CharFlags::Operator;
654  aCharTypeTab[int('%')] |= CharFlags::Operator;
655  // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
656  aCharTypeTab[int('(')] |= CharFlags::Operator;
657  aCharTypeTab[int(')')] |= CharFlags::Operator;
658  aCharTypeTab[int('*')] |= CharFlags::Operator;
659  aCharTypeTab[int('+')] |= CharFlags::Operator;
660  aCharTypeTab[int(',')] |= CharFlags::Operator;
661  aCharTypeTab[int('-')] |= CharFlags::Operator;
662  aCharTypeTab[int('/')] |= CharFlags::Operator;
663  aCharTypeTab[int(':')] |= CharFlags::Operator;
664  aCharTypeTab[int('<')] |= CharFlags::Operator;
665  aCharTypeTab[int('=')] |= CharFlags::Operator;
666  aCharTypeTab[int('>')] |= CharFlags::Operator;
667  aCharTypeTab[int('?')] |= CharFlags::Operator;
668  aCharTypeTab[int('^')] |= CharFlags::Operator;
669  aCharTypeTab[int('|')] |= CharFlags::Operator;
670  aCharTypeTab[int('~')] |= CharFlags::Operator;
671  aCharTypeTab[int('{')] |= CharFlags::Operator;
672  aCharTypeTab[int('}')] |= CharFlags::Operator;
673  // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
674  aCharTypeTab[int(']')] |= CharFlags::Operator;
675  aCharTypeTab[int(';')] |= CharFlags::Operator;
676 
677  // Space
678  aCharTypeTab[int(' ') ] |= CharFlags::Space;
679  aCharTypeTab[int('\t')] |= CharFlags::Space;
680 
681  // End of line characters
682  aCharTypeTab[int('\r')] |= CharFlags::EOL;
683  aCharTypeTab[int('\n')] |= CharFlags::EOL;
684 
685  ppListKeyWords = nullptr;
686  nKeyWordCount = 0;
687 }
688 
690  /*out*/std::vector<HighlightPortion>& portions) const
691 {
692  // Set the position to the beginning of the source string
693  auto pos = rLine.begin();
694 
695  // Variables for the out parameter
697  std::u16string_view::const_iterator pStartPos;
698  std::u16string_view::const_iterator pEndPos;
699 
700  // Loop over all the tokens
701  while( getNextToken( pos, rLine.end(), eType, pStartPos, pEndPos ) )
702  {
703  portions.emplace_back(
704  pStartPos - rLine.begin(), pEndPos - rLine.begin(), eType);
705  }
706 }
707 
708 
710  m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
711 {
712  switch (language)
713  {
715  m_tokenizer->setKeyWords( strListBasicKeyWords,
716  std::size( strListBasicKeyWords ));
717  break;
719  m_tokenizer->setKeyWords( strListSqlKeyWords,
720  std::size( strListSqlKeyWords ));
721  break;
722  default:
723  assert(false); // this cannot happen
724  }
725 }
726 
728 
729 void SyntaxHighlighter::getHighlightPortions(std::u16string_view rLine,
730  /*out*/std::vector<HighlightPortion>& portions) const
731 {
732  m_tokenizer->getHighlightPortions( rLine, portions );
733 }
734 
736 {
737  return m_tokenizer->aLanguage;
738 }
739 
740 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Tokenizer(HighlighterLanguage aLang)
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
static int compare_strings(const void *arg1, const void *arg2)
HighlighterLanguage GetLanguage() const
sal_uInt16 sal_Unicode
size_t pos
int nCount
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
DocumentType eType
HighlighterLanguage const aLanguage
int i
bool isAlpha(sal_Unicode c, bool bCompatible)
std::unique_ptr< Tokenizer > m_tokenizer
HighlighterLanguage
bool getNextToken(std::u16string_view::const_iterator &pos, std::u16string_view::const_iterator end, TokenType &reType, std::u16string_view::const_iterator &rpStartPos, std::u16string_view::const_iterator &rpEndPos) const
TokenType
SyntaxHighlighter(const SyntaxHighlighter &)=delete
void getHighlightPortions(std::u16string_view rLine, std::vector< HighlightPortion > &pPortions) const
void getHighlightPortions(std::u16string_view rLine, std::vector< HighlightPortion > &portions) const
static const char * strListSqlKeyWords[]
CharFlags
void setKeyWords(const char **ppKeyWords, sal_uInt16 nCount)
static const char * strListBasicKeyWords[]