LibreOffice Module comphelper (master)  1
syntaxhighlight.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cassert>
23 
24 #include <rtl/character.hxx>
25 #include <unicode/uchar.h>
27 #include <o3tl/typed_flags_set.hxx>
28 
29 namespace {
30 
31 // Flags for character properties
32 enum class CharFlags {
33  StartIdentifier = 0x0001,
34  InIdentifier = 0x0002,
35  StartNumber = 0x0004,
36  InNumber = 0x0008,
37  InHexNumber = 0x0010,
38  InOctNumber = 0x0020,
39  StartString = 0x0040,
40  Operator = 0x0080,
41  Space = 0x0100,
42  EOL = 0x0200
43 };
44 
45 }
46 
47 namespace o3tl {
48  template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
49 }
50 
51 // ##########################################################################
52 // ATTENTION: all these words need to be in lower case
53 // ##########################################################################
54 static const char* strListBasicKeyWords[] = {
55  "access",
56  "alias",
57  "and",
58  "any",
59  "append",
60  "as",
61  "attribute",
62  "base",
63  "binary",
64  "boolean",
65  "byref",
66  "byte",
67  "byval",
68  "call",
69  "case",
70  "cdecl",
71  "classmodule",
72  "close",
73  "compare",
74  "compatible",
75  "const",
76  "currency",
77  "date",
78  "declare",
79  "defbool",
80  "defcur",
81  "defdate",
82  "defdbl",
83  "deferr",
84  "defint",
85  "deflng",
86  "defobj",
87  "defsng",
88  "defstr",
89  "defvar",
90  "dim",
91  "do",
92  "doevents",
93  "double",
94  "each",
95  "else",
96  "elseif",
97  "end",
98  "end enum",
99  "end function",
100  "end if",
101  "end property",
102  "end select",
103  "end sub",
104  "end type",
105  "endif",
106  "enum",
107  "eqv",
108  "erase",
109  "error",
110  "exit",
111  "explicit",
112  "for",
113  "function",
114  "get",
115  "global",
116  "gosub",
117  "goto",
118  "if",
119  "imp",
120  "implements",
121  "in",
122  "input",
123  "integer",
124  "is",
125  "let",
126  "lib",
127  "like",
128  "line",
129  "line input",
130  "local",
131  "lock",
132  "long",
133  "loop",
134  "lprint",
135  "lset",
136  "mod",
137  "name",
138  "new",
139  "next",
140  "not",
141  "object",
142  "on",
143  "open",
144  "option",
145  "optional",
146  "or",
147  "output",
148  "paramarray",
149  "preserve",
150  "print",
151  "private",
152  "property",
153  "public",
154  "random",
155  "read",
156  "redim",
157  "rem",
158  "resume",
159  "return",
160  "rset",
161  "select",
162  "set",
163  "shared",
164  "single",
165  "static",
166  "step",
167  "stop",
168  "string",
169  "sub",
170  "system",
171  "text",
172  "then",
173  "to",
174  "type",
175  "typeof",
176  "until",
177  "variant",
178  "vbasupport",
179  "wend",
180  "while",
181  "with",
182  "withevents",
183  "write",
184  "xor"
185 };
186 
187 
188 static const char* strListSqlKeyWords[] = {
189  "all",
190  "and",
191  "any",
192  "as",
193  "asc",
194  "avg",
195  "between",
196  "by",
197  "cast",
198  "corresponding",
199  "count",
200  "create",
201  "cross",
202  "delete",
203  "desc",
204  "distinct",
205  "drop",
206  "escape",
207  "except",
208  "exists",
209  "false",
210  "from",
211  "full",
212  "global",
213  "group",
214  "having",
215  "in",
216  "inner",
217  "insert",
218  "intersect",
219  "into",
220  "is",
221  "join",
222  "left",
223  "like",
224  "limit",
225  "local",
226  "match",
227  "max",
228  "min",
229  "natural",
230  "not",
231  "null",
232  "on",
233  "or",
234  "order",
235  "outer",
236  "right",
237  "select",
238  "set",
239  "some",
240  "sum",
241  "table",
242  "temporary",
243  "true",
244  "union",
245  "unique",
246  "unknown",
247  "update",
248  "using",
249  "values",
250  "where"
251 };
252 
253 
254 extern "C" {
255 
256 static int compare_strings( const void *arg1, const void *arg2 )
257 {
258  return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
259 }
260 
261 }
262 
263 namespace
264 {
265  bool isAlpha(sal_Unicode c)
266  {
267  if (rtl::isAsciiAlpha(c))
268  return true;
269  return u_isalpha(c);
270  }
271 }
272 
274 {
275  // Character information tables
277 
278  // Auxiliary function: testing of the character flags
279  bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
280 
281  // Get new token, EmptyString == nothing more over there
282  bool getNextToken(const sal_Unicode*& pos, /*out*/TokenType& reType,
283  /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const;
284 
285  const char** ppListKeyWords;
286  sal_uInt16 nKeyWordCount;
287 
288 public:
290 
291  explicit Tokenizer( HighlighterLanguage aLang );
292 
293  void getHighlightPortions(const OUString& rLine,
294  /*out*/std::vector<HighlightPortion>& portions) const;
295  void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
296 };
297 
298 // Helper function: test character flag
300 {
301  bool bRet = false;
302  if( c != 0 && c <= 255 )
303  {
304  bRet = bool(aCharTypeTab[c] & nTestFlags);
305  }
306  else if( c > 255 )
307  {
308  bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
309  && isAlpha(c);
310  }
311  return bRet;
312 }
313 
314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
315 {
316  ppListKeyWords = ppKeyWords;
317  nKeyWordCount = nCount;
318 }
319 
321  /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const
322 {
323  reType = TokenType::Unknown;
324 
325  rpStartPos = pos;
326 
327  sal_Unicode c = *pos;
328  if( c == 0 )
329  return false;
330 
331  ++pos;
332 
333  //*** Go through all possibilities ***
334  // Space?
335  if ( testCharFlags( c, CharFlags::Space ) )
336  {
337  while( testCharFlags( *pos, CharFlags::Space ) )
338  ++pos;
339 
340  reType = TokenType::Whitespace;
341  }
342 
343  // Identifier?
344  else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
345  {
346  bool bIdentifierChar;
347  do
348  {
349  // Fetch next character
350  c = *pos;
351  bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
352  if( bIdentifierChar )
353  ++pos;
354  }
355  while( bIdentifierChar );
356 
357  reType = TokenType::Identifier;
358 
359  // Keyword table
360  if (ppListKeyWords != nullptr)
361  {
362  int nCount = pos - rpStartPos;
363 
364  // No keyword if string contains char > 255
365  bool bCanBeKeyword = true;
366  for( int i = 0 ; i < nCount ; i++ )
367  {
368  if( rpStartPos[i] > 255 )
369  {
370  bCanBeKeyword = false;
371  break;
372  }
373  }
374 
375  if( bCanBeKeyword )
376  {
377  OUString aKWString(rpStartPos, nCount);
378  OString aByteStr = OUStringToOString(aKWString,
379  RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
380  if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
381  compare_strings ) )
382  {
383  reType = TokenType::Keywords;
384 
385  if( aByteStr == "rem" )
386  {
387  // Remove all characters until end of line or EOF
388  sal_Unicode cPeek = *pos;
389  while( cPeek != 0 && !testCharFlags( cPeek, CharFlags::EOL ) )
390  {
391  cPeek = *++pos;
392  }
393 
394  reType = TokenType::Comment;
395  }
396  }
397  }
398  }
399  }
400 
401  // Operator?
402  // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
403  else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
404  {
405  // parameters for SQL view
406  if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
407  {
408  if (c!='?')
409  {
410  bool bIdentifierChar;
411  do
412  {
413  // Get next character
414  c = *pos;
415  bIdentifierChar = isAlpha(c);
416  if( bIdentifierChar )
417  ++pos;
418  }
419  while( bIdentifierChar );
420  }
421  reType = TokenType::Parameter;
422  }
423  else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
424  {
425  sal_Unicode cPeekNext = *pos;
426  if (cPeekNext=='-')
427  {
428  // Remove all characters until end of line or EOF
429  while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
430  {
431  ++pos;
432  cPeekNext = *pos;
433  }
434  reType = TokenType::Comment;
435  }
436  }
437  else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
438  {
439  sal_Unicode cPeekNext = *pos;
440  if (cPeekNext=='/')
441  {
442  // Remove all characters until end of line or EOF
443  while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
444  {
445  ++pos;
446  cPeekNext = *pos;
447  }
448  reType = TokenType::Comment;
449  }
450  }
451  else
452  {
453  // Apostrophe is Basic comment
454  if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
455  {
456  // Skip all characters until end of input or end of line:
457  for (;;) {
458  c = *pos;
459  if (c == 0 || testCharFlags(c, CharFlags::EOL)) {
460  break;
461  }
462  ++pos;
463  }
464 
465  reType = TokenType::Comment;
466  }
467 
468  // The real operator; can be easily used since not the actual
469  // operator (e.g. +=) is concerned, but the fact that it is one
470  if( reType != TokenType::Comment )
471  {
472  reType = TokenType::Operator;
473  }
474 
475  }
476  }
477 
478  // Object separator? Must be handled before Number
479  else if( c == '.' && ( *pos < '0' || *pos > '9' ) )
480  {
481  reType = TokenType::Operator;
482  }
483 
484  // Number?
485  else if( testCharFlags( c, CharFlags::StartNumber ) )
486  {
487  reType = TokenType::Number;
488 
489  // Number system, 10 = normal, it is changed for Oct/Hex
490  int nRadix = 10;
491 
492  // Is it an Oct or a Hex number?
493  if( c == '&' )
494  {
495  // Octal?
496  if( *pos == 'o' || *pos == 'O' )
497  {
498  // remove o
499  ++pos;
500  nRadix = 8; // Octal base
501 
502  // Read all numbers
503  while( testCharFlags( *pos, CharFlags::InOctNumber ) )
504  ++pos;
505  }
506  // Hexadecimal?
507  else if( *pos == 'h' || *pos == 'H' )
508  {
509  // remove x
510  ++pos;
511  nRadix = 16; // Hexadecimal base
512 
513  // Read all numbers
514  while( testCharFlags( *pos, CharFlags::InHexNumber ) )
515  ++pos;
516  }
517  else
518  {
519  reType = TokenType::Operator;
520  }
521  }
522 
523  // When it is not Oct or Hex, then it is double
524  if( reType == TokenType::Number && nRadix == 10 )
525  {
526  // Flag if the last character is an exponent
527  bool bAfterExpChar = false;
528 
529  // Read all numbers
530  while( testCharFlags( *pos, CharFlags::InNumber ) ||
531  (bAfterExpChar && *pos == '+' ) ||
532  (bAfterExpChar && *pos == '-' ) )
533  // After exponent +/- are OK, too
534  {
535  c = *pos++;
536  bAfterExpChar = ( c == 'e' || c == 'E' );
537  }
538  }
539  }
540 
541  // String?
542  else if( testCharFlags( c, CharFlags::StartString ) )
543  {
544  // Remember which character has opened the string
545  sal_Unicode cEndString = c;
546  if( c == '[' )
547  cEndString = ']';
548 
549  // Read all characters
550  while( *pos != cEndString )
551  {
552  // Detect EOF before reading next char, so we do not lose EOF
553  if( *pos == 0 )
554  {
555  // ERROR: unterminated string literal
556  reType = TokenType::Error;
557  break;
558  }
559  c = *pos++;
560  if( testCharFlags( c, CharFlags::EOL ) )
561  {
562  // ERROR: unterminated string literal
563  reType = TokenType::Error;
564  break;
565  }
566  }
567 
568  if( reType != TokenType::Error )
569  {
570  ++pos;
571  if( cEndString == ']' )
572  reType = TokenType::Identifier;
573  else
574  reType = TokenType::String;
575  }
576  }
577 
578  // End of line?
579  else if( testCharFlags( c, CharFlags::EOL ) )
580  {
581  // If another EOL character comes, read it
582  sal_Unicode cNext = *pos;
583  if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
584  ++pos;
585 
586  reType = TokenType::EOL;
587  }
588 
589  // All other will remain TokenType::Unknown
590 
591  // Save end position
592  rpEndPos = pos;
593  return true;
594 }
595 
597 {
598  // Fill character table
599  sal_uInt16 i;
600 
601  // Allowed characters for identifiers
602  CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
603  for( i = 'a' ; i <= 'z' ; i++ )
604  aCharTypeTab[i] |= nHelpMask;
605  for( i = 'A' ; i <= 'Z' ; i++ )
606  aCharTypeTab[i] |= nHelpMask;
607  aCharTypeTab[int('_')] |= nHelpMask;
608  aCharTypeTab[int('$')] |= nHelpMask;
609 
610  // Digit (can be identifier and number)
611  nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
612  CharFlags::InNumber | CharFlags::InHexNumber;
613  for( i = '0' ; i <= '9' ; i++ )
614  aCharTypeTab[i] |= nHelpMask;
615 
616  // Add e, E, . and & here manually
617  aCharTypeTab[int('e')] |= CharFlags::InNumber;
618  aCharTypeTab[int('E')] |= CharFlags::InNumber;
619  aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
620  aCharTypeTab[int('&')] |= CharFlags::StartNumber;
621 
622  // Hexadecimal digit
623  for( i = 'a' ; i <= 'f' ; i++ )
624  aCharTypeTab[i] |= CharFlags::InHexNumber;
625  for( i = 'A' ; i <= 'F' ; i++ )
626  aCharTypeTab[i] |= CharFlags::InHexNumber;
627 
628  // Octal digit
629  for( i = '0' ; i <= '7' ; i++ )
630  aCharTypeTab[i] |= CharFlags::InOctNumber;
631 
632  // String literal start/end characters
633  aCharTypeTab[int('\'')] |= CharFlags::StartString;
634  aCharTypeTab[int('\"')] |= CharFlags::StartString;
635  aCharTypeTab[int('[')] |= CharFlags::StartString;
636  aCharTypeTab[int('`')] |= CharFlags::StartString;
637 
638  // Operator characters
639  aCharTypeTab[int('!')] |= CharFlags::Operator;
640  aCharTypeTab[int('%')] |= CharFlags::Operator;
641  // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
642  aCharTypeTab[int('(')] |= CharFlags::Operator;
643  aCharTypeTab[int(')')] |= CharFlags::Operator;
644  aCharTypeTab[int('*')] |= CharFlags::Operator;
645  aCharTypeTab[int('+')] |= CharFlags::Operator;
646  aCharTypeTab[int(',')] |= CharFlags::Operator;
647  aCharTypeTab[int('-')] |= CharFlags::Operator;
648  aCharTypeTab[int('/')] |= CharFlags::Operator;
649  aCharTypeTab[int(':')] |= CharFlags::Operator;
650  aCharTypeTab[int('<')] |= CharFlags::Operator;
651  aCharTypeTab[int('=')] |= CharFlags::Operator;
652  aCharTypeTab[int('>')] |= CharFlags::Operator;
653  aCharTypeTab[int('?')] |= CharFlags::Operator;
654  aCharTypeTab[int('^')] |= CharFlags::Operator;
655  aCharTypeTab[int('|')] |= CharFlags::Operator;
656  aCharTypeTab[int('~')] |= CharFlags::Operator;
657  aCharTypeTab[int('{')] |= CharFlags::Operator;
658  aCharTypeTab[int('}')] |= CharFlags::Operator;
659  // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
660  aCharTypeTab[int(']')] |= CharFlags::Operator;
661  aCharTypeTab[int(';')] |= CharFlags::Operator;
662 
663  // Space
664  aCharTypeTab[int(' ') ] |= CharFlags::Space;
665  aCharTypeTab[int('\t')] |= CharFlags::Space;
666 
667  // End of line characters
668  aCharTypeTab[int('\r')] |= CharFlags::EOL;
669  aCharTypeTab[int('\n')] |= CharFlags::EOL;
670 
671  ppListKeyWords = nullptr;
672  nKeyWordCount = 0;
673 }
674 
676  /*out*/std::vector<HighlightPortion>& portions) const
677 {
678  // Set the position to the beginning of the source string
679  const sal_Unicode* pos = rLine.getStr();
680 
681  // Variables for the out parameter
683  const sal_Unicode* pStartPos;
684  const sal_Unicode* pEndPos;
685 
686  // Loop over all the tokens
687  while( getNextToken( pos, eType, pStartPos, pEndPos ) )
688  {
689  portions.emplace_back(
690  pStartPos - rLine.getStr(), pEndPos - rLine.getStr(), eType);
691  }
692 }
693 
694 
696  m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
697 {
698  switch (language)
699  {
701  m_tokenizer->setKeyWords( strListBasicKeyWords,
703  break;
705  m_tokenizer->setKeyWords( strListSqlKeyWords,
707  break;
708  default:
709  assert(false); // this cannot happen
710  }
711 }
712 
714 
715 void SyntaxHighlighter::getHighlightPortions(const OUString& rLine,
716  /*out*/std::vector<HighlightPortion>& portions) const
717 {
718  m_tokenizer->getHighlightPortions( rLine, portions );
719 }
720 
722 {
723  return m_tokenizer->aLanguage;
724 }
725 
726 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Tokenizer(HighlighterLanguage aLang)
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
static int compare_strings(const void *arg1, const void *arg2)
void getHighlightPortions(const OUString &rLine, std::vector< HighlightPortion > &pPortions) const
HighlighterLanguage GetLanguage() const
sal_uInt16 sal_Unicode
const BorderLinePrimitive2D *pCandidateB assert(pCandidateA)
size_t pos
int nCount
bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
#define EOL
#define SAL_N_ELEMENTS(arr)
DocumentType eType
HighlighterLanguage const aLanguage
int i
bool isAlpha(sal_Unicode c, bool bCompatible)
std::unique_ptr< Tokenizer > m_tokenizer
HighlighterLanguage
OString OUStringToOString(const OUString &str, ConnectionSettings const *settings)
void getHighlightPortions(const OUString &rLine, std::vector< HighlightPortion > &portions) const
TokenType
bool getNextToken(const sal_Unicode *&pos, TokenType &reType, const sal_Unicode *&rpStartPos, const sal_Unicode *&rpEndPos) const
SyntaxHighlighter(const SyntaxHighlighter &)=delete
static const char * strListSqlKeyWords[]
CharFlags
void setKeyWords(const char **ppKeyWords, sal_uInt16 nCount)
static const char * strListBasicKeyWords[]