1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20 #include <sal/config.h>
21
22 #include <cassert>
23
24 #include <rtl/character.hxx>
25 #include <unicode/uchar.h>
26 #include <comphelper/syntaxhighlight.hxx>
27 #include <o3tl/typed_flags_set.hxx>
28
29 namespace {
30
31 // Flags for character properties
32 enum class CharFlags {
33 StartIdentifier = 0x0001,
34 InIdentifier = 0x0002,
35 StartNumber = 0x0004,
36 InNumber = 0x0008,
37 InHexNumber = 0x0010,
38 InOctNumber = 0x0020,
39 StartString = 0x0040,
40 Operator = 0x0080,
41 Space = 0x0100,
42 EOL = 0x0200
43 };
44
45 }
46
47 namespace o3tl {
48 template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
49 }
50
51 // ##########################################################################
52 // ATTENTION: all these words need to be in lower case
53 // ##########################################################################
54 static const char* strListBasicKeyWords[] = {
55 "access",
56 "alias",
57 "and",
58 "any",
59 "append",
60 "as",
61 "attribute",
62 "base",
63 "binary",
64 "boolean",
65 "byref",
66 "byte",
67 "byval",
68 "call",
69 "case",
70 "cdecl",
71 "classmodule",
72 "close",
73 "compare",
74 "compatible",
75 "const",
76 "currency",
77 "date",
78 "declare",
79 "defbool",
80 "defcur",
81 "defdate",
82 "defdbl",
83 "deferr",
84 "defint",
85 "deflng",
86 "defobj",
87 "defsng",
88 "defstr",
89 "defvar",
90 "dim",
91 "do",
92 "doevents",
93 "double",
94 "each",
95 "else",
96 "elseif",
97 "end",
98 "end enum",
99 "end function",
100 "end if",
101 "end property",
102 "end select",
103 "end sub",
104 "end type",
105 "endif",
106 "enum",
107 "eqv",
108 "erase",
109 "error",
110 "exit",
111 "explicit",
112 "for",
113 "function",
114 "get",
115 "global",
116 "gosub",
117 "goto",
118 "if",
119 "imp",
120 "implements",
121 "in",
122 "input",
123 "integer",
124 "is",
125 "let",
126 "lib",
127 "like",
128 "line",
129 "line input",
130 "local",
131 "lock",
132 "long",
133 "loop",
134 "lprint",
135 "lset",
136 "mod",
137 "name",
138 "new",
139 "next",
140 "not",
141 "object",
142 "on",
143 "open",
144 "option",
145 "optional",
146 "or",
147 "output",
148 "paramarray",
149 "preserve",
150 "print",
151 "private",
152 "property",
153 "public",
154 "random",
155 "read",
156 "redim",
157 "rem",
158 "resume",
159 "return",
160 "rset",
161 "select",
162 "set",
163 "shared",
164 "single",
165 "static",
166 "step",
167 "stop",
168 "string",
169 "sub",
170 "system",
171 "text",
172 "then",
173 "to",
174 "type",
175 "typeof",
176 "until",
177 "variant",
178 "vbasupport",
179 "wend",
180 "while",
181 "with",
182 "withevents",
183 "write",
184 "xor"
185 };
186
187
188 static const char* strListSqlKeyWords[] = {
189 "all",
190 "and",
191 "any",
192 "as",
193 "asc",
194 "avg",
195 "between",
196 "by",
197 "cast",
198 "corresponding",
199 "count",
200 "create",
201 "cross",
202 "delete",
203 "desc",
204 "distinct",
205 "drop",
206 "escape",
207 "except",
208 "exists",
209 "false",
210 "from",
211 "full",
212 "global",
213 "group",
214 "having",
215 "in",
216 "inner",
217 "insert",
218 "intersect",
219 "into",
220 "is",
221 "join",
222 "left",
223 "like",
224 "limit",
225 "local",
226 "match",
227 "max",
228 "min",
229 "natural",
230 "not",
231 "null",
232 "on",
233 "or",
234 "order",
235 "outer",
236 "right",
237 "select",
238 "set",
239 "some",
240 "sum",
241 "table",
242 "temporary",
243 "true",
244 "union",
245 "unique",
246 "unknown",
247 "update",
248 "using",
249 "values",
250 "where"
251 };
252
253
254 extern "C" {
255
compare_strings(const void * arg1,const void * arg2)256 static int compare_strings( const void *arg1, const void *arg2 )
257 {
258 return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
259 }
260
261 }
262
263 namespace
264 {
isAlpha(sal_Unicode c)265 bool isAlpha(sal_Unicode c)
266 {
267 if (rtl::isAsciiAlpha(c))
268 return true;
269 return u_isalpha(c);
270 }
271 }
272
273 class SyntaxHighlighter::Tokenizer
274 {
275 // Character information tables
276 CharFlags aCharTypeTab[256] = {};
277
278 // Auxiliary function: testing of the character flags
279 bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
280
281 // Get new token, EmptyString == nothing more over there
282 bool getNextToken(const sal_Unicode*& pos, /*out*/TokenType& reType,
283 /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const;
284
285 const char** ppListKeyWords;
286 sal_uInt16 nKeyWordCount;
287
288 public:
289 HighlighterLanguage const aLanguage;
290
291 explicit Tokenizer( HighlighterLanguage aLang );
292
293 void getHighlightPortions(const OUString& rLine,
294 /*out*/std::vector<HighlightPortion>& portions) const;
295 void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
296 };
297
298 // Helper function: test character flag
testCharFlags(sal_Unicode c,CharFlags nTestFlags) const299 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
300 {
301 bool bRet = false;
302 if( c != 0 && c <= 255 )
303 {
304 bRet = bool(aCharTypeTab[c] & nTestFlags);
305 }
306 else if( c > 255 )
307 {
308 bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
309 && isAlpha(c);
310 }
311 return bRet;
312 }
313
setKeyWords(const char ** ppKeyWords,sal_uInt16 nCount)314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
315 {
316 ppListKeyWords = ppKeyWords;
317 nKeyWordCount = nCount;
318 }
319
getNextToken(const sal_Unicode * & pos,TokenType & reType,const sal_Unicode * & rpStartPos,const sal_Unicode * & rpEndPos) const320 bool SyntaxHighlighter::Tokenizer::getNextToken(const sal_Unicode*& pos, /*out*/TokenType& reType,
321 /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const
322 {
323 reType = TokenType::Unknown;
324
325 rpStartPos = pos;
326
327 sal_Unicode c = *pos;
328 if( c == 0 )
329 return false;
330
331 ++pos;
332
333 //*** Go through all possibilities ***
334 // Space?
335 if ( testCharFlags( c, CharFlags::Space ) )
336 {
337 while( testCharFlags( *pos, CharFlags::Space ) )
338 ++pos;
339
340 reType = TokenType::Whitespace;
341 }
342
343 // Identifier?
344 else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
345 {
346 bool bIdentifierChar;
347 do
348 {
349 // Fetch next character
350 c = *pos;
351 bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
352 if( bIdentifierChar )
353 ++pos;
354 }
355 while( bIdentifierChar );
356
357 reType = TokenType::Identifier;
358
359 // Keyword table
360 if (ppListKeyWords != nullptr)
361 {
362 int nCount = pos - rpStartPos;
363
364 // No keyword if string contains char > 255
365 bool bCanBeKeyword = true;
366 for( int i = 0 ; i < nCount ; i++ )
367 {
368 if( rpStartPos[i] > 255 )
369 {
370 bCanBeKeyword = false;
371 break;
372 }
373 }
374
375 if( bCanBeKeyword )
376 {
377 OUString aKWString(rpStartPos, nCount);
378 OString aByteStr = OUStringToOString(aKWString,
379 RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
380 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
381 compare_strings ) )
382 {
383 reType = TokenType::Keywords;
384
385 if( aByteStr == "rem" )
386 {
387 // Remove all characters until end of line or EOF
388 sal_Unicode cPeek = *pos;
389 while( cPeek != 0 && !testCharFlags( cPeek, CharFlags::EOL ) )
390 {
391 cPeek = *++pos;
392 }
393
394 reType = TokenType::Comment;
395 }
396 }
397 }
398 }
399 }
400
401 // Operator?
402 // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
403 else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
404 {
405 // parameters for SQL view
406 if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
407 {
408 if (c!='?')
409 {
410 bool bIdentifierChar;
411 do
412 {
413 // Get next character
414 c = *pos;
415 bIdentifierChar = isAlpha(c);
416 if( bIdentifierChar )
417 ++pos;
418 }
419 while( bIdentifierChar );
420 }
421 reType = TokenType::Parameter;
422 }
423 else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
424 {
425 sal_Unicode cPeekNext = *pos;
426 if (cPeekNext=='-')
427 {
428 // Remove all characters until end of line or EOF
429 while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
430 {
431 ++pos;
432 cPeekNext = *pos;
433 }
434 reType = TokenType::Comment;
435 }
436 else
437 reType = TokenType::Operator;
438 }
439 else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
440 {
441 sal_Unicode cPeekNext = *pos;
442 if (cPeekNext=='/')
443 {
444 // Remove all characters until end of line or EOF
445 while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
446 {
447 ++pos;
448 cPeekNext = *pos;
449 }
450 reType = TokenType::Comment;
451 }
452 else
453 reType = TokenType::Operator;
454 }
455 else
456 {
457 // Apostrophe is Basic comment
458 if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
459 {
460 // Skip all characters until end of input or end of line:
461 for (;;) {
462 c = *pos;
463 if (c == 0 || testCharFlags(c, CharFlags::EOL)) {
464 break;
465 }
466 ++pos;
467 }
468
469 reType = TokenType::Comment;
470 }
471
472 // The real operator; can be easily used since not the actual
473 // operator (e.g. +=) is concerned, but the fact that it is one
474 if( reType != TokenType::Comment )
475 {
476 reType = TokenType::Operator;
477 }
478
479 }
480 }
481
482 // Object separator? Must be handled before Number
483 else if( c == '.' && ( *pos < '0' || *pos > '9' ) )
484 {
485 reType = TokenType::Operator;
486 }
487
488 // Number?
489 else if( testCharFlags( c, CharFlags::StartNumber ) )
490 {
491 reType = TokenType::Number;
492
493 // Number system, 10 = normal, it is changed for Oct/Hex
494 int nRadix = 10;
495
496 // Is it an Oct or a Hex number?
497 if( c == '&' )
498 {
499 // Octal?
500 if( *pos == 'o' || *pos == 'O' )
501 {
502 // remove o
503 ++pos;
504 nRadix = 8; // Octal base
505
506 // Read all numbers
507 while( testCharFlags( *pos, CharFlags::InOctNumber ) )
508 ++pos;
509 }
510 // Hexadecimal?
511 else if( *pos == 'h' || *pos == 'H' )
512 {
513 // remove x
514 ++pos;
515 nRadix = 16; // Hexadecimal base
516
517 // Read all numbers
518 while( testCharFlags( *pos, CharFlags::InHexNumber ) )
519 ++pos;
520 }
521 else
522 {
523 reType = TokenType::Operator;
524 }
525 }
526
527 // When it is not Oct or Hex, then it is double
528 if( reType == TokenType::Number && nRadix == 10 )
529 {
530 // Flag if the last character is an exponent
531 bool bAfterExpChar = false;
532
533 // Read all numbers
534 while( testCharFlags( *pos, CharFlags::InNumber ) ||
535 (bAfterExpChar && *pos == '+' ) ||
536 (bAfterExpChar && *pos == '-' ) )
537 // After exponent +/- are OK, too
538 {
539 c = *pos++;
540 bAfterExpChar = ( c == 'e' || c == 'E' );
541 }
542 }
543 }
544
545 // String?
546 else if( testCharFlags( c, CharFlags::StartString ) )
547 {
548 // Remember which character has opened the string
549 sal_Unicode cEndString = c;
550 if( c == '[' )
551 cEndString = ']';
552
553 // Read all characters
554 while( *pos != cEndString )
555 {
556 // Detect EOF before reading next char, so we do not lose EOF
557 if( *pos == 0 )
558 {
559 // ERROR: unterminated string literal
560 reType = TokenType::Error;
561 break;
562 }
563 c = *pos++;
564 if( testCharFlags( c, CharFlags::EOL ) )
565 {
566 // ERROR: unterminated string literal
567 reType = TokenType::Error;
568 break;
569 }
570 }
571
572 if( reType != TokenType::Error )
573 {
574 ++pos;
575 if( cEndString == ']' )
576 reType = TokenType::Identifier;
577 else
578 reType = TokenType::String;
579 }
580 }
581
582 // End of line?
583 else if( testCharFlags( c, CharFlags::EOL ) )
584 {
585 // If another EOL character comes, read it
586 sal_Unicode cNext = *pos;
587 if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
588 ++pos;
589
590 reType = TokenType::EOL;
591 }
592
593 // All other will remain TokenType::Unknown
594
595 // Save end position
596 rpEndPos = pos;
597 return true;
598 }
599
Tokenizer(HighlighterLanguage aLang)600 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
601 {
602 // Fill character table
603 sal_uInt16 i;
604
605 // Allowed characters for identifiers
606 CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
607 for( i = 'a' ; i <= 'z' ; i++ )
608 aCharTypeTab[i] |= nHelpMask;
609 for( i = 'A' ; i <= 'Z' ; i++ )
610 aCharTypeTab[i] |= nHelpMask;
611 aCharTypeTab[int('_')] |= nHelpMask;
612 aCharTypeTab[int('$')] |= nHelpMask;
613
614 // Digit (can be identifier and number)
615 nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
616 CharFlags::InNumber | CharFlags::InHexNumber;
617 for( i = '0' ; i <= '9' ; i++ )
618 aCharTypeTab[i] |= nHelpMask;
619
620 // Add e, E, . and & here manually
621 aCharTypeTab[int('e')] |= CharFlags::InNumber;
622 aCharTypeTab[int('E')] |= CharFlags::InNumber;
623 aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
624 aCharTypeTab[int('&')] |= CharFlags::StartNumber;
625
626 // Hexadecimal digit
627 for( i = 'a' ; i <= 'f' ; i++ )
628 aCharTypeTab[i] |= CharFlags::InHexNumber;
629 for( i = 'A' ; i <= 'F' ; i++ )
630 aCharTypeTab[i] |= CharFlags::InHexNumber;
631
632 // Octal digit
633 for( i = '0' ; i <= '7' ; i++ )
634 aCharTypeTab[i] |= CharFlags::InOctNumber;
635
636 // String literal start/end characters
637 aCharTypeTab[int('\'')] |= CharFlags::StartString;
638 aCharTypeTab[int('\"')] |= CharFlags::StartString;
639 aCharTypeTab[int('[')] |= CharFlags::StartString;
640 aCharTypeTab[int('`')] |= CharFlags::StartString;
641
642 // Operator characters
643 aCharTypeTab[int('!')] |= CharFlags::Operator;
644 aCharTypeTab[int('%')] |= CharFlags::Operator;
645 // aCharTypeTab[(int)'&'] |= CharFlags::Operator; Removed because of #i14140
646 aCharTypeTab[int('(')] |= CharFlags::Operator;
647 aCharTypeTab[int(')')] |= CharFlags::Operator;
648 aCharTypeTab[int('*')] |= CharFlags::Operator;
649 aCharTypeTab[int('+')] |= CharFlags::Operator;
650 aCharTypeTab[int(',')] |= CharFlags::Operator;
651 aCharTypeTab[int('-')] |= CharFlags::Operator;
652 aCharTypeTab[int('/')] |= CharFlags::Operator;
653 aCharTypeTab[int(':')] |= CharFlags::Operator;
654 aCharTypeTab[int('<')] |= CharFlags::Operator;
655 aCharTypeTab[int('=')] |= CharFlags::Operator;
656 aCharTypeTab[int('>')] |= CharFlags::Operator;
657 aCharTypeTab[int('?')] |= CharFlags::Operator;
658 aCharTypeTab[int('^')] |= CharFlags::Operator;
659 aCharTypeTab[int('|')] |= CharFlags::Operator;
660 aCharTypeTab[int('~')] |= CharFlags::Operator;
661 aCharTypeTab[int('{')] |= CharFlags::Operator;
662 aCharTypeTab[int('}')] |= CharFlags::Operator;
663 // aCharTypeTab[(int)'['] |= CharFlags::Operator; Removed because of #i17826
664 aCharTypeTab[int(']')] |= CharFlags::Operator;
665 aCharTypeTab[int(';')] |= CharFlags::Operator;
666
667 // Space
668 aCharTypeTab[int(' ') ] |= CharFlags::Space;
669 aCharTypeTab[int('\t')] |= CharFlags::Space;
670
671 // End of line characters
672 aCharTypeTab[int('\r')] |= CharFlags::EOL;
673 aCharTypeTab[int('\n')] |= CharFlags::EOL;
674
675 ppListKeyWords = nullptr;
676 nKeyWordCount = 0;
677 }
678
getHighlightPortions(const OUString & rLine,std::vector<HighlightPortion> & portions) const679 void SyntaxHighlighter::Tokenizer::getHighlightPortions(const OUString& rLine,
680 /*out*/std::vector<HighlightPortion>& portions) const
681 {
682 // Set the position to the beginning of the source string
683 const sal_Unicode* pos = rLine.getStr();
684
685 // Variables for the out parameter
686 TokenType eType;
687 const sal_Unicode* pStartPos;
688 const sal_Unicode* pEndPos;
689
690 // Loop over all the tokens
691 while( getNextToken( pos, eType, pStartPos, pEndPos ) )
692 {
693 portions.emplace_back(
694 pStartPos - rLine.getStr(), pEndPos - rLine.getStr(), eType);
695 }
696 }
697
698
SyntaxHighlighter(HighlighterLanguage language)699 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
700 m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
701 {
702 switch (language)
703 {
704 case HighlighterLanguage::Basic:
705 m_tokenizer->setKeyWords( strListBasicKeyWords,
706 SAL_N_ELEMENTS( strListBasicKeyWords ));
707 break;
708 case HighlighterLanguage::SQL:
709 m_tokenizer->setKeyWords( strListSqlKeyWords,
710 SAL_N_ELEMENTS( strListSqlKeyWords ));
711 break;
712 default:
713 assert(false); // this cannot happen
714 }
715 }
716
~SyntaxHighlighter()717 SyntaxHighlighter::~SyntaxHighlighter() {}
718
getHighlightPortions(const OUString & rLine,std::vector<HighlightPortion> & portions) const719 void SyntaxHighlighter::getHighlightPortions(const OUString& rLine,
720 /*out*/std::vector<HighlightPortion>& portions) const
721 {
722 m_tokenizer->getHighlightPortions( rLine, portions );
723 }
724
GetLanguage() const725 HighlighterLanguage SyntaxHighlighter::GetLanguage() const
726 {
727 return m_tokenizer->aLanguage;
728 }
729
730 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
731