1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cassert>
23 
24 #include <rtl/character.hxx>
25 #include <unicode/uchar.h>
26 #include <comphelper/syntaxhighlight.hxx>
27 #include <o3tl/typed_flags_set.hxx>
28 
29 namespace {
30 
31 // Flags for character properties
32 enum class CharFlags {
33     StartIdentifier   = 0x0001,
34     InIdentifier      = 0x0002,
35     StartNumber       = 0x0004,
36     InNumber          = 0x0008,
37     InHexNumber       = 0x0010,
38     InOctNumber       = 0x0020,
39     StartString       = 0x0040,
40     Operator          = 0x0080,
41     Space             = 0x0100,
42     EOL               = 0x0200
43 };
44 
45 }
46 
47 namespace o3tl {
48     template<> struct typed_flags<CharFlags> : is_typed_flags<CharFlags, 0x03ff> {};
49 }
50 
51 // ##########################################################################
52 // ATTENTION: all these words need to be in lower case
53 // ##########################################################################
54 static const char* strListBasicKeyWords[] = {
55     "access",
56     "alias",
57     "and",
58     "any",
59     "append",
60     "as",
61     "attribute",
62     "base",
63     "binary",
64     "boolean",
65     "byref",
66     "byte",
67     "byval",
68     "call",
69     "case",
70     "cdecl",
71     "classmodule",
72     "close",
73     "compare",
74     "compatible",
75     "const",
76     "currency",
77     "date",
78     "declare",
79     "defbool",
80     "defcur",
81     "defdate",
82     "defdbl",
83     "deferr",
84     "defint",
85     "deflng",
86     "defobj",
87     "defsng",
88     "defstr",
89     "defvar",
90     "dim",
91     "do",
92     "doevents",
93     "double",
94     "each",
95     "else",
96     "elseif",
97     "end",
98     "end enum",
99     "end function",
100     "end if",
101     "end property",
102     "end select",
103     "end sub",
104     "end type",
105     "endif",
106     "enum",
107     "eqv",
108     "erase",
109     "error",
110     "exit",
111     "explicit",
112     "for",
113     "function",
114     "get",
115     "global",
116     "gosub",
117     "goto",
118     "if",
119     "imp",
120     "implements",
121     "in",
122     "input",
123     "integer",
124     "is",
125     "let",
126     "lib",
127     "like",
128     "line",
129     "line input",
130     "local",
131     "lock",
132     "long",
133     "loop",
134     "lprint",
135     "lset",
136     "mod",
137     "name",
138     "new",
139     "next",
140     "not",
141     "object",
142     "on",
143     "open",
144     "option",
145     "optional",
146     "or",
147     "output",
148     "paramarray",
149     "preserve",
150     "print",
151     "private",
152     "property",
153     "public",
154     "random",
155     "read",
156     "redim",
157     "rem",
158     "resume",
159     "return",
160     "rset",
161     "select",
162     "set",
163     "shared",
164     "single",
165     "static",
166     "step",
167     "stop",
168     "string",
169     "sub",
170     "system",
171     "text",
172     "then",
173     "to",
174     "type",
175     "typeof",
176     "until",
177     "variant",
178     "vbasupport",
179     "wend",
180     "while",
181     "with",
182     "withevents",
183     "write",
184     "xor"
185 };
186 
187 
188 static const char* strListSqlKeyWords[] = {
189     "all",
190     "and",
191     "any",
192     "as",
193     "asc",
194     "avg",
195     "between",
196     "by",
197     "cast",
198     "corresponding",
199     "count",
200     "create",
201     "cross",
202     "delete",
203     "desc",
204     "distinct",
205     "drop",
206     "escape",
207     "except",
208     "exists",
209     "false",
210     "from",
211     "full",
212     "global",
213     "group",
214     "having",
215     "in",
216     "inner",
217     "insert",
218     "intersect",
219     "into",
220     "is",
221     "join",
222     "left",
223     "like",
224     "limit",
225     "local",
226     "match",
227     "max",
228     "min",
229     "natural",
230     "not",
231     "null",
232     "on",
233     "or",
234     "order",
235     "outer",
236     "right",
237     "select",
238     "set",
239     "some",
240     "sum",
241     "table",
242     "temporary",
243     "true",
244     "union",
245     "unique",
246     "unknown",
247     "update",
248     "using",
249     "values",
250     "where"
251 };
252 
253 
254 extern "C" {
255 
compare_strings(const void * arg1,const void * arg2)256 static int compare_strings( const void *arg1, const void *arg2 )
257 {
258     return strcmp( static_cast<char const *>(arg1), *static_cast<char * const *>(arg2) );
259 }
260 
261 }
262 
263 namespace
264 {
isAlpha(sal_Unicode c)265     bool isAlpha(sal_Unicode c)
266     {
267         if (rtl::isAsciiAlpha(c))
268             return true;
269         return u_isalpha(c);
270     }
271 }
272 
273 class SyntaxHighlighter::Tokenizer
274 {
275     // Character information tables
276     CharFlags aCharTypeTab[256] = {};
277 
278     // Auxiliary function: testing of the character flags
279     bool testCharFlags(sal_Unicode c, CharFlags nTestFlags) const;
280 
281     // Get new token, EmptyString == nothing more over there
282     bool getNextToken(const sal_Unicode*& pos, /*out*/TokenType& reType,
283         /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const;
284 
285     const char** ppListKeyWords;
286     sal_uInt16 nKeyWordCount;
287 
288 public:
289     HighlighterLanguage const aLanguage;
290 
291     explicit Tokenizer( HighlighterLanguage aLang );
292 
293     void getHighlightPortions(const OUString& rLine,
294                                /*out*/std::vector<HighlightPortion>& portions) const;
295     void setKeyWords( const char** ppKeyWords, sal_uInt16 nCount );
296 };
297 
298 // Helper function: test character flag
testCharFlags(sal_Unicode c,CharFlags nTestFlags) const299 bool SyntaxHighlighter::Tokenizer::testCharFlags(sal_Unicode c, CharFlags nTestFlags) const
300 {
301     bool bRet = false;
302     if( c != 0 && c <= 255 )
303     {
304         bRet = bool(aCharTypeTab[c] & nTestFlags);
305     }
306     else if( c > 255 )
307     {
308         bRet = (( CharFlags::StartIdentifier | CharFlags::InIdentifier ) & nTestFlags)
309             && isAlpha(c);
310     }
311     return bRet;
312 }
313 
setKeyWords(const char ** ppKeyWords,sal_uInt16 nCount)314 void SyntaxHighlighter::Tokenizer::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
315 {
316     ppListKeyWords = ppKeyWords;
317     nKeyWordCount = nCount;
318 }
319 
getNextToken(const sal_Unicode * & pos,TokenType & reType,const sal_Unicode * & rpStartPos,const sal_Unicode * & rpEndPos) const320 bool SyntaxHighlighter::Tokenizer::getNextToken(const sal_Unicode*& pos, /*out*/TokenType& reType,
321     /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos) const
322 {
323     reType = TokenType::Unknown;
324 
325     rpStartPos = pos;
326 
327     sal_Unicode c = *pos;
328     if( c == 0 )
329         return false;
330 
331     ++pos;
332 
333     //*** Go through all possibilities ***
334     // Space?
335     if ( testCharFlags( c, CharFlags::Space ) )
336     {
337         while( testCharFlags( *pos, CharFlags::Space ) )
338             ++pos;
339 
340         reType = TokenType::Whitespace;
341     }
342 
343     // Identifier?
344     else if ( testCharFlags( c, CharFlags::StartIdentifier ) )
345     {
346         bool bIdentifierChar;
347         do
348         {
349             // Fetch next character
350             c = *pos;
351             bIdentifierChar = testCharFlags( c, CharFlags::InIdentifier );
352             if( bIdentifierChar )
353                 ++pos;
354         }
355         while( bIdentifierChar );
356 
357         reType = TokenType::Identifier;
358 
359         // Keyword table
360         if (ppListKeyWords != nullptr)
361         {
362             int nCount = pos - rpStartPos;
363 
364             // No keyword if string contains char > 255
365             bool bCanBeKeyword = true;
366             for( int i = 0 ; i < nCount ; i++ )
367             {
368                 if( rpStartPos[i] > 255 )
369                 {
370                     bCanBeKeyword = false;
371                     break;
372                 }
373             }
374 
375             if( bCanBeKeyword )
376             {
377                 OUString aKWString(rpStartPos, nCount);
378                 OString aByteStr = OUStringToOString(aKWString,
379                     RTL_TEXTENCODING_ASCII_US).toAsciiLowerCase();
380                 if ( bsearch( aByteStr.getStr(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
381                                                                         compare_strings ) )
382                 {
383                     reType = TokenType::Keywords;
384 
385                     if( aByteStr == "rem" )
386                     {
387                         // Remove all characters until end of line or EOF
388                         sal_Unicode cPeek = *pos;
389                         while( cPeek != 0 && !testCharFlags( cPeek, CharFlags::EOL ) )
390                         {
391                             cPeek = *++pos;
392                         }
393 
394                         reType = TokenType::Comment;
395                     }
396                 }
397             }
398         }
399     }
400 
401     // Operator?
402     // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
403     else if ( testCharFlags( c, CharFlags::Operator ) || ( (c == '\'') && (aLanguage==HighlighterLanguage::Basic)) )
404     {
405         // parameters for SQL view
406         if (((c==':') || (c=='?')) && (aLanguage == HighlighterLanguage::SQL))
407         {
408             if (c!='?')
409             {
410                 bool bIdentifierChar;
411                 do
412                 {
413                     // Get next character
414                     c = *pos;
415                     bIdentifierChar = isAlpha(c);
416                     if( bIdentifierChar )
417                         ++pos;
418                 }
419                 while( bIdentifierChar );
420             }
421             reType = TokenType::Parameter;
422         }
423         else if ((c=='-') && (aLanguage == HighlighterLanguage::SQL))
424         {
425             sal_Unicode cPeekNext = *pos;
426             if (cPeekNext=='-')
427             {
428                 // Remove all characters until end of line or EOF
429                 while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
430                 {
431                     ++pos;
432                     cPeekNext = *pos;
433                 }
434                 reType = TokenType::Comment;
435             }
436             else
437                 reType = TokenType::Operator;
438         }
439         else if ((c=='/') && (aLanguage == HighlighterLanguage::SQL))
440         {
441             sal_Unicode cPeekNext = *pos;
442             if (cPeekNext=='/')
443             {
444                 // Remove all characters until end of line or EOF
445                 while( cPeekNext != 0 && !testCharFlags( cPeekNext, CharFlags::EOL ) )
446                 {
447                     ++pos;
448                     cPeekNext = *pos;
449                 }
450                 reType = TokenType::Comment;
451             }
452             else
453                 reType = TokenType::Operator;
454         }
455         else
456         {
457             // Apostrophe is Basic comment
458             if (( c == '\'') && (aLanguage == HighlighterLanguage::Basic))
459             {
460                 // Skip all characters until end of input or end of line:
461                 for (;;) {
462                     c = *pos;
463                     if (c == 0 || testCharFlags(c, CharFlags::EOL)) {
464                         break;
465                     }
466                     ++pos;
467                 }
468 
469                 reType = TokenType::Comment;
470             }
471 
472             // The real operator; can be easily used since not the actual
473             // operator (e.g. +=) is concerned, but the fact that it is one
474             if( reType != TokenType::Comment )
475             {
476                 reType = TokenType::Operator;
477             }
478 
479         }
480     }
481 
482     // Object separator? Must be handled before Number
483     else if( c == '.' && ( *pos < '0' || *pos > '9' ) )
484     {
485         reType = TokenType::Operator;
486     }
487 
488     // Number?
489     else if( testCharFlags( c, CharFlags::StartNumber ) )
490     {
491         reType = TokenType::Number;
492 
493         // Number system, 10 = normal, it is changed for Oct/Hex
494         int nRadix = 10;
495 
496         // Is it an Oct or a Hex number?
497         if( c == '&' )
498         {
499             // Octal?
500             if( *pos == 'o' || *pos == 'O' )
501             {
502                 // remove o
503                 ++pos;
504                 nRadix = 8;     // Octal base
505 
506                 // Read all numbers
507                 while( testCharFlags( *pos, CharFlags::InOctNumber ) )
508                     ++pos;
509             }
510             // Hexadecimal?
511             else if( *pos == 'h' || *pos == 'H' )
512             {
513                 // remove x
514                 ++pos;
515                 nRadix = 16;     // Hexadecimal base
516 
517                 // Read all numbers
518                 while( testCharFlags( *pos, CharFlags::InHexNumber ) )
519                     ++pos;
520             }
521             else
522             {
523                 reType = TokenType::Operator;
524             }
525         }
526 
527         // When it is not Oct or Hex, then it is double
528         if( reType == TokenType::Number && nRadix == 10 )
529         {
530             // Flag if the last character is an exponent
531             bool bAfterExpChar = false;
532 
533             // Read all numbers
534             while( testCharFlags( *pos, CharFlags::InNumber ) ||
535                     (bAfterExpChar && *pos == '+' ) ||
536                     (bAfterExpChar && *pos == '-' ) )
537                     // After exponent +/- are OK, too
538             {
539                 c = *pos++;
540                 bAfterExpChar = ( c == 'e' || c == 'E' );
541             }
542         }
543     }
544 
545     // String?
546     else if( testCharFlags( c, CharFlags::StartString ) )
547     {
548         // Remember which character has opened the string
549         sal_Unicode cEndString = c;
550         if( c == '[' )
551             cEndString = ']';
552 
553         // Read all characters
554         while( *pos != cEndString )
555         {
556             // Detect EOF before reading next char, so we do not lose EOF
557             if( *pos == 0 )
558             {
559                 // ERROR: unterminated string literal
560                 reType = TokenType::Error;
561                 break;
562             }
563             c = *pos++;
564             if( testCharFlags( c, CharFlags::EOL ) )
565             {
566                 // ERROR: unterminated string literal
567                 reType = TokenType::Error;
568                 break;
569             }
570         }
571 
572         if( reType != TokenType::Error )
573         {
574             ++pos;
575             if( cEndString == ']' )
576                 reType = TokenType::Identifier;
577             else
578                 reType = TokenType::String;
579         }
580     }
581 
582     // End of line?
583     else if( testCharFlags( c, CharFlags::EOL ) )
584     {
585         // If another EOL character comes, read it
586         sal_Unicode cNext = *pos;
587         if( cNext != c && testCharFlags( cNext, CharFlags::EOL ) )
588             ++pos;
589 
590         reType = TokenType::EOL;
591     }
592 
593     // All other will remain TokenType::Unknown
594 
595     // Save end position
596     rpEndPos = pos;
597     return true;
598 }
599 
Tokenizer(HighlighterLanguage aLang)600 SyntaxHighlighter::Tokenizer::Tokenizer( HighlighterLanguage aLang ): aLanguage(aLang)
601 {
602     // Fill character table
603     sal_uInt16 i;
604 
605     // Allowed characters for identifiers
606     CharFlags nHelpMask = CharFlags::StartIdentifier | CharFlags::InIdentifier;
607     for( i = 'a' ; i <= 'z' ; i++ )
608         aCharTypeTab[i] |= nHelpMask;
609     for( i = 'A' ; i <= 'Z' ; i++ )
610         aCharTypeTab[i] |= nHelpMask;
611     aCharTypeTab[int('_')] |= nHelpMask;
612     aCharTypeTab[int('$')] |= nHelpMask;
613 
614     // Digit (can be identifier and number)
615     nHelpMask = CharFlags::InIdentifier | CharFlags::StartNumber |
616                          CharFlags::InNumber | CharFlags::InHexNumber;
617     for( i = '0' ; i <= '9' ; i++ )
618         aCharTypeTab[i] |= nHelpMask;
619 
620     // Add e, E, . and & here manually
621     aCharTypeTab[int('e')] |= CharFlags::InNumber;
622     aCharTypeTab[int('E')] |= CharFlags::InNumber;
623     aCharTypeTab[int('.')] |= CharFlags::InNumber | CharFlags::StartNumber;
624     aCharTypeTab[int('&')] |= CharFlags::StartNumber;
625 
626     // Hexadecimal digit
627     for( i = 'a' ; i <= 'f' ; i++ )
628         aCharTypeTab[i] |= CharFlags::InHexNumber;
629     for( i = 'A' ; i <= 'F' ; i++ )
630         aCharTypeTab[i] |= CharFlags::InHexNumber;
631 
632     // Octal digit
633     for( i = '0' ; i <= '7' ; i++ )
634         aCharTypeTab[i] |= CharFlags::InOctNumber;
635 
636     // String literal start/end characters
637     aCharTypeTab[int('\'')] |= CharFlags::StartString;
638     aCharTypeTab[int('\"')] |= CharFlags::StartString;
639     aCharTypeTab[int('[')]  |= CharFlags::StartString;
640     aCharTypeTab[int('`')]  |= CharFlags::StartString;
641 
642     // Operator characters
643     aCharTypeTab[int('!')] |= CharFlags::Operator;
644     aCharTypeTab[int('%')] |= CharFlags::Operator;
645     // aCharTypeTab[(int)'&'] |= CharFlags::Operator;     Removed because of #i14140
646     aCharTypeTab[int('(')] |= CharFlags::Operator;
647     aCharTypeTab[int(')')] |= CharFlags::Operator;
648     aCharTypeTab[int('*')] |= CharFlags::Operator;
649     aCharTypeTab[int('+')] |= CharFlags::Operator;
650     aCharTypeTab[int(',')] |= CharFlags::Operator;
651     aCharTypeTab[int('-')] |= CharFlags::Operator;
652     aCharTypeTab[int('/')] |= CharFlags::Operator;
653     aCharTypeTab[int(':')] |= CharFlags::Operator;
654     aCharTypeTab[int('<')] |= CharFlags::Operator;
655     aCharTypeTab[int('=')] |= CharFlags::Operator;
656     aCharTypeTab[int('>')] |= CharFlags::Operator;
657     aCharTypeTab[int('?')] |= CharFlags::Operator;
658     aCharTypeTab[int('^')] |= CharFlags::Operator;
659     aCharTypeTab[int('|')] |= CharFlags::Operator;
660     aCharTypeTab[int('~')] |= CharFlags::Operator;
661     aCharTypeTab[int('{')] |= CharFlags::Operator;
662     aCharTypeTab[int('}')] |= CharFlags::Operator;
663     // aCharTypeTab[(int)'['] |= CharFlags::Operator;     Removed because of #i17826
664     aCharTypeTab[int(']')] |= CharFlags::Operator;
665     aCharTypeTab[int(';')] |= CharFlags::Operator;
666 
667     // Space
668     aCharTypeTab[int(' ') ] |= CharFlags::Space;
669     aCharTypeTab[int('\t')] |= CharFlags::Space;
670 
671     // End of line characters
672     aCharTypeTab[int('\r')] |= CharFlags::EOL;
673     aCharTypeTab[int('\n')] |= CharFlags::EOL;
674 
675     ppListKeyWords = nullptr;
676     nKeyWordCount = 0;
677 }
678 
getHighlightPortions(const OUString & rLine,std::vector<HighlightPortion> & portions) const679 void SyntaxHighlighter::Tokenizer::getHighlightPortions(const OUString& rLine,
680                                                  /*out*/std::vector<HighlightPortion>& portions) const
681 {
682     // Set the position to the beginning of the source string
683     const sal_Unicode* pos = rLine.getStr();
684 
685     // Variables for the out parameter
686     TokenType eType;
687     const sal_Unicode* pStartPos;
688     const sal_Unicode* pEndPos;
689 
690     // Loop over all the tokens
691     while( getNextToken( pos, eType, pStartPos, pEndPos ) )
692     {
693         portions.emplace_back(
694                 pStartPos - rLine.getStr(), pEndPos - rLine.getStr(), eType);
695     }
696 }
697 
698 
SyntaxHighlighter(HighlighterLanguage language)699 SyntaxHighlighter::SyntaxHighlighter(HighlighterLanguage language):
700     m_tokenizer(new SyntaxHighlighter::Tokenizer(language))
701 {
702     switch (language)
703     {
704         case HighlighterLanguage::Basic:
705             m_tokenizer->setKeyWords( strListBasicKeyWords,
706                                       SAL_N_ELEMENTS( strListBasicKeyWords ));
707             break;
708         case HighlighterLanguage::SQL:
709             m_tokenizer->setKeyWords( strListSqlKeyWords,
710                                       SAL_N_ELEMENTS( strListSqlKeyWords ));
711             break;
712         default:
713             assert(false); // this cannot happen
714     }
715 }
716 
~SyntaxHighlighter()717 SyntaxHighlighter::~SyntaxHighlighter() {}
718 
getHighlightPortions(const OUString & rLine,std::vector<HighlightPortion> & portions) const719 void SyntaxHighlighter::getHighlightPortions(const OUString& rLine,
720                                               /*out*/std::vector<HighlightPortion>& portions) const
721 {
722     m_tokenizer->getHighlightPortions( rLine, portions );
723 }
724 
GetLanguage() const725 HighlighterLanguage SyntaxHighlighter::GetLanguage() const
726 {
727     return m_tokenizer->aLanguage;
728 }
729 
730 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
731