1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3 ** Lexer for Objective Caml.
4 **/
5 // Copyright 2005-2009 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /* Release History
8 20050204 Initial release.
9 20050205 Quick compiler standards/"cleanliness" adjustment.
10 20050206 Added cast for IsLeadByte().
11 20050209 Changes to "external" build support.
12 20050306 Fix for 1st-char-in-doc "corner" case.
13 20050502 Fix for [harmless] one-past-the-end coloring.
14 20050515 Refined numeric token recognition logic.
15 20051125 Added 2nd "optional" keywords class.
16 20051129 Support "magic" (read-only) comments for RCaml.
17 20051204 Swtich to using StyleContext infrastructure.
18 20090629 Add full Standard ML '97 support.
19 */
20
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdarg.h>
25 #include <assert.h>
26 #include <ctype.h>
27
28 #include "ILexer.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
31
32 #include "PropSetSimple.h"
33 #include "WordList.h"
34 #include "LexAccessor.h"
35 #include "Accessor.h"
36 #include "StyleContext.h"
37 #include "CharacterSet.h"
38 #include "LexerModule.h"
39
40 #if defined(__clang__)
41 #pragma clang diagnostic ignored "-Wcomma"
42 #endif
43
44 // Since the Microsoft __iscsym[f] funcs are not ANSI...
iscaml(int c)45 inline int iscaml(int c) {return isalnum(c) || c == '_';}
iscamlf(int c)46 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
47
48 static const int baseT[24] = {
49 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* A - L */
50 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,16 /* M - X */
51 };
52
53 using namespace Scintilla;
54
55 #ifdef BUILD_AS_EXTERNAL_LEXER
56 /*
57 (actually seems to work!)
58 */
59 #include <string>
60 #include "WindowAccessor.h"
61 #include "ExternalLexer.h"
62
63 #undef EXT_LEXER_DECL
64 #define EXT_LEXER_DECL __declspec( dllexport ) __stdcall
65
66 #if PLAT_WIN
67 #include <windows.h>
68 #endif
69
70 static void ColouriseCamlDoc(
71 Sci_PositionU startPos, Sci_Position length,
72 int initStyle,
73 WordList *keywordlists[],
74 Accessor &styler);
75
76 static void FoldCamlDoc(
77 Sci_PositionU startPos, Sci_Position length,
78 int initStyle,
79 WordList *keywordlists[],
80 Accessor &styler);
81
82 static void InternalLexOrFold(int lexOrFold, Sci_PositionU startPos, Sci_Position length,
83 int initStyle, char *words[], WindowID window, char *props);
84
85 static const char* LexerName = "caml";
86
87 #ifdef TRACE
DebugPrintf(const char * format,...)88 void Platform::DebugPrintf(const char *format, ...) {
89 char buffer[2000];
90 va_list pArguments;
91 va_start(pArguments, format);
92 vsprintf(buffer,format,pArguments);
93 va_end(pArguments);
94 Platform::DebugDisplay(buffer);
95 }
96 #else
DebugPrintf(const char *,...)97 void Platform::DebugPrintf(const char *, ...) {
98 }
99 #endif
100
IsDBCSLeadByte(int codePage,char ch)101 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
102 return ::IsDBCSLeadByteEx(codePage, ch) != 0;
103 }
104
SendScintilla(WindowID w,unsigned int msg,unsigned long wParam,long lParam)105 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
106 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
107 }
108
SendScintillaPointer(WindowID w,unsigned int msg,unsigned long wParam,void * lParam)109 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
110 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
111 reinterpret_cast<LPARAM>(lParam));
112 }
113
Fold(unsigned int lexer,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)114 void EXT_LEXER_DECL Fold(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
115 int initStyle, char *words[], WindowID window, char *props)
116 {
117 // below useless evaluation(s) to supress "not used" warnings
118 lexer;
119 // build expected data structures and do the Fold
120 InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
121
122 }
123
GetLexerCount()124 int EXT_LEXER_DECL GetLexerCount()
125 {
126 return 1; // just us [Objective] Caml lexers here!
127 }
128
GetLexerName(unsigned int Index,char * name,int buflength)129 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
130 {
131 // below useless evaluation(s) to supress "not used" warnings
132 Index;
133 // return as much of our lexer name as will fit (what's up with Index?)
134 if (buflength > 0) {
135 buflength--;
136 int n = strlen(LexerName);
137 if (n > buflength)
138 n = buflength;
139 memcpy(name, LexerName, n), name[n] = '\0';
140 }
141 }
142
Lex(unsigned int lexer,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)143 void EXT_LEXER_DECL Lex(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
144 int initStyle, char *words[], WindowID window, char *props)
145 {
146 // below useless evaluation(s) to supress "not used" warnings
147 lexer;
148 // build expected data structures and do the Lex
149 InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
150 }
151
InternalLexOrFold(int foldOrLex,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)152 static void InternalLexOrFold(int foldOrLex, Sci_PositionU startPos, Sci_Position length,
153 int initStyle, char *words[], WindowID window, char *props)
154 {
155 // create and initialize a WindowAccessor (including contained PropSet)
156 PropSetSimple ps;
157 ps.SetMultiple(props);
158 WindowAccessor wa(window, ps);
159 // create and initialize WordList(s)
160 int nWL = 0;
161 for (; words[nWL]; nWL++) ; // count # of WordList PTRs needed
162 WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
163 int i = 0;
164 for (; i < nWL; i++) {
165 wl[i] = new WordList(); // (works or THROWS bad_alloc EXCEPTION)
166 wl[i]->Set(words[i]);
167 }
168 wl[i] = 0;
169 // call our "internal" folder/lexer (... then do Flush!)
170 if (foldOrLex)
171 FoldCamlDoc(startPos, length, initStyle, wl, wa);
172 else
173 ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
174 wa.Flush();
175 // clean up before leaving
176 for (i = nWL - 1; i >= 0; i--)
177 delete wl[i];
178 delete [] wl;
179 }
180
181 static
182 #endif /* BUILD_AS_EXTERNAL_LEXER */
183
ColouriseCamlDoc(Sci_PositionU startPos,Sci_Position length,int initStyle,WordList * keywordlists[],Accessor & styler)184 void ColouriseCamlDoc(
185 Sci_PositionU startPos, Sci_Position length,
186 int initStyle,
187 WordList *keywordlists[],
188 Accessor &styler)
189 {
190 // initialize styler
191 StyleContext sc(startPos, length, initStyle, styler);
192
193 Sci_PositionU chToken = 0;
194 int chBase = 0, chLit = 0;
195 WordList& keywords = *keywordlists[0];
196 WordList& keywords2 = *keywordlists[1];
197 WordList& keywords3 = *keywordlists[2];
198 const bool isSML = keywords.InList("andalso");
199 const int useMagic = styler.GetPropertyInt("lexer.caml.magic", 0);
200
201 // set up [initial] state info (terminating states that shouldn't "bleed")
202 const int state_ = sc.state & 0x0f;
203 if (state_ <= SCE_CAML_CHAR
204 || (isSML && state_ == SCE_CAML_STRING))
205 sc.state = SCE_CAML_DEFAULT;
206 int nesting = (state_ >= SCE_CAML_COMMENT)? (state_ - SCE_CAML_COMMENT): 0;
207
208 // foreach char in range...
209 while (sc.More()) {
210 // set up [per-char] state info
211 int state2 = -1; // (ASSUME no state change)
212 Sci_Position chColor = sc.currentPos - 1;// (ASSUME standard coloring range)
213 bool advance = true; // (ASSUME scanner "eats" 1 char)
214
215 // step state machine
216 switch (sc.state & 0x0f) {
217 case SCE_CAML_DEFAULT:
218 chToken = sc.currentPos; // save [possible] token start (JIC)
219 // it's wide open; what do we have?
220 if (iscamlf(sc.ch))
221 state2 = SCE_CAML_IDENTIFIER;
222 else if (!isSML && sc.Match('`') && iscamlf(sc.chNext))
223 state2 = SCE_CAML_TAGNAME;
224 else if (!isSML && sc.Match('#') && isdigit(sc.chNext))
225 state2 = SCE_CAML_LINENUM;
226 else if (isdigit(sc.ch)) {
227 // it's a number, assume base 10
228 state2 = SCE_CAML_NUMBER, chBase = 10;
229 if (sc.Match('0')) {
230 // there MAY be a base specified...
231 const char* baseC = "bBoOxX";
232 if (isSML) {
233 if (sc.chNext == 'w')
234 sc.Forward(); // (consume SML "word" indicator)
235 baseC = "x";
236 }
237 // ... change to specified base AS REQUIRED
238 if (strchr(baseC, sc.chNext))
239 chBase = baseT[tolower(sc.chNext) - 'a'], sc.Forward();
240 }
241 } else if (!isSML && sc.Match('\'')) // (Caml char literal?)
242 state2 = SCE_CAML_CHAR, chLit = 0;
243 else if (isSML && sc.Match('#', '"')) // (SML char literal?)
244 state2 = SCE_CAML_CHAR, sc.Forward();
245 else if (sc.Match('"'))
246 state2 = SCE_CAML_STRING;
247 else if (sc.Match('(', '*'))
248 state2 = SCE_CAML_COMMENT, sc.Forward(), sc.ch = ' '; // (*)...
249 else if (strchr("!?~" /* Caml "prefix-symbol" */
250 "=<>@^|&+-*/$%" /* Caml "infix-symbol" */
251 "()[]{};,:.#", sc.ch) // Caml "bracket" or ;,:.#
252 // SML "extra" ident chars
253 || (isSML && (sc.Match('\\') || sc.Match('`'))))
254 state2 = SCE_CAML_OPERATOR;
255 break;
256
257 case SCE_CAML_IDENTIFIER:
258 // [try to] interpret as [additional] identifier char
259 if (!(iscaml(sc.ch) || sc.Match('\''))) {
260 const Sci_Position n = sc.currentPos - chToken;
261 if (n < 24) {
262 // length is believable as keyword, [re-]construct token
263 char t[24];
264 for (Sci_Position i = -n; i < 0; i++)
265 t[n + i] = static_cast<char>(sc.GetRelative(i));
266 t[n] = '\0';
267 // special-case "_" token as KEYWORD
268 if ((n == 1 && sc.chPrev == '_') || keywords.InList(t))
269 sc.ChangeState(SCE_CAML_KEYWORD);
270 else if (keywords2.InList(t))
271 sc.ChangeState(SCE_CAML_KEYWORD2);
272 else if (keywords3.InList(t))
273 sc.ChangeState(SCE_CAML_KEYWORD3);
274 }
275 state2 = SCE_CAML_DEFAULT, advance = false;
276 }
277 break;
278
279 case SCE_CAML_TAGNAME:
280 // [try to] interpret as [additional] tagname char
281 if (!(iscaml(sc.ch) || sc.Match('\'')))
282 state2 = SCE_CAML_DEFAULT, advance = false;
283 break;
284
285 /*case SCE_CAML_KEYWORD:
286 case SCE_CAML_KEYWORD2:
287 case SCE_CAML_KEYWORD3:
288 // [try to] interpret as [additional] keyword char
289 if (!iscaml(ch))
290 state2 = SCE_CAML_DEFAULT, advance = false;
291 break;*/
292
293 case SCE_CAML_LINENUM:
294 // [try to] interpret as [additional] linenum directive char
295 if (!isdigit(sc.ch))
296 state2 = SCE_CAML_DEFAULT, advance = false;
297 break;
298
299 case SCE_CAML_OPERATOR: {
300 // [try to] interpret as [additional] operator char
301 const char* o = 0;
302 if (iscaml(sc.ch) || isspace(sc.ch) // ident or whitespace
303 || (o = strchr(")]};,\'\"#", sc.ch),o) // "termination" chars
304 || (!isSML && sc.Match('`')) // Caml extra term char
305 || (!strchr("!$%&*+-./:<=>?@^|~", sc.ch)// "operator" chars
306 // SML extra ident chars
307 && !(isSML && (sc.Match('\\') || sc.Match('`'))))) {
308 // check for INCLUSIVE termination
309 if (o && strchr(")]};,", sc.ch)) {
310 if ((sc.Match(')') && sc.chPrev == '(')
311 || (sc.Match(']') && sc.chPrev == '['))
312 // special-case "()" and "[]" tokens as KEYWORDS
313 sc.ChangeState(SCE_CAML_KEYWORD);
314 chColor++;
315 } else
316 advance = false;
317 state2 = SCE_CAML_DEFAULT;
318 }
319 break;
320 }
321
322 case SCE_CAML_NUMBER:
323 // [try to] interpret as [additional] numeric literal char
324 if ((!isSML && sc.Match('_')) || IsADigit(sc.ch, chBase))
325 break;
326 // how about an integer suffix?
327 if (!isSML && (sc.Match('l') || sc.Match('L') || sc.Match('n'))
328 && (sc.chPrev == '_' || IsADigit(sc.chPrev, chBase)))
329 break;
330 // or a floating-point literal?
331 if (chBase == 10) {
332 // with a decimal point?
333 if (sc.Match('.')
334 && ((!isSML && sc.chPrev == '_')
335 || IsADigit(sc.chPrev, chBase)))
336 break;
337 // with an exponent? (I)
338 if ((sc.Match('e') || sc.Match('E'))
339 && ((!isSML && (sc.chPrev == '.' || sc.chPrev == '_'))
340 || IsADigit(sc.chPrev, chBase)))
341 break;
342 // with an exponent? (II)
343 if (((!isSML && (sc.Match('+') || sc.Match('-')))
344 || (isSML && sc.Match('~')))
345 && (sc.chPrev == 'e' || sc.chPrev == 'E'))
346 break;
347 }
348 // it looks like we have run out of number
349 state2 = SCE_CAML_DEFAULT, advance = false;
350 break;
351
352 case SCE_CAML_CHAR:
353 if (!isSML) {
354 // [try to] interpret as [additional] char literal char
355 if (sc.Match('\\')) {
356 chLit = 1; // (definitely IS a char literal)
357 if (sc.chPrev == '\\')
358 sc.ch = ' '; // (...\\')
359 // should we be terminating - one way or another?
360 } else if ((sc.Match('\'') && sc.chPrev != '\\')
361 || sc.atLineEnd) {
362 state2 = SCE_CAML_DEFAULT;
363 if (sc.Match('\''))
364 chColor++;
365 else
366 sc.ChangeState(SCE_CAML_IDENTIFIER);
367 // ... maybe a char literal, maybe not
368 } else if (chLit < 1 && sc.currentPos - chToken >= 2)
369 sc.ChangeState(SCE_CAML_IDENTIFIER), advance = false;
370 break;
371 }/* else
372 // fall through for SML char literal (handle like string) */
373 // Falls through.
374
375 case SCE_CAML_STRING:
376 // [try to] interpret as [additional] [SML char/] string literal char
377 if (isSML && sc.Match('\\') && sc.chPrev != '\\' && isspace(sc.chNext))
378 state2 = SCE_CAML_WHITE;
379 else if (sc.Match('\\') && sc.chPrev == '\\')
380 sc.ch = ' '; // (...\\")
381 // should we be terminating - one way or another?
382 else if ((sc.Match('"') && sc.chPrev != '\\')
383 || (isSML && sc.atLineEnd)) {
384 state2 = SCE_CAML_DEFAULT;
385 if (sc.Match('"'))
386 chColor++;
387 }
388 break;
389
390 case SCE_CAML_WHITE:
391 // [try to] interpret as [additional] SML embedded whitespace char
392 if (sc.Match('\\')) {
393 // style this puppy NOW...
394 state2 = SCE_CAML_STRING, sc.ch = ' ' /* (...\") */, chColor++,
395 styler.ColourTo(chColor, SCE_CAML_WHITE), styler.Flush();
396 // ... then backtrack to determine original SML literal type
397 Sci_Position p = chColor - 2;
398 for (; p >= 0 && styler.StyleAt(p) == SCE_CAML_WHITE; p--) ;
399 if (p >= 0)
400 state2 = static_cast<int>(styler.StyleAt(p));
401 // take care of state change NOW
402 sc.ChangeState(state2), state2 = -1;
403 }
404 break;
405
406 case SCE_CAML_COMMENT:
407 case SCE_CAML_COMMENT1:
408 case SCE_CAML_COMMENT2:
409 case SCE_CAML_COMMENT3:
410 // we're IN a comment - does this start a NESTED comment?
411 if (sc.Match('(', '*'))
412 state2 = sc.state + 1, chToken = sc.currentPos,
413 sc.Forward(), sc.ch = ' ' /* (*)... */, nesting++;
414 // [try to] interpret as [additional] comment char
415 else if (sc.Match(')') && sc.chPrev == '*') {
416 if (nesting)
417 state2 = (sc.state & 0x0f) - 1, chToken = 0, nesting--;
418 else
419 state2 = SCE_CAML_DEFAULT;
420 chColor++;
421 // enable "magic" (read-only) comment AS REQUIRED
422 } else if (useMagic && sc.currentPos - chToken == 4
423 && sc.Match('c') && sc.chPrev == 'r' && sc.GetRelative(-2) == '@')
424 sc.state |= 0x10; // (switch to read-only comment style)
425 break;
426 }
427
428 // handle state change and char coloring AS REQUIRED
429 if (state2 >= 0)
430 styler.ColourTo(chColor, sc.state), sc.ChangeState(state2);
431 // move to next char UNLESS re-scanning current char
432 if (advance)
433 sc.Forward();
434 }
435
436 // do any required terminal char coloring (JIC)
437 sc.Complete();
438 }
439
440 #ifdef BUILD_AS_EXTERNAL_LEXER
441 static
442 #endif /* BUILD_AS_EXTERNAL_LEXER */
FoldCamlDoc(Sci_PositionU,Sci_Position,int,WordList * [],Accessor &)443 void FoldCamlDoc(
444 Sci_PositionU, Sci_Position,
445 int,
446 WordList *[],
447 Accessor &)
448 {
449 }
450
451 static const char * const camlWordListDesc[] = {
452 "Keywords", // primary Objective Caml keywords
453 "Keywords2", // "optional" keywords (typically from Pervasives)
454 "Keywords3", // "optional" keywords (typically typenames)
455 0
456 };
457
458 #ifndef BUILD_AS_EXTERNAL_LEXER
459 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
460 #endif /* BUILD_AS_EXTERNAL_LEXER */
461