1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3  ** Lexer for Objective Caml.
4  **/
5 // Copyright 2005-2009 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /*	Release History
8 	20050204 Initial release.
9 	20050205 Quick compiler standards/"cleanliness" adjustment.
10 	20050206 Added cast for IsLeadByte().
11 	20050209 Changes to "external" build support.
12 	20050306 Fix for 1st-char-in-doc "corner" case.
13 	20050502 Fix for [harmless] one-past-the-end coloring.
14 	20050515 Refined numeric token recognition logic.
15 	20051125 Added 2nd "optional" keywords class.
16 	20051129 Support "magic" (read-only) comments for RCaml.
17 	20051204 Swtich to using StyleContext infrastructure.
18 	20090629 Add full Standard ML '97 support.
19 */
20 
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdarg.h>
25 #include <assert.h>
26 #include <ctype.h>
27 
28 #include "ILexer.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
31 
32 #include "PropSetSimple.h"
33 #include "WordList.h"
34 #include "LexAccessor.h"
35 #include "Accessor.h"
36 #include "StyleContext.h"
37 #include "CharacterSet.h"
38 #include "LexerModule.h"
39 
40 #if defined(__clang__)
41 #pragma clang diagnostic ignored "-Wcomma"
42 #endif
43 
44 //	Since the Microsoft __iscsym[f] funcs are not ANSI...
iscaml(int c)45 inline int  iscaml(int c) {return isalnum(c) || c == '_';}
iscamlf(int c)46 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
47 
48 static const int baseT[24] = {
49 	0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* A - L */
50 	0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,16	/* M - X */
51 };
52 
53 using namespace Scintilla;
54 
55 #ifdef BUILD_AS_EXTERNAL_LEXER
56 /*
57 	(actually seems to work!)
58 */
59 #include <string>
60 #include "WindowAccessor.h"
61 #include "ExternalLexer.h"
62 
63 #undef EXT_LEXER_DECL
64 #define EXT_LEXER_DECL __declspec( dllexport ) __stdcall
65 
66 #if PLAT_WIN
67 #include <windows.h>
68 #endif
69 
70 static void ColouriseCamlDoc(
71 	Sci_PositionU startPos, Sci_Position length,
72 	int initStyle,
73 	WordList *keywordlists[],
74 	Accessor &styler);
75 
76 static void FoldCamlDoc(
77 	Sci_PositionU startPos, Sci_Position length,
78 	int initStyle,
79 	WordList *keywordlists[],
80 	Accessor &styler);
81 
82 static void InternalLexOrFold(int lexOrFold, Sci_PositionU startPos, Sci_Position length,
83 	int initStyle, char *words[], WindowID window, char *props);
84 
85 static const char* LexerName = "caml";
86 
87 #ifdef TRACE
DebugPrintf(const char * format,...)88 void Platform::DebugPrintf(const char *format, ...) {
89 	char buffer[2000];
90 	va_list pArguments;
91 	va_start(pArguments, format);
92 	vsprintf(buffer,format,pArguments);
93 	va_end(pArguments);
94 	Platform::DebugDisplay(buffer);
95 }
96 #else
DebugPrintf(const char *,...)97 void Platform::DebugPrintf(const char *, ...) {
98 }
99 #endif
100 
IsDBCSLeadByte(int codePage,char ch)101 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
102 	return ::IsDBCSLeadByteEx(codePage, ch) != 0;
103 }
104 
SendScintilla(WindowID w,unsigned int msg,unsigned long wParam,long lParam)105 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
106 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
107 }
108 
SendScintillaPointer(WindowID w,unsigned int msg,unsigned long wParam,void * lParam)109 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
110 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
111 		reinterpret_cast<LPARAM>(lParam));
112 }
113 
Fold(unsigned int lexer,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)114 void EXT_LEXER_DECL Fold(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
115 	int initStyle, char *words[], WindowID window, char *props)
116 {
117 	// below useless evaluation(s) to supress "not used" warnings
118 	lexer;
119 	// build expected data structures and do the Fold
120 	InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
121 
122 }
123 
GetLexerCount()124 int EXT_LEXER_DECL GetLexerCount()
125 {
126 	return 1;	// just us [Objective] Caml lexers here!
127 }
128 
GetLexerName(unsigned int Index,char * name,int buflength)129 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
130 {
131 	// below useless evaluation(s) to supress "not used" warnings
132 	Index;
133 	// return as much of our lexer name as will fit (what's up with Index?)
134 	if (buflength > 0) {
135 		buflength--;
136 		int n = strlen(LexerName);
137 		if (n > buflength)
138 			n = buflength;
139 		memcpy(name, LexerName, n), name[n] = '\0';
140 	}
141 }
142 
Lex(unsigned int lexer,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)143 void EXT_LEXER_DECL Lex(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
144 	int initStyle, char *words[], WindowID window, char *props)
145 {
146 	// below useless evaluation(s) to supress "not used" warnings
147 	lexer;
148 	// build expected data structures and do the Lex
149 	InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
150 }
151 
InternalLexOrFold(int foldOrLex,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)152 static void InternalLexOrFold(int foldOrLex, Sci_PositionU startPos, Sci_Position length,
153 	int initStyle, char *words[], WindowID window, char *props)
154 {
155 	// create and initialize a WindowAccessor (including contained PropSet)
156 	PropSetSimple ps;
157 	ps.SetMultiple(props);
158 	WindowAccessor wa(window, ps);
159 	// create and initialize WordList(s)
160 	int nWL = 0;
161 	for (; words[nWL]; nWL++) ;	// count # of WordList PTRs needed
162 	WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
163 	int i = 0;
164 	for (; i < nWL; i++) {
165 		wl[i] = new WordList();	// (works or THROWS bad_alloc EXCEPTION)
166 		wl[i]->Set(words[i]);
167 	}
168 	wl[i] = 0;
169 	// call our "internal" folder/lexer (... then do Flush!)
170 	if (foldOrLex)
171 		FoldCamlDoc(startPos, length, initStyle, wl, wa);
172 	else
173 		ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
174 	wa.Flush();
175 	// clean up before leaving
176 	for (i = nWL - 1; i >= 0; i--)
177 		delete wl[i];
178 	delete [] wl;
179 }
180 
181 static
182 #endif	/* BUILD_AS_EXTERNAL_LEXER */
183 
ColouriseCamlDoc(Sci_PositionU startPos,Sci_Position length,int initStyle,WordList * keywordlists[],Accessor & styler)184 void ColouriseCamlDoc(
185 	Sci_PositionU startPos, Sci_Position length,
186 	int initStyle,
187 	WordList *keywordlists[],
188 	Accessor &styler)
189 {
190 	// initialize styler
191 	StyleContext sc(startPos, length, initStyle, styler);
192 
193 	Sci_PositionU chToken = 0;
194 	int chBase = 0, chLit = 0;
195 	WordList& keywords  = *keywordlists[0];
196 	WordList& keywords2 = *keywordlists[1];
197 	WordList& keywords3 = *keywordlists[2];
198 	const bool isSML = keywords.InList("andalso");
199 	const int useMagic = styler.GetPropertyInt("lexer.caml.magic", 0);
200 
201 	// set up [initial] state info (terminating states that shouldn't "bleed")
202 	const int state_ = sc.state & 0x0f;
203 	if (state_ <= SCE_CAML_CHAR
204 		|| (isSML && state_ == SCE_CAML_STRING))
205 		sc.state = SCE_CAML_DEFAULT;
206 	int nesting = (state_ >= SCE_CAML_COMMENT)? (state_ - SCE_CAML_COMMENT): 0;
207 
208 	// foreach char in range...
209 	while (sc.More()) {
210 		// set up [per-char] state info
211 		int state2 = -1;				// (ASSUME no state change)
212 		Sci_Position chColor = sc.currentPos - 1;// (ASSUME standard coloring range)
213 		bool advance = true;			// (ASSUME scanner "eats" 1 char)
214 
215 		// step state machine
216 		switch (sc.state & 0x0f) {
217 		case SCE_CAML_DEFAULT:
218 			chToken = sc.currentPos;	// save [possible] token start (JIC)
219 			// it's wide open; what do we have?
220 			if (iscamlf(sc.ch))
221 				state2 = SCE_CAML_IDENTIFIER;
222 			else if (!isSML && sc.Match('`') && iscamlf(sc.chNext))
223 				state2 = SCE_CAML_TAGNAME;
224 			else if (!isSML && sc.Match('#') && isdigit(sc.chNext))
225 				state2 = SCE_CAML_LINENUM;
226 			else if (isdigit(sc.ch)) {
227 				// it's a number, assume base 10
228 				state2 = SCE_CAML_NUMBER, chBase = 10;
229 				if (sc.Match('0')) {
230 					// there MAY be a base specified...
231 					const char* baseC = "bBoOxX";
232 					if (isSML) {
233 						if (sc.chNext == 'w')
234 							sc.Forward();	// (consume SML "word" indicator)
235 						baseC = "x";
236 					}
237 					// ... change to specified base AS REQUIRED
238 					if (strchr(baseC, sc.chNext))
239 						chBase = baseT[tolower(sc.chNext) - 'a'], sc.Forward();
240 				}
241 			} else if (!isSML && sc.Match('\''))	// (Caml char literal?)
242 				state2 = SCE_CAML_CHAR, chLit = 0;
243 			else if (isSML && sc.Match('#', '"'))	// (SML char literal?)
244 				state2 = SCE_CAML_CHAR, sc.Forward();
245 			else if (sc.Match('"'))
246 				state2 = SCE_CAML_STRING;
247 			else if (sc.Match('(', '*'))
248 				state2 = SCE_CAML_COMMENT, sc.Forward(), sc.ch = ' '; // (*)...
249 			else if (strchr("!?~"			/* Caml "prefix-symbol" */
250 					"=<>@^|&+-*/$%"			/* Caml "infix-symbol" */
251 					"()[]{};,:.#", sc.ch)	// Caml "bracket" or ;,:.#
252 											// SML "extra" ident chars
253 				|| (isSML && (sc.Match('\\') || sc.Match('`'))))
254 				state2 = SCE_CAML_OPERATOR;
255 			break;
256 
257 		case SCE_CAML_IDENTIFIER:
258 			// [try to] interpret as [additional] identifier char
259 			if (!(iscaml(sc.ch) || sc.Match('\''))) {
260 				const Sci_Position n = sc.currentPos - chToken;
261 				if (n < 24) {
262 					// length is believable as keyword, [re-]construct token
263 					char t[24];
264 					for (Sci_Position i = -n; i < 0; i++)
265 						t[n + i] = static_cast<char>(sc.GetRelative(i));
266 					t[n] = '\0';
267 					// special-case "_" token as KEYWORD
268 					if ((n == 1 && sc.chPrev == '_') || keywords.InList(t))
269 						sc.ChangeState(SCE_CAML_KEYWORD);
270 					else if (keywords2.InList(t))
271 						sc.ChangeState(SCE_CAML_KEYWORD2);
272 					else if (keywords3.InList(t))
273 						sc.ChangeState(SCE_CAML_KEYWORD3);
274 				}
275 				state2 = SCE_CAML_DEFAULT, advance = false;
276 			}
277 			break;
278 
279 		case SCE_CAML_TAGNAME:
280 			// [try to] interpret as [additional] tagname char
281 			if (!(iscaml(sc.ch) || sc.Match('\'')))
282 				state2 = SCE_CAML_DEFAULT, advance = false;
283 			break;
284 
285 		/*case SCE_CAML_KEYWORD:
286 		case SCE_CAML_KEYWORD2:
287 		case SCE_CAML_KEYWORD3:
288 			// [try to] interpret as [additional] keyword char
289 			if (!iscaml(ch))
290 				state2 = SCE_CAML_DEFAULT, advance = false;
291 			break;*/
292 
293 		case SCE_CAML_LINENUM:
294 			// [try to] interpret as [additional] linenum directive char
295 			if (!isdigit(sc.ch))
296 				state2 = SCE_CAML_DEFAULT, advance = false;
297 			break;
298 
299 		case SCE_CAML_OPERATOR: {
300 			// [try to] interpret as [additional] operator char
301 			const char* o = 0;
302 			if (iscaml(sc.ch) || isspace(sc.ch)			// ident or whitespace
303 				|| (o = strchr(")]};,\'\"#", sc.ch),o)	// "termination" chars
304 				|| (!isSML && sc.Match('`'))			// Caml extra term char
305 				|| (!strchr("!$%&*+-./:<=>?@^|~", sc.ch)// "operator" chars
306 														// SML extra ident chars
307 					&& !(isSML && (sc.Match('\\') || sc.Match('`'))))) {
308 				// check for INCLUSIVE termination
309 				if (o && strchr(")]};,", sc.ch)) {
310 					if ((sc.Match(')') && sc.chPrev == '(')
311 						|| (sc.Match(']') && sc.chPrev == '['))
312 						// special-case "()" and "[]" tokens as KEYWORDS
313 						sc.ChangeState(SCE_CAML_KEYWORD);
314 					chColor++;
315 				} else
316 					advance = false;
317 				state2 = SCE_CAML_DEFAULT;
318 			}
319 			break;
320 		}
321 
322 		case SCE_CAML_NUMBER:
323 			// [try to] interpret as [additional] numeric literal char
324 			if ((!isSML && sc.Match('_')) || IsADigit(sc.ch, chBase))
325 				break;
326 			// how about an integer suffix?
327 			if (!isSML && (sc.Match('l') || sc.Match('L') || sc.Match('n'))
328 				&& (sc.chPrev == '_' || IsADigit(sc.chPrev, chBase)))
329 				break;
330 			// or a floating-point literal?
331 			if (chBase == 10) {
332 				// with a decimal point?
333 				if (sc.Match('.')
334 					&& ((!isSML && sc.chPrev == '_')
335 						|| IsADigit(sc.chPrev, chBase)))
336 					break;
337 				// with an exponent? (I)
338 				if ((sc.Match('e') || sc.Match('E'))
339 					&& ((!isSML && (sc.chPrev == '.' || sc.chPrev == '_'))
340 						|| IsADigit(sc.chPrev, chBase)))
341 					break;
342 				// with an exponent? (II)
343 				if (((!isSML && (sc.Match('+') || sc.Match('-')))
344 						|| (isSML && sc.Match('~')))
345 					&& (sc.chPrev == 'e' || sc.chPrev == 'E'))
346 					break;
347 			}
348 			// it looks like we have run out of number
349 			state2 = SCE_CAML_DEFAULT, advance = false;
350 			break;
351 
352 		case SCE_CAML_CHAR:
353 			if (!isSML) {
354 				// [try to] interpret as [additional] char literal char
355 				if (sc.Match('\\')) {
356 					chLit = 1;	// (definitely IS a char literal)
357 					if (sc.chPrev == '\\')
358 						sc.ch = ' ';	// (...\\')
359 				// should we be terminating - one way or another?
360 				} else if ((sc.Match('\'') && sc.chPrev != '\\')
361 					|| sc.atLineEnd) {
362 					state2 = SCE_CAML_DEFAULT;
363 					if (sc.Match('\''))
364 						chColor++;
365 					else
366 						sc.ChangeState(SCE_CAML_IDENTIFIER);
367 				// ... maybe a char literal, maybe not
368 				} else if (chLit < 1 && sc.currentPos - chToken >= 2)
369 					sc.ChangeState(SCE_CAML_IDENTIFIER), advance = false;
370 				break;
371 			}/* else
372 				// fall through for SML char literal (handle like string) */
373 			// Falls through.
374 
375 		case SCE_CAML_STRING:
376 			// [try to] interpret as [additional] [SML char/] string literal char
377 			if (isSML && sc.Match('\\') && sc.chPrev != '\\' && isspace(sc.chNext))
378 				state2 = SCE_CAML_WHITE;
379 			else if (sc.Match('\\') && sc.chPrev == '\\')
380 				sc.ch = ' ';	// (...\\")
381 			// should we be terminating - one way or another?
382 			else if ((sc.Match('"') && sc.chPrev != '\\')
383 				|| (isSML && sc.atLineEnd)) {
384 				state2 = SCE_CAML_DEFAULT;
385 				if (sc.Match('"'))
386 					chColor++;
387 			}
388 			break;
389 
390 		case SCE_CAML_WHITE:
391 			// [try to] interpret as [additional] SML embedded whitespace char
392 			if (sc.Match('\\')) {
393 				// style this puppy NOW...
394 				state2 = SCE_CAML_STRING, sc.ch = ' ' /* (...\") */, chColor++,
395 					styler.ColourTo(chColor, SCE_CAML_WHITE), styler.Flush();
396 				// ... then backtrack to determine original SML literal type
397 				Sci_Position p = chColor - 2;
398 				for (; p >= 0 && styler.StyleAt(p) == SCE_CAML_WHITE; p--) ;
399 				if (p >= 0)
400 					state2 = static_cast<int>(styler.StyleAt(p));
401 				// take care of state change NOW
402 				sc.ChangeState(state2), state2 = -1;
403 			}
404 			break;
405 
406 		case SCE_CAML_COMMENT:
407 		case SCE_CAML_COMMENT1:
408 		case SCE_CAML_COMMENT2:
409 		case SCE_CAML_COMMENT3:
410 			// we're IN a comment - does this start a NESTED comment?
411 			if (sc.Match('(', '*'))
412 				state2 = sc.state + 1, chToken = sc.currentPos,
413 					sc.Forward(), sc.ch = ' ' /* (*)... */, nesting++;
414 			// [try to] interpret as [additional] comment char
415 			else if (sc.Match(')') && sc.chPrev == '*') {
416 				if (nesting)
417 					state2 = (sc.state & 0x0f) - 1, chToken = 0, nesting--;
418 				else
419 					state2 = SCE_CAML_DEFAULT;
420 				chColor++;
421 			// enable "magic" (read-only) comment AS REQUIRED
422 			} else if (useMagic && sc.currentPos - chToken == 4
423 				&& sc.Match('c') && sc.chPrev == 'r' && sc.GetRelative(-2) == '@')
424 				sc.state |= 0x10;	// (switch to read-only comment style)
425 			break;
426 		}
427 
428 		// handle state change and char coloring AS REQUIRED
429 		if (state2 >= 0)
430 			styler.ColourTo(chColor, sc.state), sc.ChangeState(state2);
431 		// move to next char UNLESS re-scanning current char
432 		if (advance)
433 			sc.Forward();
434 	}
435 
436 	// do any required terminal char coloring (JIC)
437 	sc.Complete();
438 }
439 
440 #ifdef BUILD_AS_EXTERNAL_LEXER
441 static
442 #endif	/* BUILD_AS_EXTERNAL_LEXER */
FoldCamlDoc(Sci_PositionU,Sci_Position,int,WordList * [],Accessor &)443 void FoldCamlDoc(
444 	Sci_PositionU, Sci_Position,
445 	int,
446 	WordList *[],
447 	Accessor &)
448 {
449 }
450 
451 static const char * const camlWordListDesc[] = {
452 	"Keywords",		// primary Objective Caml keywords
453 	"Keywords2",	// "optional" keywords (typically from Pervasives)
454 	"Keywords3",	// "optional" keywords (typically typenames)
455 	0
456 };
457 
458 #ifndef BUILD_AS_EXTERNAL_LEXER
459 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
460 #endif	/* BUILD_AS_EXTERNAL_LEXER */
461