1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3  ** Lexer for Objective Caml.
4  **/
5 // Copyright 2005-2009 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /*	Release History
8 	20050204 Initial release.
9 	20050205 Quick compiler standards/"cleanliness" adjustment.
10 	20050206 Added cast for IsLeadByte().
11 	20050209 Changes to "external" build support.
12 	20050306 Fix for 1st-char-in-doc "corner" case.
13 	20050502 Fix for [harmless] one-past-the-end coloring.
14 	20050515 Refined numeric token recognition logic.
15 	20051125 Added 2nd "optional" keywords class.
16 	20051129 Support "magic" (read-only) comments for RCaml.
17 	20051204 Swtich to using StyleContext infrastructure.
18 	20090629 Add full Standard ML '97 support.
19 */
20 
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdarg.h>
25 #include <assert.h>
26 #include <ctype.h>
27 
28 #include "ILexer.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
31 
32 #include "PropSetSimple.h"
33 #include "WordList.h"
34 #include "LexAccessor.h"
35 #include "Accessor.h"
36 #include "StyleContext.h"
37 #include "CharacterSet.h"
38 #include "LexerModule.h"
39 
40 //	Since the Microsoft __iscsym[f] funcs are not ANSI...
iscaml(int c)41 inline int  iscaml(int c) {return isalnum(c) || c == '_';}
iscamlf(int c)42 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
43 
44 static const int baseT[24] = {
45 	0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* A - L */
46 	0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,16	/* M - X */
47 };
48 
49 #ifdef SCI_NAMESPACE
50 using namespace Scintilla;
51 #endif
52 
53 #ifdef BUILD_AS_EXTERNAL_LEXER
54 /*
55 	(actually seems to work!)
56 */
57 #include <string>
58 #include "WindowAccessor.h"
59 #include "ExternalLexer.h"
60 
61 #undef EXT_LEXER_DECL
62 #define EXT_LEXER_DECL __declspec( dllexport ) __stdcall
63 
64 #if PLAT_WIN
65 #include <windows.h>
66 #endif
67 
68 static void ColouriseCamlDoc(
69 	Sci_PositionU startPos, Sci_Position length,
70 	int initStyle,
71 	WordList *keywordlists[],
72 	Accessor &styler);
73 
74 static void FoldCamlDoc(
75 	Sci_PositionU startPos, Sci_Position length,
76 	int initStyle,
77 	WordList *keywordlists[],
78 	Accessor &styler);
79 
80 static void InternalLexOrFold(int lexOrFold, Sci_PositionU startPos, Sci_Position length,
81 	int initStyle, char *words[], WindowID window, char *props);
82 
83 static const char* LexerName = "caml";
84 
85 #ifdef TRACE
DebugPrintf(const char * format,...)86 void Platform::DebugPrintf(const char *format, ...) {
87 	char buffer[2000];
88 	va_list pArguments;
89 	va_start(pArguments, format);
90 	vsprintf(buffer,format,pArguments);
91 	va_end(pArguments);
92 	Platform::DebugDisplay(buffer);
93 }
94 #else
DebugPrintf(const char *,...)95 void Platform::DebugPrintf(const char *, ...) {
96 }
97 #endif
98 
IsDBCSLeadByte(int codePage,char ch)99 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
100 	return ::IsDBCSLeadByteEx(codePage, ch) != 0;
101 }
102 
SendScintilla(WindowID w,unsigned int msg,unsigned long wParam,long lParam)103 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
104 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
105 }
106 
SendScintillaPointer(WindowID w,unsigned int msg,unsigned long wParam,void * lParam)107 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
108 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
109 		reinterpret_cast<LPARAM>(lParam));
110 }
111 
Fold(unsigned int lexer,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)112 void EXT_LEXER_DECL Fold(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
113 	int initStyle, char *words[], WindowID window, char *props)
114 {
115 	// below useless evaluation(s) to supress "not used" warnings
116 	lexer;
117 	// build expected data structures and do the Fold
118 	InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
119 
120 }
121 
GetLexerCount()122 int EXT_LEXER_DECL GetLexerCount()
123 {
124 	return 1;	// just us [Objective] Caml lexers here!
125 }
126 
GetLexerName(unsigned int Index,char * name,int buflength)127 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
128 {
129 	// below useless evaluation(s) to supress "not used" warnings
130 	Index;
131 	// return as much of our lexer name as will fit (what's up with Index?)
132 	if (buflength > 0) {
133 		buflength--;
134 		int n = strlen(LexerName);
135 		if (n > buflength)
136 			n = buflength;
137 		memcpy(name, LexerName, n), name[n] = '\0';
138 	}
139 }
140 
Lex(unsigned int lexer,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)141 void EXT_LEXER_DECL Lex(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
142 	int initStyle, char *words[], WindowID window, char *props)
143 {
144 	// below useless evaluation(s) to supress "not used" warnings
145 	lexer;
146 	// build expected data structures and do the Lex
147 	InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
148 }
149 
InternalLexOrFold(int foldOrLex,Sci_PositionU startPos,Sci_Position length,int initStyle,char * words[],WindowID window,char * props)150 static void InternalLexOrFold(int foldOrLex, Sci_PositionU startPos, Sci_Position length,
151 	int initStyle, char *words[], WindowID window, char *props)
152 {
153 	// create and initialize a WindowAccessor (including contained PropSet)
154 	PropSetSimple ps;
155 	ps.SetMultiple(props);
156 	WindowAccessor wa(window, ps);
157 	// create and initialize WordList(s)
158 	int nWL = 0;
159 	for (; words[nWL]; nWL++) ;	// count # of WordList PTRs needed
160 	WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
161 	int i = 0;
162 	for (; i < nWL; i++) {
163 		wl[i] = new WordList();	// (works or THROWS bad_alloc EXCEPTION)
164 		wl[i]->Set(words[i]);
165 	}
166 	wl[i] = 0;
167 	// call our "internal" folder/lexer (... then do Flush!)
168 	if (foldOrLex)
169 		FoldCamlDoc(startPos, length, initStyle, wl, wa);
170 	else
171 		ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
172 	wa.Flush();
173 	// clean up before leaving
174 	for (i = nWL - 1; i >= 0; i--)
175 		delete wl[i];
176 	delete [] wl;
177 }
178 
179 static
180 #endif	/* BUILD_AS_EXTERNAL_LEXER */
181 
ColouriseCamlDoc(Sci_PositionU startPos,Sci_Position length,int initStyle,WordList * keywordlists[],Accessor & styler)182 void ColouriseCamlDoc(
183 	Sci_PositionU startPos, Sci_Position length,
184 	int initStyle,
185 	WordList *keywordlists[],
186 	Accessor &styler)
187 {
188 	// initialize styler
189 	StyleContext sc(startPos, length, initStyle, styler);
190 
191 	Sci_PositionU chToken = 0;
192 	int chBase = 0, chLit = 0;
193 	WordList& keywords  = *keywordlists[0];
194 	WordList& keywords2 = *keywordlists[1];
195 	WordList& keywords3 = *keywordlists[2];
196 	const bool isSML = keywords.InList("andalso");
197 	const int useMagic = styler.GetPropertyInt("lexer.caml.magic", 0);
198 
199 	// set up [initial] state info (terminating states that shouldn't "bleed")
200 	const int state_ = sc.state & 0x0f;
201 	if (state_ <= SCE_CAML_CHAR
202 		|| (isSML && state_ == SCE_CAML_STRING))
203 		sc.state = SCE_CAML_DEFAULT;
204 	int nesting = (state_ >= SCE_CAML_COMMENT)? (state_ - SCE_CAML_COMMENT): 0;
205 
206 	// foreach char in range...
207 	while (sc.More()) {
208 		// set up [per-char] state info
209 		int state2 = -1;				// (ASSUME no state change)
210 		Sci_Position chColor = sc.currentPos - 1;// (ASSUME standard coloring range)
211 		bool advance = true;			// (ASSUME scanner "eats" 1 char)
212 
213 		// step state machine
214 		switch (sc.state & 0x0f) {
215 		case SCE_CAML_DEFAULT:
216 			chToken = sc.currentPos;	// save [possible] token start (JIC)
217 			// it's wide open; what do we have?
218 			if (iscamlf(sc.ch))
219 				state2 = SCE_CAML_IDENTIFIER;
220 			else if (!isSML && sc.Match('`') && iscamlf(sc.chNext))
221 				state2 = SCE_CAML_TAGNAME;
222 			else if (!isSML && sc.Match('#') && isdigit(sc.chNext))
223 				state2 = SCE_CAML_LINENUM;
224 			else if (isdigit(sc.ch)) {
225 				// it's a number, assume base 10
226 				state2 = SCE_CAML_NUMBER, chBase = 10;
227 				if (sc.Match('0')) {
228 					// there MAY be a base specified...
229 					const char* baseC = "bBoOxX";
230 					if (isSML) {
231 						if (sc.chNext == 'w')
232 							sc.Forward();	// (consume SML "word" indicator)
233 						baseC = "x";
234 					}
235 					// ... change to specified base AS REQUIRED
236 					if (strchr(baseC, sc.chNext))
237 						chBase = baseT[tolower(sc.chNext) - 'a'], sc.Forward();
238 				}
239 			} else if (!isSML && sc.Match('\''))	// (Caml char literal?)
240 				state2 = SCE_CAML_CHAR, chLit = 0;
241 			else if (isSML && sc.Match('#', '"'))	// (SML char literal?)
242 				state2 = SCE_CAML_CHAR, sc.Forward();
243 			else if (sc.Match('"'))
244 				state2 = SCE_CAML_STRING;
245 			else if (sc.Match('(', '*'))
246 				state2 = SCE_CAML_COMMENT, sc.Forward(), sc.ch = ' '; // (*)...
247 			else if (strchr("!?~"			/* Caml "prefix-symbol" */
248 					"=<>@^|&+-*/$%"			/* Caml "infix-symbol" */
249 					"()[]{};,:.#", sc.ch)	// Caml "bracket" or ;,:.#
250 											// SML "extra" ident chars
251 				|| (isSML && (sc.Match('\\') || sc.Match('`'))))
252 				state2 = SCE_CAML_OPERATOR;
253 			break;
254 
255 		case SCE_CAML_IDENTIFIER:
256 			// [try to] interpret as [additional] identifier char
257 			if (!(iscaml(sc.ch) || sc.Match('\''))) {
258 				const Sci_Position n = sc.currentPos - chToken;
259 				if (n < 24) {
260 					// length is believable as keyword, [re-]construct token
261 					char t[24];
262 					for (Sci_Position i = -n; i < 0; i++)
263 						t[n + i] = static_cast<char>(sc.GetRelative(i));
264 					t[n] = '\0';
265 					// special-case "_" token as KEYWORD
266 					if ((n == 1 && sc.chPrev == '_') || keywords.InList(t))
267 						sc.ChangeState(SCE_CAML_KEYWORD);
268 					else if (keywords2.InList(t))
269 						sc.ChangeState(SCE_CAML_KEYWORD2);
270 					else if (keywords3.InList(t))
271 						sc.ChangeState(SCE_CAML_KEYWORD3);
272 				}
273 				state2 = SCE_CAML_DEFAULT, advance = false;
274 			}
275 			break;
276 
277 		case SCE_CAML_TAGNAME:
278 			// [try to] interpret as [additional] tagname char
279 			if (!(iscaml(sc.ch) || sc.Match('\'')))
280 				state2 = SCE_CAML_DEFAULT, advance = false;
281 			break;
282 
283 		/*case SCE_CAML_KEYWORD:
284 		case SCE_CAML_KEYWORD2:
285 		case SCE_CAML_KEYWORD3:
286 			// [try to] interpret as [additional] keyword char
287 			if (!iscaml(ch))
288 				state2 = SCE_CAML_DEFAULT, advance = false;
289 			break;*/
290 
291 		case SCE_CAML_LINENUM:
292 			// [try to] interpret as [additional] linenum directive char
293 			if (!isdigit(sc.ch))
294 				state2 = SCE_CAML_DEFAULT, advance = false;
295 			break;
296 
297 		case SCE_CAML_OPERATOR: {
298 			// [try to] interpret as [additional] operator char
299 			const char* o = 0;
300 			if (iscaml(sc.ch) || isspace(sc.ch)			// ident or whitespace
301 				|| (o = strchr(")]};,\'\"#", sc.ch),o)	// "termination" chars
302 				|| (!isSML && sc.Match('`'))			// Caml extra term char
303 				|| (!strchr("!$%&*+-./:<=>?@^|~", sc.ch)// "operator" chars
304 														// SML extra ident chars
305 					&& !(isSML && (sc.Match('\\') || sc.Match('`'))))) {
306 				// check for INCLUSIVE termination
307 				if (o && strchr(")]};,", sc.ch)) {
308 					if ((sc.Match(')') && sc.chPrev == '(')
309 						|| (sc.Match(']') && sc.chPrev == '['))
310 						// special-case "()" and "[]" tokens as KEYWORDS
311 						sc.ChangeState(SCE_CAML_KEYWORD);
312 					chColor++;
313 				} else
314 					advance = false;
315 				state2 = SCE_CAML_DEFAULT;
316 			}
317 			break;
318 		}
319 
320 		case SCE_CAML_NUMBER:
321 			// [try to] interpret as [additional] numeric literal char
322 			if ((!isSML && sc.Match('_')) || IsADigit(sc.ch, chBase))
323 				break;
324 			// how about an integer suffix?
325 			if (!isSML && (sc.Match('l') || sc.Match('L') || sc.Match('n'))
326 				&& (sc.chPrev == '_' || IsADigit(sc.chPrev, chBase)))
327 				break;
328 			// or a floating-point literal?
329 			if (chBase == 10) {
330 				// with a decimal point?
331 				if (sc.Match('.')
332 					&& ((!isSML && sc.chPrev == '_')
333 						|| IsADigit(sc.chPrev, chBase)))
334 					break;
335 				// with an exponent? (I)
336 				if ((sc.Match('e') || sc.Match('E'))
337 					&& ((!isSML && (sc.chPrev == '.' || sc.chPrev == '_'))
338 						|| IsADigit(sc.chPrev, chBase)))
339 					break;
340 				// with an exponent? (II)
341 				if (((!isSML && (sc.Match('+') || sc.Match('-')))
342 						|| (isSML && sc.Match('~')))
343 					&& (sc.chPrev == 'e' || sc.chPrev == 'E'))
344 					break;
345 			}
346 			// it looks like we have run out of number
347 			state2 = SCE_CAML_DEFAULT, advance = false;
348 			break;
349 
350 		case SCE_CAML_CHAR:
351 			if (!isSML) {
352 				// [try to] interpret as [additional] char literal char
353 				if (sc.Match('\\')) {
354 					chLit = 1;	// (definitely IS a char literal)
355 					if (sc.chPrev == '\\')
356 						sc.ch = ' ';	// (...\\')
357 				// should we be terminating - one way or another?
358 				} else if ((sc.Match('\'') && sc.chPrev != '\\')
359 					|| sc.atLineEnd) {
360 					state2 = SCE_CAML_DEFAULT;
361 					if (sc.Match('\''))
362 						chColor++;
363 					else
364 						sc.ChangeState(SCE_CAML_IDENTIFIER);
365 				// ... maybe a char literal, maybe not
366 				} else if (chLit < 1 && sc.currentPos - chToken >= 2)
367 					sc.ChangeState(SCE_CAML_IDENTIFIER), advance = false;
368 				break;
369 			}/* else
370 				// fall through for SML char literal (handle like string) */
371 
372 		case SCE_CAML_STRING:
373 			// [try to] interpret as [additional] [SML char/] string literal char
374 			if (isSML && sc.Match('\\') && sc.chPrev != '\\' && isspace(sc.chNext))
375 				state2 = SCE_CAML_WHITE;
376 			else if (sc.Match('\\') && sc.chPrev == '\\')
377 				sc.ch = ' ';	// (...\\")
378 			// should we be terminating - one way or another?
379 			else if ((sc.Match('"') && sc.chPrev != '\\')
380 				|| (isSML && sc.atLineEnd)) {
381 				state2 = SCE_CAML_DEFAULT;
382 				if (sc.Match('"'))
383 					chColor++;
384 			}
385 			break;
386 
387 		case SCE_CAML_WHITE:
388 			// [try to] interpret as [additional] SML embedded whitespace char
389 			if (sc.Match('\\')) {
390 				// style this puppy NOW...
391 				state2 = SCE_CAML_STRING, sc.ch = ' ' /* (...\") */, chColor++,
392 					styler.ColourTo(chColor, SCE_CAML_WHITE), styler.Flush();
393 				// ... then backtrack to determine original SML literal type
394 				Sci_Position p = chColor - 2;
395 				for (; p >= 0 && styler.StyleAt(p) == SCE_CAML_WHITE; p--) ;
396 				if (p >= 0)
397 					state2 = static_cast<int>(styler.StyleAt(p));
398 				// take care of state change NOW
399 				sc.ChangeState(state2), state2 = -1;
400 			}
401 			break;
402 
403 		case SCE_CAML_COMMENT:
404 		case SCE_CAML_COMMENT1:
405 		case SCE_CAML_COMMENT2:
406 		case SCE_CAML_COMMENT3:
407 			// we're IN a comment - does this start a NESTED comment?
408 			if (sc.Match('(', '*'))
409 				state2 = sc.state + 1, chToken = sc.currentPos,
410 					sc.Forward(), sc.ch = ' ' /* (*)... */, nesting++;
411 			// [try to] interpret as [additional] comment char
412 			else if (sc.Match(')') && sc.chPrev == '*') {
413 				if (nesting)
414 					state2 = (sc.state & 0x0f) - 1, chToken = 0, nesting--;
415 				else
416 					state2 = SCE_CAML_DEFAULT;
417 				chColor++;
418 			// enable "magic" (read-only) comment AS REQUIRED
419 			} else if (useMagic && sc.currentPos - chToken == 4
420 				&& sc.Match('c') && sc.chPrev == 'r' && sc.GetRelative(-2) == '@')
421 				sc.state |= 0x10;	// (switch to read-only comment style)
422 			break;
423 		}
424 
425 		// handle state change and char coloring AS REQUIRED
426 		if (state2 >= 0)
427 			styler.ColourTo(chColor, sc.state), sc.ChangeState(state2);
428 		// move to next char UNLESS re-scanning current char
429 		if (advance)
430 			sc.Forward();
431 	}
432 
433 	// do any required terminal char coloring (JIC)
434 	sc.Complete();
435 }
436 
437 #ifdef BUILD_AS_EXTERNAL_LEXER
438 static
439 #endif	/* BUILD_AS_EXTERNAL_LEXER */
FoldCamlDoc(Sci_PositionU,Sci_Position,int,WordList * [],Accessor &)440 void FoldCamlDoc(
441 	Sci_PositionU, Sci_Position,
442 	int,
443 	WordList *[],
444 	Accessor &)
445 {
446 }
447 
448 static const char * const camlWordListDesc[] = {
449 	"Keywords",		// primary Objective Caml keywords
450 	"Keywords2",	// "optional" keywords (typically from Pervasives)
451 	"Keywords3",	// "optional" keywords (typically typenames)
452 	0
453 };
454 
455 #ifndef BUILD_AS_EXTERNAL_LEXER
456 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
457 #endif	/* BUILD_AS_EXTERNAL_LEXER */
458