1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3  ** Lexer for Objective Caml.
4  **/
5 // Copyright 2005-2009 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /*	Release History
8 	20050204 Initial release.
9 	20050205 Quick compiler standards/"cleanliness" adjustment.
10 	20050206 Added cast for IsLeadByte().
11 	20050209 Changes to "external" build support.
12 	20050306 Fix for 1st-char-in-doc "corner" case.
13 	20050502 Fix for [harmless] one-past-the-end coloring.
14 	20050515 Refined numeric token recognition logic.
15 	20051125 Added 2nd "optional" keywords class.
16 	20051129 Support "magic" (read-only) comments for RCaml.
17 	20051204 Swtich to using StyleContext infrastructure.
18 	20090629 Add full Standard ML '97 support.
19 */
20 
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdarg.h>
25 #include <assert.h>
26 #include <ctype.h>
27 
28 #include "ILexer.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
31 
32 #include "PropSetSimple.h"
33 #include "WordList.h"
34 #include "LexAccessor.h"
35 #include "Accessor.h"
36 #include "StyleContext.h"
37 #include "CharacterSet.h"
38 #include "LexerModule.h"
39 
40 //	Since the Microsoft __iscsym[f] funcs are not ANSI...
iscaml(int c)41 inline int  iscaml(int c) {return isalnum(c) || c == '_';}
iscamlf(int c)42 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
43 
44 static const int baseT[24] = {
45 	0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* A - L */
46 	0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,16	/* M - X */
47 };
48 
49 #ifdef SCI_NAMESPACE
50 using namespace Scintilla;
51 #endif
52 
53 #ifdef BUILD_AS_EXTERNAL_LEXER
54 /*
55 	(actually seems to work!)
56 */
57 #include <string>
58 #include "WindowAccessor.h"
59 #include "ExternalLexer.h"
60 
61 #undef EXT_LEXER_DECL
62 #define EXT_LEXER_DECL __declspec( dllexport ) __stdcall
63 
64 #if PLAT_WIN
65 #include <windows.h>
66 #endif
67 
68 static void ColouriseCamlDoc(
69 	unsigned int startPos, int length,
70 	int initStyle,
71 	WordList *keywordlists[],
72 	Accessor &styler);
73 
74 static void FoldCamlDoc(
75 	unsigned int startPos, int length,
76 	int initStyle,
77 	WordList *keywordlists[],
78 	Accessor &styler);
79 
80 static void InternalLexOrFold(int lexOrFold, unsigned int startPos, int length,
81 	int initStyle, char *words[], WindowID window, char *props);
82 
83 static const char* LexerName = "caml";
84 
85 #ifdef TRACE
DebugPrintf(const char * format,...)86 void Platform::DebugPrintf(const char *format, ...) {
87 	char buffer[2000];
88 	va_list pArguments;
89 	va_start(pArguments, format);
90 	vsprintf(buffer,format,pArguments);
91 	va_end(pArguments);
92 	Platform::DebugDisplay(buffer);
93 }
94 #else
DebugPrintf(const char *,...)95 void Platform::DebugPrintf(const char *, ...) {
96 }
97 #endif
98 
IsDBCSLeadByte(int codePage,char ch)99 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
100 	return ::IsDBCSLeadByteEx(codePage, ch) != 0;
101 }
102 
SendScintilla(WindowID w,unsigned int msg,unsigned long wParam,long lParam)103 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
104 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
105 }
106 
SendScintillaPointer(WindowID w,unsigned int msg,unsigned long wParam,void * lParam)107 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
108 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
109 		reinterpret_cast<LPARAM>(lParam));
110 }
111 
Fold(unsigned int lexer,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)112 void EXT_LEXER_DECL Fold(unsigned int lexer, unsigned int startPos, int length,
113 	int initStyle, char *words[], WindowID window, char *props)
114 {
115 	// below useless evaluation(s) to supress "not used" warnings
116 	lexer;
117 	// build expected data structures and do the Fold
118 	InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
119 
120 }
121 
GetLexerCount()122 int EXT_LEXER_DECL GetLexerCount()
123 {
124 	return 1;	// just us [Objective] Caml lexers here!
125 }
126 
GetLexerName(unsigned int Index,char * name,int buflength)127 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
128 {
129 	// below useless evaluation(s) to supress "not used" warnings
130 	Index;
131 	// return as much of our lexer name as will fit (what's up with Index?)
132 	if (buflength > 0) {
133 		buflength--;
134 		int n = strlen(LexerName);
135 		if (n > buflength)
136 			n = buflength;
137 		memcpy(name, LexerName, n), name[n] = '\0';
138 	}
139 }
140 
Lex(unsigned int lexer,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)141 void EXT_LEXER_DECL Lex(unsigned int lexer, unsigned int startPos, int length,
142 	int initStyle, char *words[], WindowID window, char *props)
143 {
144 	// below useless evaluation(s) to supress "not used" warnings
145 	lexer;
146 	// build expected data structures and do the Lex
147 	InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
148 }
149 
InternalLexOrFold(int foldOrLex,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)150 static void InternalLexOrFold(int foldOrLex, unsigned int startPos, int length,
151 	int initStyle, char *words[], WindowID window, char *props)
152 {
153 	// create and initialize a WindowAccessor (including contained PropSet)
154 	PropSetSimple ps;
155 	ps.SetMultiple(props);
156 	WindowAccessor wa(window, ps);
157 	// create and initialize WordList(s)
158 	int nWL = 0;
159 	for (; words[nWL]; nWL++) ;	// count # of WordList PTRs needed
160 	WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
161 	int i = 0;
162 	for (; i < nWL; i++) {
163 		wl[i] = new WordList();	// (works or THROWS bad_alloc EXCEPTION)
164 		wl[i]->Set(words[i]);
165 	}
166 	wl[i] = 0;
167 	// call our "internal" folder/lexer (... then do Flush!)
168 	if (foldOrLex)
169 		FoldCamlDoc(startPos, length, initStyle, wl, wa);
170 	else
171 		ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
172 	wa.Flush();
173 	// clean up before leaving
174 	for (i = nWL - 1; i >= 0; i--)
175 		delete wl[i];
176 	delete [] wl;
177 }
178 
179 static
180 #endif	/* BUILD_AS_EXTERNAL_LEXER */
181 
ColouriseCamlDoc(unsigned int startPos,int length,int initStyle,WordList * keywordlists[],Accessor & styler)182 void ColouriseCamlDoc(
183 	unsigned int startPos, int length,
184 	int initStyle,
185 	WordList *keywordlists[],
186 	Accessor &styler)
187 {
188 	// initialize styler
189 	StyleContext sc(startPos, length, initStyle, styler);
190 
191 	int chBase = 0, chToken = 0, chLit = 0;
192 	WordList& keywords  = *keywordlists[0];
193 	WordList& keywords2 = *keywordlists[1];
194 	WordList& keywords3 = *keywordlists[2];
195 	const bool isSML = keywords.InList("andalso");
196 	const int useMagic = styler.GetPropertyInt("lexer.caml.magic", 0);
197 
198 	// set up [initial] state info (terminating states that shouldn't "bleed")
199 	const int state_ = sc.state & 0x0f;
200 	if (state_ <= SCE_CAML_CHAR
201 		|| (isSML && state_ == SCE_CAML_STRING))
202 		sc.state = SCE_CAML_DEFAULT;
203 	int nesting = (state_ >= SCE_CAML_COMMENT)? (state_ - SCE_CAML_COMMENT): 0;
204 
205 	// foreach char in range...
206 	while (sc.More()) {
207 		// set up [per-char] state info
208 		int state2 = -1;				// (ASSUME no state change)
209 		int chColor = sc.currentPos - 1;// (ASSUME standard coloring range)
210 		bool advance = true;			// (ASSUME scanner "eats" 1 char)
211 
212 		// step state machine
213 		switch (sc.state & 0x0f) {
214 		case SCE_CAML_DEFAULT:
215 			chToken = sc.currentPos;	// save [possible] token start (JIC)
216 			// it's wide open; what do we have?
217 			if (iscamlf(sc.ch))
218 				state2 = SCE_CAML_IDENTIFIER;
219 			else if (!isSML && sc.Match('`') && iscamlf(sc.chNext))
220 				state2 = SCE_CAML_TAGNAME;
221 			else if (!isSML && sc.Match('#') && isdigit(sc.chNext))
222 				state2 = SCE_CAML_LINENUM;
223 			else if (isdigit(sc.ch)) {
224 				// it's a number, assume base 10
225 				state2 = SCE_CAML_NUMBER, chBase = 10;
226 				if (sc.Match('0')) {
227 					// there MAY be a base specified...
228 					const char* baseC = "bBoOxX";
229 					if (isSML) {
230 						if (sc.chNext == 'w')
231 							sc.Forward();	// (consume SML "word" indicator)
232 						baseC = "x";
233 					}
234 					// ... change to specified base AS REQUIRED
235 					if (strchr(baseC, sc.chNext))
236 						chBase = baseT[tolower(sc.chNext) - 'a'], sc.Forward();
237 				}
238 			} else if (!isSML && sc.Match('\''))	// (Caml char literal?)
239 				state2 = SCE_CAML_CHAR, chLit = 0;
240 			else if (isSML && sc.Match('#', '"'))	// (SML char literal?)
241 				state2 = SCE_CAML_CHAR, sc.Forward();
242 			else if (sc.Match('"'))
243 				state2 = SCE_CAML_STRING;
244 			else if (sc.Match('(', '*'))
245 				state2 = SCE_CAML_COMMENT, sc.Forward(), sc.ch = ' '; // (*)...
246 			else if (strchr("!?~"			/* Caml "prefix-symbol" */
247 					"=<>@^|&+-*/$%"			/* Caml "infix-symbol" */
248 					"()[]{};,:.#", sc.ch)	// Caml "bracket" or ;,:.#
249 											// SML "extra" ident chars
250 				|| (isSML && (sc.Match('\\') || sc.Match('`'))))
251 				state2 = SCE_CAML_OPERATOR;
252 			break;
253 
254 		case SCE_CAML_IDENTIFIER:
255 			// [try to] interpret as [additional] identifier char
256 			if (!(iscaml(sc.ch) || sc.Match('\''))) {
257 				const int n = sc.currentPos - chToken;
258 				if (n < 24) {
259 					// length is believable as keyword, [re-]construct token
260 					char t[24];
261 					for (int i = -n; i < 0; i++)
262 						t[n + i] = static_cast<char>(sc.GetRelative(i));
263 					t[n] = '\0';
264 					// special-case "_" token as KEYWORD
265 					if ((n == 1 && sc.chPrev == '_') || keywords.InList(t))
266 						sc.ChangeState(SCE_CAML_KEYWORD);
267 					else if (keywords2.InList(t))
268 						sc.ChangeState(SCE_CAML_KEYWORD2);
269 					else if (keywords3.InList(t))
270 						sc.ChangeState(SCE_CAML_KEYWORD3);
271 				}
272 				state2 = SCE_CAML_DEFAULT, advance = false;
273 			}
274 			break;
275 
276 		case SCE_CAML_TAGNAME:
277 			// [try to] interpret as [additional] tagname char
278 			if (!(iscaml(sc.ch) || sc.Match('\'')))
279 				state2 = SCE_CAML_DEFAULT, advance = false;
280 			break;
281 
282 		/*case SCE_CAML_KEYWORD:
283 		case SCE_CAML_KEYWORD2:
284 		case SCE_CAML_KEYWORD3:
285 			// [try to] interpret as [additional] keyword char
286 			if (!iscaml(ch))
287 				state2 = SCE_CAML_DEFAULT, advance = false;
288 			break;*/
289 
290 		case SCE_CAML_LINENUM:
291 			// [try to] interpret as [additional] linenum directive char
292 			if (!isdigit(sc.ch))
293 				state2 = SCE_CAML_DEFAULT, advance = false;
294 			break;
295 
296 		case SCE_CAML_OPERATOR: {
297 			// [try to] interpret as [additional] operator char
298 			const char* o = 0;
299 			if (iscaml(sc.ch) || isspace(sc.ch)			// ident or whitespace
300 				|| (o = strchr(")]};,\'\"#", sc.ch),o)	// "termination" chars
301 				|| (!isSML && sc.Match('`'))			// Caml extra term char
302 				|| (!strchr("!$%&*+-./:<=>?@^|~", sc.ch)// "operator" chars
303 														// SML extra ident chars
304 					&& !(isSML && (sc.Match('\\') || sc.Match('`'))))) {
305 				// check for INCLUSIVE termination
306 				if (o && strchr(")]};,", sc.ch)) {
307 					if ((sc.Match(')') && sc.chPrev == '(')
308 						|| (sc.Match(']') && sc.chPrev == '['))
309 						// special-case "()" and "[]" tokens as KEYWORDS
310 						sc.ChangeState(SCE_CAML_KEYWORD);
311 					chColor++;
312 				} else
313 					advance = false;
314 				state2 = SCE_CAML_DEFAULT;
315 			}
316 			break;
317 		}
318 
319 		case SCE_CAML_NUMBER:
320 			// [try to] interpret as [additional] numeric literal char
321 			if ((!isSML && sc.Match('_')) || IsADigit(sc.ch, chBase))
322 				break;
323 			// how about an integer suffix?
324 			if (!isSML && (sc.Match('l') || sc.Match('L') || sc.Match('n'))
325 				&& (sc.chPrev == '_' || IsADigit(sc.chPrev, chBase)))
326 				break;
327 			// or a floating-point literal?
328 			if (chBase == 10) {
329 				// with a decimal point?
330 				if (sc.Match('.')
331 					&& ((!isSML && sc.chPrev == '_')
332 						|| IsADigit(sc.chPrev, chBase)))
333 					break;
334 				// with an exponent? (I)
335 				if ((sc.Match('e') || sc.Match('E'))
336 					&& ((!isSML && (sc.chPrev == '.' || sc.chPrev == '_'))
337 						|| IsADigit(sc.chPrev, chBase)))
338 					break;
339 				// with an exponent? (II)
340 				if (((!isSML && (sc.Match('+') || sc.Match('-')))
341 						|| (isSML && sc.Match('~')))
342 					&& (sc.chPrev == 'e' || sc.chPrev == 'E'))
343 					break;
344 			}
345 			// it looks like we have run out of number
346 			state2 = SCE_CAML_DEFAULT, advance = false;
347 			break;
348 
349 		case SCE_CAML_CHAR:
350 			if (!isSML) {
351 				// [try to] interpret as [additional] char literal char
352 				if (sc.Match('\\')) {
353 					chLit = 1;	// (definitely IS a char literal)
354 					if (sc.chPrev == '\\')
355 						sc.ch = ' ';	// (...\\')
356 				// should we be terminating - one way or another?
357 				} else if ((sc.Match('\'') && sc.chPrev != '\\')
358 					|| sc.atLineEnd) {
359 					state2 = SCE_CAML_DEFAULT;
360 					if (sc.Match('\''))
361 						chColor++;
362 					else
363 						sc.ChangeState(SCE_CAML_IDENTIFIER);
364 				// ... maybe a char literal, maybe not
365 				} else if (chLit < 1 && sc.currentPos - chToken >= 2)
366 					sc.ChangeState(SCE_CAML_IDENTIFIER), advance = false;
367 				break;
368 			}/* else
369 				// fall through for SML char literal (handle like string) */
370 
371 		case SCE_CAML_STRING:
372 			// [try to] interpret as [additional] [SML char/] string literal char
373 			if (isSML && sc.Match('\\') && sc.chPrev != '\\' && isspace(sc.chNext))
374 				state2 = SCE_CAML_WHITE;
375 			else if (sc.Match('\\') && sc.chPrev == '\\')
376 				sc.ch = ' ';	// (...\\")
377 			// should we be terminating - one way or another?
378 			else if ((sc.Match('"') && sc.chPrev != '\\')
379 				|| (isSML && sc.atLineEnd)) {
380 				state2 = SCE_CAML_DEFAULT;
381 				if (sc.Match('"'))
382 					chColor++;
383 			}
384 			break;
385 
386 		case SCE_CAML_WHITE:
387 			// [try to] interpret as [additional] SML embedded whitespace char
388 			if (sc.Match('\\')) {
389 				// style this puppy NOW...
390 				state2 = SCE_CAML_STRING, sc.ch = ' ' /* (...\") */, chColor++,
391 					styler.ColourTo(chColor, SCE_CAML_WHITE), styler.Flush();
392 				// ... then backtrack to determine original SML literal type
393 				int p = chColor - 2;
394 				for (; p >= 0 && styler.StyleAt(p) == SCE_CAML_WHITE; p--) ;
395 				if (p >= 0)
396 					state2 = static_cast<int>(styler.StyleAt(p));
397 				// take care of state change NOW
398 				sc.ChangeState(state2), state2 = -1;
399 			}
400 			break;
401 
402 		case SCE_CAML_COMMENT:
403 		case SCE_CAML_COMMENT1:
404 		case SCE_CAML_COMMENT2:
405 		case SCE_CAML_COMMENT3:
406 			// we're IN a comment - does this start a NESTED comment?
407 			if (sc.Match('(', '*'))
408 				state2 = sc.state + 1, chToken = sc.currentPos,
409 					sc.Forward(), sc.ch = ' ' /* (*)... */, nesting++;
410 			// [try to] interpret as [additional] comment char
411 			else if (sc.Match(')') && sc.chPrev == '*') {
412 				if (nesting)
413 					state2 = (sc.state & 0x0f) - 1, chToken = 0, nesting--;
414 				else
415 					state2 = SCE_CAML_DEFAULT;
416 				chColor++;
417 			// enable "magic" (read-only) comment AS REQUIRED
418 			} else if (useMagic && sc.currentPos - chToken == 4
419 				&& sc.Match('c') && sc.chPrev == 'r' && sc.GetRelative(-2) == '@')
420 				sc.state |= 0x10;	// (switch to read-only comment style)
421 			break;
422 		}
423 
424 		// handle state change and char coloring AS REQUIRED
425 		if (state2 >= 0)
426 			styler.ColourTo(chColor, sc.state), sc.ChangeState(state2);
427 		// move to next char UNLESS re-scanning current char
428 		if (advance)
429 			sc.Forward();
430 	}
431 
432 	// do any required terminal char coloring (JIC)
433 	sc.Complete();
434 }
435 
436 #ifdef BUILD_AS_EXTERNAL_LEXER
437 static
438 #endif	/* BUILD_AS_EXTERNAL_LEXER */
FoldCamlDoc(unsigned int,int,int,WordList * [],Accessor &)439 void FoldCamlDoc(
440 	unsigned int, int,
441 	int,
442 	WordList *[],
443 	Accessor &)
444 {
445 }
446 
447 static const char * const camlWordListDesc[] = {
448 	"Keywords",		// primary Objective Caml keywords
449 	"Keywords2",	// "optional" keywords (typically from Pervasives)
450 	"Keywords3",	// "optional" keywords (typically typenames)
451 	0
452 };
453 
454 #ifndef BUILD_AS_EXTERNAL_LEXER
455 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
456 #endif	/* BUILD_AS_EXTERNAL_LEXER */
457