1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3  ** Lexer for Objective Caml.
4  **/
5 // Copyright 2005 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /*	Release History
8 	20050204 Initial release.
9 	20050205 Quick compiler standards/"cleanliness" adjustment.
10 	20050206 Added cast for IsLeadByte().
11 	20050209 Changes to "external" build support.
12 	20050306 Fix for 1st-char-in-doc "corner" case.
13 	20050502 Fix for [harmless] one-past-the-end coloring.
14 	20050515 Refined numeric token recognition logic.
15 */
16 
17 #include <stdlib.h>
18 #include <string.h>
19 #include <ctype.h>
20 #include <stdio.h>
21 #include <stdarg.h>
22 
23 #include "Platform.h"
24 
25 #include "PropSet.h"
26 #include "Accessor.h"
27 #include "StyleContext.h"
28 #include "KeyWords.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
31 
32 //	Since the Microsoft __iscsym[f] funcs are not ANSI...
iscaml(int c)33 inline int  iscaml(int c) {return isalnum(c) || c == '_';}
iscamlf(int c)34 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
iscamld(int c)35 inline int iscamld(int c) {return isdigit(c) || c == '_';}
36 
37 #ifdef BUILD_AS_EXTERNAL_LEXER
38 /*
39 	(actually seems to work!)
40 */
41 #include "WindowAccessor.h"
42 #include "ExternalLexer.h"
43 
44 #if PLAT_WIN
45 #include <windows.h>
46 #endif
47 
48 static void ColouriseCamlDoc(
49 	unsigned int startPos, int length,
50 	int initStyle,
51 	WordList *keywordlists[],
52 	Accessor &styler);
53 
54 static void FoldCamlDoc(
55 	unsigned int startPos, int length,
56 	int initStyle,
57 	WordList *keywordlists[],
58 	Accessor &styler);
59 
60 static void InternalLexOrFold(int lexOrFold, unsigned int startPos, int length,
61 	int initStyle, char *words[], WindowID window, char *props);
62 
63 static const char* LexerName = "caml";
64 
65 #ifdef TRACE
DebugPrintf(const char * format,...)66 void Platform::DebugPrintf(const char *format, ...) {
67 	char buffer[2000];
68 	va_list pArguments;
69 	va_start(pArguments, format);
70 	vsprintf(buffer,format,pArguments);
71 	va_end(pArguments);
72 	Platform::DebugDisplay(buffer);
73 }
74 #else
DebugPrintf(const char *,...)75 void Platform::DebugPrintf(const char *, ...) {
76 }
77 #endif
78 
IsDBCSLeadByte(int codePage,char ch)79 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
80 	return ::IsDBCSLeadByteEx(codePage, ch) != 0;
81 }
82 
SendScintilla(WindowID w,unsigned int msg,unsigned long wParam,long lParam)83 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
84 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
85 }
86 
SendScintillaPointer(WindowID w,unsigned int msg,unsigned long wParam,void * lParam)87 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
88 	return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
89 		reinterpret_cast<LPARAM>(lParam));
90 }
91 
Fold(unsigned int lexer,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)92 void EXT_LEXER_DECL Fold(unsigned int lexer, unsigned int startPos, int length,
93 	int initStyle, char *words[], WindowID window, char *props)
94 {
95 	// below useless evaluation(s) to supress "not used" warnings
96 	lexer;
97 	// build expected data structures and do the Fold
98 	InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
99 
100 }
101 
GetLexerCount()102 int EXT_LEXER_DECL GetLexerCount()
103 {
104 	return 1;	// just us [Objective] Caml lexers here!
105 }
106 
GetLexerName(unsigned int Index,char * name,int buflength)107 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
108 {
109 	// below useless evaluation(s) to supress "not used" warnings
110 	Index;
111 	// return as much of our lexer name as will fit (what's up with Index?)
112 	if (buflength > 0) {
113 		buflength--;
114 		int n = strlen(LexerName);
115 		if (n > buflength)
116 			n = buflength;
117 		memcpy(name, LexerName, n), name[n] = '\0';
118 	}
119 }
120 
Lex(unsigned int lexer,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)121 void EXT_LEXER_DECL Lex(unsigned int lexer, unsigned int startPos, int length,
122 	int initStyle, char *words[], WindowID window, char *props)
123 {
124 	// below useless evaluation(s) to supress "not used" warnings
125 	lexer;
126 	// build expected data structures and do the Lex
127 	InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
128 }
129 
InternalLexOrFold(int foldOrLex,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)130 static void InternalLexOrFold(int foldOrLex, unsigned int startPos, int length,
131 	int initStyle, char *words[], WindowID window, char *props)
132 {
133 	// create and initialize a WindowAccessor (including contained PropSet)
134 	PropSet ps;
135 	ps.SetMultiple(props);
136 	WindowAccessor wa(window, ps);
137 	// create and initialize WordList(s)
138 	int nWL = 0;
139 	for (; words[nWL]; nWL++) ;	// count # of WordList PTRs needed
140 	WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
141 	int i = 0;
142 	for (; i < nWL; i++) {
143 		wl[i] = new WordList();	// (works or THROWS bad_alloc EXCEPTION)
144 		wl[i]->Set(words[i]);
145 	}
146 	wl[i] = 0;
147 	// call our "internal" folder/lexer (... then do Flush!)
148 	if (foldOrLex)
149 		FoldCamlDoc(startPos, length, initStyle, wl, wa);
150 	else
151 		ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
152 	wa.Flush();
153 	// clean up before leaving
154 	for (i = nWL - 1; i >= 0; i--)
155 		delete wl[i];
156 	delete [] wl;
157 }
158 
159 static
160 #endif	/* BUILD_AS_EXTERNAL_LEXER */
161 
ColouriseCamlDoc(unsigned int startPos,int length,int initStyle,WordList * keywordlists[],Accessor & styler)162 void ColouriseCamlDoc(
163 	unsigned int startPos, int length,
164 	int initStyle,
165 	WordList *keywordlists[],
166 	Accessor &styler)
167 {
168 	// initialize styler
169 	styler.StartAt(startPos);
170 	styler.StartSegment(startPos);
171 	// set up [initial] state info (terminating states that shouldn't "bleed")
172 	int state = initStyle, nesting = 0;
173 	if (state < SCE_CAML_STRING)
174 		state = SCE_CAML_DEFAULT;
175 	if (state >= SCE_CAML_COMMENT)
176 		nesting = state - SCE_CAML_COMMENT;
177 	int chLast = startPos? static_cast<unsigned char>(styler[startPos - 1]): ' ';
178 	int chNext = static_cast<unsigned char>(styler[startPos]);
179 
180 	int chBase = 'd', chToken = 0, chLit = 0, chSkip;
181 	WordList& keywords = *keywordlists[0];
182 	WordList& keywords2 = *keywordlists[1];
183 
184 	// foreach char in range...
185 	unsigned int i = startPos;
186 	const unsigned int endPos = startPos + length;
187 	for (; i < endPos; i += chSkip) {
188 		// set up [per-char] state info
189 		int ch = chNext;
190 		chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 1));
191 		int state2 = -1;	// (ASSUME no state change)
192 		int chColor = i - 1;// (ASSUME standard coloring range)
193 		chSkip = 1;			// (ASSUME scanner "eats" 1 char)
194 
195 		// this may be the correct thing to do... or not
196 		if (styler.IsLeadByte(static_cast<char>(ch))) {
197 			chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
198 				chSkip++;
199 			continue;
200 		}
201 
202 		// step state machine
203 		switch (state) {
204 		case SCE_CAML_DEFAULT:
205 			// it's wide open; what do we have?
206 			if (iscamlf(ch))
207 				state2 = SCE_CAML_IDENTIFIER, chToken = i;
208 			else if (ch == '`')
209 				state2 = SCE_CAML_TAGNAME, chToken = i;
210 			else if (ch == '#' && isdigit(chNext))
211 				state2 = SCE_CAML_LINENUM, chToken = i;
212 			else if (isdigit(ch)) {
213 				state2 = SCE_CAML_NUMBER,
214 					chBase = strchr("xXoObB", chNext)? chNext: 'd';
215 				if (chBase != 'd')
216 					ch = chNext,
217 						chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
218 						chSkip++;
219 			} else if (ch == '\'')	/* (char literal?) */
220 				state2 = SCE_CAML_CHAR, chToken = i, chLit = 0;
221 			else if (ch == '\"')
222 				state2 = SCE_CAML_STRING;
223 			else if (ch == '(' && chNext == '*')
224 				state2 = SCE_CAML_COMMENT,
225 					ch = ' ',	// (make SURE "(*)" isn't seen as a closed comment)
226 					chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
227 					chSkip++, nesting = 0;
228 			else if (strchr("!?~"		/* Caml "prefix-symbol" */
229 					"=<>@^|&+-*/$%"		/* Caml "infix-symbol" */
230 					"()[]{};,:.#", ch))	/* Caml "bracket" or ;,:.# */
231 				state2 = SCE_CAML_OPERATOR, chToken = i;
232 			break;
233 
234 		case SCE_CAML_IDENTIFIER:
235 			// [try to] interpret as [additional] identifier char
236 			if (!(iscaml(ch) || ch == '\'')) {
237 				const int n = i - chToken;
238 				if (n < 24) {
239 					// length is believable as keyword, [re-]construct token
240 					char t[24];
241 					int p = 0;
242 					for (int q = chToken; p < n; p++, q++)
243 						t[p] = styler[q];
244 					t[p] = '\0';
245 					// special-case "_" token as KEYWORD
246 					if ((n == 1 && chLast == '_') || keywords.InList(t))
247 						state = SCE_CAML_KEYWORD;
248 					else if (keywords2.InList(t))
249 						state = SCE_CAML_KEYWORD2;
250 				}
251 				state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
252 			}
253 			break;
254 
255 		case SCE_CAML_TAGNAME:
256 			// [try to] interpret as [additional] tagname char
257 			if (!(iscaml(ch) || ch == '\''))
258 				state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
259 			break;
260 
261 		/*case SCE_CAML_KEYWORD:
262 		case SCE_CAML_KEYWORD2:
263 			// [try to] interpret as [additional] keyword char
264 			if (!iscaml(ch))
265 				state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
266 			break;*/
267 
268 		case SCE_CAML_LINENUM:
269 			// [try to] interpret as [additional] linenum directive char
270 			if (!isdigit(ch))
271 				state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
272 			break;
273 
274 		case SCE_CAML_OPERATOR: {
275 			// [try to] interpret as [additional] operator char
276 			const char* o = 0;
277 			if (iscaml(ch) || isspace(ch)			/* ident or whitespace */
278 				|| ((o = strchr(")]};,\'\"`#", ch)) != 0)/* "termination" chars */
279 				|| !strchr("!$%&*+-./:<=>?@^|~", ch)/* "operator" chars */) {
280 				// check for INCLUSIVE termination
281 				if (o && strchr(")]};,", ch)) {
282 					if ((ch == ')' && chLast == '(') || (ch == ']' && chLast == '['))
283 						// special-case "()" and "[]" tokens as KEYWORDS
284 						state = SCE_CAML_KEYWORD;
285 					chColor++;
286 				} else
287 					chNext = ch, chSkip--;
288 				state2 = SCE_CAML_DEFAULT;
289 			}
290 			break;
291 		}
292 
293 		case SCE_CAML_NUMBER:
294 			// [try to] interpret as [additional] numeric literal char
295 			// N.B. - improperly accepts "extra" digits in base 2 or 8 literals
296 			if (iscamld(ch) || ((chBase == 'x' || chBase == 'X') && isxdigit(ch)))
297 				break;
298 			// how about an integer suffix?
299 			if ((ch == 'l' || ch == 'L' || ch == 'n')&& (iscamld(chLast)
300 				|| ((chBase == 'x' || chBase == 'X') && isxdigit(chLast))))
301 				break;
302 			// or a floating-point literal?
303 			if (chBase == 'd') {
304 				// with a decimal point?
305 				if (ch == '.' && iscamld(chLast))
306 					break;
307 				// with an exponent? (I)
308 				if ((ch == 'e' || ch == 'E') && (iscamld(chLast) || chLast == '.'))
309 					break;
310 				// with an exponent? (II)
311 				if ((ch == '+' || ch == '-') && (chLast == 'e' || chLast == 'E'))
312 					break;
313 			}
314 			// it looks like we have run out of number
315 			state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
316 			break;
317 
318 		case SCE_CAML_CHAR:
319 			// [try to] interpret as [additional] char literal char
320 			if (ch == '\\') {
321 				chLit = 1;	// (definitely IS a char literal)
322 				if (chLast == '\\')
323 					ch = ' ';	// (so termination test isn't fooled)
324 			// should we be terminating - one way or another?
325 			} else if ((ch == '\'' && chLast != '\\') || ch == '\r' || ch == '\n') {
326 				state2 = SCE_CAML_DEFAULT;
327 				if (ch == '\'')
328 					chColor++;
329 				else
330 					state = SCE_CAML_IDENTIFIER;
331 			// ... maybe a char literal, maybe not
332 			} else if (chLit < 1 && i - chToken >= 2)
333 				state = SCE_CAML_IDENTIFIER, chNext = ch, chSkip--;
334 			break;
335 
336 		case SCE_CAML_STRING:
337 			// [try to] interpret as [additional] string literal char
338 			if (ch == '\\' && chLast == '\\')
339 				ch = ' ';	// (so '\\' doesn't cause us trouble)
340 			else if (ch == '\"' && chLast != '\\')
341 				state2 = SCE_CAML_DEFAULT, chColor++;
342 			break;
343 
344 		case SCE_CAML_COMMENT:
345 		case SCE_CAML_COMMENT+1:
346 		case SCE_CAML_COMMENT+2:
347 		case SCE_CAML_COMMENT+3:
348 			// we're IN a comment - does this start a NESTED comment?
349 			if (ch == '(' && chNext == '*')
350 				state2 = state + 1,
351 					ch = ' ',	// (make SURE "(*)" isn't seen as a closed comment)
352 					chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
353 					chSkip++, nesting++;
354 			// [try to] interpret as [additional] comment char
355 			else if (ch == ')' && chLast == '*')
356 				state2 = nesting? (state - 1): SCE_CAML_DEFAULT, chColor++, nesting--;
357 			break;
358 		}
359 
360 		// handle state change and char coloring as required
361 		if (state2 >= 0) {
362 			// (1st char will NOT be colored until AT LEAST 2nd char)
363 			if (chColor >= 0)
364 				styler.ColourTo(chColor, state);
365 			state = state2;
366 		}
367 		chLast = ch;
368 	}
369 
370 	// do any required terminal char coloring (JIC)
371 	if (i >= endPos)
372 		i = endPos - 1;
373 	styler.ColourTo(i, state);
374 //	styler.Flush();	// (is this always done by calling code?)
375 }
376 
377 #ifdef BUILD_AS_EXTERNAL_LEXER
378 static
379 #endif	/* BUILD_AS_EXTERNAL_LEXER */
FoldCamlDoc(unsigned int startPos,int length,int initStyle,WordList * keywordlists[],Accessor & styler)380 void FoldCamlDoc(
381 	unsigned int startPos, int length,
382 	int initStyle,
383 	WordList *keywordlists[],
384 	Accessor &styler)
385 {
386 	// below useless evaluation(s) to supress "not used" warnings
387 	startPos || length || initStyle || keywordlists[0] || styler.Length();
388 }
389 
390 static const char * const camlWordListDesc[] = {
391 	"Keywords",		// primary Objective Caml keywords
392 	"Keywords2",	// "optional" keywords (typically from Pervasives)
393 	0
394 };
395 
396 #ifndef BUILD_AS_EXTERNAL_LEXER
397 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
398 #endif	/* BUILD_AS_EXTERNAL_LEXER */
399