1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3 ** Lexer for Objective Caml.
4 **/
5 // Copyright 2005 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /* Release History
8 20050204 Initial release.
9 20050205 Quick compiler standards/"cleanliness" adjustment.
10 20050206 Added cast for IsLeadByte().
11 20050209 Changes to "external" build support.
12 20050306 Fix for 1st-char-in-doc "corner" case.
13 20050502 Fix for [harmless] one-past-the-end coloring.
14 20050515 Refined numeric token recognition logic.
15 */
16
17 #include <stdlib.h>
18 #include <string.h>
19 #include <ctype.h>
20 #include <stdio.h>
21 #include <stdarg.h>
22
23 #include "Platform.h"
24
25 #include "PropSet.h"
26 #include "Accessor.h"
27 #include "StyleContext.h"
28 #include "KeyWords.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
31
32 // Since the Microsoft __iscsym[f] funcs are not ANSI...
iscaml(int c)33 inline int iscaml(int c) {return isalnum(c) || c == '_';}
iscamlf(int c)34 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
iscamld(int c)35 inline int iscamld(int c) {return isdigit(c) || c == '_';}
36
37 #ifdef BUILD_AS_EXTERNAL_LEXER
38 /*
39 (actually seems to work!)
40 */
41 #include "WindowAccessor.h"
42 #include "ExternalLexer.h"
43
44 #if PLAT_WIN
45 #include <windows.h>
46 #endif
47
48 static void ColouriseCamlDoc(
49 unsigned int startPos, int length,
50 int initStyle,
51 WordList *keywordlists[],
52 Accessor &styler);
53
54 static void FoldCamlDoc(
55 unsigned int startPos, int length,
56 int initStyle,
57 WordList *keywordlists[],
58 Accessor &styler);
59
60 static void InternalLexOrFold(int lexOrFold, unsigned int startPos, int length,
61 int initStyle, char *words[], WindowID window, char *props);
62
63 static const char* LexerName = "caml";
64
65 #ifdef TRACE
DebugPrintf(const char * format,...)66 void Platform::DebugPrintf(const char *format, ...) {
67 char buffer[2000];
68 va_list pArguments;
69 va_start(pArguments, format);
70 vsprintf(buffer,format,pArguments);
71 va_end(pArguments);
72 Platform::DebugDisplay(buffer);
73 }
74 #else
DebugPrintf(const char *,...)75 void Platform::DebugPrintf(const char *, ...) {
76 }
77 #endif
78
IsDBCSLeadByte(int codePage,char ch)79 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
80 return ::IsDBCSLeadByteEx(codePage, ch) != 0;
81 }
82
SendScintilla(WindowID w,unsigned int msg,unsigned long wParam,long lParam)83 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
84 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
85 }
86
SendScintillaPointer(WindowID w,unsigned int msg,unsigned long wParam,void * lParam)87 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
88 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
89 reinterpret_cast<LPARAM>(lParam));
90 }
91
Fold(unsigned int lexer,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)92 void EXT_LEXER_DECL Fold(unsigned int lexer, unsigned int startPos, int length,
93 int initStyle, char *words[], WindowID window, char *props)
94 {
95 // below useless evaluation(s) to supress "not used" warnings
96 lexer;
97 // build expected data structures and do the Fold
98 InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
99
100 }
101
GetLexerCount()102 int EXT_LEXER_DECL GetLexerCount()
103 {
104 return 1; // just us [Objective] Caml lexers here!
105 }
106
GetLexerName(unsigned int Index,char * name,int buflength)107 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
108 {
109 // below useless evaluation(s) to supress "not used" warnings
110 Index;
111 // return as much of our lexer name as will fit (what's up with Index?)
112 if (buflength > 0) {
113 buflength--;
114 int n = strlen(LexerName);
115 if (n > buflength)
116 n = buflength;
117 memcpy(name, LexerName, n), name[n] = '\0';
118 }
119 }
120
Lex(unsigned int lexer,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)121 void EXT_LEXER_DECL Lex(unsigned int lexer, unsigned int startPos, int length,
122 int initStyle, char *words[], WindowID window, char *props)
123 {
124 // below useless evaluation(s) to supress "not used" warnings
125 lexer;
126 // build expected data structures and do the Lex
127 InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
128 }
129
InternalLexOrFold(int foldOrLex,unsigned int startPos,int length,int initStyle,char * words[],WindowID window,char * props)130 static void InternalLexOrFold(int foldOrLex, unsigned int startPos, int length,
131 int initStyle, char *words[], WindowID window, char *props)
132 {
133 // create and initialize a WindowAccessor (including contained PropSet)
134 PropSet ps;
135 ps.SetMultiple(props);
136 WindowAccessor wa(window, ps);
137 // create and initialize WordList(s)
138 int nWL = 0;
139 for (; words[nWL]; nWL++) ; // count # of WordList PTRs needed
140 WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
141 int i = 0;
142 for (; i < nWL; i++) {
143 wl[i] = new WordList(); // (works or THROWS bad_alloc EXCEPTION)
144 wl[i]->Set(words[i]);
145 }
146 wl[i] = 0;
147 // call our "internal" folder/lexer (... then do Flush!)
148 if (foldOrLex)
149 FoldCamlDoc(startPos, length, initStyle, wl, wa);
150 else
151 ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
152 wa.Flush();
153 // clean up before leaving
154 for (i = nWL - 1; i >= 0; i--)
155 delete wl[i];
156 delete [] wl;
157 }
158
159 static
160 #endif /* BUILD_AS_EXTERNAL_LEXER */
161
ColouriseCamlDoc(unsigned int startPos,int length,int initStyle,WordList * keywordlists[],Accessor & styler)162 void ColouriseCamlDoc(
163 unsigned int startPos, int length,
164 int initStyle,
165 WordList *keywordlists[],
166 Accessor &styler)
167 {
168 // initialize styler
169 styler.StartAt(startPos);
170 styler.StartSegment(startPos);
171 // set up [initial] state info (terminating states that shouldn't "bleed")
172 int state = initStyle, nesting = 0;
173 if (state < SCE_CAML_STRING)
174 state = SCE_CAML_DEFAULT;
175 if (state >= SCE_CAML_COMMENT)
176 nesting = state - SCE_CAML_COMMENT;
177 int chLast = startPos? static_cast<unsigned char>(styler[startPos - 1]): ' ';
178 int chNext = static_cast<unsigned char>(styler[startPos]);
179
180 int chBase = 'd', chToken = 0, chLit = 0, chSkip;
181 WordList& keywords = *keywordlists[0];
182 WordList& keywords2 = *keywordlists[1];
183
184 // foreach char in range...
185 unsigned int i = startPos;
186 const unsigned int endPos = startPos + length;
187 for (; i < endPos; i += chSkip) {
188 // set up [per-char] state info
189 int ch = chNext;
190 chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 1));
191 int state2 = -1; // (ASSUME no state change)
192 int chColor = i - 1;// (ASSUME standard coloring range)
193 chSkip = 1; // (ASSUME scanner "eats" 1 char)
194
195 // this may be the correct thing to do... or not
196 if (styler.IsLeadByte(static_cast<char>(ch))) {
197 chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
198 chSkip++;
199 continue;
200 }
201
202 // step state machine
203 switch (state) {
204 case SCE_CAML_DEFAULT:
205 // it's wide open; what do we have?
206 if (iscamlf(ch))
207 state2 = SCE_CAML_IDENTIFIER, chToken = i;
208 else if (ch == '`')
209 state2 = SCE_CAML_TAGNAME, chToken = i;
210 else if (ch == '#' && isdigit(chNext))
211 state2 = SCE_CAML_LINENUM, chToken = i;
212 else if (isdigit(ch)) {
213 state2 = SCE_CAML_NUMBER,
214 chBase = strchr("xXoObB", chNext)? chNext: 'd';
215 if (chBase != 'd')
216 ch = chNext,
217 chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
218 chSkip++;
219 } else if (ch == '\'') /* (char literal?) */
220 state2 = SCE_CAML_CHAR, chToken = i, chLit = 0;
221 else if (ch == '\"')
222 state2 = SCE_CAML_STRING;
223 else if (ch == '(' && chNext == '*')
224 state2 = SCE_CAML_COMMENT,
225 ch = ' ', // (make SURE "(*)" isn't seen as a closed comment)
226 chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
227 chSkip++, nesting = 0;
228 else if (strchr("!?~" /* Caml "prefix-symbol" */
229 "=<>@^|&+-*/$%" /* Caml "infix-symbol" */
230 "()[]{};,:.#", ch)) /* Caml "bracket" or ;,:.# */
231 state2 = SCE_CAML_OPERATOR, chToken = i;
232 break;
233
234 case SCE_CAML_IDENTIFIER:
235 // [try to] interpret as [additional] identifier char
236 if (!(iscaml(ch) || ch == '\'')) {
237 const int n = i - chToken;
238 if (n < 24) {
239 // length is believable as keyword, [re-]construct token
240 char t[24];
241 int p = 0;
242 for (int q = chToken; p < n; p++, q++)
243 t[p] = styler[q];
244 t[p] = '\0';
245 // special-case "_" token as KEYWORD
246 if ((n == 1 && chLast == '_') || keywords.InList(t))
247 state = SCE_CAML_KEYWORD;
248 else if (keywords2.InList(t))
249 state = SCE_CAML_KEYWORD2;
250 }
251 state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
252 }
253 break;
254
255 case SCE_CAML_TAGNAME:
256 // [try to] interpret as [additional] tagname char
257 if (!(iscaml(ch) || ch == '\''))
258 state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
259 break;
260
261 /*case SCE_CAML_KEYWORD:
262 case SCE_CAML_KEYWORD2:
263 // [try to] interpret as [additional] keyword char
264 if (!iscaml(ch))
265 state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
266 break;*/
267
268 case SCE_CAML_LINENUM:
269 // [try to] interpret as [additional] linenum directive char
270 if (!isdigit(ch))
271 state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
272 break;
273
274 case SCE_CAML_OPERATOR: {
275 // [try to] interpret as [additional] operator char
276 const char* o = 0;
277 if (iscaml(ch) || isspace(ch) /* ident or whitespace */
278 || ((o = strchr(")]};,\'\"`#", ch)) != 0)/* "termination" chars */
279 || !strchr("!$%&*+-./:<=>?@^|~", ch)/* "operator" chars */) {
280 // check for INCLUSIVE termination
281 if (o && strchr(")]};,", ch)) {
282 if ((ch == ')' && chLast == '(') || (ch == ']' && chLast == '['))
283 // special-case "()" and "[]" tokens as KEYWORDS
284 state = SCE_CAML_KEYWORD;
285 chColor++;
286 } else
287 chNext = ch, chSkip--;
288 state2 = SCE_CAML_DEFAULT;
289 }
290 break;
291 }
292
293 case SCE_CAML_NUMBER:
294 // [try to] interpret as [additional] numeric literal char
295 // N.B. - improperly accepts "extra" digits in base 2 or 8 literals
296 if (iscamld(ch) || ((chBase == 'x' || chBase == 'X') && isxdigit(ch)))
297 break;
298 // how about an integer suffix?
299 if ((ch == 'l' || ch == 'L' || ch == 'n')&& (iscamld(chLast)
300 || ((chBase == 'x' || chBase == 'X') && isxdigit(chLast))))
301 break;
302 // or a floating-point literal?
303 if (chBase == 'd') {
304 // with a decimal point?
305 if (ch == '.' && iscamld(chLast))
306 break;
307 // with an exponent? (I)
308 if ((ch == 'e' || ch == 'E') && (iscamld(chLast) || chLast == '.'))
309 break;
310 // with an exponent? (II)
311 if ((ch == '+' || ch == '-') && (chLast == 'e' || chLast == 'E'))
312 break;
313 }
314 // it looks like we have run out of number
315 state2 = SCE_CAML_DEFAULT, chNext = ch, chSkip--;
316 break;
317
318 case SCE_CAML_CHAR:
319 // [try to] interpret as [additional] char literal char
320 if (ch == '\\') {
321 chLit = 1; // (definitely IS a char literal)
322 if (chLast == '\\')
323 ch = ' '; // (so termination test isn't fooled)
324 // should we be terminating - one way or another?
325 } else if ((ch == '\'' && chLast != '\\') || ch == '\r' || ch == '\n') {
326 state2 = SCE_CAML_DEFAULT;
327 if (ch == '\'')
328 chColor++;
329 else
330 state = SCE_CAML_IDENTIFIER;
331 // ... maybe a char literal, maybe not
332 } else if (chLit < 1 && i - chToken >= 2)
333 state = SCE_CAML_IDENTIFIER, chNext = ch, chSkip--;
334 break;
335
336 case SCE_CAML_STRING:
337 // [try to] interpret as [additional] string literal char
338 if (ch == '\\' && chLast == '\\')
339 ch = ' '; // (so '\\' doesn't cause us trouble)
340 else if (ch == '\"' && chLast != '\\')
341 state2 = SCE_CAML_DEFAULT, chColor++;
342 break;
343
344 case SCE_CAML_COMMENT:
345 case SCE_CAML_COMMENT+1:
346 case SCE_CAML_COMMENT+2:
347 case SCE_CAML_COMMENT+3:
348 // we're IN a comment - does this start a NESTED comment?
349 if (ch == '(' && chNext == '*')
350 state2 = state + 1,
351 ch = ' ', // (make SURE "(*)" isn't seen as a closed comment)
352 chNext = static_cast<unsigned char>(styler.SafeGetCharAt(i + 2)),
353 chSkip++, nesting++;
354 // [try to] interpret as [additional] comment char
355 else if (ch == ')' && chLast == '*')
356 state2 = nesting? (state - 1): SCE_CAML_DEFAULT, chColor++, nesting--;
357 break;
358 }
359
360 // handle state change and char coloring as required
361 if (state2 >= 0) {
362 // (1st char will NOT be colored until AT LEAST 2nd char)
363 if (chColor >= 0)
364 styler.ColourTo(chColor, state);
365 state = state2;
366 }
367 chLast = ch;
368 }
369
370 // do any required terminal char coloring (JIC)
371 if (i >= endPos)
372 i = endPos - 1;
373 styler.ColourTo(i, state);
374 // styler.Flush(); // (is this always done by calling code?)
375 }
376
377 #ifdef BUILD_AS_EXTERNAL_LEXER
378 static
379 #endif /* BUILD_AS_EXTERNAL_LEXER */
FoldCamlDoc(unsigned int startPos,int length,int initStyle,WordList * keywordlists[],Accessor & styler)380 void FoldCamlDoc(
381 unsigned int startPos, int length,
382 int initStyle,
383 WordList *keywordlists[],
384 Accessor &styler)
385 {
386 // below useless evaluation(s) to supress "not used" warnings
387 startPos || length || initStyle || keywordlists[0] || styler.Length();
388 }
389
390 static const char * const camlWordListDesc[] = {
391 "Keywords", // primary Objective Caml keywords
392 "Keywords2", // "optional" keywords (typically from Pervasives)
393 0
394 };
395
396 #ifndef BUILD_AS_EXTERNAL_LEXER
397 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
398 #endif /* BUILD_AS_EXTERNAL_LEXER */
399