1 // Scintilla source code edit control
2 /** @file LexLaTeX.cxx
3  ** Lexer for LaTeX2e.
4   **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 // Modified by G. HU in 2013. Added folding, syntax highting inside math environments, and changed some minor behaviors.
9 
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdio.h>
13 #include <stdarg.h>
14 #include <assert.h>
15 #include <ctype.h>
16 #include <vector>
17 
18 #include "ILexer.h"
19 #include "Scintilla.h"
20 #include "SciLexer.h"
21 
22 #include "PropSetSimple.h"
23 #include "WordList.h"
24 #include "LexAccessor.h"
25 #include "Accessor.h"
26 #include "StyleContext.h"
27 #include "CharacterSet.h"
28 #include "LexerModule.h"
29 #include "LexerBase.h"
30 
31 using namespace Scintilla;
32 
33 using namespace std;
34 
35 struct latexFoldSave {
latexFoldSavelatexFoldSave36 	latexFoldSave() : structLev(0) {
37 		for (int i = 0; i < 8; ++i) openBegins[i] = 0;
38 	}
latexFoldSavelatexFoldSave39 	latexFoldSave(const latexFoldSave &save) : structLev(save.structLev) {
40 		for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
41 	}
operator =latexFoldSave42 	latexFoldSave &operator=(const latexFoldSave &save) {
43 		if (this != &save) {
44 			structLev = save.structLev;
45 			for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
46 		}
47 		return *this;
48 	}
49 	int openBegins[8];
50 	Sci_Position structLev;
51 };
52 
53 class LexerLaTeX : public LexerBase {
54 private:
55 	vector<int> modes;
setMode(Sci_Position line,int mode)56 	void setMode(Sci_Position line, int mode) {
57 		if (line >= static_cast<Sci_Position>(modes.size())) modes.resize(line + 1, 0);
58 		modes[line] = mode;
59 	}
getMode(Sci_Position line)60 	int getMode(Sci_Position line) {
61 		if (line >= 0 && line < static_cast<Sci_Position>(modes.size())) return modes[line];
62 		return 0;
63 	}
truncModes(Sci_Position numLines)64 	void truncModes(Sci_Position numLines) {
65 		if (static_cast<Sci_Position>(modes.size()) > numLines * 2 + 256)
66 			modes.resize(numLines + 128);
67 	}
68 
69 	vector<latexFoldSave> saves;
setSave(Sci_Position line,const latexFoldSave & save)70 	void setSave(Sci_Position line, const latexFoldSave &save) {
71 		if (line >= static_cast<Sci_Position>(saves.size())) saves.resize(line + 1);
72 		saves[line] = save;
73 	}
getSave(Sci_Position line,latexFoldSave & save)74 	void getSave(Sci_Position line, latexFoldSave &save) {
75 		if (line >= 0 && line < static_cast<Sci_Position>(saves.size())) save = saves[line];
76 		else {
77 			save.structLev = 0;
78 			for (int i = 0; i < 8; ++i) save.openBegins[i] = 0;
79 		}
80 	}
truncSaves(Sci_Position numLines)81 	void truncSaves(Sci_Position numLines) {
82 		if (static_cast<Sci_Position>(saves.size()) > numLines * 2 + 256)
83 			saves.resize(numLines + 128);
84 	}
85 public:
LexerFactoryLaTeX()86 	static ILexer *LexerFactoryLaTeX() {
87 		return new LexerLaTeX();
88 	}
89 	void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
90 	void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
91 };
92 
latexIsSpecial(int ch)93 static bool latexIsSpecial(int ch) {
94 	return (ch == '#') || (ch == '$') || (ch == '%') || (ch == '&') || (ch == '_') ||
95 		   (ch == '{') || (ch == '}') || (ch == ' ');
96 }
97 
latexIsBlank(int ch)98 static bool latexIsBlank(int ch) {
99 	return (ch == ' ') || (ch == '\t');
100 }
101 
latexIsBlankAndNL(int ch)102 static bool latexIsBlankAndNL(int ch) {
103 	return (ch == ' ') || (ch == '\t') || (ch == '\r') || (ch == '\n');
104 }
105 
latexIsLetter(int ch)106 static bool latexIsLetter(int ch) {
107 	return IsASCII(ch) && isalpha(ch);
108 }
109 
latexIsTagValid(Sci_Position & i,Sci_Position l,Accessor & styler)110 static bool latexIsTagValid(Sci_Position &i, Sci_Position l, Accessor &styler) {
111 	while (i < l) {
112 		if (styler.SafeGetCharAt(i) == '{') {
113 			while (i < l) {
114 				i++;
115 				if (styler.SafeGetCharAt(i) == '}') {
116 					return true;
117 				}	else if (!latexIsLetter(styler.SafeGetCharAt(i)) &&
118                    styler.SafeGetCharAt(i)!='*') {
119 					return false;
120 				}
121 			}
122 		} else if (!latexIsBlank(styler.SafeGetCharAt(i))) {
123 			return false;
124 		}
125 		i++;
126 	}
127 	return false;
128 }
129 
latexNextNotBlankIs(Sci_Position i,Accessor & styler,char needle)130 static bool latexNextNotBlankIs(Sci_Position i, Accessor &styler, char needle) {
131   char ch;
132 	while (i < styler.Length()) {
133     ch = styler.SafeGetCharAt(i);
134 		if (!latexIsBlankAndNL(ch) && ch != '*') {
135       if (ch == needle)
136         return true;
137       else
138         return false;
139 		}
140 		i++;
141 	}
142 	return false;
143 }
144 
latexLastWordIs(Sci_Position start,Accessor & styler,const char * needle)145 static bool latexLastWordIs(Sci_Position start, Accessor &styler, const char *needle) {
146 	Sci_PositionU i = 0;
147 	Sci_PositionU l = static_cast<Sci_PositionU>(strlen(needle));
148 	Sci_Position ini = start-l+1;
149 	char s[32];
150 
151 	while (i < l && i < 31) {
152 		s[i] = styler.SafeGetCharAt(ini + i);
153 		i++;
154 	}
155 	s[i] = '\0';
156 
157 	return (strcmp(s, needle) == 0);
158 }
159 
latexLastWordIsMathEnv(Sci_Position pos,Accessor & styler)160 static bool latexLastWordIsMathEnv(Sci_Position pos, Accessor &styler) {
161 	Sci_Position i, j;
162 	char s[32];
163 	const char *mathEnvs[] = { "align", "alignat", "flalign", "gather",
164 		"multiline", "displaymath", "eqnarray", "equation" };
165 	if (styler.SafeGetCharAt(pos) != '}') return false;
166 	for (i = pos - 1; i >= 0; --i) {
167 		if (styler.SafeGetCharAt(i) == '{') break;
168 		if (pos - i >= 20) return false;
169 	}
170 	if (i < 0 || i == pos - 1) return false;
171 	++i;
172 	for (j = 0; i + j < pos; ++j)
173 		s[j] = styler.SafeGetCharAt(i + j);
174 	s[j] = '\0';
175 	if (j == 0) return false;
176 	if (s[j - 1] == '*') s[--j] = '\0';
177 	for (i = 0; i < static_cast<int>(sizeof(mathEnvs) / sizeof(const char *)); ++i)
178 		if (strcmp(s, mathEnvs[i]) == 0) return true;
179 	return false;
180 }
181 
latexStateReset(int & mode,int & state)182 static inline void latexStateReset(int &mode, int &state) {
183 	switch (mode) {
184 	case 1:     state = SCE_L_MATH; break;
185 	case 2:     state = SCE_L_MATH2; break;
186 	default:    state = SCE_L_DEFAULT; break;
187 	}
188 }
189 
190 // There are cases not handled correctly, like $abcd\textrm{what is $x+y$}z+w$.
191 // But I think it's already good enough.
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)192 void SCI_METHOD LexerLaTeX::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
193 	// startPos is assumed to be the first character of a line
194 	Accessor styler(pAccess, &props);
195 	styler.StartAt(startPos);
196 	int mode = getMode(styler.GetLine(startPos) - 1);
197 	int state = initStyle;
198 	if (state == SCE_L_ERROR || state == SCE_L_SHORTCMD || state == SCE_L_SPECIAL)   // should not happen
199 		latexStateReset(mode, state);
200 
201 	char chNext = styler.SafeGetCharAt(startPos);
202 	char chVerbatimDelim = '\0';
203 	styler.StartSegment(startPos);
204 	Sci_Position lengthDoc = startPos + length;
205 
206 	for (Sci_Position i = startPos; i < lengthDoc; i++) {
207 		char ch = chNext;
208 		chNext = styler.SafeGetCharAt(i + 1);
209 
210 		if (styler.IsLeadByte(ch)) {
211 			i++;
212 			chNext = styler.SafeGetCharAt(i + 1);
213 			continue;
214 		}
215 
216 		if (ch == '\r' || ch == '\n')
217 			setMode(styler.GetLine(i), mode);
218 
219 		switch (state) {
220 		case SCE_L_DEFAULT :
221 			switch (ch) {
222 			case '\\' :
223 				styler.ColourTo(i - 1, state);
224 				if (latexIsLetter(chNext)) {
225 					state = SCE_L_COMMAND;
226 				} else if (latexIsSpecial(chNext)) {
227 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
228 					i++;
229 					chNext = styler.SafeGetCharAt(i + 1);
230 				} else if (chNext == '\r' || chNext == '\n') {
231 					styler.ColourTo(i, SCE_L_ERROR);
232 				} else if (IsASCII(chNext)) {
233 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
234 					if (chNext == '(') {
235 						mode = 1;
236 						state = SCE_L_MATH;
237 					} else if (chNext == '[') {
238 						mode = 2;
239 						state = SCE_L_MATH2;
240 					}
241 					i++;
242 					chNext = styler.SafeGetCharAt(i + 1);
243 				}
244 				break;
245 			case '$' :
246 				styler.ColourTo(i - 1, state);
247 				if (chNext == '$') {
248 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
249 					mode = 2;
250 					state = SCE_L_MATH2;
251 					i++;
252 					chNext = styler.SafeGetCharAt(i + 1);
253 				} else {
254 					styler.ColourTo(i, SCE_L_SHORTCMD);
255 					mode = 1;
256 					state = SCE_L_MATH;
257 				}
258 				break;
259 			case '%' :
260 				styler.ColourTo(i - 1, state);
261 				state = SCE_L_COMMENT;
262 				break;
263 			}
264 			break;
265 		// These 3 will never be reached.
266 		case SCE_L_ERROR:
267 		case SCE_L_SPECIAL:
268 		case SCE_L_SHORTCMD:
269 			break;
270 		case SCE_L_COMMAND :
271 			if (!latexIsLetter(chNext)) {
272 				styler.ColourTo(i, state);
273 				if (latexNextNotBlankIs(i + 1, styler, '[' )) {
274 					state = SCE_L_CMDOPT;
275 				} else if (latexLastWordIs(i, styler, "\\begin")) {
276 					state = SCE_L_TAG;
277 				} else if (latexLastWordIs(i, styler, "\\end")) {
278 					state = SCE_L_TAG2;
279 				} else if (latexLastWordIs(i, styler, "\\verb") && chNext != '*' && chNext != ' ') {
280 					chVerbatimDelim = chNext;
281 					state = SCE_L_VERBATIM;
282 				} else {
283 					latexStateReset(mode, state);
284 				}
285 			}
286 			break;
287 		case SCE_L_CMDOPT :
288 			if (ch == ']') {
289 				styler.ColourTo(i, state);
290 				latexStateReset(mode, state);
291 			}
292 			break;
293 		case SCE_L_TAG :
294 			if (latexIsTagValid(i, lengthDoc, styler)) {
295 				styler.ColourTo(i, state);
296 				latexStateReset(mode, state);
297 				if (latexLastWordIs(i, styler, "{verbatim}")) {
298 					state = SCE_L_VERBATIM;
299 				} else if (latexLastWordIs(i, styler, "{comment}")) {
300 					state = SCE_L_COMMENT2;
301 				} else if (latexLastWordIs(i, styler, "{math}") && mode == 0) {
302 					mode = 1;
303 					state = SCE_L_MATH;
304 				} else if (latexLastWordIsMathEnv(i, styler) && mode == 0) {
305 					mode = 2;
306 					state = SCE_L_MATH2;
307 				}
308 			} else {
309 				styler.ColourTo(i, SCE_L_ERROR);
310 				latexStateReset(mode, state);
311 				ch = styler.SafeGetCharAt(i);
312 				if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
313 			}
314 			chNext = styler.SafeGetCharAt(i+1);
315 			break;
316 		case SCE_L_TAG2 :
317 			if (latexIsTagValid(i, lengthDoc, styler)) {
318 				styler.ColourTo(i, state);
319 				latexStateReset(mode, state);
320 			} else {
321 				styler.ColourTo(i, SCE_L_ERROR);
322 				latexStateReset(mode, state);
323 				ch = styler.SafeGetCharAt(i);
324 				if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
325 			}
326 			chNext = styler.SafeGetCharAt(i+1);
327 			break;
328 		case SCE_L_MATH :
329 			switch (ch) {
330 			case '\\' :
331 				styler.ColourTo(i - 1, state);
332 				if (latexIsLetter(chNext)) {
333 					Sci_Position match = i + 3;
334 					if (latexLastWordIs(match, styler, "\\end")) {
335 						match++;
336 						if (latexIsTagValid(match, lengthDoc, styler)) {
337 							if (latexLastWordIs(match, styler, "{math}"))
338 								mode = 0;
339 						}
340 					}
341 					state = SCE_L_COMMAND;
342 				} else if (latexIsSpecial(chNext)) {
343 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
344 					i++;
345 					chNext = styler.SafeGetCharAt(i + 1);
346 				} else if (chNext == '\r' || chNext == '\n') {
347 					styler.ColourTo(i, SCE_L_ERROR);
348 				} else if (IsASCII(chNext)) {
349 					if (chNext == ')') {
350 						mode = 0;
351 						state = SCE_L_DEFAULT;
352 					}
353 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
354 					i++;
355 					chNext = styler.SafeGetCharAt(i + 1);
356 				}
357 				break;
358 			case '$' :
359 				styler.ColourTo(i - 1, state);
360 				styler.ColourTo(i, SCE_L_SHORTCMD);
361 				mode = 0;
362 				state = SCE_L_DEFAULT;
363 				break;
364 			case '%' :
365 				styler.ColourTo(i - 1, state);
366 				state = SCE_L_COMMENT;
367 				break;
368 			}
369 			break;
370 		case SCE_L_MATH2 :
371 			switch (ch) {
372 			case '\\' :
373 				styler.ColourTo(i - 1, state);
374 				if (latexIsLetter(chNext)) {
375 					Sci_Position match = i + 3;
376 					if (latexLastWordIs(match, styler, "\\end")) {
377 						match++;
378 						if (latexIsTagValid(match, lengthDoc, styler)) {
379 							if (latexLastWordIsMathEnv(match, styler))
380 								mode = 0;
381 						}
382 					}
383 					state = SCE_L_COMMAND;
384 				} else if (latexIsSpecial(chNext)) {
385 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
386 					i++;
387 					chNext = styler.SafeGetCharAt(i + 1);
388 				} else if (chNext == '\r' || chNext == '\n') {
389 					styler.ColourTo(i, SCE_L_ERROR);
390 				} else if (IsASCII(chNext)) {
391 					if (chNext == ']') {
392 						mode = 0;
393 						state = SCE_L_DEFAULT;
394 					}
395 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
396 					i++;
397 					chNext = styler.SafeGetCharAt(i + 1);
398 				}
399 				break;
400 			case '$' :
401 				styler.ColourTo(i - 1, state);
402 				if (chNext == '$') {
403 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
404 					i++;
405 					chNext = styler.SafeGetCharAt(i + 1);
406 					mode = 0;
407 					state = SCE_L_DEFAULT;
408 				} else { // This may not be an error, e.g. \begin{equation}\text{$a$}\end{equation}
409 					styler.ColourTo(i, SCE_L_SHORTCMD);
410 				}
411 				break;
412 			case '%' :
413 				styler.ColourTo(i - 1, state);
414 				state = SCE_L_COMMENT;
415 				break;
416 			}
417 			break;
418 		case SCE_L_COMMENT :
419 			if (ch == '\r' || ch == '\n') {
420 				styler.ColourTo(i - 1, state);
421 				latexStateReset(mode, state);
422 			}
423 			break;
424 		case SCE_L_COMMENT2 :
425 			if (ch == '\\') {
426 				Sci_Position match = i + 3;
427 				if (latexLastWordIs(match, styler, "\\end")) {
428 					match++;
429 					if (latexIsTagValid(match, lengthDoc, styler)) {
430 						if (latexLastWordIs(match, styler, "{comment}")) {
431 							styler.ColourTo(i - 1, state);
432 							state = SCE_L_COMMAND;
433 						}
434 					}
435 				}
436 			}
437 			break;
438 		case SCE_L_VERBATIM :
439 			if (ch == '\\') {
440 				Sci_Position match = i + 3;
441 				if (latexLastWordIs(match, styler, "\\end")) {
442 					match++;
443 					if (latexIsTagValid(match, lengthDoc, styler)) {
444 						if (latexLastWordIs(match, styler, "{verbatim}")) {
445 							styler.ColourTo(i - 1, state);
446 							state = SCE_L_COMMAND;
447 						}
448 					}
449 				}
450 			} else if (chNext == chVerbatimDelim) {
451 				styler.ColourTo(i + 1, state);
452 				latexStateReset(mode, state);
453 				chVerbatimDelim = '\0';
454 				i++;
455 				chNext = styler.SafeGetCharAt(i + 1);
456 			} else if (chVerbatimDelim != '\0' && (ch == '\n' || ch == '\r')) {
457 				styler.ColourTo(i, SCE_L_ERROR);
458 				latexStateReset(mode, state);
459 				chVerbatimDelim = '\0';
460 			}
461 			break;
462 		}
463 	}
464 	if (lengthDoc == styler.Length()) truncModes(styler.GetLine(lengthDoc - 1));
465 	styler.ColourTo(lengthDoc - 1, state);
466 	styler.Flush();
467 }
468 
latexFoldSaveToInt(const latexFoldSave & save)469 static int latexFoldSaveToInt(const latexFoldSave &save) {
470 	int sum = 0;
471 	for (int i = 0; i <= save.structLev; ++i)
472 		sum += save.openBegins[i];
473 	return ((sum + save.structLev + SC_FOLDLEVELBASE) & SC_FOLDLEVELNUMBERMASK);
474 }
475 
476 // Change folding state while processing a line
477 // Return the level before the first relevant command
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)478 void SCI_METHOD LexerLaTeX::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) {
479 	const char *structWords[7] = {"part", "chapter", "section", "subsection",
480 		"subsubsection", "paragraph", "subparagraph"};
481 	Accessor styler(pAccess, &props);
482 	Sci_PositionU endPos = startPos + length;
483 	Sci_Position curLine = styler.GetLine(startPos);
484 	latexFoldSave save;
485 	getSave(curLine - 1, save);
486 	do {
487 		char ch, buf[16];
488 		Sci_Position i, j;
489 		int lev = -1;
490 		bool needFold = false;
491 		for (i = static_cast<Sci_Position>(startPos); i < static_cast<Sci_Position>(endPos); ++i) {
492 			ch = styler.SafeGetCharAt(i);
493 			if (ch == '\r' || ch == '\n') break;
494 			if (ch != '\\' || styler.StyleAt(i) != SCE_L_COMMAND) continue;
495 			for (j = 0; j < 15 && i + 1 < static_cast<Sci_Position>(endPos); ++j, ++i) {
496 				buf[j] = styler.SafeGetCharAt(i + 1);
497 				if (!latexIsLetter(buf[j])) break;
498 			}
499 			buf[j] = '\0';
500 			if (strcmp(buf, "begin") == 0) {
501 				if (lev < 0) lev = latexFoldSaveToInt(save);
502 				++save.openBegins[save.structLev];
503 				needFold = true;
504 			}
505 			else if (strcmp(buf, "end") == 0) {
506 				while (save.structLev > 0 && save.openBegins[save.structLev] == 0)
507 					--save.structLev;
508 				if (lev < 0) lev = latexFoldSaveToInt(save);
509 				if (save.openBegins[save.structLev] > 0) --save.openBegins[save.structLev];
510 			}
511 			else {
512 				for (j = 0; j < 7; ++j)
513 					if (strcmp(buf, structWords[j]) == 0) break;
514 				if (j >= 7) continue;
515 				save.structLev = j;   // level before the command
516 				for (j = save.structLev + 1; j < 8; ++j) {
517 					save.openBegins[save.structLev] += save.openBegins[j];
518 					save.openBegins[j] = 0;
519 				}
520 				if (lev < 0) lev = latexFoldSaveToInt(save);
521 				++save.structLev;   // level after the command
522 				needFold = true;
523 			}
524 		}
525 		if (lev < 0) lev = latexFoldSaveToInt(save);
526 		if (needFold) lev |= SC_FOLDLEVELHEADERFLAG;
527 		styler.SetLevel(curLine, lev);
528 		setSave(curLine, save);
529 		++curLine;
530 		startPos = styler.LineStart(curLine);
531 		if (static_cast<Sci_Position>(startPos) == styler.Length()) {
532 			lev = latexFoldSaveToInt(save);
533 			styler.SetLevel(curLine, lev);
534 			setSave(curLine, save);
535 			truncSaves(curLine);
536 		}
537 	} while (startPos < endPos);
538 	styler.Flush();
539 }
540 
541 static const char *const emptyWordListDesc[] = {
542 	0
543 };
544 
545 LexerModule lmLatex(SCLEX_LATEX, LexerLaTeX::LexerFactoryLaTeX, "latex", emptyWordListDesc);
546