1 // Scintilla source code edit control
2 /** @file LexLaTeX.cxx
3  ** Lexer for LaTeX2e.
4   **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 // Modified by G. HU in 2013. Added folding, syntax highting inside math environments, and changed some minor behaviors.
9 
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdio.h>
13 #include <stdarg.h>
14 #include <assert.h>
15 #include <ctype.h>
16 #include <vector>
17 
18 #include "ILexer.h"
19 #include "Scintilla.h"
20 #include "SciLexer.h"
21 
22 #include "PropSetSimple.h"
23 #include "WordList.h"
24 #include "LexAccessor.h"
25 #include "Accessor.h"
26 #include "StyleContext.h"
27 #include "CharacterSet.h"
28 #include "LexerModule.h"
29 #include "DefaultLexer.h"
30 #include "LexerBase.h"
31 
32 using namespace Scintilla;
33 
34 using namespace std;
35 
36 struct latexFoldSave {
latexFoldSavelatexFoldSave37 	latexFoldSave() : structLev(0) {
38 		for (int i = 0; i < 8; ++i) openBegins[i] = 0;
39 	}
latexFoldSavelatexFoldSave40 	latexFoldSave(const latexFoldSave &save) : structLev(save.structLev) {
41 		for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
42 	}
operator =latexFoldSave43 	latexFoldSave &operator=(const latexFoldSave &save) {
44 		if (this != &save) {
45 			structLev = save.structLev;
46 			for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
47 		}
48 		return *this;
49 	}
50 	int openBegins[8];
51 	Sci_Position structLev;
52 };
53 
54 class LexerLaTeX : public LexerBase {
55 private:
56 	vector<int> modes;
setMode(Sci_Position line,int mode)57 	void setMode(Sci_Position line, int mode) {
58 		if (line >= static_cast<Sci_Position>(modes.size())) modes.resize(line + 1, 0);
59 		modes[line] = mode;
60 	}
getMode(Sci_Position line)61 	int getMode(Sci_Position line) {
62 		if (line >= 0 && line < static_cast<Sci_Position>(modes.size())) return modes[line];
63 		return 0;
64 	}
truncModes(Sci_Position numLines)65 	void truncModes(Sci_Position numLines) {
66 		if (static_cast<Sci_Position>(modes.size()) > numLines * 2 + 256)
67 			modes.resize(numLines + 128);
68 	}
69 
70 	vector<latexFoldSave> saves;
setSave(Sci_Position line,const latexFoldSave & save)71 	void setSave(Sci_Position line, const latexFoldSave &save) {
72 		if (line >= static_cast<Sci_Position>(saves.size())) saves.resize(line + 1);
73 		saves[line] = save;
74 	}
getSave(Sci_Position line,latexFoldSave & save)75 	void getSave(Sci_Position line, latexFoldSave &save) {
76 		if (line >= 0 && line < static_cast<Sci_Position>(saves.size())) save = saves[line];
77 		else {
78 			save.structLev = 0;
79 			for (int i = 0; i < 8; ++i) save.openBegins[i] = 0;
80 		}
81 	}
truncSaves(Sci_Position numLines)82 	void truncSaves(Sci_Position numLines) {
83 		if (static_cast<Sci_Position>(saves.size()) > numLines * 2 + 256)
84 			saves.resize(numLines + 128);
85 	}
86 public:
LexerFactoryLaTeX()87 	static ILexer5 *LexerFactoryLaTeX() {
88 		return new LexerLaTeX();
89 	}
90 	void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
91 	void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
92 
93 	// ILexer5 methods
GetName()94 	const char * SCI_METHOD GetName() override {
95 		return "latex";
96 	}
GetIdentifier()97 	int SCI_METHOD  GetIdentifier() override {
98 		return SCLEX_LATEX;
99 	}
100 };
101 
latexIsSpecial(int ch)102 static bool latexIsSpecial(int ch) {
103 	return (ch == '#') || (ch == '$') || (ch == '%') || (ch == '&') || (ch == '_') ||
104 		   (ch == '{') || (ch == '}') || (ch == ' ');
105 }
106 
latexIsBlank(int ch)107 static bool latexIsBlank(int ch) {
108 	return (ch == ' ') || (ch == '\t');
109 }
110 
latexIsBlankAndNL(int ch)111 static bool latexIsBlankAndNL(int ch) {
112 	return (ch == ' ') || (ch == '\t') || (ch == '\r') || (ch == '\n');
113 }
114 
latexIsLetter(int ch)115 static bool latexIsLetter(int ch) {
116 	return IsASCII(ch) && isalpha(ch);
117 }
118 
latexIsTagValid(Sci_Position & i,Sci_Position l,Accessor & styler)119 static bool latexIsTagValid(Sci_Position &i, Sci_Position l, Accessor &styler) {
120 	while (i < l) {
121 		if (styler.SafeGetCharAt(i) == '{') {
122 			while (i < l) {
123 				i++;
124 				if (styler.SafeGetCharAt(i) == '}') {
125 					return true;
126 				}	else if (!latexIsLetter(styler.SafeGetCharAt(i)) &&
127                    styler.SafeGetCharAt(i)!='*') {
128 					return false;
129 				}
130 			}
131 		} else if (!latexIsBlank(styler.SafeGetCharAt(i))) {
132 			return false;
133 		}
134 		i++;
135 	}
136 	return false;
137 }
138 
latexNextNotBlankIs(Sci_Position i,Accessor & styler,char needle)139 static bool latexNextNotBlankIs(Sci_Position i, Accessor &styler, char needle) {
140   char ch;
141 	while (i < styler.Length()) {
142     ch = styler.SafeGetCharAt(i);
143 		if (!latexIsBlankAndNL(ch) && ch != '*') {
144       if (ch == needle)
145         return true;
146       else
147         return false;
148 		}
149 		i++;
150 	}
151 	return false;
152 }
153 
latexLastWordIs(Sci_Position start,Accessor & styler,const char * needle)154 static bool latexLastWordIs(Sci_Position start, Accessor &styler, const char *needle) {
155 	Sci_PositionU i = 0;
156 	Sci_PositionU l = static_cast<Sci_PositionU>(strlen(needle));
157 	Sci_Position ini = start-l+1;
158 	char s[32];
159 
160 	while (i < l && i < 31) {
161 		s[i] = styler.SafeGetCharAt(ini + i);
162 		i++;
163 	}
164 	s[i] = '\0';
165 
166 	return (strcmp(s, needle) == 0);
167 }
168 
latexLastWordIsMathEnv(Sci_Position pos,Accessor & styler)169 static bool latexLastWordIsMathEnv(Sci_Position pos, Accessor &styler) {
170 	Sci_Position i, j;
171 	char s[32];
172 	const char *mathEnvs[] = { "align", "alignat", "flalign", "gather",
173 		"multiline", "displaymath", "eqnarray", "equation" };
174 	if (styler.SafeGetCharAt(pos) != '}') return false;
175 	for (i = pos - 1; i >= 0; --i) {
176 		if (styler.SafeGetCharAt(i) == '{') break;
177 		if (pos - i >= 20) return false;
178 	}
179 	if (i < 0 || i == pos - 1) return false;
180 	++i;
181 	for (j = 0; i + j < pos; ++j)
182 		s[j] = styler.SafeGetCharAt(i + j);
183 	s[j] = '\0';
184 	if (j == 0) return false;
185 	if (s[j - 1] == '*') s[--j] = '\0';
186 	for (i = 0; i < static_cast<int>(sizeof(mathEnvs) / sizeof(const char *)); ++i)
187 		if (strcmp(s, mathEnvs[i]) == 0) return true;
188 	return false;
189 }
190 
latexStateReset(int & mode,int & state)191 static inline void latexStateReset(int &mode, int &state) {
192 	switch (mode) {
193 	case 1:     state = SCE_L_MATH; break;
194 	case 2:     state = SCE_L_MATH2; break;
195 	default:    state = SCE_L_DEFAULT; break;
196 	}
197 }
198 
199 // There are cases not handled correctly, like $abcd\textrm{what is $x+y$}z+w$.
200 // But I think it's already good enough.
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)201 void SCI_METHOD LexerLaTeX::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
202 	// startPos is assumed to be the first character of a line
203 	Accessor styler(pAccess, &props);
204 	styler.StartAt(startPos);
205 	int mode = getMode(styler.GetLine(startPos) - 1);
206 	int state = initStyle;
207 	if (state == SCE_L_ERROR || state == SCE_L_SHORTCMD || state == SCE_L_SPECIAL)   // should not happen
208 		latexStateReset(mode, state);
209 
210 	char chNext = styler.SafeGetCharAt(startPos);
211 	char chVerbatimDelim = '\0';
212 	styler.StartSegment(startPos);
213 	Sci_Position lengthDoc = startPos + length;
214 
215 	for (Sci_Position i = startPos; i < lengthDoc; i++) {
216 		char ch = chNext;
217 		chNext = styler.SafeGetCharAt(i + 1);
218 
219 		if (styler.IsLeadByte(ch)) {
220 			i++;
221 			chNext = styler.SafeGetCharAt(i + 1);
222 			continue;
223 		}
224 
225 		if (ch == '\r' || ch == '\n')
226 			setMode(styler.GetLine(i), mode);
227 
228 		switch (state) {
229 		case SCE_L_DEFAULT :
230 			switch (ch) {
231 			case '\\' :
232 				styler.ColourTo(i - 1, state);
233 				if (latexIsLetter(chNext)) {
234 					state = SCE_L_COMMAND;
235 				} else if (latexIsSpecial(chNext)) {
236 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
237 					i++;
238 					chNext = styler.SafeGetCharAt(i + 1);
239 				} else if (chNext == '\r' || chNext == '\n') {
240 					styler.ColourTo(i, SCE_L_ERROR);
241 				} else if (IsASCII(chNext)) {
242 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
243 					if (chNext == '(') {
244 						mode = 1;
245 						state = SCE_L_MATH;
246 					} else if (chNext == '[') {
247 						mode = 2;
248 						state = SCE_L_MATH2;
249 					}
250 					i++;
251 					chNext = styler.SafeGetCharAt(i + 1);
252 				}
253 				break;
254 			case '$' :
255 				styler.ColourTo(i - 1, state);
256 				if (chNext == '$') {
257 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
258 					mode = 2;
259 					state = SCE_L_MATH2;
260 					i++;
261 					chNext = styler.SafeGetCharAt(i + 1);
262 				} else {
263 					styler.ColourTo(i, SCE_L_SHORTCMD);
264 					mode = 1;
265 					state = SCE_L_MATH;
266 				}
267 				break;
268 			case '%' :
269 				styler.ColourTo(i - 1, state);
270 				state = SCE_L_COMMENT;
271 				break;
272 			}
273 			break;
274 		// These 3 will never be reached.
275 		case SCE_L_ERROR:
276 		case SCE_L_SPECIAL:
277 		case SCE_L_SHORTCMD:
278 			break;
279 		case SCE_L_COMMAND :
280 			if (!latexIsLetter(chNext)) {
281 				styler.ColourTo(i, state);
282 				if (latexNextNotBlankIs(i + 1, styler, '[' )) {
283 					state = SCE_L_CMDOPT;
284 				} else if (latexLastWordIs(i, styler, "\\begin")) {
285 					state = SCE_L_TAG;
286 				} else if (latexLastWordIs(i, styler, "\\end")) {
287 					state = SCE_L_TAG2;
288 				} else if (latexLastWordIs(i, styler, "\\verb") && chNext != '*' && chNext != ' ') {
289 					chVerbatimDelim = chNext;
290 					state = SCE_L_VERBATIM;
291 				} else {
292 					latexStateReset(mode, state);
293 				}
294 			}
295 			break;
296 		case SCE_L_CMDOPT :
297 			if (ch == ']') {
298 				styler.ColourTo(i, state);
299 				latexStateReset(mode, state);
300 			}
301 			break;
302 		case SCE_L_TAG :
303 			if (latexIsTagValid(i, lengthDoc, styler)) {
304 				styler.ColourTo(i, state);
305 				latexStateReset(mode, state);
306 				if (latexLastWordIs(i, styler, "{verbatim}")) {
307 					state = SCE_L_VERBATIM;
308 				} else if (latexLastWordIs(i, styler, "{lstlisting}")) {
309 					state = SCE_L_VERBATIM;
310 				} else if (latexLastWordIs(i, styler, "{comment}")) {
311 					state = SCE_L_COMMENT2;
312 				} else if (latexLastWordIs(i, styler, "{math}") && mode == 0) {
313 					mode = 1;
314 					state = SCE_L_MATH;
315 				} else if (latexLastWordIsMathEnv(i, styler) && mode == 0) {
316 					mode = 2;
317 					state = SCE_L_MATH2;
318 				}
319 			} else {
320 				styler.ColourTo(i, SCE_L_ERROR);
321 				latexStateReset(mode, state);
322 				ch = styler.SafeGetCharAt(i);
323 				if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
324 			}
325 			chNext = styler.SafeGetCharAt(i+1);
326 			break;
327 		case SCE_L_TAG2 :
328 			if (latexIsTagValid(i, lengthDoc, styler)) {
329 				styler.ColourTo(i, state);
330 				latexStateReset(mode, state);
331 			} else {
332 				styler.ColourTo(i, SCE_L_ERROR);
333 				latexStateReset(mode, state);
334 				ch = styler.SafeGetCharAt(i);
335 				if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
336 			}
337 			chNext = styler.SafeGetCharAt(i+1);
338 			break;
339 		case SCE_L_MATH :
340 			switch (ch) {
341 			case '\\' :
342 				styler.ColourTo(i - 1, state);
343 				if (latexIsLetter(chNext)) {
344 					Sci_Position match = i + 3;
345 					if (latexLastWordIs(match, styler, "\\end")) {
346 						match++;
347 						if (latexIsTagValid(match, lengthDoc, styler)) {
348 							if (latexLastWordIs(match, styler, "{math}"))
349 								mode = 0;
350 						}
351 					}
352 					state = SCE_L_COMMAND;
353 				} else if (latexIsSpecial(chNext)) {
354 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
355 					i++;
356 					chNext = styler.SafeGetCharAt(i + 1);
357 				} else if (chNext == '\r' || chNext == '\n') {
358 					styler.ColourTo(i, SCE_L_ERROR);
359 				} else if (IsASCII(chNext)) {
360 					if (chNext == ')') {
361 						mode = 0;
362 						state = SCE_L_DEFAULT;
363 					}
364 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
365 					i++;
366 					chNext = styler.SafeGetCharAt(i + 1);
367 				}
368 				break;
369 			case '$' :
370 				styler.ColourTo(i - 1, state);
371 				styler.ColourTo(i, SCE_L_SHORTCMD);
372 				mode = 0;
373 				state = SCE_L_DEFAULT;
374 				break;
375 			case '%' :
376 				styler.ColourTo(i - 1, state);
377 				state = SCE_L_COMMENT;
378 				break;
379 			}
380 			break;
381 		case SCE_L_MATH2 :
382 			switch (ch) {
383 			case '\\' :
384 				styler.ColourTo(i - 1, state);
385 				if (latexIsLetter(chNext)) {
386 					Sci_Position match = i + 3;
387 					if (latexLastWordIs(match, styler, "\\end")) {
388 						match++;
389 						if (latexIsTagValid(match, lengthDoc, styler)) {
390 							if (latexLastWordIsMathEnv(match, styler))
391 								mode = 0;
392 						}
393 					}
394 					state = SCE_L_COMMAND;
395 				} else if (latexIsSpecial(chNext)) {
396 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
397 					i++;
398 					chNext = styler.SafeGetCharAt(i + 1);
399 				} else if (chNext == '\r' || chNext == '\n') {
400 					styler.ColourTo(i, SCE_L_ERROR);
401 				} else if (IsASCII(chNext)) {
402 					if (chNext == ']') {
403 						mode = 0;
404 						state = SCE_L_DEFAULT;
405 					}
406 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
407 					i++;
408 					chNext = styler.SafeGetCharAt(i + 1);
409 				}
410 				break;
411 			case '$' :
412 				styler.ColourTo(i - 1, state);
413 				if (chNext == '$') {
414 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
415 					i++;
416 					chNext = styler.SafeGetCharAt(i + 1);
417 					mode = 0;
418 					state = SCE_L_DEFAULT;
419 				} else { // This may not be an error, e.g. \begin{equation}\text{$a$}\end{equation}
420 					styler.ColourTo(i, SCE_L_SHORTCMD);
421 				}
422 				break;
423 			case '%' :
424 				styler.ColourTo(i - 1, state);
425 				state = SCE_L_COMMENT;
426 				break;
427 			}
428 			break;
429 		case SCE_L_COMMENT :
430 			if (ch == '\r' || ch == '\n') {
431 				styler.ColourTo(i - 1, state);
432 				latexStateReset(mode, state);
433 			}
434 			break;
435 		case SCE_L_COMMENT2 :
436 			if (ch == '\\') {
437 				Sci_Position match = i + 3;
438 				if (latexLastWordIs(match, styler, "\\end")) {
439 					match++;
440 					if (latexIsTagValid(match, lengthDoc, styler)) {
441 						if (latexLastWordIs(match, styler, "{comment}")) {
442 							styler.ColourTo(i - 1, state);
443 							state = SCE_L_COMMAND;
444 						}
445 					}
446 				}
447 			}
448 			break;
449 		case SCE_L_VERBATIM :
450 			if (ch == '\\') {
451 				Sci_Position match = i + 3;
452 				if (latexLastWordIs(match, styler, "\\end")) {
453 					match++;
454 					if (latexIsTagValid(match, lengthDoc, styler)) {
455 						if (latexLastWordIs(match, styler, "{verbatim}")) {
456 							styler.ColourTo(i - 1, state);
457 							state = SCE_L_COMMAND;
458 						} else if (latexLastWordIs(match, styler, "{lstlisting}")) {
459 							styler.ColourTo(i - 1, state);
460 							state = SCE_L_COMMAND;
461 						}
462 					}
463 				}
464 			} else if (chNext == chVerbatimDelim) {
465 				styler.ColourTo(i + 1, state);
466 				latexStateReset(mode, state);
467 				chVerbatimDelim = '\0';
468 				i++;
469 				chNext = styler.SafeGetCharAt(i + 1);
470 			} else if (chVerbatimDelim != '\0' && (ch == '\n' || ch == '\r')) {
471 				styler.ColourTo(i, SCE_L_ERROR);
472 				latexStateReset(mode, state);
473 				chVerbatimDelim = '\0';
474 			}
475 			break;
476 		}
477 	}
478 	if (lengthDoc == styler.Length()) truncModes(styler.GetLine(lengthDoc - 1));
479 	styler.ColourTo(lengthDoc - 1, state);
480 	styler.Flush();
481 }
482 
latexFoldSaveToInt(const latexFoldSave & save)483 static int latexFoldSaveToInt(const latexFoldSave &save) {
484 	int sum = 0;
485 	for (int i = 0; i <= save.structLev; ++i)
486 		sum += save.openBegins[i];
487 	return ((sum + save.structLev + SC_FOLDLEVELBASE) & SC_FOLDLEVELNUMBERMASK);
488 }
489 
490 // Change folding state while processing a line
491 // Return the level before the first relevant command
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)492 void SCI_METHOD LexerLaTeX::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) {
493 	const char *structWords[7] = {"part", "chapter", "section", "subsection",
494 		"subsubsection", "paragraph", "subparagraph"};
495 	Accessor styler(pAccess, &props);
496 	Sci_PositionU endPos = startPos + length;
497 	Sci_Position curLine = styler.GetLine(startPos);
498 	latexFoldSave save;
499 	getSave(curLine - 1, save);
500 	do {
501 		char ch, buf[16];
502 		Sci_Position i, j;
503 		int lev = -1;
504 		bool needFold = false;
505 		for (i = static_cast<Sci_Position>(startPos); i < static_cast<Sci_Position>(endPos); ++i) {
506 			ch = styler.SafeGetCharAt(i);
507 			if (ch == '\r' || ch == '\n') break;
508 			if (ch != '\\' || styler.StyleAt(i) != SCE_L_COMMAND) continue;
509 			for (j = 0; j < 15 && i + 1 < static_cast<Sci_Position>(endPos); ++j, ++i) {
510 				buf[j] = styler.SafeGetCharAt(i + 1);
511 				if (!latexIsLetter(buf[j])) break;
512 			}
513 			buf[j] = '\0';
514 			if (strcmp(buf, "begin") == 0) {
515 				if (lev < 0) lev = latexFoldSaveToInt(save);
516 				++save.openBegins[save.structLev];
517 				needFold = true;
518 			}
519 			else if (strcmp(buf, "end") == 0) {
520 				while (save.structLev > 0 && save.openBegins[save.structLev] == 0)
521 					--save.structLev;
522 				if (lev < 0) lev = latexFoldSaveToInt(save);
523 				if (save.openBegins[save.structLev] > 0) --save.openBegins[save.structLev];
524 			}
525 			else {
526 				for (j = 0; j < 7; ++j)
527 					if (strcmp(buf, structWords[j]) == 0) break;
528 				if (j >= 7) continue;
529 				save.structLev = j;   // level before the command
530 				for (j = save.structLev + 1; j < 8; ++j) {
531 					save.openBegins[save.structLev] += save.openBegins[j];
532 					save.openBegins[j] = 0;
533 				}
534 				if (lev < 0) lev = latexFoldSaveToInt(save);
535 				++save.structLev;   // level after the command
536 				needFold = true;
537 			}
538 		}
539 		if (lev < 0) lev = latexFoldSaveToInt(save);
540 		if (needFold) lev |= SC_FOLDLEVELHEADERFLAG;
541 		styler.SetLevel(curLine, lev);
542 		setSave(curLine, save);
543 		++curLine;
544 		startPos = styler.LineStart(curLine);
545 		if (static_cast<Sci_Position>(startPos) == styler.Length()) {
546 			lev = latexFoldSaveToInt(save);
547 			styler.SetLevel(curLine, lev);
548 			setSave(curLine, save);
549 			truncSaves(curLine);
550 		}
551 	} while (startPos < endPos);
552 	styler.Flush();
553 }
554 
555 static const char *const emptyWordListDesc[] = {
556 	0
557 };
558 
559 LexerModule lmLatex(SCLEX_LATEX, LexerLaTeX::LexerFactoryLaTeX, "latex", emptyWordListDesc);
560