1 // Scintilla source code edit control
2 /** @file LexLaTeX.cxx
3  ** Lexer for LaTeX2e.
4   **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 // Modified by G. HU in 2013. Added folding, syntax highting inside math environments, and changed some minor behaviors.
9 
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdio.h>
13 #include <stdarg.h>
14 #include <assert.h>
15 #include <ctype.h>
16 #include <vector>
17 
18 #include "ILexer.h"
19 #include "Scintilla.h"
20 #include "SciLexer.h"
21 
22 #include "PropSetSimple.h"
23 #include "WordList.h"
24 #include "LexAccessor.h"
25 #include "Accessor.h"
26 #include "StyleContext.h"
27 #include "CharacterSet.h"
28 #include "LexerModule.h"
29 #include "DefaultLexer.h"
30 #include "LexerBase.h"
31 
32 using namespace Scintilla;
33 
34 using namespace std;
35 
36 struct latexFoldSave {
latexFoldSavelatexFoldSave37 	latexFoldSave() : structLev(0) {
38 		for (int i = 0; i < 8; ++i) openBegins[i] = 0;
39 	}
latexFoldSavelatexFoldSave40 	latexFoldSave(const latexFoldSave &save) : structLev(save.structLev) {
41 		for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
42 	}
operator =latexFoldSave43 	latexFoldSave &operator=(const latexFoldSave &save) {
44 		if (this != &save) {
45 			structLev = save.structLev;
46 			for (int i = 0; i < 8; ++i) openBegins[i] = save.openBegins[i];
47 		}
48 		return *this;
49 	}
50 	int openBegins[8];
51 	Sci_Position structLev;
52 };
53 
54 class LexerLaTeX : public LexerBase {
55 private:
56 	vector<int> modes;
setMode(Sci_Position line,int mode)57 	void setMode(Sci_Position line, int mode) {
58 		if (line >= static_cast<Sci_Position>(modes.size())) modes.resize(line + 1, 0);
59 		modes[line] = mode;
60 	}
getMode(Sci_Position line)61 	int getMode(Sci_Position line) {
62 		if (line >= 0 && line < static_cast<Sci_Position>(modes.size())) return modes[line];
63 		return 0;
64 	}
truncModes(Sci_Position numLines)65 	void truncModes(Sci_Position numLines) {
66 		if (static_cast<Sci_Position>(modes.size()) > numLines * 2 + 256)
67 			modes.resize(numLines + 128);
68 	}
69 
70 	vector<latexFoldSave> saves;
setSave(Sci_Position line,const latexFoldSave & save)71 	void setSave(Sci_Position line, const latexFoldSave &save) {
72 		if (line >= static_cast<Sci_Position>(saves.size())) saves.resize(line + 1);
73 		saves[line] = save;
74 	}
getSave(Sci_Position line,latexFoldSave & save)75 	void getSave(Sci_Position line, latexFoldSave &save) {
76 		if (line >= 0 && line < static_cast<Sci_Position>(saves.size())) save = saves[line];
77 		else {
78 			save.structLev = 0;
79 			for (int i = 0; i < 8; ++i) save.openBegins[i] = 0;
80 		}
81 	}
truncSaves(Sci_Position numLines)82 	void truncSaves(Sci_Position numLines) {
83 		if (static_cast<Sci_Position>(saves.size()) > numLines * 2 + 256)
84 			saves.resize(numLines + 128);
85 	}
86 public:
LexerFactoryLaTeX()87 	static ILexer5 *LexerFactoryLaTeX() {
88 		return new LexerLaTeX();
89 	}
90 	void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
91 	void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
92 
93 	// ILexer5 methods
GetName()94 	const char * SCI_METHOD GetName() override {
95 		return "latex";
96 	}
GetIdentifier()97 	int SCI_METHOD  GetIdentifier() override {
98 		return SCLEX_LATEX;
99 	}
100 };
101 
latexIsSpecial(int ch)102 static bool latexIsSpecial(int ch) {
103 	return (ch == '#') || (ch == '$') || (ch == '%') || (ch == '&') || (ch == '_') ||
104 		   (ch == '{') || (ch == '}') || (ch == ' ');
105 }
106 
latexIsBlank(int ch)107 static bool latexIsBlank(int ch) {
108 	return (ch == ' ') || (ch == '\t');
109 }
110 
latexIsBlankAndNL(int ch)111 static bool latexIsBlankAndNL(int ch) {
112 	return (ch == ' ') || (ch == '\t') || (ch == '\r') || (ch == '\n');
113 }
114 
latexIsLetter(int ch)115 static bool latexIsLetter(int ch) {
116 	return IsASCII(ch) && isalpha(ch);
117 }
118 
latexIsTagValid(Sci_Position & i,Sci_Position l,Accessor & styler)119 static bool latexIsTagValid(Sci_Position &i, Sci_Position l, Accessor &styler) {
120 	while (i < l) {
121 		if (styler.SafeGetCharAt(i) == '{') {
122 			while (i < l) {
123 				i++;
124 				if (styler.SafeGetCharAt(i) == '}') {
125 					return true;
126 				}	else if (!latexIsLetter(styler.SafeGetCharAt(i)) &&
127                    styler.SafeGetCharAt(i)!='*') {
128 					return false;
129 				}
130 			}
131 		} else if (!latexIsBlank(styler.SafeGetCharAt(i))) {
132 			return false;
133 		}
134 		i++;
135 	}
136 	return false;
137 }
138 
latexNextNotBlankIs(Sci_Position i,Accessor & styler,char needle)139 static bool latexNextNotBlankIs(Sci_Position i, Accessor &styler, char needle) {
140   char ch;
141 	while (i < styler.Length()) {
142     ch = styler.SafeGetCharAt(i);
143 		if (!latexIsBlankAndNL(ch) && ch != '*') {
144       if (ch == needle)
145         return true;
146       else
147         return false;
148 		}
149 		i++;
150 	}
151 	return false;
152 }
153 
latexLastWordIs(Sci_Position start,Accessor & styler,const char * needle)154 static bool latexLastWordIs(Sci_Position start, Accessor &styler, const char *needle) {
155 	Sci_PositionU i = 0;
156 	Sci_PositionU l = static_cast<Sci_PositionU>(strlen(needle));
157 	Sci_Position ini = start-l+1;
158 	char s[32];
159 
160 	while (i < l && i < 31) {
161 		s[i] = styler.SafeGetCharAt(ini + i);
162 		i++;
163 	}
164 	s[i] = '\0';
165 
166 	return (strcmp(s, needle) == 0);
167 }
168 
latexLastWordIsMathEnv(Sci_Position pos,Accessor & styler)169 static bool latexLastWordIsMathEnv(Sci_Position pos, Accessor &styler) {
170 	Sci_Position i, j;
171 	char s[32];
172 	const char *mathEnvs[] = { "align", "alignat", "flalign", "gather",
173 		"multiline", "displaymath", "eqnarray", "equation" };
174 	if (styler.SafeGetCharAt(pos) != '}') return false;
175 	for (i = pos - 1; i >= 0; --i) {
176 		if (styler.SafeGetCharAt(i) == '{') break;
177 		if (pos - i >= 20) return false;
178 	}
179 	if (i < 0 || i == pos - 1) return false;
180 	++i;
181 	for (j = 0; i + j < pos; ++j)
182 		s[j] = styler.SafeGetCharAt(i + j);
183 	s[j] = '\0';
184 	if (j == 0) return false;
185 	if (s[j - 1] == '*') s[--j] = '\0';
186 	for (i = 0; i < static_cast<int>(sizeof(mathEnvs) / sizeof(const char *)); ++i)
187 		if (strcmp(s, mathEnvs[i]) == 0) return true;
188 	return false;
189 }
190 
latexStateReset(int & mode,int & state)191 static inline void latexStateReset(int &mode, int &state) {
192 	switch (mode) {
193 	case 1:     state = SCE_L_MATH; break;
194 	case 2:     state = SCE_L_MATH2; break;
195 	default:    state = SCE_L_DEFAULT; break;
196 	}
197 }
198 
199 // There are cases not handled correctly, like $abcd\textrm{what is $x+y$}z+w$.
200 // But I think it's already good enough.
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)201 void SCI_METHOD LexerLaTeX::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
202 	// startPos is assumed to be the first character of a line
203 	Accessor styler(pAccess, &props);
204 	styler.StartAt(startPos);
205 	int mode = getMode(styler.GetLine(startPos) - 1);
206 	int state = initStyle;
207 	if (state == SCE_L_ERROR || state == SCE_L_SHORTCMD || state == SCE_L_SPECIAL)   // should not happen
208 		latexStateReset(mode, state);
209 
210 	char chNext = styler.SafeGetCharAt(startPos);
211 	char chVerbatimDelim = '\0';
212 	styler.StartSegment(startPos);
213 	Sci_Position lengthDoc = startPos + length;
214 
215 	for (Sci_Position i = startPos; i < lengthDoc; i++) {
216 		char ch = chNext;
217 		chNext = styler.SafeGetCharAt(i + 1);
218 
219 		if (styler.IsLeadByte(ch)) {
220 			i++;
221 			chNext = styler.SafeGetCharAt(i + 1);
222 			continue;
223 		}
224 
225 		if (ch == '\r' || ch == '\n')
226 			setMode(styler.GetLine(i), mode);
227 
228 		switch (state) {
229 		case SCE_L_DEFAULT :
230 			switch (ch) {
231 			case '\\' :
232 				styler.ColourTo(i - 1, state);
233 				if (latexIsLetter(chNext)) {
234 					state = SCE_L_COMMAND;
235 				} else if (latexIsSpecial(chNext)) {
236 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
237 					i++;
238 					chNext = styler.SafeGetCharAt(i + 1);
239 				} else if (chNext == '\r' || chNext == '\n') {
240 					styler.ColourTo(i, SCE_L_ERROR);
241 				} else if (IsASCII(chNext)) {
242 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
243 					if (chNext == '(') {
244 						mode = 1;
245 						state = SCE_L_MATH;
246 					} else if (chNext == '[') {
247 						mode = 2;
248 						state = SCE_L_MATH2;
249 					}
250 					i++;
251 					chNext = styler.SafeGetCharAt(i + 1);
252 				}
253 				break;
254 			case '$' :
255 				styler.ColourTo(i - 1, state);
256 				if (chNext == '$') {
257 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
258 					mode = 2;
259 					state = SCE_L_MATH2;
260 					i++;
261 					chNext = styler.SafeGetCharAt(i + 1);
262 				} else {
263 					styler.ColourTo(i, SCE_L_SHORTCMD);
264 					mode = 1;
265 					state = SCE_L_MATH;
266 				}
267 				break;
268 			case '%' :
269 				styler.ColourTo(i - 1, state);
270 				state = SCE_L_COMMENT;
271 				break;
272 			}
273 			break;
274 		// These 3 will never be reached.
275 		case SCE_L_ERROR:
276 		case SCE_L_SPECIAL:
277 		case SCE_L_SHORTCMD:
278 			break;
279 		case SCE_L_COMMAND :
280 			if (!latexIsLetter(chNext)) {
281 				styler.ColourTo(i, state);
282 				if (latexNextNotBlankIs(i + 1, styler, '[' )) {
283 					state = SCE_L_CMDOPT;
284 				} else if (latexLastWordIs(i, styler, "\\begin")) {
285 					state = SCE_L_TAG;
286 				} else if (latexLastWordIs(i, styler, "\\end")) {
287 					state = SCE_L_TAG2;
288 				} else if (latexLastWordIs(i, styler, "\\verb") && chNext != '*' && chNext != ' ') {
289 					chVerbatimDelim = chNext;
290 					state = SCE_L_VERBATIM;
291 				} else {
292 					latexStateReset(mode, state);
293 				}
294 			}
295 			break;
296 		case SCE_L_CMDOPT :
297 			if (ch == ']') {
298 				styler.ColourTo(i, state);
299 				latexStateReset(mode, state);
300 			}
301 			break;
302 		case SCE_L_TAG :
303 			if (latexIsTagValid(i, lengthDoc, styler)) {
304 				styler.ColourTo(i, state);
305 				latexStateReset(mode, state);
306 				if (latexLastWordIs(i, styler, "{verbatim}")) {
307 					state = SCE_L_VERBATIM;
308 				} else if (latexLastWordIs(i, styler, "{comment}")) {
309 					state = SCE_L_COMMENT2;
310 				} else if (latexLastWordIs(i, styler, "{math}") && mode == 0) {
311 					mode = 1;
312 					state = SCE_L_MATH;
313 				} else if (latexLastWordIsMathEnv(i, styler) && mode == 0) {
314 					mode = 2;
315 					state = SCE_L_MATH2;
316 				}
317 			} else {
318 				styler.ColourTo(i, SCE_L_ERROR);
319 				latexStateReset(mode, state);
320 				ch = styler.SafeGetCharAt(i);
321 				if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
322 			}
323 			chNext = styler.SafeGetCharAt(i+1);
324 			break;
325 		case SCE_L_TAG2 :
326 			if (latexIsTagValid(i, lengthDoc, styler)) {
327 				styler.ColourTo(i, state);
328 				latexStateReset(mode, state);
329 			} else {
330 				styler.ColourTo(i, SCE_L_ERROR);
331 				latexStateReset(mode, state);
332 				ch = styler.SafeGetCharAt(i);
333 				if (ch == '\r' || ch == '\n') setMode(styler.GetLine(i), mode);
334 			}
335 			chNext = styler.SafeGetCharAt(i+1);
336 			break;
337 		case SCE_L_MATH :
338 			switch (ch) {
339 			case '\\' :
340 				styler.ColourTo(i - 1, state);
341 				if (latexIsLetter(chNext)) {
342 					Sci_Position match = i + 3;
343 					if (latexLastWordIs(match, styler, "\\end")) {
344 						match++;
345 						if (latexIsTagValid(match, lengthDoc, styler)) {
346 							if (latexLastWordIs(match, styler, "{math}"))
347 								mode = 0;
348 						}
349 					}
350 					state = SCE_L_COMMAND;
351 				} else if (latexIsSpecial(chNext)) {
352 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
353 					i++;
354 					chNext = styler.SafeGetCharAt(i + 1);
355 				} else if (chNext == '\r' || chNext == '\n') {
356 					styler.ColourTo(i, SCE_L_ERROR);
357 				} else if (IsASCII(chNext)) {
358 					if (chNext == ')') {
359 						mode = 0;
360 						state = SCE_L_DEFAULT;
361 					}
362 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
363 					i++;
364 					chNext = styler.SafeGetCharAt(i + 1);
365 				}
366 				break;
367 			case '$' :
368 				styler.ColourTo(i - 1, state);
369 				styler.ColourTo(i, SCE_L_SHORTCMD);
370 				mode = 0;
371 				state = SCE_L_DEFAULT;
372 				break;
373 			case '%' :
374 				styler.ColourTo(i - 1, state);
375 				state = SCE_L_COMMENT;
376 				break;
377 			}
378 			break;
379 		case SCE_L_MATH2 :
380 			switch (ch) {
381 			case '\\' :
382 				styler.ColourTo(i - 1, state);
383 				if (latexIsLetter(chNext)) {
384 					Sci_Position match = i + 3;
385 					if (latexLastWordIs(match, styler, "\\end")) {
386 						match++;
387 						if (latexIsTagValid(match, lengthDoc, styler)) {
388 							if (latexLastWordIsMathEnv(match, styler))
389 								mode = 0;
390 						}
391 					}
392 					state = SCE_L_COMMAND;
393 				} else if (latexIsSpecial(chNext)) {
394 					styler.ColourTo(i + 1, SCE_L_SPECIAL);
395 					i++;
396 					chNext = styler.SafeGetCharAt(i + 1);
397 				} else if (chNext == '\r' || chNext == '\n') {
398 					styler.ColourTo(i, SCE_L_ERROR);
399 				} else if (IsASCII(chNext)) {
400 					if (chNext == ']') {
401 						mode = 0;
402 						state = SCE_L_DEFAULT;
403 					}
404 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
405 					i++;
406 					chNext = styler.SafeGetCharAt(i + 1);
407 				}
408 				break;
409 			case '$' :
410 				styler.ColourTo(i - 1, state);
411 				if (chNext == '$') {
412 					styler.ColourTo(i + 1, SCE_L_SHORTCMD);
413 					i++;
414 					chNext = styler.SafeGetCharAt(i + 1);
415 					mode = 0;
416 					state = SCE_L_DEFAULT;
417 				} else { // This may not be an error, e.g. \begin{equation}\text{$a$}\end{equation}
418 					styler.ColourTo(i, SCE_L_SHORTCMD);
419 				}
420 				break;
421 			case '%' :
422 				styler.ColourTo(i - 1, state);
423 				state = SCE_L_COMMENT;
424 				break;
425 			}
426 			break;
427 		case SCE_L_COMMENT :
428 			if (ch == '\r' || ch == '\n') {
429 				styler.ColourTo(i - 1, state);
430 				latexStateReset(mode, state);
431 			}
432 			break;
433 		case SCE_L_COMMENT2 :
434 			if (ch == '\\') {
435 				Sci_Position match = i + 3;
436 				if (latexLastWordIs(match, styler, "\\end")) {
437 					match++;
438 					if (latexIsTagValid(match, lengthDoc, styler)) {
439 						if (latexLastWordIs(match, styler, "{comment}")) {
440 							styler.ColourTo(i - 1, state);
441 							state = SCE_L_COMMAND;
442 						}
443 					}
444 				}
445 			}
446 			break;
447 		case SCE_L_VERBATIM :
448 			if (ch == '\\') {
449 				Sci_Position match = i + 3;
450 				if (latexLastWordIs(match, styler, "\\end")) {
451 					match++;
452 					if (latexIsTagValid(match, lengthDoc, styler)) {
453 						if (latexLastWordIs(match, styler, "{verbatim}")) {
454 							styler.ColourTo(i - 1, state);
455 							state = SCE_L_COMMAND;
456 						}
457 					}
458 				}
459 			} else if (chNext == chVerbatimDelim) {
460 				styler.ColourTo(i + 1, state);
461 				latexStateReset(mode, state);
462 				chVerbatimDelim = '\0';
463 				i++;
464 				chNext = styler.SafeGetCharAt(i + 1);
465 			} else if (chVerbatimDelim != '\0' && (ch == '\n' || ch == '\r')) {
466 				styler.ColourTo(i, SCE_L_ERROR);
467 				latexStateReset(mode, state);
468 				chVerbatimDelim = '\0';
469 			}
470 			break;
471 		}
472 	}
473 	if (lengthDoc == styler.Length()) truncModes(styler.GetLine(lengthDoc - 1));
474 	styler.ColourTo(lengthDoc - 1, state);
475 	styler.Flush();
476 }
477 
latexFoldSaveToInt(const latexFoldSave & save)478 static int latexFoldSaveToInt(const latexFoldSave &save) {
479 	int sum = 0;
480 	for (int i = 0; i <= save.structLev; ++i)
481 		sum += save.openBegins[i];
482 	return ((sum + save.structLev + SC_FOLDLEVELBASE) & SC_FOLDLEVELNUMBERMASK);
483 }
484 
485 // Change folding state while processing a line
486 // Return the level before the first relevant command
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)487 void SCI_METHOD LexerLaTeX::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess) {
488 	const char *structWords[7] = {"part", "chapter", "section", "subsection",
489 		"subsubsection", "paragraph", "subparagraph"};
490 	Accessor styler(pAccess, &props);
491 	Sci_PositionU endPos = startPos + length;
492 	Sci_Position curLine = styler.GetLine(startPos);
493 	latexFoldSave save;
494 	getSave(curLine - 1, save);
495 	do {
496 		char ch, buf[16];
497 		Sci_Position i, j;
498 		int lev = -1;
499 		bool needFold = false;
500 		for (i = static_cast<Sci_Position>(startPos); i < static_cast<Sci_Position>(endPos); ++i) {
501 			ch = styler.SafeGetCharAt(i);
502 			if (ch == '\r' || ch == '\n') break;
503 			if (ch != '\\' || styler.StyleAt(i) != SCE_L_COMMAND) continue;
504 			for (j = 0; j < 15 && i + 1 < static_cast<Sci_Position>(endPos); ++j, ++i) {
505 				buf[j] = styler.SafeGetCharAt(i + 1);
506 				if (!latexIsLetter(buf[j])) break;
507 			}
508 			buf[j] = '\0';
509 			if (strcmp(buf, "begin") == 0) {
510 				if (lev < 0) lev = latexFoldSaveToInt(save);
511 				++save.openBegins[save.structLev];
512 				needFold = true;
513 			}
514 			else if (strcmp(buf, "end") == 0) {
515 				while (save.structLev > 0 && save.openBegins[save.structLev] == 0)
516 					--save.structLev;
517 				if (lev < 0) lev = latexFoldSaveToInt(save);
518 				if (save.openBegins[save.structLev] > 0) --save.openBegins[save.structLev];
519 			}
520 			else {
521 				for (j = 0; j < 7; ++j)
522 					if (strcmp(buf, structWords[j]) == 0) break;
523 				if (j >= 7) continue;
524 				save.structLev = j;   // level before the command
525 				for (j = save.structLev + 1; j < 8; ++j) {
526 					save.openBegins[save.structLev] += save.openBegins[j];
527 					save.openBegins[j] = 0;
528 				}
529 				if (lev < 0) lev = latexFoldSaveToInt(save);
530 				++save.structLev;   // level after the command
531 				needFold = true;
532 			}
533 		}
534 		if (lev < 0) lev = latexFoldSaveToInt(save);
535 		if (needFold) lev |= SC_FOLDLEVELHEADERFLAG;
536 		styler.SetLevel(curLine, lev);
537 		setSave(curLine, save);
538 		++curLine;
539 		startPos = styler.LineStart(curLine);
540 		if (static_cast<Sci_Position>(startPos) == styler.Length()) {
541 			lev = latexFoldSaveToInt(save);
542 			styler.SetLevel(curLine, lev);
543 			setSave(curLine, save);
544 			truncSaves(curLine);
545 		}
546 	} while (startPos < endPos);
547 	styler.Flush();
548 }
549 
550 static const char *const emptyWordListDesc[] = {
551 	0
552 };
553 
554 LexerModule lmLatex(SCLEX_LATEX, LexerLaTeX::LexerFactoryLaTeX, "latex", emptyWordListDesc);
555