1 /******************************************************************
2 * LexMarkdown.cxx
3 *
4 * A simple Markdown lexer for scintilla.
5 *
6 * Includes highlighting for some extra features from the
7 * Pandoc implementation; strikeout, using '#.' as a default
8 * ordered list item marker, and delimited code blocks.
9 *
10 * Limitations:
11 *
12 * Standard indented code blocks are not highlighted at all,
13 * as it would conflict with other indentation schemes. Use
14 * delimited code blocks for blanket highlighting of an
15 * entire code block. Embedded HTML is not highlighted either.
16 * Blanket HTML highlighting has issues, because some Markdown
17 * implementations allow Markdown markup inside of the HTML. Also,
18 * there is a following blank line issue that can't be ignored,
19 * explained in the next paragraph. Embedded HTML and code
20 * blocks would be better supported with language specific
21 * highlighting.
22 *
23 * The highlighting aims to accurately reflect correct syntax,
24 * but a few restrictions are relaxed. Delimited code blocks are
25 * highlighted, even if the line following the code block is not blank.
26 * Requiring a blank line after a block, breaks the highlighting
27 * in certain cases, because of the way Scintilla ends up calling
28 * the lexer.
29 *
30 * Written by Jon Strait - jstrait@moonloop.net
31 *
32 * The License.txt file describes the conditions under which this
33 * software may be distributed.
34 *
35 *****************************************************************/
36
37 #include <stdlib.h>
38 #include <string.h>
39 #include <stdio.h>
40 #include <stdarg.h>
41 #include <assert.h>
42
43 #include "ILexer.h"
44 #include "Scintilla.h"
45 #include "SciLexer.h"
46
47 #include "WordList.h"
48 #include "LexAccessor.h"
49 #include "Accessor.h"
50 #include "StyleContext.h"
51 #include "CharacterSet.h"
52 #include "LexerModule.h"
53
54 using namespace Scintilla;
55
IsNewline(const int ch)56 static inline bool IsNewline(const int ch) {
57 return (ch == '\n' || ch == '\r');
58 }
59
60 // True if can follow ch down to the end with possibly trailing whitespace
FollowToLineEnd(const int ch,const int state,const Sci_PositionU endPos,StyleContext & sc)61 static bool FollowToLineEnd(const int ch, const int state, const Sci_PositionU endPos, StyleContext &sc) {
62 Sci_PositionU i = 0;
63 while (sc.GetRelative(++i) == ch)
64 ;
65 // Skip over whitespace
66 while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos)
67 ++i;
68 if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) {
69 sc.Forward(i);
70 sc.ChangeState(state);
71 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
72 return true;
73 }
74 else return false;
75 }
76
77 // Set the state on text section from current to length characters,
78 // then set the rest until the newline to default, except for any characters matching token
SetStateAndZoom(const int state,const Sci_Position length,const int token,StyleContext & sc)79 static void SetStateAndZoom(const int state, const Sci_Position length, const int token, StyleContext &sc) {
80 sc.SetState(state);
81 sc.Forward(length);
82 sc.SetState(SCE_MARKDOWN_DEFAULT);
83 sc.Forward();
84 bool started = false;
85 while (sc.More() && !IsNewline(sc.ch)) {
86 if (sc.ch == token && !started) {
87 sc.SetState(state);
88 started = true;
89 }
90 else if (sc.ch != token) {
91 sc.SetState(SCE_MARKDOWN_DEFAULT);
92 started = false;
93 }
94 sc.Forward();
95 }
96 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
97 }
98
99 // Does the previous line have more than spaces and tabs?
HasPrevLineContent(StyleContext & sc)100 static bool HasPrevLineContent(StyleContext &sc) {
101 Sci_Position i = 0;
102 // Go back to the previous newline
103 while ((--i + (Sci_Position)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i)))
104 ;
105 while ((--i + (Sci_Position)sc.currentPos) >= 0) {
106 if (IsNewline(sc.GetRelative(i)))
107 break;
108 if (!IsASpaceOrTab(sc.GetRelative(i)))
109 return true;
110 }
111 return false;
112 }
113
AtTermStart(StyleContext & sc)114 static bool AtTermStart(StyleContext &sc) {
115 return sc.currentPos == 0 || sc.chPrev == 0 || isspacechar(sc.chPrev);
116 }
117
IsValidHrule(const Sci_PositionU endPos,StyleContext & sc)118 static bool IsValidHrule(const Sci_PositionU endPos, StyleContext &sc) {
119 int count = 1;
120 Sci_PositionU i = 0;
121 for (;;) {
122 ++i;
123 int c = sc.GetRelative(i);
124 if (c == sc.ch)
125 ++count;
126 // hit a terminating character
127 else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) {
128 // Are we a valid HRULE
129 if ((IsNewline(c) || sc.currentPos + i == endPos) &&
130 count >= 3 && !HasPrevLineContent(sc)) {
131 sc.SetState(SCE_MARKDOWN_HRULE);
132 sc.Forward(i);
133 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
134 return true;
135 }
136 else {
137 sc.SetState(SCE_MARKDOWN_DEFAULT);
138 return false;
139 }
140 }
141 }
142 }
143
ColorizeMarkdownDoc(Sci_PositionU startPos,Sci_Position length,int initStyle,WordList **,Accessor & styler)144 static void ColorizeMarkdownDoc(Sci_PositionU startPos, Sci_Position length, int initStyle,
145 WordList **, Accessor &styler) {
146 Sci_PositionU endPos = startPos + length;
147 int precharCount = 0;
148 bool isLinkNameDetecting = false;
149 // Don't advance on a new loop iteration and retry at the same position.
150 // Useful in the corner case of having to start at the beginning file position
151 // in the default state.
152 bool freezeCursor = false;
153
154 StyleContext sc(startPos, length, initStyle, styler);
155
156 while (sc.More()) {
157 // Skip past escaped characters
158 if (sc.ch == '\\') {
159 sc.Forward();
160 continue;
161 }
162
163 // A blockquotes resets the line semantics
164 if (sc.state == SCE_MARKDOWN_BLOCKQUOTE)
165 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
166
167 // Conditional state-based actions
168 if (sc.state == SCE_MARKDOWN_CODE2) {
169 if (sc.Match("``") && sc.GetRelative(-2) != ' ') {
170 sc.Forward(2);
171 sc.SetState(SCE_MARKDOWN_DEFAULT);
172 }
173 }
174 else if (sc.state == SCE_MARKDOWN_CODE) {
175 if (sc.ch == '`' && sc.chPrev != ' ')
176 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
177 }
178 /* De-activated because it gets in the way of other valid indentation
179 * schemes, for example multiple paragraphs inside a list item.
180 // Code block
181 else if (sc.state == SCE_MARKDOWN_CODEBK) {
182 bool d = true;
183 if (IsNewline(sc.ch)) {
184 if (sc.chNext != '\t') {
185 for (int c = 1; c < 5; ++c) {
186 if (sc.GetRelative(c) != ' ')
187 d = false;
188 }
189 }
190 }
191 else if (sc.atLineStart) {
192 if (sc.ch != '\t' ) {
193 for (int i = 0; i < 4; ++i) {
194 if (sc.GetRelative(i) != ' ')
195 d = false;
196 }
197 }
198 }
199 if (!d)
200 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
201 }
202 */
203 // Strong
204 else if (sc.state == SCE_MARKDOWN_STRONG1) {
205 if (sc.Match("**") && sc.chPrev != ' ') {
206 sc.Forward(2);
207 sc.SetState(SCE_MARKDOWN_DEFAULT);
208 }
209 }
210 else if (sc.state == SCE_MARKDOWN_STRONG2) {
211 if (sc.Match("__") && sc.chPrev != ' ') {
212 sc.Forward(2);
213 sc.SetState(SCE_MARKDOWN_DEFAULT);
214 }
215 }
216 // Emphasis
217 else if (sc.state == SCE_MARKDOWN_EM1) {
218 if (sc.ch == '*' && sc.chPrev != ' ')
219 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
220 }
221 else if (sc.state == SCE_MARKDOWN_EM2) {
222 if (sc.ch == '_' && sc.chPrev != ' ')
223 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
224 }
225 else if (sc.state == SCE_MARKDOWN_CODEBK) {
226 if (sc.atLineStart && sc.Match("~~~")) {
227 Sci_Position i = 1;
228 while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos)
229 i++;
230 sc.Forward(i);
231 sc.SetState(SCE_MARKDOWN_DEFAULT);
232 }
233 }
234 else if (sc.state == SCE_MARKDOWN_STRIKEOUT) {
235 if (sc.Match("~~") && sc.chPrev != ' ') {
236 sc.Forward(2);
237 sc.SetState(SCE_MARKDOWN_DEFAULT);
238 }
239 }
240 else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) {
241 // Header
242 if (sc.Match("######"))
243 SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc);
244 else if (sc.Match("#####"))
245 SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc);
246 else if (sc.Match("####"))
247 SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc);
248 else if (sc.Match("###"))
249 SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc);
250 else if (sc.Match("##"))
251 SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc);
252 else if (sc.Match("#")) {
253 // Catch the special case of an unordered list
254 if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
255 precharCount = 0;
256 sc.SetState(SCE_MARKDOWN_PRECHAR);
257 }
258 else
259 SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc);
260 }
261 // Code block
262 else if (sc.Match("~~~")) {
263 if (!HasPrevLineContent(sc))
264 sc.SetState(SCE_MARKDOWN_CODEBK);
265 else
266 sc.SetState(SCE_MARKDOWN_DEFAULT);
267 }
268 else if (sc.ch == '=') {
269 if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc))
270 ;
271 else
272 sc.SetState(SCE_MARKDOWN_DEFAULT);
273 }
274 else if (sc.ch == '-') {
275 if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc))
276 ;
277 else {
278 precharCount = 0;
279 sc.SetState(SCE_MARKDOWN_PRECHAR);
280 }
281 }
282 else if (IsNewline(sc.ch))
283 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
284 else {
285 precharCount = 0;
286 sc.SetState(SCE_MARKDOWN_PRECHAR);
287 }
288 }
289
290 // The header lasts until the newline
291 else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 ||
292 sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 ||
293 sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) {
294 if (IsNewline(sc.ch))
295 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
296 }
297
298 // New state only within the initial whitespace
299 if (sc.state == SCE_MARKDOWN_PRECHAR) {
300 // Blockquote
301 if (sc.ch == '>' && precharCount < 5)
302 sc.SetState(SCE_MARKDOWN_BLOCKQUOTE);
303 /*
304 // Begin of code block
305 else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4))
306 sc.SetState(SCE_MARKDOWN_CODEBK);
307 */
308 // HRule - Total of three or more hyphens, asterisks, or underscores
309 // on a line by themselves
310 else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc))
311 ;
312 // Unordered list
313 else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) {
314 sc.SetState(SCE_MARKDOWN_ULIST_ITEM);
315 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
316 }
317 // Ordered list
318 else if (IsADigit(sc.ch)) {
319 int digitCount = 0;
320 while (IsADigit(sc.GetRelative(++digitCount)))
321 ;
322 if (sc.GetRelative(digitCount) == '.' &&
323 IsASpaceOrTab(sc.GetRelative(digitCount + 1))) {
324 sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
325 sc.Forward(digitCount + 1);
326 sc.SetState(SCE_MARKDOWN_DEFAULT);
327 }
328 }
329 // Alternate Ordered list
330 else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
331 sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
332 sc.Forward(2);
333 sc.SetState(SCE_MARKDOWN_DEFAULT);
334 }
335 else if (sc.ch != ' ' || precharCount > 2)
336 sc.SetState(SCE_MARKDOWN_DEFAULT);
337 else
338 ++precharCount;
339 }
340
341 // Any link
342 if (sc.state == SCE_MARKDOWN_LINK) {
343 if (sc.Match("](") && sc.GetRelative(-1) != '\\') {
344 sc.Forward(2);
345 isLinkNameDetecting = true;
346 }
347 else if (sc.Match("]:") && sc.GetRelative(-1) != '\\') {
348 sc.Forward(2);
349 sc.SetState(SCE_MARKDOWN_DEFAULT);
350 }
351 else if (!isLinkNameDetecting && sc.ch == ']' && sc.GetRelative(-1) != '\\') {
352 sc.Forward();
353 sc.SetState(SCE_MARKDOWN_DEFAULT);
354 }
355 else if (isLinkNameDetecting && sc.ch == ')' && sc.GetRelative(-1) != '\\') {
356 sc.Forward();
357 sc.SetState(SCE_MARKDOWN_DEFAULT);
358 isLinkNameDetecting = false;
359 }
360 }
361
362 // New state anywhere in doc
363 if (sc.state == SCE_MARKDOWN_DEFAULT) {
364 if (sc.atLineStart && sc.ch == '#') {
365 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
366 freezeCursor = true;
367 }
368 // Links and Images
369 if (sc.Match("![")) {
370 sc.SetState(SCE_MARKDOWN_LINK);
371 sc.Forward(2);
372 }
373 else if (sc.ch == '[' && sc.GetRelative(-1) != '\\') {
374 sc.SetState(SCE_MARKDOWN_LINK);
375 sc.Forward();
376 }
377 // Code - also a special case for alternate inside spacing
378 else if (sc.Match("``") && sc.GetRelative(3) != ' ' && AtTermStart(sc)) {
379 sc.SetState(SCE_MARKDOWN_CODE2);
380 sc.Forward();
381 }
382 else if (sc.ch == '`' && sc.chNext != ' ' && AtTermStart(sc)) {
383 sc.SetState(SCE_MARKDOWN_CODE);
384 }
385 // Strong
386 else if (sc.Match("**") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
387 sc.SetState(SCE_MARKDOWN_STRONG1);
388 sc.Forward();
389 }
390 else if (sc.Match("__") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
391 sc.SetState(SCE_MARKDOWN_STRONG2);
392 sc.Forward();
393 }
394 // Emphasis
395 else if (sc.ch == '*' && sc.chNext != ' ' && AtTermStart(sc)) {
396 sc.SetState(SCE_MARKDOWN_EM1);
397 }
398 else if (sc.ch == '_' && sc.chNext != ' ' && AtTermStart(sc)) {
399 sc.SetState(SCE_MARKDOWN_EM2);
400 }
401 // Strikeout
402 else if (sc.Match("~~") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
403 sc.SetState(SCE_MARKDOWN_STRIKEOUT);
404 sc.Forward();
405 }
406 // Beginning of line
407 else if (IsNewline(sc.ch)) {
408 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
409 }
410 }
411 // Advance if not holding back the cursor for this iteration.
412 if (!freezeCursor)
413 sc.Forward();
414 freezeCursor = false;
415 }
416 sc.Complete();
417 }
418
419 LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown");
420