1 /******************************************************************
2  *  LexMarkdown.cxx
3  *
4  *  A simple Markdown lexer for scintilla.
5  *
6  *  Includes highlighting for some extra features from the
7  *  Pandoc implementation; strikeout, using '#.' as a default
8  *  ordered list item marker, and delimited code blocks.
9  *
10  *  Limitations:
11  *
12  *  Standard indented code blocks are not highlighted at all,
13  *  as it would conflict with other indentation schemes. Use
14  *  delimited code blocks for blanket highlighting of an
15  *  entire code block.  Embedded HTML is not highlighted either.
16  *  Blanket HTML highlighting has issues, because some Markdown
17  *  implementations allow Markdown markup inside of the HTML. Also,
18  *  there is a following blank line issue that can't be ignored,
19  *  explained in the next paragraph. Embedded HTML and code
20  *  blocks would be better supported with language specific
21  *  highlighting.
22  *
23  *  The highlighting aims to accurately reflect correct syntax,
24  *  but a few restrictions are relaxed. Delimited code blocks are
25  *  highlighted, even if the line following the code block is not blank.
26  *  Requiring a blank line after a block, breaks the highlighting
27  *  in certain cases, because of the way Scintilla ends up calling
28  *  the lexer.
29  *
30  *  Written by Jon Strait - jstrait@moonloop.net
31  *
32  *  The License.txt file describes the conditions under which this
33  *  software may be distributed.
34  *
35  *****************************************************************/
36 
37 #include <stdlib.h>
38 #include <string.h>
39 #include <stdio.h>
40 #include <stdarg.h>
41 #include <assert.h>
42 
43 #include "ILexer.h"
44 #include "Scintilla.h"
45 #include "SciLexer.h"
46 
47 #include "WordList.h"
48 #include "LexAccessor.h"
49 #include "Accessor.h"
50 #include "StyleContext.h"
51 #include "CharacterSet.h"
52 #include "LexerModule.h"
53 
54 #ifdef SCI_NAMESPACE
55 using namespace Scintilla;
56 #endif
57 
IsNewline(const int ch)58 static inline bool IsNewline(const int ch) {
59     return (ch == '\n' || ch == '\r');
60 }
61 
62 // True if can follow ch down to the end with possibly trailing whitespace
FollowToLineEnd(const int ch,const int state,const unsigned int endPos,StyleContext & sc)63 static bool FollowToLineEnd(const int ch, const int state, const unsigned int endPos, StyleContext &sc) {
64     unsigned int i = 0;
65     while (sc.GetRelative(++i) == ch)
66         ;
67     // Skip over whitespace
68     while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos)
69         ++i;
70     if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) {
71         sc.Forward(i);
72         sc.ChangeState(state);
73         sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
74         return true;
75     }
76     else return false;
77 }
78 
79 // Set the state on text section from current to length characters,
80 // then set the rest until the newline to default, except for any characters matching token
SetStateAndZoom(const int state,const int length,const int token,StyleContext & sc)81 static void SetStateAndZoom(const int state, const int length, const int token, StyleContext &sc) {
82     sc.SetState(state);
83     sc.Forward(length);
84     sc.SetState(SCE_MARKDOWN_DEFAULT);
85     sc.Forward();
86     bool started = false;
87     while (sc.More() && !IsNewline(sc.ch)) {
88         if (sc.ch == token && !started) {
89             sc.SetState(state);
90             started = true;
91         }
92         else if (sc.ch != token) {
93             sc.SetState(SCE_MARKDOWN_DEFAULT);
94             started = false;
95         }
96         sc.Forward();
97     }
98     sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
99 }
100 
101 // Does the previous line have more than spaces and tabs?
HasPrevLineContent(StyleContext & sc)102 static bool HasPrevLineContent(StyleContext &sc) {
103     int i = 0;
104     // Go back to the previous newline
105     while ((--i + (int)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i)))
106         ;
107     while ((--i + (int)sc.currentPos) >= 0) {
108         if (IsNewline(sc.GetRelative(i)))
109             break;
110         if (!IsASpaceOrTab(sc.GetRelative(i)))
111             return true;
112     }
113     return false;
114 }
115 
AtTermStart(StyleContext & sc)116 static bool AtTermStart(StyleContext &sc) {
117     return sc.currentPos == 0 || isspacechar(sc.chPrev);
118 }
119 
IsValidHrule(const unsigned int endPos,StyleContext & sc)120 static bool IsValidHrule(const unsigned int endPos, StyleContext &sc) {
121     int count = 1;
122     unsigned int i = 0;
123     for (;;) {
124         ++i;
125         int c = sc.GetRelative(i);
126         if (c == sc.ch)
127             ++count;
128         // hit a terminating character
129         else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) {
130             // Are we a valid HRULE
131             if ((IsNewline(c) || sc.currentPos + i == endPos) &&
132                     count >= 3 && !HasPrevLineContent(sc)) {
133                 sc.SetState(SCE_MARKDOWN_HRULE);
134                 sc.Forward(i);
135                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
136                 return true;
137             }
138             else {
139                 sc.SetState(SCE_MARKDOWN_DEFAULT);
140 		return false;
141             }
142         }
143     }
144 }
145 
ColorizeMarkdownDoc(unsigned int startPos,int length,int initStyle,WordList **,Accessor & styler)146 static void ColorizeMarkdownDoc(unsigned int startPos, int length, int initStyle,
147                                WordList **, Accessor &styler) {
148     unsigned int endPos = startPos + length;
149     int precharCount = 0;
150     // Don't advance on a new loop iteration and retry at the same position.
151     // Useful in the corner case of having to start at the beginning file position
152     // in the default state.
153     bool freezeCursor = false;
154 
155     StyleContext sc(startPos, length, initStyle, styler);
156 
157     while (sc.More()) {
158         // Skip past escaped characters
159         if (sc.ch == '\\') {
160             sc.Forward();
161             continue;
162         }
163 
164         // A blockquotes resets the line semantics
165         if (sc.state == SCE_MARKDOWN_BLOCKQUOTE)
166             sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
167 
168         // Conditional state-based actions
169         if (sc.state == SCE_MARKDOWN_CODE2) {
170             if (sc.Match("``") && sc.GetRelative(-2) != ' ') {
171                 sc.Forward(2);
172                 sc.SetState(SCE_MARKDOWN_DEFAULT);
173             }
174         }
175         else if (sc.state == SCE_MARKDOWN_CODE) {
176             if (sc.ch == '`' && sc.chPrev != ' ')
177                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
178         }
179         /* De-activated because it gets in the way of other valid indentation
180          * schemes, for example multiple paragraphs inside a list item.
181         // Code block
182         else if (sc.state == SCE_MARKDOWN_CODEBK) {
183             bool d = true;
184             if (IsNewline(sc.ch)) {
185                 if (sc.chNext != '\t') {
186                     for (int c = 1; c < 5; ++c) {
187                         if (sc.GetRelative(c) != ' ')
188                             d = false;
189                     }
190                 }
191             }
192             else if (sc.atLineStart) {
193                 if (sc.ch != '\t' ) {
194                     for (int i = 0; i < 4; ++i) {
195                         if (sc.GetRelative(i) != ' ')
196                             d = false;
197                     }
198                 }
199             }
200             if (!d)
201                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
202         }
203         */
204         // Strong
205         else if (sc.state == SCE_MARKDOWN_STRONG1) {
206             if (sc.Match("**") && sc.chPrev != ' ') {
207                 sc.Forward(2);
208                 sc.SetState(SCE_MARKDOWN_DEFAULT);
209             }
210         }
211         else if (sc.state == SCE_MARKDOWN_STRONG2) {
212             if (sc.Match("__") && sc.chPrev != ' ') {
213                 sc.Forward(2);
214                 sc.SetState(SCE_MARKDOWN_DEFAULT);
215             }
216         }
217         // Emphasis
218         else if (sc.state == SCE_MARKDOWN_EM1) {
219             if (sc.ch == '*' && sc.chPrev != ' ')
220                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
221         }
222         else if (sc.state == SCE_MARKDOWN_EM2) {
223             if (sc.ch == '_' && sc.chPrev != ' ')
224                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
225         }
226         else if (sc.state == SCE_MARKDOWN_CODEBK) {
227             if (sc.atLineStart && sc.Match("~~~")) {
228                 int i = 1;
229                 while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos)
230                     i++;
231                 sc.Forward(i);
232                 sc.SetState(SCE_MARKDOWN_DEFAULT);
233             }
234         }
235         else if (sc.state == SCE_MARKDOWN_STRIKEOUT) {
236             if (sc.Match("~~") && sc.chPrev != ' ') {
237                 sc.Forward(2);
238                 sc.SetState(SCE_MARKDOWN_DEFAULT);
239             }
240         }
241         else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) {
242             // Header
243             if (sc.Match("######"))
244                 SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc);
245             else if (sc.Match("#####"))
246                 SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc);
247             else if (sc.Match("####"))
248                 SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc);
249             else if (sc.Match("###"))
250                 SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc);
251             else if (sc.Match("##"))
252                 SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc);
253             else if (sc.Match("#")) {
254                 // Catch the special case of an unordered list
255                 if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
256                     precharCount = 0;
257                     sc.SetState(SCE_MARKDOWN_PRECHAR);
258                 }
259                 else
260                     SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc);
261             }
262             // Code block
263             else if (sc.Match("~~~")) {
264                 if (!HasPrevLineContent(sc))
265                     sc.SetState(SCE_MARKDOWN_CODEBK);
266                 else
267                     sc.SetState(SCE_MARKDOWN_DEFAULT);
268             }
269             else if (sc.ch == '=') {
270                 if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc))
271                     ;
272                 else
273                     sc.SetState(SCE_MARKDOWN_DEFAULT);
274             }
275             else if (sc.ch == '-') {
276                 if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc))
277                     ;
278                 else {
279                     precharCount = 0;
280                     sc.SetState(SCE_MARKDOWN_PRECHAR);
281                 }
282             }
283             else if (IsNewline(sc.ch))
284                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
285             else {
286                 precharCount = 0;
287                 sc.SetState(SCE_MARKDOWN_PRECHAR);
288             }
289         }
290 
291         // The header lasts until the newline
292         else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 ||
293                 sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 ||
294                 sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) {
295             if (IsNewline(sc.ch))
296                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
297         }
298 
299         // New state only within the initial whitespace
300         if (sc.state == SCE_MARKDOWN_PRECHAR) {
301             // Blockquote
302             if (sc.ch == '>' && precharCount < 5)
303                 sc.SetState(SCE_MARKDOWN_BLOCKQUOTE);
304             /*
305             // Begin of code block
306             else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4))
307                 sc.SetState(SCE_MARKDOWN_CODEBK);
308             */
309             // HRule - Total of three or more hyphens, asterisks, or underscores
310             // on a line by themselves
311             else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc))
312                 ;
313             // Unordered list
314             else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) {
315                 sc.SetState(SCE_MARKDOWN_ULIST_ITEM);
316                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
317             }
318             // Ordered list
319             else if (IsADigit(sc.ch)) {
320                 int digitCount = 0;
321                 while (IsADigit(sc.GetRelative(++digitCount)))
322                     ;
323                 if (sc.GetRelative(digitCount) == '.' &&
324                         IsASpaceOrTab(sc.GetRelative(digitCount + 1))) {
325                     sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
326                     sc.Forward(digitCount + 1);
327                     sc.SetState(SCE_MARKDOWN_DEFAULT);
328                 }
329             }
330             // Alternate Ordered list
331             else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
332                 sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
333                 sc.Forward(2);
334                 sc.SetState(SCE_MARKDOWN_DEFAULT);
335             }
336             else if (sc.ch != ' ' || precharCount > 2)
337                 sc.SetState(SCE_MARKDOWN_DEFAULT);
338             else
339                 ++precharCount;
340         }
341 
342         // New state anywhere in doc
343         if (sc.state == SCE_MARKDOWN_DEFAULT) {
344             if (sc.atLineStart && sc.ch == '#') {
345                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
346                 freezeCursor = true;
347             }
348             // Links and Images
349             if (sc.Match("![") || sc.ch == '[') {
350                 int i = 0, j = 0, k = 0;
351                 int len = endPos - sc.currentPos;
352                 while (i < len && (sc.GetRelative(++i) != ']' || sc.GetRelative(i - 1) == '\\'))
353                     ;
354                 if (sc.GetRelative(i) == ']') {
355                     j = i;
356                     if (sc.GetRelative(++i) == '(') {
357                         while (i < len && (sc.GetRelative(++i) != ')' || sc.GetRelative(i - 1) == '\\'))
358                             ;
359                         if (sc.GetRelative(i) == ')')
360                             k = i;
361                     }
362                     else if (sc.GetRelative(i) == '[' || sc.GetRelative(++i) == '[') {
363                         while (i < len && (sc.GetRelative(++i) != ']' || sc.GetRelative(i - 1) == '\\'))
364                             ;
365                         if (sc.GetRelative(i) == ']')
366                             k = i;
367                     }
368                 }
369                 // At least a link text
370                 if (j) {
371                     sc.SetState(SCE_MARKDOWN_LINK);
372                     sc.Forward(j);
373                     // Also has a URL or reference portion
374                     if (k)
375                         sc.Forward(k - j);
376                     sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
377                 }
378             }
379             // Code - also a special case for alternate inside spacing
380             if (sc.Match("``") && sc.GetRelative(3) != ' ' && AtTermStart(sc)) {
381                 sc.SetState(SCE_MARKDOWN_CODE2);
382                 sc.Forward();
383             }
384             else if (sc.ch == '`' && sc.chNext != ' ' && AtTermStart(sc)) {
385                 sc.SetState(SCE_MARKDOWN_CODE);
386             }
387             // Strong
388             else if (sc.Match("**") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
389                 sc.SetState(SCE_MARKDOWN_STRONG1);
390                 sc.Forward();
391            }
392             else if (sc.Match("__") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
393                 sc.SetState(SCE_MARKDOWN_STRONG2);
394                 sc.Forward();
395             }
396             // Emphasis
397             else if (sc.ch == '*' && sc.chNext != ' ' && AtTermStart(sc)) {
398                 sc.SetState(SCE_MARKDOWN_EM1);
399             }
400             else if (sc.ch == '_' && sc.chNext != ' ' && AtTermStart(sc)) {
401                 sc.SetState(SCE_MARKDOWN_EM2);
402             }
403             // Strikeout
404             else if (sc.Match("~~") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
405                 sc.SetState(SCE_MARKDOWN_STRIKEOUT);
406                 sc.Forward();
407             }
408             // Beginning of line
409             else if (IsNewline(sc.ch)) {
410                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
411             }
412         }
413         // Advance if not holding back the cursor for this iteration.
414         if (!freezeCursor)
415             sc.Forward();
416         freezeCursor = false;
417     }
418     sc.Complete();
419 }
420 
421 LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown");
422