1 /******************************************************************
2  *  LexMarkdown.cxx
3  *
4  *  A simple Markdown lexer for scintilla.
5  *
6  *  Includes highlighting for some extra features from the
7  *  Pandoc implementation; strikeout, using '#.' as a default
8  *  ordered list item marker, and delimited code blocks.
9  *
10  *  Limitations:
11  *
12  *  Standard indented code blocks are not highlighted at all,
13  *  as it would conflict with other indentation schemes. Use
14  *  delimited code blocks for blanket highlighting of an
15  *  entire code block.  Embedded HTML is not highlighted either.
16  *  Blanket HTML highlighting has issues, because some Markdown
17  *  implementations allow Markdown markup inside of the HTML. Also,
18  *  there is a following blank line issue that can't be ignored,
19  *  explained in the next paragraph. Embedded HTML and code
20  *  blocks would be better supported with language specific
21  *  highlighting.
22  *
23  *  The highlighting aims to accurately reflect correct syntax,
24  *  but a few restrictions are relaxed. Delimited code blocks are
25  *  highlighted, even if the line following the code block is not blank.
26  *  Requiring a blank line after a block, breaks the highlighting
27  *  in certain cases, because of the way Scintilla ends up calling
28  *  the lexer.
29  *
30  *  Written by Jon Strait - jstrait@moonloop.net
31  *
32  *  The License.txt file describes the conditions under which this
33  *  software may be distributed.
34  *
35  *****************************************************************/
36 
37 #include <stdlib.h>
38 #include <string.h>
39 #include <stdio.h>
40 #include <stdarg.h>
41 #include <assert.h>
42 
43 #include "ILexer.h"
44 #include "Scintilla.h"
45 #include "SciLexer.h"
46 
47 #include "WordList.h"
48 #include "LexAccessor.h"
49 #include "Accessor.h"
50 #include "StyleContext.h"
51 #include "CharacterSet.h"
52 #include "LexerModule.h"
53 
54 using namespace Scintilla;
55 
IsNewline(const int ch)56 static inline bool IsNewline(const int ch) {
57     return (ch == '\n' || ch == '\r');
58 }
59 
60 // True if can follow ch down to the end with possibly trailing whitespace
FollowToLineEnd(const int ch,const int state,const Sci_PositionU endPos,StyleContext & sc)61 static bool FollowToLineEnd(const int ch, const int state, const Sci_PositionU endPos, StyleContext &sc) {
62     Sci_PositionU i = 0;
63     while (sc.GetRelative(++i) == ch)
64         ;
65     // Skip over whitespace
66     while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos)
67         ++i;
68     if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) {
69         sc.Forward(i);
70         sc.ChangeState(state);
71         sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
72         return true;
73     }
74     else return false;
75 }
76 
77 // Set the state on text section from current to length characters,
78 // then set the rest until the newline to default, except for any characters matching token
SetStateAndZoom(const int state,const Sci_Position length,const int token,StyleContext & sc)79 static void SetStateAndZoom(const int state, const Sci_Position length, const int token, StyleContext &sc) {
80     sc.SetState(state);
81     sc.Forward(length);
82     sc.SetState(SCE_MARKDOWN_DEFAULT);
83     sc.Forward();
84     bool started = false;
85     while (sc.More() && !IsNewline(sc.ch)) {
86         if (sc.ch == token && !started) {
87             sc.SetState(state);
88             started = true;
89         }
90         else if (sc.ch != token) {
91             sc.SetState(SCE_MARKDOWN_DEFAULT);
92             started = false;
93         }
94         sc.Forward();
95     }
96     sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
97 }
98 
99 // Does the previous line have more than spaces and tabs?
HasPrevLineContent(StyleContext & sc)100 static bool HasPrevLineContent(StyleContext &sc) {
101     Sci_Position i = 0;
102     // Go back to the previous newline
103     while ((--i + (Sci_Position)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i)))
104         ;
105     while ((--i + (Sci_Position)sc.currentPos) >= 0) {
106         if (IsNewline(sc.GetRelative(i)))
107             break;
108         if (!IsASpaceOrTab(sc.GetRelative(i)))
109             return true;
110     }
111     return false;
112 }
113 
AtTermStart(StyleContext & sc)114 static bool AtTermStart(StyleContext &sc) {
115     return sc.currentPos == 0 || sc.chPrev == 0 || isspacechar(sc.chPrev);
116 }
117 
IsValidHrule(const Sci_PositionU endPos,StyleContext & sc)118 static bool IsValidHrule(const Sci_PositionU endPos, StyleContext &sc) {
119     int count = 1;
120     Sci_PositionU i = 0;
121     for (;;) {
122         ++i;
123         int c = sc.GetRelative(i);
124         if (c == sc.ch)
125             ++count;
126         // hit a terminating character
127         else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) {
128             // Are we a valid HRULE
129             if ((IsNewline(c) || sc.currentPos + i == endPos) &&
130                     count >= 3 && !HasPrevLineContent(sc)) {
131                 sc.SetState(SCE_MARKDOWN_HRULE);
132                 sc.Forward(i);
133                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
134                 return true;
135             }
136             else {
137                 sc.SetState(SCE_MARKDOWN_DEFAULT);
138 		return false;
139             }
140         }
141     }
142 }
143 
ColorizeMarkdownDoc(Sci_PositionU startPos,Sci_Position length,int initStyle,WordList **,Accessor & styler)144 static void ColorizeMarkdownDoc(Sci_PositionU startPos, Sci_Position length, int initStyle,
145                                WordList **, Accessor &styler) {
146     Sci_PositionU endPos = startPos + length;
147     int precharCount = 0;
148     bool isLinkNameDetecting = false;
149     // Don't advance on a new loop iteration and retry at the same position.
150     // Useful in the corner case of having to start at the beginning file position
151     // in the default state.
152     bool freezeCursor = false;
153 
154     StyleContext sc(startPos, length, initStyle, styler);
155 
156     while (sc.More()) {
157         // Skip past escaped characters
158         if (sc.ch == '\\') {
159             sc.Forward();
160             continue;
161         }
162 
163         // A blockquotes resets the line semantics
164         if (sc.state == SCE_MARKDOWN_BLOCKQUOTE)
165             sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
166 
167         // Conditional state-based actions
168         if (sc.state == SCE_MARKDOWN_CODE2) {
169             if (sc.Match("``") && sc.GetRelative(-2) != ' ') {
170                 sc.Forward(2);
171                 sc.SetState(SCE_MARKDOWN_DEFAULT);
172             }
173         }
174         else if (sc.state == SCE_MARKDOWN_CODE) {
175             if (sc.ch == '`' && sc.chPrev != ' ')
176                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
177         }
178         /* De-activated because it gets in the way of other valid indentation
179          * schemes, for example multiple paragraphs inside a list item.
180         // Code block
181         else if (sc.state == SCE_MARKDOWN_CODEBK) {
182             bool d = true;
183             if (IsNewline(sc.ch)) {
184                 if (sc.chNext != '\t') {
185                     for (int c = 1; c < 5; ++c) {
186                         if (sc.GetRelative(c) != ' ')
187                             d = false;
188                     }
189                 }
190             }
191             else if (sc.atLineStart) {
192                 if (sc.ch != '\t' ) {
193                     for (int i = 0; i < 4; ++i) {
194                         if (sc.GetRelative(i) != ' ')
195                             d = false;
196                     }
197                 }
198             }
199             if (!d)
200                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
201         }
202         */
203         // Strong
204         else if (sc.state == SCE_MARKDOWN_STRONG1) {
205             if (sc.Match("**") && sc.chPrev != ' ') {
206                 sc.Forward(2);
207                 sc.SetState(SCE_MARKDOWN_DEFAULT);
208             }
209         }
210         else if (sc.state == SCE_MARKDOWN_STRONG2) {
211             if (sc.Match("__") && sc.chPrev != ' ') {
212                 sc.Forward(2);
213                 sc.SetState(SCE_MARKDOWN_DEFAULT);
214             }
215         }
216         // Emphasis
217         else if (sc.state == SCE_MARKDOWN_EM1) {
218             if (sc.ch == '*' && sc.chPrev != ' ')
219                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
220         }
221         else if (sc.state == SCE_MARKDOWN_EM2) {
222             if (sc.ch == '_' && sc.chPrev != ' ')
223                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
224         }
225         else if (sc.state == SCE_MARKDOWN_CODEBK) {
226             if (sc.atLineStart && sc.Match("~~~")) {
227                 Sci_Position i = 1;
228                 while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos)
229                     i++;
230                 sc.Forward(i);
231                 sc.SetState(SCE_MARKDOWN_DEFAULT);
232             }
233         }
234         else if (sc.state == SCE_MARKDOWN_STRIKEOUT) {
235             if (sc.Match("~~") && sc.chPrev != ' ') {
236                 sc.Forward(2);
237                 sc.SetState(SCE_MARKDOWN_DEFAULT);
238             }
239         }
240         else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) {
241             // Header
242             if (sc.Match("######"))
243                 SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc);
244             else if (sc.Match("#####"))
245                 SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc);
246             else if (sc.Match("####"))
247                 SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc);
248             else if (sc.Match("###"))
249                 SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc);
250             else if (sc.Match("##"))
251                 SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc);
252             else if (sc.Match("#")) {
253                 // Catch the special case of an unordered list
254                 if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
255                     precharCount = 0;
256                     sc.SetState(SCE_MARKDOWN_PRECHAR);
257                 }
258                 else
259                     SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc);
260             }
261             // Code block
262             else if (sc.Match("~~~")) {
263                 if (!HasPrevLineContent(sc))
264                     sc.SetState(SCE_MARKDOWN_CODEBK);
265                 else
266                     sc.SetState(SCE_MARKDOWN_DEFAULT);
267             }
268             else if (sc.ch == '=') {
269                 if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc))
270                     ;
271                 else
272                     sc.SetState(SCE_MARKDOWN_DEFAULT);
273             }
274             else if (sc.ch == '-') {
275                 if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc))
276                     ;
277                 else {
278                     precharCount = 0;
279                     sc.SetState(SCE_MARKDOWN_PRECHAR);
280                 }
281             }
282             else if (IsNewline(sc.ch))
283                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
284             else {
285                 precharCount = 0;
286                 sc.SetState(SCE_MARKDOWN_PRECHAR);
287             }
288         }
289 
290         // The header lasts until the newline
291         else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 ||
292                 sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 ||
293                 sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) {
294             if (IsNewline(sc.ch))
295                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
296         }
297 
298         // New state only within the initial whitespace
299         if (sc.state == SCE_MARKDOWN_PRECHAR) {
300             // Blockquote
301             if (sc.ch == '>' && precharCount < 5)
302                 sc.SetState(SCE_MARKDOWN_BLOCKQUOTE);
303             /*
304             // Begin of code block
305             else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4))
306                 sc.SetState(SCE_MARKDOWN_CODEBK);
307             */
308             // HRule - Total of three or more hyphens, asterisks, or underscores
309             // on a line by themselves
310             else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc))
311                 ;
312             // Unordered list
313             else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) {
314                 sc.SetState(SCE_MARKDOWN_ULIST_ITEM);
315                 sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
316             }
317             // Ordered list
318             else if (IsADigit(sc.ch)) {
319                 int digitCount = 0;
320                 while (IsADigit(sc.GetRelative(++digitCount)))
321                     ;
322                 if (sc.GetRelative(digitCount) == '.' &&
323                         IsASpaceOrTab(sc.GetRelative(digitCount + 1))) {
324                     sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
325                     sc.Forward(digitCount + 1);
326                     sc.SetState(SCE_MARKDOWN_DEFAULT);
327                 }
328             }
329             // Alternate Ordered list
330             else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
331                 sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
332                 sc.Forward(2);
333                 sc.SetState(SCE_MARKDOWN_DEFAULT);
334             }
335             else if (sc.ch != ' ' || precharCount > 2)
336                 sc.SetState(SCE_MARKDOWN_DEFAULT);
337             else
338                 ++precharCount;
339         }
340 
341         // Any link
342         if (sc.state == SCE_MARKDOWN_LINK) {
343             if (sc.Match("](") && sc.GetRelative(-1) != '\\') {
344               sc.Forward(2);
345               isLinkNameDetecting = true;
346             }
347             else if (sc.Match("]:") && sc.GetRelative(-1) != '\\') {
348               sc.Forward(2);
349               sc.SetState(SCE_MARKDOWN_DEFAULT);
350             }
351             else if (!isLinkNameDetecting && sc.ch == ']' && sc.GetRelative(-1) != '\\') {
352               sc.Forward();
353               sc.SetState(SCE_MARKDOWN_DEFAULT);
354             }
355             else if (isLinkNameDetecting && sc.ch == ')' && sc.GetRelative(-1) != '\\') {
356               sc.Forward();
357               sc.SetState(SCE_MARKDOWN_DEFAULT);
358               isLinkNameDetecting = false;
359             }
360         }
361 
362         // New state anywhere in doc
363         if (sc.state == SCE_MARKDOWN_DEFAULT) {
364             if (sc.atLineStart && sc.ch == '#') {
365                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
366                 freezeCursor = true;
367             }
368             // Links and Images
369             if (sc.Match("![")) {
370               sc.SetState(SCE_MARKDOWN_LINK);
371               sc.Forward(2);
372             }
373             else if (sc.ch == '[' && sc.GetRelative(-1) != '\\') {
374               sc.SetState(SCE_MARKDOWN_LINK);
375               sc.Forward();
376             }
377             // Code - also a special case for alternate inside spacing
378             else if (sc.Match("``") && sc.GetRelative(3) != ' ' && AtTermStart(sc)) {
379                 sc.SetState(SCE_MARKDOWN_CODE2);
380                 sc.Forward();
381             }
382             else if (sc.ch == '`' && sc.chNext != ' ' && AtTermStart(sc)) {
383                 sc.SetState(SCE_MARKDOWN_CODE);
384             }
385             // Strong
386             else if (sc.Match("**") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
387                 sc.SetState(SCE_MARKDOWN_STRONG1);
388                 sc.Forward();
389            }
390             else if (sc.Match("__") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
391                 sc.SetState(SCE_MARKDOWN_STRONG2);
392                 sc.Forward();
393             }
394             // Emphasis
395             else if (sc.ch == '*' && sc.chNext != ' ' && AtTermStart(sc)) {
396                 sc.SetState(SCE_MARKDOWN_EM1);
397             }
398             else if (sc.ch == '_' && sc.chNext != ' ' && AtTermStart(sc)) {
399                 sc.SetState(SCE_MARKDOWN_EM2);
400             }
401             // Strikeout
402             else if (sc.Match("~~") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
403                 sc.SetState(SCE_MARKDOWN_STRIKEOUT);
404                 sc.Forward();
405             }
406             // Beginning of line
407             else if (IsNewline(sc.ch)) {
408                 sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
409             }
410         }
411         // Advance if not holding back the cursor for this iteration.
412         if (!freezeCursor)
413             sc.Forward();
414         freezeCursor = false;
415     }
416     sc.Complete();
417 }
418 
419 LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown");
420