1 // Scintilla source code edit control
2 /**
3  * @file LexJSON.cxx
4  * @date February 19, 2016
5  * @brief Lexer for JSON and JSON-LD formats
6  * @author nkmathew
7  *
8  * The License.txt file describes the conditions under which this software may
9  * be distributed.
10  *
11  */
12 
13 #include <cstdlib>
14 #include <cassert>
15 #include <cctype>
16 #include <cstdio>
17 #include <string>
18 #include <vector>
19 #include <map>
20 
21 #include "ILexer.h"
22 #include "Scintilla.h"
23 #include "SciLexer.h"
24 #include "WordList.h"
25 #include "LexAccessor.h"
26 #include "StyleContext.h"
27 #include "CharacterSet.h"
28 #include "LexerModule.h"
29 #include "OptionSet.h"
30 #include "DefaultLexer.h"
31 
32 using namespace Scintilla;
33 
34 static const char *const JSONWordListDesc[] = {
35 	"JSON Keywords",
36 	"JSON-LD Keywords",
37 	0
38 };
39 
40 /**
41  * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the
42  * colon separating the prefix and suffix
43  *
44  * https://www.w3.org/TR/json-ld/#dfn-compact-iri
45  */
46 struct CompactIRI {
47 	int colonCount;
48 	bool foundInvalidChar;
49 	CharacterSet setCompactIRI;
CompactIRICompactIRI50 	CompactIRI() {
51 		colonCount = 0;
52 		foundInvalidChar = false;
53 		setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-");
54 	}
resetStateCompactIRI55 	void resetState() {
56 		colonCount = 0;
57 		foundInvalidChar = false;
58 	}
checkCharCompactIRI59 	void checkChar(int ch) {
60 		if (ch == ':') {
61 			colonCount++;
62 		} else {
63 			foundInvalidChar |= !setCompactIRI.Contains(ch);
64 		}
65 	}
shouldHighlightCompactIRI66 	bool shouldHighlight() const {
67 		return !foundInvalidChar && colonCount == 1;
68 	}
69 };
70 
71 /**
72  * Keeps track of escaped characters in strings as per:
73  *
74  * https://tools.ietf.org/html/rfc7159#section-7
75  */
76 struct EscapeSequence {
77 	int digitsLeft;
78 	CharacterSet setHexDigits;
79 	CharacterSet setEscapeChars;
EscapeSequenceEscapeSequence80 	EscapeSequence() {
81 		digitsLeft = 0;
82 		setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef");
83 		setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/");
84 	}
85 	// Returns true if the following character is a valid escaped character
newSequenceEscapeSequence86 	bool newSequence(int nextChar) {
87 		digitsLeft = 0;
88 		if (nextChar == 'u') {
89 			digitsLeft = 5;
90 		} else if (!setEscapeChars.Contains(nextChar)) {
91 			return false;
92 		}
93 		return true;
94 	}
atEscapeEndEscapeSequence95 	bool atEscapeEnd() const {
96 		return digitsLeft <= 0;
97 	}
isInvalidCharEscapeSequence98 	bool isInvalidChar(int currChar) const {
99 		return !setHexDigits.Contains(currChar);
100 	}
101 };
102 
103 struct OptionsJSON {
104 	bool foldCompact;
105 	bool fold;
106 	bool allowComments;
107 	bool escapeSequence;
OptionsJSONOptionsJSON108 	OptionsJSON() {
109 		foldCompact = false;
110 		fold = false;
111 		allowComments = false;
112 		escapeSequence = false;
113 	}
114 };
115 
116 struct OptionSetJSON : public OptionSet<OptionsJSON> {
OptionSetJSONOptionSetJSON117 	OptionSetJSON() {
118 		DefineProperty("lexer.json.escape.sequence", &OptionsJSON::escapeSequence,
119 					   "Set to 1 to enable highlighting of escape sequences in strings");
120 
121 		DefineProperty("lexer.json.allow.comments", &OptionsJSON::allowComments,
122 					   "Set to 1 to enable highlighting of line/block comments in JSON");
123 
124 		DefineProperty("fold.compact", &OptionsJSON::foldCompact);
125 		DefineProperty("fold", &OptionsJSON::fold);
126 		DefineWordListSets(JSONWordListDesc);
127 	}
128 };
129 
130 class LexerJSON : public DefaultLexer {
131 	OptionsJSON options;
132 	OptionSetJSON optSetJSON;
133 	EscapeSequence escapeSeq;
134 	WordList keywordsJSON;
135 	WordList keywordsJSONLD;
136 	CharacterSet setOperators;
137 	CharacterSet setURL;
138 	CharacterSet setKeywordJSONLD;
139 	CharacterSet setKeywordJSON;
140 	CompactIRI compactIRI;
141 
IsNextNonWhitespace(LexAccessor & styler,Sci_Position start,char ch)142 	static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) {
143 		Sci_Position i = 0;
144 		while (i < 50) {
145 			i++;
146 			char curr = styler.SafeGetCharAt(start+i, '\0');
147 			char next = styler.SafeGetCharAt(start+i+1, '\0');
148 			bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
149 			if (curr == ch) {
150 				return true;
151 			} else if (!isspacechar(curr) || atEOL) {
152 				return false;
153 			}
154 		}
155 		return false;
156 	}
157 
158 	/**
159 	 * Looks for the colon following the end quote
160 	 *
161 	 * Assumes property names of lengths no longer than a 100 characters.
162 	 * The colon is also expected to be less than 50 spaces after the end
163 	 * quote for the string to be considered a property name
164 	 */
AtPropertyName(LexAccessor & styler,Sci_Position start)165 	static bool AtPropertyName(LexAccessor &styler, Sci_Position start) {
166 		Sci_Position i = 0;
167 		bool escaped = false;
168 		while (i < 100) {
169 			i++;
170 			char curr = styler.SafeGetCharAt(start+i, '\0');
171 			if (escaped) {
172 				escaped = false;
173 				continue;
174 			}
175 			escaped = curr == '\\';
176 			if (curr == '"') {
177 				return IsNextNonWhitespace(styler, start+i, ':');
178 			} else if (!curr) {
179 				return false;
180 			}
181 		}
182 		return false;
183 	}
184 
IsNextWordInList(WordList & keywordList,CharacterSet wordSet,StyleContext & context,LexAccessor & styler)185 	static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet,
186 								 StyleContext &context, LexAccessor &styler) {
187 		char word[51];
188 		Sci_Position currPos = (Sci_Position) context.currentPos;
189 		int i = 0;
190 		while (i < 50) {
191 			char ch = styler.SafeGetCharAt(currPos + i);
192 			if (!wordSet.Contains(ch)) {
193 				break;
194 			}
195 			word[i] = ch;
196 			i++;
197 		}
198 		word[i] = '\0';
199 		return keywordList.InList(word);
200 	}
201 
202 	public:
LexerJSON()203 	LexerJSON() :
204 		setOperators(CharacterSet::setNone, "[{}]:,"),
205 		setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
206 		setKeywordJSONLD(CharacterSet::setAlpha, ":@"),
207 		setKeywordJSON(CharacterSet::setAlpha, "$_") {
208 	}
~LexerJSON()209 	virtual ~LexerJSON() {}
Version() const210 	int SCI_METHOD Version() const override {
211 		return lvOriginal;
212 	}
Release()213 	void SCI_METHOD Release() override {
214 		delete this;
215 	}
PropertyNames()216 	const char *SCI_METHOD PropertyNames() override {
217 		return optSetJSON.PropertyNames();
218 	}
PropertyType(const char * name)219 	int SCI_METHOD PropertyType(const char *name) override {
220 		return optSetJSON.PropertyType(name);
221 	}
DescribeProperty(const char * name)222 	const char *SCI_METHOD DescribeProperty(const char *name) override {
223 		return optSetJSON.DescribeProperty(name);
224 	}
PropertySet(const char * key,const char * val)225 	Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override {
226 		if (optSetJSON.PropertySet(&options, key, val)) {
227 			return 0;
228 		}
229 		return -1;
230 	}
WordListSet(int n,const char * wl)231 	Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override {
232 		WordList *wordListN = 0;
233 		switch (n) {
234 			case 0:
235 				wordListN = &keywordsJSON;
236 				break;
237 			case 1:
238 				wordListN = &keywordsJSONLD;
239 				break;
240 		}
241 		Sci_Position firstModification = -1;
242 		if (wordListN) {
243 			WordList wlNew;
244 			wlNew.Set(wl);
245 			if (*wordListN != wlNew) {
246 				wordListN->Set(wl);
247 				firstModification = 0;
248 			}
249 		}
250 		return firstModification;
251 	}
PrivateCall(int,void *)252 	void *SCI_METHOD PrivateCall(int, void *) override {
253 		return 0;
254 	}
LexerFactoryJSON()255 	static ILexer *LexerFactoryJSON() {
256 		return new LexerJSON;
257 	}
DescribeWordListSets()258 	const char *SCI_METHOD DescribeWordListSets() override {
259 		return optSetJSON.DescribeWordListSets();
260 	}
261 	void SCI_METHOD Lex(Sci_PositionU startPos,
262 								Sci_Position length,
263 								int initStyle,
264 								IDocument *pAccess) override;
265 	void SCI_METHOD Fold(Sci_PositionU startPos,
266 								 Sci_Position length,
267 								 int initStyle,
268 								 IDocument *pAccess) override;
269 };
270 
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)271 void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos,
272 							   Sci_Position length,
273 							   int initStyle,
274 							   IDocument *pAccess) {
275 	LexAccessor styler(pAccess);
276 	StyleContext context(startPos, length, initStyle, styler);
277 	int stringStyleBefore = SCE_JSON_STRING;
278 	while (context.More()) {
279 		switch (context.state) {
280 			case SCE_JSON_BLOCKCOMMENT:
281 				if (context.Match("*/")) {
282 					context.Forward();
283 					context.ForwardSetState(SCE_JSON_DEFAULT);
284 				}
285 				break;
286 			case SCE_JSON_LINECOMMENT:
287 				if (context.atLineEnd) {
288 					context.SetState(SCE_JSON_DEFAULT);
289 				}
290 				break;
291 			case SCE_JSON_STRINGEOL:
292 				if (context.atLineStart) {
293 					context.SetState(SCE_JSON_DEFAULT);
294 				}
295 				break;
296 			case SCE_JSON_ESCAPESEQUENCE:
297 				escapeSeq.digitsLeft--;
298 				if (!escapeSeq.atEscapeEnd()) {
299 					if (escapeSeq.isInvalidChar(context.ch)) {
300 						context.SetState(SCE_JSON_ERROR);
301 					}
302 					break;
303 				}
304 				if (context.ch == '"') {
305 					context.SetState(stringStyleBefore);
306 					context.ForwardSetState(SCE_C_DEFAULT);
307 				} else if (context.ch == '\\') {
308 					if (!escapeSeq.newSequence(context.chNext)) {
309 						context.SetState(SCE_JSON_ERROR);
310 					}
311 					context.Forward();
312 				} else {
313 					context.SetState(stringStyleBefore);
314 					if (context.atLineEnd) {
315 						context.ChangeState(SCE_JSON_STRINGEOL);
316 					}
317 				}
318 				break;
319 			case SCE_JSON_PROPERTYNAME:
320 			case SCE_JSON_STRING:
321 				if (context.ch == '"') {
322 					if (compactIRI.shouldHighlight()) {
323 						context.ChangeState(SCE_JSON_COMPACTIRI);
324 						context.ForwardSetState(SCE_JSON_DEFAULT);
325 						compactIRI.resetState();
326 					} else {
327 						context.ForwardSetState(SCE_JSON_DEFAULT);
328 					}
329 				} else if (context.atLineEnd) {
330 					context.ChangeState(SCE_JSON_STRINGEOL);
331 				} else if (context.ch == '\\') {
332 					stringStyleBefore = context.state;
333 					if (options.escapeSequence) {
334 						context.SetState(SCE_JSON_ESCAPESEQUENCE);
335 						if (!escapeSeq.newSequence(context.chNext)) {
336 							context.SetState(SCE_JSON_ERROR);
337 						}
338 					}
339 					context.Forward();
340 				} else if (context.Match("https://") ||
341 						   context.Match("http://") ||
342 						   context.Match("ssh://") ||
343 						   context.Match("git://") ||
344 						   context.Match("svn://") ||
345 						   context.Match("ftp://") ||
346 						   context.Match("mailto:")) {
347 					// Handle most common URI schemes only
348 					stringStyleBefore = context.state;
349 					context.SetState(SCE_JSON_URI);
350 				} else if (context.ch == '@') {
351 					// https://www.w3.org/TR/json-ld/#dfn-keyword
352 					if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) {
353 						stringStyleBefore = context.state;
354 						context.SetState(SCE_JSON_LDKEYWORD);
355 					}
356 				} else {
357 					compactIRI.checkChar(context.ch);
358 				}
359 				break;
360 			case SCE_JSON_LDKEYWORD:
361 			case SCE_JSON_URI:
362 				if ((!setKeywordJSONLD.Contains(context.ch) &&
363 					 (context.state == SCE_JSON_LDKEYWORD)) ||
364 					(!setURL.Contains(context.ch))) {
365 					context.SetState(stringStyleBefore);
366 				}
367 				if (context.ch == '"') {
368 					context.ForwardSetState(SCE_JSON_DEFAULT);
369 				} else if (context.atLineEnd) {
370 					context.ChangeState(SCE_JSON_STRINGEOL);
371 				}
372 				break;
373 			case SCE_JSON_OPERATOR:
374 			case SCE_JSON_NUMBER:
375 				context.SetState(SCE_JSON_DEFAULT);
376 				break;
377 			case SCE_JSON_ERROR:
378 				if (context.atLineEnd) {
379 					context.SetState(SCE_JSON_DEFAULT);
380 				}
381 				break;
382 			case SCE_JSON_KEYWORD:
383 				if (!setKeywordJSON.Contains(context.ch)) {
384 					context.SetState(SCE_JSON_DEFAULT);
385 				}
386 				break;
387 		}
388 		if (context.state == SCE_JSON_DEFAULT) {
389 			if (context.ch == '"') {
390 				compactIRI.resetState();
391 				context.SetState(SCE_JSON_STRING);
392 				Sci_Position currPos = static_cast<Sci_Position>(context.currentPos);
393 				if (AtPropertyName(styler, currPos)) {
394 					context.SetState(SCE_JSON_PROPERTYNAME);
395 				}
396 			} else if (setOperators.Contains(context.ch)) {
397 				context.SetState(SCE_JSON_OPERATOR);
398 			} else if (options.allowComments && context.Match("/*")) {
399 				context.SetState(SCE_JSON_BLOCKCOMMENT);
400 				context.Forward();
401 			} else if (options.allowComments && context.Match("//")) {
402 				context.SetState(SCE_JSON_LINECOMMENT);
403 			} else if (setKeywordJSON.Contains(context.ch)) {
404 				if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) {
405 					context.SetState(SCE_JSON_KEYWORD);
406 				}
407 			}
408 			bool numberStart =
409 				IsADigit(context.ch) && (context.chPrev == '+'||
410 										 context.chPrev == '-' ||
411 										 context.atLineStart ||
412 										 IsASpace(context.chPrev) ||
413 										 setOperators.Contains(context.chPrev));
414 			bool exponentPart =
415 				tolower(context.ch) == 'e' &&
416 				IsADigit(context.chPrev) &&
417 				(IsADigit(context.chNext) ||
418 				 context.chNext == '+' ||
419 				 context.chNext == '-');
420 			bool signPart =
421 				(context.ch == '-' || context.ch == '+') &&
422 				((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) ||
423 				 ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev))
424 				  && IsADigit(context.chNext)));
425 			bool adjacentDigit =
426 				IsADigit(context.ch) && IsADigit(context.chPrev);
427 			bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e';
428 			bool dotPart = context.ch == '.' &&
429 				IsADigit(context.chPrev) &&
430 				IsADigit(context.chNext);
431 			bool afterDot = IsADigit(context.ch) && context.chPrev == '.';
432 			if (numberStart ||
433 				exponentPart ||
434 				signPart ||
435 				adjacentDigit ||
436 				dotPart ||
437 				afterExponent ||
438 				afterDot) {
439 				context.SetState(SCE_JSON_NUMBER);
440 			} else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) {
441 				context.SetState(SCE_JSON_ERROR);
442 			}
443 		}
444 		context.Forward();
445 	}
446 	context.Complete();
447 }
448 
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)449 void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos,
450 								Sci_Position length,
451 								int,
452 								IDocument *pAccess) {
453 	if (!options.fold) {
454 		return;
455 	}
456 	LexAccessor styler(pAccess);
457 	Sci_PositionU currLine = styler.GetLine(startPos);
458 	Sci_PositionU endPos = startPos + length;
459 	int currLevel = SC_FOLDLEVELBASE;
460 	if (currLine > 0)
461 		currLevel = styler.LevelAt(currLine - 1) >> 16;
462 	int nextLevel = currLevel;
463 	int visibleChars = 0;
464 	for (Sci_PositionU i = startPos; i < endPos; i++) {
465 		char curr = styler.SafeGetCharAt(i);
466 		char next = styler.SafeGetCharAt(i+1);
467 		bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
468 		if (styler.StyleAt(i) == SCE_JSON_OPERATOR) {
469 			if (curr == '{' || curr == '[') {
470 				nextLevel++;
471 			} else if (curr == '}' || curr == ']') {
472 				nextLevel--;
473 			}
474 		}
475 		if (atEOL || i == (endPos-1)) {
476 			int level = currLevel | nextLevel << 16;
477 			if (!visibleChars && options.foldCompact) {
478 				level |= SC_FOLDLEVELWHITEFLAG;
479 			} else if (nextLevel > currLevel) {
480 				level |= SC_FOLDLEVELHEADERFLAG;
481 			}
482 			if (level != styler.LevelAt(currLine)) {
483 				styler.SetLevel(currLine, level);
484 			}
485 			currLine++;
486 			currLevel = nextLevel;
487 			visibleChars = 0;
488 		}
489 		if (!isspacechar(curr)) {
490 			visibleChars++;
491 		}
492 	}
493 }
494 
495 LexerModule lmJSON(SCLEX_JSON,
496 				   LexerJSON::LexerFactoryJSON,
497 				   "json",
498 				   JSONWordListDesc);
499