1 // Scintilla source code edit control
2 /**
3  * @file LexJSON.cxx
4  * @date February 19, 2016
5  * @brief Lexer for JSON and JSON-LD formats
6  * @author nkmathew
7  *
8  * The License.txt file describes the conditions under which this software may
9  * be distributed.
10  *
11  */
12 
13 #include <cstdlib>
14 #include <cassert>
15 #include <cctype>
16 #include <cstdio>
17 #include <string>
18 #include <vector>
19 #include <map>
20 
21 #include "ILexer.h"
22 #include "Scintilla.h"
23 #include "SciLexer.h"
24 #include "WordList.h"
25 #include "LexAccessor.h"
26 #include "StyleContext.h"
27 #include "CharacterSet.h"
28 #include "LexerModule.h"
29 #include "OptionSet.h"
30 #include "DefaultLexer.h"
31 
32 using namespace Scintilla;
33 
34 static const char *const JSONWordListDesc[] = {
35 	"JSON Keywords",
36 	"JSON-LD Keywords",
37 	0
38 };
39 
40 /**
41  * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the
42  * colon separating the prefix and suffix
43  *
44  * https://www.w3.org/TR/json-ld/#dfn-compact-iri
45  */
46 struct CompactIRI {
47 	int colonCount;
48 	bool foundInvalidChar;
49 	CharacterSet setCompactIRI;
CompactIRICompactIRI50 	CompactIRI() {
51 		colonCount = 0;
52 		foundInvalidChar = false;
53 		setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-");
54 	}
resetStateCompactIRI55 	void resetState() {
56 		colonCount = 0;
57 		foundInvalidChar = false;
58 	}
checkCharCompactIRI59 	void checkChar(int ch) {
60 		if (ch == ':') {
61 			colonCount++;
62 		} else {
63 			foundInvalidChar |= !setCompactIRI.Contains(ch);
64 		}
65 	}
shouldHighlightCompactIRI66 	bool shouldHighlight() const {
67 		return !foundInvalidChar && colonCount == 1;
68 	}
69 };
70 
71 /**
72  * Keeps track of escaped characters in strings as per:
73  *
74  * https://tools.ietf.org/html/rfc7159#section-7
75  */
76 struct EscapeSequence {
77 	int digitsLeft;
78 	CharacterSet setHexDigits;
79 	CharacterSet setEscapeChars;
EscapeSequenceEscapeSequence80 	EscapeSequence() {
81 		digitsLeft = 0;
82 		setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef");
83 		setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/");
84 	}
85 	// Returns true if the following character is a valid escaped character
newSequenceEscapeSequence86 	bool newSequence(int nextChar) {
87 		digitsLeft = 0;
88 		if (nextChar == 'u') {
89 			digitsLeft = 5;
90 		} else if (!setEscapeChars.Contains(nextChar)) {
91 			return false;
92 		}
93 		return true;
94 	}
atEscapeEndEscapeSequence95 	bool atEscapeEnd() const {
96 		return digitsLeft <= 0;
97 	}
isInvalidCharEscapeSequence98 	bool isInvalidChar(int currChar) const {
99 		return !setHexDigits.Contains(currChar);
100 	}
101 };
102 
103 struct OptionsJSON {
104 	bool foldCompact;
105 	bool fold;
106 	bool allowComments;
107 	bool escapeSequence;
OptionsJSONOptionsJSON108 	OptionsJSON() {
109 		foldCompact = false;
110 		fold = false;
111 		allowComments = false;
112 		escapeSequence = false;
113 	}
114 };
115 
116 struct OptionSetJSON : public OptionSet<OptionsJSON> {
OptionSetJSONOptionSetJSON117 	OptionSetJSON() {
118 		DefineProperty("lexer.json.escape.sequence", &OptionsJSON::escapeSequence,
119 					   "Set to 1 to enable highlighting of escape sequences in strings");
120 
121 		DefineProperty("lexer.json.allow.comments", &OptionsJSON::allowComments,
122 					   "Set to 1 to enable highlighting of line/block comments in JSON");
123 
124 		DefineProperty("fold.compact", &OptionsJSON::foldCompact);
125 		DefineProperty("fold", &OptionsJSON::fold);
126 		DefineWordListSets(JSONWordListDesc);
127 	}
128 };
129 
130 class LexerJSON : public DefaultLexer {
131 	OptionsJSON options;
132 	OptionSetJSON optSetJSON;
133 	EscapeSequence escapeSeq;
134 	WordList keywordsJSON;
135 	WordList keywordsJSONLD;
136 	CharacterSet setOperators;
137 	CharacterSet setURL;
138 	CharacterSet setKeywordJSONLD;
139 	CharacterSet setKeywordJSON;
140 	CompactIRI compactIRI;
141 
IsNextNonWhitespace(LexAccessor & styler,Sci_Position start,char ch)142 	static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) {
143 		Sci_Position i = 0;
144 		while (i < 50) {
145 			i++;
146 			char curr = styler.SafeGetCharAt(start+i, '\0');
147 			char next = styler.SafeGetCharAt(start+i+1, '\0');
148 			bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
149 			if (curr == ch) {
150 				return true;
151 			} else if (!isspacechar(curr) || atEOL) {
152 				return false;
153 			}
154 		}
155 		return false;
156 	}
157 
158 	/**
159 	 * Looks for the colon following the end quote
160 	 *
161 	 * Assumes property names of lengths no longer than a 100 characters.
162 	 * The colon is also expected to be less than 50 spaces after the end
163 	 * quote for the string to be considered a property name
164 	 */
AtPropertyName(LexAccessor & styler,Sci_Position start)165 	static bool AtPropertyName(LexAccessor &styler, Sci_Position start) {
166 		Sci_Position i = 0;
167 		bool escaped = false;
168 		while (i < 100) {
169 			i++;
170 			char curr = styler.SafeGetCharAt(start+i, '\0');
171 			if (escaped) {
172 				escaped = false;
173 				continue;
174 			}
175 			escaped = curr == '\\';
176 			if (curr == '"') {
177 				return IsNextNonWhitespace(styler, start+i, ':');
178 			} else if (!curr) {
179 				return false;
180 			}
181 		}
182 		return false;
183 	}
184 
IsNextWordInList(WordList & keywordList,CharacterSet wordSet,StyleContext & context,LexAccessor & styler)185 	static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet,
186 								 StyleContext &context, LexAccessor &styler) {
187 		char word[51];
188 		Sci_Position currPos = (Sci_Position) context.currentPos;
189 		int i = 0;
190 		while (i < 50) {
191 			char ch = styler.SafeGetCharAt(currPos + i);
192 			if (!wordSet.Contains(ch)) {
193 				break;
194 			}
195 			word[i] = ch;
196 			i++;
197 		}
198 		word[i] = '\0';
199 		return keywordList.InList(word);
200 	}
201 
202 	public:
LexerJSON()203 	LexerJSON() :
204 		DefaultLexer("json", SCLEX_JSON),
205 		setOperators(CharacterSet::setNone, "[{}]:,"),
206 		setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
207 		setKeywordJSONLD(CharacterSet::setAlpha, ":@"),
208 		setKeywordJSON(CharacterSet::setAlpha, "$_") {
209 	}
~LexerJSON()210 	virtual ~LexerJSON() {}
Version() const211 	int SCI_METHOD Version() const override {
212 		return lvRelease5;
213 	}
Release()214 	void SCI_METHOD Release() override {
215 		delete this;
216 	}
PropertyNames()217 	const char *SCI_METHOD PropertyNames() override {
218 		return optSetJSON.PropertyNames();
219 	}
PropertyType(const char * name)220 	int SCI_METHOD PropertyType(const char *name) override {
221 		return optSetJSON.PropertyType(name);
222 	}
DescribeProperty(const char * name)223 	const char *SCI_METHOD DescribeProperty(const char *name) override {
224 		return optSetJSON.DescribeProperty(name);
225 	}
PropertySet(const char * key,const char * val)226 	Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override {
227 		if (optSetJSON.PropertySet(&options, key, val)) {
228 			return 0;
229 		}
230 		return -1;
231 	}
PropertyGet(const char * key)232 	const char * SCI_METHOD PropertyGet(const char *key) override {
233 		return optSetJSON.PropertyGet(key);
234 	}
WordListSet(int n,const char * wl)235 	Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override {
236 		WordList *wordListN = 0;
237 		switch (n) {
238 			case 0:
239 				wordListN = &keywordsJSON;
240 				break;
241 			case 1:
242 				wordListN = &keywordsJSONLD;
243 				break;
244 		}
245 		Sci_Position firstModification = -1;
246 		if (wordListN) {
247 			WordList wlNew;
248 			wlNew.Set(wl);
249 			if (*wordListN != wlNew) {
250 				wordListN->Set(wl);
251 				firstModification = 0;
252 			}
253 		}
254 		return firstModification;
255 	}
PrivateCall(int,void *)256 	void *SCI_METHOD PrivateCall(int, void *) override {
257 		return 0;
258 	}
LexerFactoryJSON()259 	static ILexer5 *LexerFactoryJSON() {
260 		return new LexerJSON;
261 	}
DescribeWordListSets()262 	const char *SCI_METHOD DescribeWordListSets() override {
263 		return optSetJSON.DescribeWordListSets();
264 	}
265 	void SCI_METHOD Lex(Sci_PositionU startPos,
266 								Sci_Position length,
267 								int initStyle,
268 								IDocument *pAccess) override;
269 	void SCI_METHOD Fold(Sci_PositionU startPos,
270 								 Sci_Position length,
271 								 int initStyle,
272 								 IDocument *pAccess) override;
273 };
274 
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)275 void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos,
276 							   Sci_Position length,
277 							   int initStyle,
278 							   IDocument *pAccess) {
279 	LexAccessor styler(pAccess);
280 	StyleContext context(startPos, length, initStyle, styler);
281 	int stringStyleBefore = SCE_JSON_STRING;
282 	while (context.More()) {
283 		switch (context.state) {
284 			case SCE_JSON_BLOCKCOMMENT:
285 				if (context.Match("*/")) {
286 					context.Forward();
287 					context.ForwardSetState(SCE_JSON_DEFAULT);
288 				}
289 				break;
290 			case SCE_JSON_LINECOMMENT:
291 				if (context.atLineEnd) {
292 					context.SetState(SCE_JSON_DEFAULT);
293 				}
294 				break;
295 			case SCE_JSON_STRINGEOL:
296 				if (context.atLineStart) {
297 					context.SetState(SCE_JSON_DEFAULT);
298 				}
299 				break;
300 			case SCE_JSON_ESCAPESEQUENCE:
301 				escapeSeq.digitsLeft--;
302 				if (!escapeSeq.atEscapeEnd()) {
303 					if (escapeSeq.isInvalidChar(context.ch)) {
304 						context.SetState(SCE_JSON_ERROR);
305 					}
306 					break;
307 				}
308 				if (context.ch == '"') {
309 					context.SetState(stringStyleBefore);
310 					context.ForwardSetState(SCE_C_DEFAULT);
311 				} else if (context.ch == '\\') {
312 					if (!escapeSeq.newSequence(context.chNext)) {
313 						context.SetState(SCE_JSON_ERROR);
314 					}
315 					context.Forward();
316 				} else {
317 					context.SetState(stringStyleBefore);
318 					if (context.atLineEnd) {
319 						context.ChangeState(SCE_JSON_STRINGEOL);
320 					}
321 				}
322 				break;
323 			case SCE_JSON_PROPERTYNAME:
324 			case SCE_JSON_STRING:
325 				if (context.ch == '"') {
326 					if (compactIRI.shouldHighlight()) {
327 						context.ChangeState(SCE_JSON_COMPACTIRI);
328 						context.ForwardSetState(SCE_JSON_DEFAULT);
329 						compactIRI.resetState();
330 					} else {
331 						context.ForwardSetState(SCE_JSON_DEFAULT);
332 					}
333 				} else if (context.atLineEnd) {
334 					context.ChangeState(SCE_JSON_STRINGEOL);
335 				} else if (context.ch == '\\') {
336 					stringStyleBefore = context.state;
337 					if (options.escapeSequence) {
338 						context.SetState(SCE_JSON_ESCAPESEQUENCE);
339 						if (!escapeSeq.newSequence(context.chNext)) {
340 							context.SetState(SCE_JSON_ERROR);
341 						}
342 					}
343 					context.Forward();
344 				} else if (context.Match("https://") ||
345 						   context.Match("http://") ||
346 						   context.Match("ssh://") ||
347 						   context.Match("git://") ||
348 						   context.Match("svn://") ||
349 						   context.Match("ftp://") ||
350 						   context.Match("mailto:")) {
351 					// Handle most common URI schemes only
352 					stringStyleBefore = context.state;
353 					context.SetState(SCE_JSON_URI);
354 				} else if (context.ch == '@') {
355 					// https://www.w3.org/TR/json-ld/#dfn-keyword
356 					if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) {
357 						stringStyleBefore = context.state;
358 						context.SetState(SCE_JSON_LDKEYWORD);
359 					}
360 				} else {
361 					compactIRI.checkChar(context.ch);
362 				}
363 				break;
364 			case SCE_JSON_LDKEYWORD:
365 			case SCE_JSON_URI:
366 				if ((!setKeywordJSONLD.Contains(context.ch) &&
367 					 (context.state == SCE_JSON_LDKEYWORD)) ||
368 					(!setURL.Contains(context.ch))) {
369 					context.SetState(stringStyleBefore);
370 				}
371 				if (context.ch == '"') {
372 					context.ForwardSetState(SCE_JSON_DEFAULT);
373 				} else if (context.atLineEnd) {
374 					context.ChangeState(SCE_JSON_STRINGEOL);
375 				}
376 				break;
377 			case SCE_JSON_OPERATOR:
378 			case SCE_JSON_NUMBER:
379 				context.SetState(SCE_JSON_DEFAULT);
380 				break;
381 			case SCE_JSON_ERROR:
382 				if (context.atLineEnd) {
383 					context.SetState(SCE_JSON_DEFAULT);
384 				}
385 				break;
386 			case SCE_JSON_KEYWORD:
387 				if (!setKeywordJSON.Contains(context.ch)) {
388 					context.SetState(SCE_JSON_DEFAULT);
389 				}
390 				break;
391 		}
392 		if (context.state == SCE_JSON_DEFAULT) {
393 			if (context.ch == '"') {
394 				compactIRI.resetState();
395 				context.SetState(SCE_JSON_STRING);
396 				Sci_Position currPos = static_cast<Sci_Position>(context.currentPos);
397 				if (AtPropertyName(styler, currPos)) {
398 					context.SetState(SCE_JSON_PROPERTYNAME);
399 				}
400 			} else if (setOperators.Contains(context.ch)) {
401 				context.SetState(SCE_JSON_OPERATOR);
402 			} else if (options.allowComments && context.Match("/*")) {
403 				context.SetState(SCE_JSON_BLOCKCOMMENT);
404 				context.Forward();
405 			} else if (options.allowComments && context.Match("//")) {
406 				context.SetState(SCE_JSON_LINECOMMENT);
407 			} else if (setKeywordJSON.Contains(context.ch)) {
408 				if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) {
409 					context.SetState(SCE_JSON_KEYWORD);
410 				}
411 			}
412 			bool numberStart =
413 				IsADigit(context.ch) && (context.chPrev == '+'||
414 										 context.chPrev == '-' ||
415 										 context.atLineStart ||
416 										 IsASpace(context.chPrev) ||
417 										 setOperators.Contains(context.chPrev));
418 			bool exponentPart =
419 				tolower(context.ch) == 'e' &&
420 				IsADigit(context.chPrev) &&
421 				(IsADigit(context.chNext) ||
422 				 context.chNext == '+' ||
423 				 context.chNext == '-');
424 			bool signPart =
425 				(context.ch == '-' || context.ch == '+') &&
426 				((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) ||
427 				 ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev))
428 				  && IsADigit(context.chNext)));
429 			bool adjacentDigit =
430 				IsADigit(context.ch) && IsADigit(context.chPrev);
431 			bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e';
432 			bool dotPart = context.ch == '.' &&
433 				IsADigit(context.chPrev) &&
434 				IsADigit(context.chNext);
435 			bool afterDot = IsADigit(context.ch) && context.chPrev == '.';
436 			if (numberStart ||
437 				exponentPart ||
438 				signPart ||
439 				adjacentDigit ||
440 				dotPart ||
441 				afterExponent ||
442 				afterDot) {
443 				context.SetState(SCE_JSON_NUMBER);
444 			} else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) {
445 				context.SetState(SCE_JSON_ERROR);
446 			}
447 		}
448 		context.Forward();
449 	}
450 	context.Complete();
451 }
452 
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)453 void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos,
454 								Sci_Position length,
455 								int,
456 								IDocument *pAccess) {
457 	if (!options.fold) {
458 		return;
459 	}
460 	LexAccessor styler(pAccess);
461 	Sci_PositionU currLine = styler.GetLine(startPos);
462 	Sci_PositionU endPos = startPos + length;
463 	int currLevel = SC_FOLDLEVELBASE;
464 	if (currLine > 0)
465 		currLevel = styler.LevelAt(currLine - 1) >> 16;
466 	int nextLevel = currLevel;
467 	int visibleChars = 0;
468 	for (Sci_PositionU i = startPos; i < endPos; i++) {
469 		char curr = styler.SafeGetCharAt(i);
470 		char next = styler.SafeGetCharAt(i+1);
471 		bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
472 		if (styler.StyleAt(i) == SCE_JSON_OPERATOR) {
473 			if (curr == '{' || curr == '[') {
474 				nextLevel++;
475 			} else if (curr == '}' || curr == ']') {
476 				nextLevel--;
477 			}
478 		}
479 		if (atEOL || i == (endPos-1)) {
480 			int level = currLevel | nextLevel << 16;
481 			if (!visibleChars && options.foldCompact) {
482 				level |= SC_FOLDLEVELWHITEFLAG;
483 			} else if (nextLevel > currLevel) {
484 				level |= SC_FOLDLEVELHEADERFLAG;
485 			}
486 			if (level != styler.LevelAt(currLine)) {
487 				styler.SetLevel(currLine, level);
488 			}
489 			currLine++;
490 			currLevel = nextLevel;
491 			visibleChars = 0;
492 		}
493 		if (!isspacechar(curr)) {
494 			visibleChars++;
495 		}
496 	}
497 }
498 
499 LexerModule lmJSON(SCLEX_JSON,
500 				   LexerJSON::LexerFactoryJSON,
501 				   "json",
502 				   JSONWordListDesc);
503