1 // Scintilla source code edit control
2 /**
3 * @file LexJSON.cxx
4 * @date February 19, 2016
5 * @brief Lexer for JSON and JSON-LD formats
6 * @author nkmathew
7 *
8 * The License.txt file describes the conditions under which this software may
9 * be distributed.
10 *
11 */
12
13 #include <cstdlib>
14 #include <cassert>
15 #include <cctype>
16 #include <cstdio>
17 #include <string>
18 #include <vector>
19 #include <map>
20
21 #include "ILexer.h"
22 #include "Scintilla.h"
23 #include "SciLexer.h"
24 #include "WordList.h"
25 #include "LexAccessor.h"
26 #include "StyleContext.h"
27 #include "CharacterSet.h"
28 #include "LexerModule.h"
29 #include "OptionSet.h"
30 #include "DefaultLexer.h"
31
32 using namespace Scintilla;
33
34 static const char *const JSONWordListDesc[] = {
35 "JSON Keywords",
36 "JSON-LD Keywords",
37 0
38 };
39
40 /**
41 * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the
42 * colon separating the prefix and suffix
43 *
44 * https://www.w3.org/TR/json-ld/#dfn-compact-iri
45 */
46 struct CompactIRI {
47 int colonCount;
48 bool foundInvalidChar;
49 CharacterSet setCompactIRI;
CompactIRICompactIRI50 CompactIRI() {
51 colonCount = 0;
52 foundInvalidChar = false;
53 setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-");
54 }
resetStateCompactIRI55 void resetState() {
56 colonCount = 0;
57 foundInvalidChar = false;
58 }
checkCharCompactIRI59 void checkChar(int ch) {
60 if (ch == ':') {
61 colonCount++;
62 } else {
63 foundInvalidChar |= !setCompactIRI.Contains(ch);
64 }
65 }
shouldHighlightCompactIRI66 bool shouldHighlight() const {
67 return !foundInvalidChar && colonCount == 1;
68 }
69 };
70
71 /**
72 * Keeps track of escaped characters in strings as per:
73 *
74 * https://tools.ietf.org/html/rfc7159#section-7
75 */
76 struct EscapeSequence {
77 int digitsLeft;
78 CharacterSet setHexDigits;
79 CharacterSet setEscapeChars;
EscapeSequenceEscapeSequence80 EscapeSequence() {
81 digitsLeft = 0;
82 setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef");
83 setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/");
84 }
85 // Returns true if the following character is a valid escaped character
newSequenceEscapeSequence86 bool newSequence(int nextChar) {
87 digitsLeft = 0;
88 if (nextChar == 'u') {
89 digitsLeft = 5;
90 } else if (!setEscapeChars.Contains(nextChar)) {
91 return false;
92 }
93 return true;
94 }
atEscapeEndEscapeSequence95 bool atEscapeEnd() const {
96 return digitsLeft <= 0;
97 }
isInvalidCharEscapeSequence98 bool isInvalidChar(int currChar) const {
99 return !setHexDigits.Contains(currChar);
100 }
101 };
102
103 struct OptionsJSON {
104 bool foldCompact;
105 bool fold;
106 bool allowComments;
107 bool escapeSequence;
OptionsJSONOptionsJSON108 OptionsJSON() {
109 foldCompact = false;
110 fold = false;
111 allowComments = false;
112 escapeSequence = false;
113 }
114 };
115
116 struct OptionSetJSON : public OptionSet<OptionsJSON> {
OptionSetJSONOptionSetJSON117 OptionSetJSON() {
118 DefineProperty("lexer.json.escape.sequence", &OptionsJSON::escapeSequence,
119 "Set to 1 to enable highlighting of escape sequences in strings");
120
121 DefineProperty("lexer.json.allow.comments", &OptionsJSON::allowComments,
122 "Set to 1 to enable highlighting of line/block comments in JSON");
123
124 DefineProperty("fold.compact", &OptionsJSON::foldCompact);
125 DefineProperty("fold", &OptionsJSON::fold);
126 DefineWordListSets(JSONWordListDesc);
127 }
128 };
129
130 class LexerJSON : public DefaultLexer {
131 OptionsJSON options;
132 OptionSetJSON optSetJSON;
133 EscapeSequence escapeSeq;
134 WordList keywordsJSON;
135 WordList keywordsJSONLD;
136 CharacterSet setOperators;
137 CharacterSet setURL;
138 CharacterSet setKeywordJSONLD;
139 CharacterSet setKeywordJSON;
140 CompactIRI compactIRI;
141
IsNextNonWhitespace(LexAccessor & styler,Sci_Position start,char ch)142 static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) {
143 Sci_Position i = 0;
144 while (i < 50) {
145 i++;
146 char curr = styler.SafeGetCharAt(start+i, '\0');
147 char next = styler.SafeGetCharAt(start+i+1, '\0');
148 bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
149 if (curr == ch) {
150 return true;
151 } else if (!isspacechar(curr) || atEOL) {
152 return false;
153 }
154 }
155 return false;
156 }
157
158 /**
159 * Looks for the colon following the end quote
160 *
161 * Assumes property names of lengths no longer than a 100 characters.
162 * The colon is also expected to be less than 50 spaces after the end
163 * quote for the string to be considered a property name
164 */
AtPropertyName(LexAccessor & styler,Sci_Position start)165 static bool AtPropertyName(LexAccessor &styler, Sci_Position start) {
166 Sci_Position i = 0;
167 bool escaped = false;
168 while (i < 100) {
169 i++;
170 char curr = styler.SafeGetCharAt(start+i, '\0');
171 if (escaped) {
172 escaped = false;
173 continue;
174 }
175 escaped = curr == '\\';
176 if (curr == '"') {
177 return IsNextNonWhitespace(styler, start+i, ':');
178 } else if (!curr) {
179 return false;
180 }
181 }
182 return false;
183 }
184
IsNextWordInList(WordList & keywordList,CharacterSet wordSet,StyleContext & context,LexAccessor & styler)185 static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet,
186 StyleContext &context, LexAccessor &styler) {
187 char word[51];
188 Sci_Position currPos = (Sci_Position) context.currentPos;
189 int i = 0;
190 while (i < 50) {
191 char ch = styler.SafeGetCharAt(currPos + i);
192 if (!wordSet.Contains(ch)) {
193 break;
194 }
195 word[i] = ch;
196 i++;
197 }
198 word[i] = '\0';
199 return keywordList.InList(word);
200 }
201
202 public:
LexerJSON()203 LexerJSON() :
204 DefaultLexer("json", SCLEX_JSON),
205 setOperators(CharacterSet::setNone, "[{}]:,"),
206 setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
207 setKeywordJSONLD(CharacterSet::setAlpha, ":@"),
208 setKeywordJSON(CharacterSet::setAlpha, "$_") {
209 }
~LexerJSON()210 virtual ~LexerJSON() {}
Version() const211 int SCI_METHOD Version() const override {
212 return lvRelease5;
213 }
Release()214 void SCI_METHOD Release() override {
215 delete this;
216 }
PropertyNames()217 const char *SCI_METHOD PropertyNames() override {
218 return optSetJSON.PropertyNames();
219 }
PropertyType(const char * name)220 int SCI_METHOD PropertyType(const char *name) override {
221 return optSetJSON.PropertyType(name);
222 }
DescribeProperty(const char * name)223 const char *SCI_METHOD DescribeProperty(const char *name) override {
224 return optSetJSON.DescribeProperty(name);
225 }
PropertySet(const char * key,const char * val)226 Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override {
227 if (optSetJSON.PropertySet(&options, key, val)) {
228 return 0;
229 }
230 return -1;
231 }
PropertyGet(const char * key)232 const char * SCI_METHOD PropertyGet(const char *key) override {
233 return optSetJSON.PropertyGet(key);
234 }
WordListSet(int n,const char * wl)235 Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override {
236 WordList *wordListN = 0;
237 switch (n) {
238 case 0:
239 wordListN = &keywordsJSON;
240 break;
241 case 1:
242 wordListN = &keywordsJSONLD;
243 break;
244 }
245 Sci_Position firstModification = -1;
246 if (wordListN) {
247 WordList wlNew;
248 wlNew.Set(wl);
249 if (*wordListN != wlNew) {
250 wordListN->Set(wl);
251 firstModification = 0;
252 }
253 }
254 return firstModification;
255 }
PrivateCall(int,void *)256 void *SCI_METHOD PrivateCall(int, void *) override {
257 return 0;
258 }
LexerFactoryJSON()259 static ILexer5 *LexerFactoryJSON() {
260 return new LexerJSON;
261 }
DescribeWordListSets()262 const char *SCI_METHOD DescribeWordListSets() override {
263 return optSetJSON.DescribeWordListSets();
264 }
265 void SCI_METHOD Lex(Sci_PositionU startPos,
266 Sci_Position length,
267 int initStyle,
268 IDocument *pAccess) override;
269 void SCI_METHOD Fold(Sci_PositionU startPos,
270 Sci_Position length,
271 int initStyle,
272 IDocument *pAccess) override;
273 };
274
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)275 void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos,
276 Sci_Position length,
277 int initStyle,
278 IDocument *pAccess) {
279 LexAccessor styler(pAccess);
280 StyleContext context(startPos, length, initStyle, styler);
281 int stringStyleBefore = SCE_JSON_STRING;
282 while (context.More()) {
283 switch (context.state) {
284 case SCE_JSON_BLOCKCOMMENT:
285 if (context.Match("*/")) {
286 context.Forward();
287 context.ForwardSetState(SCE_JSON_DEFAULT);
288 }
289 break;
290 case SCE_JSON_LINECOMMENT:
291 if (context.atLineEnd) {
292 context.SetState(SCE_JSON_DEFAULT);
293 }
294 break;
295 case SCE_JSON_STRINGEOL:
296 if (context.atLineStart) {
297 context.SetState(SCE_JSON_DEFAULT);
298 }
299 break;
300 case SCE_JSON_ESCAPESEQUENCE:
301 escapeSeq.digitsLeft--;
302 if (!escapeSeq.atEscapeEnd()) {
303 if (escapeSeq.isInvalidChar(context.ch)) {
304 context.SetState(SCE_JSON_ERROR);
305 }
306 break;
307 }
308 if (context.ch == '"') {
309 context.SetState(stringStyleBefore);
310 context.ForwardSetState(SCE_C_DEFAULT);
311 } else if (context.ch == '\\') {
312 if (!escapeSeq.newSequence(context.chNext)) {
313 context.SetState(SCE_JSON_ERROR);
314 }
315 context.Forward();
316 } else {
317 context.SetState(stringStyleBefore);
318 if (context.atLineEnd) {
319 context.ChangeState(SCE_JSON_STRINGEOL);
320 }
321 }
322 break;
323 case SCE_JSON_PROPERTYNAME:
324 case SCE_JSON_STRING:
325 if (context.ch == '"') {
326 if (compactIRI.shouldHighlight()) {
327 context.ChangeState(SCE_JSON_COMPACTIRI);
328 context.ForwardSetState(SCE_JSON_DEFAULT);
329 compactIRI.resetState();
330 } else {
331 context.ForwardSetState(SCE_JSON_DEFAULT);
332 }
333 } else if (context.atLineEnd) {
334 context.ChangeState(SCE_JSON_STRINGEOL);
335 } else if (context.ch == '\\') {
336 stringStyleBefore = context.state;
337 if (options.escapeSequence) {
338 context.SetState(SCE_JSON_ESCAPESEQUENCE);
339 if (!escapeSeq.newSequence(context.chNext)) {
340 context.SetState(SCE_JSON_ERROR);
341 }
342 }
343 context.Forward();
344 } else if (context.Match("https://") ||
345 context.Match("http://") ||
346 context.Match("ssh://") ||
347 context.Match("git://") ||
348 context.Match("svn://") ||
349 context.Match("ftp://") ||
350 context.Match("mailto:")) {
351 // Handle most common URI schemes only
352 stringStyleBefore = context.state;
353 context.SetState(SCE_JSON_URI);
354 } else if (context.ch == '@') {
355 // https://www.w3.org/TR/json-ld/#dfn-keyword
356 if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) {
357 stringStyleBefore = context.state;
358 context.SetState(SCE_JSON_LDKEYWORD);
359 }
360 } else {
361 compactIRI.checkChar(context.ch);
362 }
363 break;
364 case SCE_JSON_LDKEYWORD:
365 case SCE_JSON_URI:
366 if ((!setKeywordJSONLD.Contains(context.ch) &&
367 (context.state == SCE_JSON_LDKEYWORD)) ||
368 (!setURL.Contains(context.ch))) {
369 context.SetState(stringStyleBefore);
370 }
371 if (context.ch == '"') {
372 context.ForwardSetState(SCE_JSON_DEFAULT);
373 } else if (context.atLineEnd) {
374 context.ChangeState(SCE_JSON_STRINGEOL);
375 }
376 break;
377 case SCE_JSON_OPERATOR:
378 case SCE_JSON_NUMBER:
379 context.SetState(SCE_JSON_DEFAULT);
380 break;
381 case SCE_JSON_ERROR:
382 if (context.atLineEnd) {
383 context.SetState(SCE_JSON_DEFAULT);
384 }
385 break;
386 case SCE_JSON_KEYWORD:
387 if (!setKeywordJSON.Contains(context.ch)) {
388 context.SetState(SCE_JSON_DEFAULT);
389 }
390 break;
391 }
392 if (context.state == SCE_JSON_DEFAULT) {
393 if (context.ch == '"') {
394 compactIRI.resetState();
395 context.SetState(SCE_JSON_STRING);
396 Sci_Position currPos = static_cast<Sci_Position>(context.currentPos);
397 if (AtPropertyName(styler, currPos)) {
398 context.SetState(SCE_JSON_PROPERTYNAME);
399 }
400 } else if (setOperators.Contains(context.ch)) {
401 context.SetState(SCE_JSON_OPERATOR);
402 } else if (options.allowComments && context.Match("/*")) {
403 context.SetState(SCE_JSON_BLOCKCOMMENT);
404 context.Forward();
405 } else if (options.allowComments && context.Match("//")) {
406 context.SetState(SCE_JSON_LINECOMMENT);
407 } else if (setKeywordJSON.Contains(context.ch)) {
408 if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) {
409 context.SetState(SCE_JSON_KEYWORD);
410 }
411 }
412 bool numberStart =
413 IsADigit(context.ch) && (context.chPrev == '+'||
414 context.chPrev == '-' ||
415 context.atLineStart ||
416 IsASpace(context.chPrev) ||
417 setOperators.Contains(context.chPrev));
418 bool exponentPart =
419 tolower(context.ch) == 'e' &&
420 IsADigit(context.chPrev) &&
421 (IsADigit(context.chNext) ||
422 context.chNext == '+' ||
423 context.chNext == '-');
424 bool signPart =
425 (context.ch == '-' || context.ch == '+') &&
426 ((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) ||
427 ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev))
428 && IsADigit(context.chNext)));
429 bool adjacentDigit =
430 IsADigit(context.ch) && IsADigit(context.chPrev);
431 bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e';
432 bool dotPart = context.ch == '.' &&
433 IsADigit(context.chPrev) &&
434 IsADigit(context.chNext);
435 bool afterDot = IsADigit(context.ch) && context.chPrev == '.';
436 if (numberStart ||
437 exponentPart ||
438 signPart ||
439 adjacentDigit ||
440 dotPart ||
441 afterExponent ||
442 afterDot) {
443 context.SetState(SCE_JSON_NUMBER);
444 } else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) {
445 context.SetState(SCE_JSON_ERROR);
446 }
447 }
448 context.Forward();
449 }
450 context.Complete();
451 }
452
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)453 void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos,
454 Sci_Position length,
455 int,
456 IDocument *pAccess) {
457 if (!options.fold) {
458 return;
459 }
460 LexAccessor styler(pAccess);
461 Sci_PositionU currLine = styler.GetLine(startPos);
462 Sci_PositionU endPos = startPos + length;
463 int currLevel = SC_FOLDLEVELBASE;
464 if (currLine > 0)
465 currLevel = styler.LevelAt(currLine - 1) >> 16;
466 int nextLevel = currLevel;
467 int visibleChars = 0;
468 for (Sci_PositionU i = startPos; i < endPos; i++) {
469 char curr = styler.SafeGetCharAt(i);
470 char next = styler.SafeGetCharAt(i+1);
471 bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
472 if (styler.StyleAt(i) == SCE_JSON_OPERATOR) {
473 if (curr == '{' || curr == '[') {
474 nextLevel++;
475 } else if (curr == '}' || curr == ']') {
476 nextLevel--;
477 }
478 }
479 if (atEOL || i == (endPos-1)) {
480 int level = currLevel | nextLevel << 16;
481 if (!visibleChars && options.foldCompact) {
482 level |= SC_FOLDLEVELWHITEFLAG;
483 } else if (nextLevel > currLevel) {
484 level |= SC_FOLDLEVELHEADERFLAG;
485 }
486 if (level != styler.LevelAt(currLine)) {
487 styler.SetLevel(currLine, level);
488 }
489 currLine++;
490 currLevel = nextLevel;
491 visibleChars = 0;
492 }
493 if (!isspacechar(curr)) {
494 visibleChars++;
495 }
496 }
497 }
498
499 LexerModule lmJSON(SCLEX_JSON,
500 LexerJSON::LexerFactoryJSON,
501 "json",
502 JSONWordListDesc);
503