1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package json
6
7// JSON value parser state machine.
8// Just about at the limit of what is reasonable to write by hand.
9// Some parts are a bit tedious, but overall it nicely factors out the
10// otherwise common code from the multiple scanning functions
11// in this package (Compact, Indent, checkValid, etc).
12//
13// This file starts with two simple examples using the scanner
14// before diving into the scanner itself.
15
16import (
17	"strconv"
18	"sync"
19)
20
21// Valid reports whether data is a valid JSON encoding.
22func Valid(data []byte) bool {
23	scan := newScanner()
24	defer freeScanner(scan)
25	return checkValid(data, scan) == nil
26}
27
28// checkValid verifies that data is valid JSON-encoded data.
29// scan is passed in for use by checkValid to avoid an allocation.
30func checkValid(data []byte, scan *scanner) error {
31	scan.reset()
32	for _, c := range data {
33		scan.bytes++
34		if scan.step(scan, c) == scanError {
35			return scan.err
36		}
37	}
38	if scan.eof() == scanError {
39		return scan.err
40	}
41	return nil
42}
43
44// A SyntaxError is a description of a JSON syntax error.
45type SyntaxError struct {
46	msg    string // description of error
47	Offset int64  // error occurred after reading Offset bytes
48}
49
50func (e *SyntaxError) Error() string { return e.msg }
51
52// A scanner is a JSON scanning state machine.
53// Callers call scan.reset and then pass bytes in one at a time
54// by calling scan.step(&scan, c) for each byte.
55// The return value, referred to as an opcode, tells the
56// caller about significant parsing events like beginning
57// and ending literals, objects, and arrays, so that the
58// caller can follow along if it wishes.
59// The return value scanEnd indicates that a single top-level
60// JSON value has been completed, *before* the byte that
61// just got passed in.  (The indication must be delayed in order
62// to recognize the end of numbers: is 123 a whole value or
63// the beginning of 12345e+6?).
64type scanner struct {
65	// The step is a func to be called to execute the next transition.
66	// Also tried using an integer constant and a single func
67	// with a switch, but using the func directly was 10% faster
68	// on a 64-bit Mac Mini, and it's nicer to read.
69	step func(*scanner, byte) int
70
71	// Reached end of top-level value.
72	endTop bool
73
74	// Stack of what we're in the middle of - array values, object keys, object values.
75	parseState []int
76
77	// Error that happened, if any.
78	err error
79
80	// total bytes consumed, updated by decoder.Decode (and deliberately
81	// not set to zero by scan.reset)
82	bytes int64
83}
84
85var scannerPool = sync.Pool{
86	New: func() interface{} {
87		return &scanner{}
88	},
89}
90
91func newScanner() *scanner {
92	scan := scannerPool.Get().(*scanner)
93	// scan.reset by design doesn't set bytes to zero
94	scan.bytes = 0
95	scan.reset()
96	return scan
97}
98
99func freeScanner(scan *scanner) {
100	// Avoid hanging on to too much memory in extreme cases.
101	if len(scan.parseState) > 1024 {
102		scan.parseState = nil
103	}
104	scannerPool.Put(scan)
105}
106
107// These values are returned by the state transition functions
108// assigned to scanner.state and the method scanner.eof.
109// They give details about the current state of the scan that
110// callers might be interested to know about.
111// It is okay to ignore the return value of any particular
112// call to scanner.state: if one call returns scanError,
113// every subsequent call will return scanError too.
114const (
115	// Continue.
116	scanContinue     = iota // uninteresting byte
117	scanBeginLiteral        // end implied by next result != scanContinue
118	scanBeginObject         // begin object
119	scanObjectKey           // just finished object key (string)
120	scanObjectValue         // just finished non-last object value
121	scanEndObject           // end object (implies scanObjectValue if possible)
122	scanBeginArray          // begin array
123	scanArrayValue          // just finished array value
124	scanEndArray            // end array (implies scanArrayValue if possible)
125	scanSkipSpace           // space byte; can skip; known to be last "continue" result
126
127	// Stop.
128	scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
129	scanError // hit an error, scanner.err.
130)
131
132// These values are stored in the parseState stack.
133// They give the current state of a composite value
134// being scanned. If the parser is inside a nested value
135// the parseState describes the nested state, outermost at entry 0.
136const (
137	parseObjectKey   = iota // parsing object key (before colon)
138	parseObjectValue        // parsing object value (after colon)
139	parseArrayValue         // parsing array value
140)
141
142// This limits the max nesting depth to prevent stack overflow.
143// This is permitted by https://tools.ietf.org/html/rfc7159#section-9
144const maxNestingDepth = 10000
145
146// reset prepares the scanner for use.
147// It must be called before calling s.step.
148func (s *scanner) reset() {
149	s.step = stateBeginValue
150	s.parseState = s.parseState[0:0]
151	s.err = nil
152	s.endTop = false
153}
154
155// eof tells the scanner that the end of input has been reached.
156// It returns a scan status just as s.step does.
157func (s *scanner) eof() int {
158	if s.err != nil {
159		return scanError
160	}
161	if s.endTop {
162		return scanEnd
163	}
164	s.step(s, ' ')
165	if s.endTop {
166		return scanEnd
167	}
168	if s.err == nil {
169		s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
170	}
171	return scanError
172}
173
174// pushParseState pushes a new parse state p onto the parse stack.
175// an error state is returned if maxNestingDepth was exceeded, otherwise successState is returned.
176func (s *scanner) pushParseState(c byte, newParseState int, successState int) int {
177	s.parseState = append(s.parseState, newParseState)
178	if len(s.parseState) <= maxNestingDepth {
179		return successState
180	}
181	return s.error(c, "exceeded max depth")
182}
183
184// popParseState pops a parse state (already obtained) off the stack
185// and updates s.step accordingly.
186func (s *scanner) popParseState() {
187	n := len(s.parseState) - 1
188	s.parseState = s.parseState[0:n]
189	if n == 0 {
190		s.step = stateEndTop
191		s.endTop = true
192	} else {
193		s.step = stateEndValue
194	}
195}
196
197func isSpace(c byte) bool {
198	return c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
199}
200
201// stateBeginValueOrEmpty is the state after reading `[`.
202func stateBeginValueOrEmpty(s *scanner, c byte) int {
203	if isSpace(c) {
204		return scanSkipSpace
205	}
206	if c == ']' {
207		return stateEndValue(s, c)
208	}
209	return stateBeginValue(s, c)
210}
211
212// stateBeginValue is the state at the beginning of the input.
213func stateBeginValue(s *scanner, c byte) int {
214	if isSpace(c) {
215		return scanSkipSpace
216	}
217	switch c {
218	case '{':
219		s.step = stateBeginStringOrEmpty
220		return s.pushParseState(c, parseObjectKey, scanBeginObject)
221	case '[':
222		s.step = stateBeginValueOrEmpty
223		return s.pushParseState(c, parseArrayValue, scanBeginArray)
224	case '"':
225		s.step = stateInString
226		return scanBeginLiteral
227	case '-':
228		s.step = stateNeg
229		return scanBeginLiteral
230	case '0': // beginning of 0.123
231		s.step = state0
232		return scanBeginLiteral
233	case 't': // beginning of true
234		s.step = stateT
235		return scanBeginLiteral
236	case 'f': // beginning of false
237		s.step = stateF
238		return scanBeginLiteral
239	case 'n': // beginning of null
240		s.step = stateN
241		return scanBeginLiteral
242	}
243	if '1' <= c && c <= '9' { // beginning of 1234.5
244		s.step = state1
245		return scanBeginLiteral
246	}
247	return s.error(c, "looking for beginning of value")
248}
249
250// stateBeginStringOrEmpty is the state after reading `{`.
251func stateBeginStringOrEmpty(s *scanner, c byte) int {
252	if isSpace(c) {
253		return scanSkipSpace
254	}
255	if c == '}' {
256		n := len(s.parseState)
257		s.parseState[n-1] = parseObjectValue
258		return stateEndValue(s, c)
259	}
260	return stateBeginString(s, c)
261}
262
263// stateBeginString is the state after reading `{"key": value,`.
264func stateBeginString(s *scanner, c byte) int {
265	if isSpace(c) {
266		return scanSkipSpace
267	}
268	if c == '"' {
269		s.step = stateInString
270		return scanBeginLiteral
271	}
272	return s.error(c, "looking for beginning of object key string")
273}
274
275// stateEndValue is the state after completing a value,
276// such as after reading `{}` or `true` or `["x"`.
277func stateEndValue(s *scanner, c byte) int {
278	n := len(s.parseState)
279	if n == 0 {
280		// Completed top-level before the current byte.
281		s.step = stateEndTop
282		s.endTop = true
283		return stateEndTop(s, c)
284	}
285	if isSpace(c) {
286		s.step = stateEndValue
287		return scanSkipSpace
288	}
289	ps := s.parseState[n-1]
290	switch ps {
291	case parseObjectKey:
292		if c == ':' {
293			s.parseState[n-1] = parseObjectValue
294			s.step = stateBeginValue
295			return scanObjectKey
296		}
297		return s.error(c, "after object key")
298	case parseObjectValue:
299		if c == ',' {
300			s.parseState[n-1] = parseObjectKey
301			s.step = stateBeginString
302			return scanObjectValue
303		}
304		if c == '}' {
305			s.popParseState()
306			return scanEndObject
307		}
308		return s.error(c, "after object key:value pair")
309	case parseArrayValue:
310		if c == ',' {
311			s.step = stateBeginValue
312			return scanArrayValue
313		}
314		if c == ']' {
315			s.popParseState()
316			return scanEndArray
317		}
318		return s.error(c, "after array element")
319	}
320	return s.error(c, "")
321}
322
323// stateEndTop is the state after finishing the top-level value,
324// such as after reading `{}` or `[1,2,3]`.
325// Only space characters should be seen now.
326func stateEndTop(s *scanner, c byte) int {
327	if !isSpace(c) {
328		// Complain about non-space byte on next call.
329		s.error(c, "after top-level value")
330	}
331	return scanEnd
332}
333
334// stateInString is the state after reading `"`.
335func stateInString(s *scanner, c byte) int {
336	if c == '"' {
337		s.step = stateEndValue
338		return scanContinue
339	}
340	if c == '\\' {
341		s.step = stateInStringEsc
342		return scanContinue
343	}
344	if c < 0x20 {
345		return s.error(c, "in string literal")
346	}
347	return scanContinue
348}
349
350// stateInStringEsc is the state after reading `"\` during a quoted string.
351func stateInStringEsc(s *scanner, c byte) int {
352	switch c {
353	case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
354		s.step = stateInString
355		return scanContinue
356	case 'u':
357		s.step = stateInStringEscU
358		return scanContinue
359	}
360	return s.error(c, "in string escape code")
361}
362
363// stateInStringEscU is the state after reading `"\u` during a quoted string.
364func stateInStringEscU(s *scanner, c byte) int {
365	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
366		s.step = stateInStringEscU1
367		return scanContinue
368	}
369	// numbers
370	return s.error(c, "in \\u hexadecimal character escape")
371}
372
373// stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
374func stateInStringEscU1(s *scanner, c byte) int {
375	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
376		s.step = stateInStringEscU12
377		return scanContinue
378	}
379	// numbers
380	return s.error(c, "in \\u hexadecimal character escape")
381}
382
383// stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
384func stateInStringEscU12(s *scanner, c byte) int {
385	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
386		s.step = stateInStringEscU123
387		return scanContinue
388	}
389	// numbers
390	return s.error(c, "in \\u hexadecimal character escape")
391}
392
393// stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
394func stateInStringEscU123(s *scanner, c byte) int {
395	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
396		s.step = stateInString
397		return scanContinue
398	}
399	// numbers
400	return s.error(c, "in \\u hexadecimal character escape")
401}
402
403// stateNeg is the state after reading `-` during a number.
404func stateNeg(s *scanner, c byte) int {
405	if c == '0' {
406		s.step = state0
407		return scanContinue
408	}
409	if '1' <= c && c <= '9' {
410		s.step = state1
411		return scanContinue
412	}
413	return s.error(c, "in numeric literal")
414}
415
416// state1 is the state after reading a non-zero integer during a number,
417// such as after reading `1` or `100` but not `0`.
418func state1(s *scanner, c byte) int {
419	if '0' <= c && c <= '9' {
420		s.step = state1
421		return scanContinue
422	}
423	return state0(s, c)
424}
425
426// state0 is the state after reading `0` during a number.
427func state0(s *scanner, c byte) int {
428	if c == '.' {
429		s.step = stateDot
430		return scanContinue
431	}
432	if c == 'e' || c == 'E' {
433		s.step = stateE
434		return scanContinue
435	}
436	return stateEndValue(s, c)
437}
438
439// stateDot is the state after reading the integer and decimal point in a number,
440// such as after reading `1.`.
441func stateDot(s *scanner, c byte) int {
442	if '0' <= c && c <= '9' {
443		s.step = stateDot0
444		return scanContinue
445	}
446	return s.error(c, "after decimal point in numeric literal")
447}
448
449// stateDot0 is the state after reading the integer, decimal point, and subsequent
450// digits of a number, such as after reading `3.14`.
451func stateDot0(s *scanner, c byte) int {
452	if '0' <= c && c <= '9' {
453		return scanContinue
454	}
455	if c == 'e' || c == 'E' {
456		s.step = stateE
457		return scanContinue
458	}
459	return stateEndValue(s, c)
460}
461
462// stateE is the state after reading the mantissa and e in a number,
463// such as after reading `314e` or `0.314e`.
464func stateE(s *scanner, c byte) int {
465	if c == '+' || c == '-' {
466		s.step = stateESign
467		return scanContinue
468	}
469	return stateESign(s, c)
470}
471
472// stateESign is the state after reading the mantissa, e, and sign in a number,
473// such as after reading `314e-` or `0.314e+`.
474func stateESign(s *scanner, c byte) int {
475	if '0' <= c && c <= '9' {
476		s.step = stateE0
477		return scanContinue
478	}
479	return s.error(c, "in exponent of numeric literal")
480}
481
482// stateE0 is the state after reading the mantissa, e, optional sign,
483// and at least one digit of the exponent in a number,
484// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
485func stateE0(s *scanner, c byte) int {
486	if '0' <= c && c <= '9' {
487		return scanContinue
488	}
489	return stateEndValue(s, c)
490}
491
492// stateT is the state after reading `t`.
493func stateT(s *scanner, c byte) int {
494	if c == 'r' {
495		s.step = stateTr
496		return scanContinue
497	}
498	return s.error(c, "in literal true (expecting 'r')")
499}
500
501// stateTr is the state after reading `tr`.
502func stateTr(s *scanner, c byte) int {
503	if c == 'u' {
504		s.step = stateTru
505		return scanContinue
506	}
507	return s.error(c, "in literal true (expecting 'u')")
508}
509
510// stateTru is the state after reading `tru`.
511func stateTru(s *scanner, c byte) int {
512	if c == 'e' {
513		s.step = stateEndValue
514		return scanContinue
515	}
516	return s.error(c, "in literal true (expecting 'e')")
517}
518
519// stateF is the state after reading `f`.
520func stateF(s *scanner, c byte) int {
521	if c == 'a' {
522		s.step = stateFa
523		return scanContinue
524	}
525	return s.error(c, "in literal false (expecting 'a')")
526}
527
528// stateFa is the state after reading `fa`.
529func stateFa(s *scanner, c byte) int {
530	if c == 'l' {
531		s.step = stateFal
532		return scanContinue
533	}
534	return s.error(c, "in literal false (expecting 'l')")
535}
536
537// stateFal is the state after reading `fal`.
538func stateFal(s *scanner, c byte) int {
539	if c == 's' {
540		s.step = stateFals
541		return scanContinue
542	}
543	return s.error(c, "in literal false (expecting 's')")
544}
545
546// stateFals is the state after reading `fals`.
547func stateFals(s *scanner, c byte) int {
548	if c == 'e' {
549		s.step = stateEndValue
550		return scanContinue
551	}
552	return s.error(c, "in literal false (expecting 'e')")
553}
554
555// stateN is the state after reading `n`.
556func stateN(s *scanner, c byte) int {
557	if c == 'u' {
558		s.step = stateNu
559		return scanContinue
560	}
561	return s.error(c, "in literal null (expecting 'u')")
562}
563
564// stateNu is the state after reading `nu`.
565func stateNu(s *scanner, c byte) int {
566	if c == 'l' {
567		s.step = stateNul
568		return scanContinue
569	}
570	return s.error(c, "in literal null (expecting 'l')")
571}
572
573// stateNul is the state after reading `nul`.
574func stateNul(s *scanner, c byte) int {
575	if c == 'l' {
576		s.step = stateEndValue
577		return scanContinue
578	}
579	return s.error(c, "in literal null (expecting 'l')")
580}
581
582// stateError is the state after reaching a syntax error,
583// such as after reading `[1}` or `5.1.2`.
584func stateError(s *scanner, c byte) int {
585	return scanError
586}
587
588// error records an error and switches to the error state.
589func (s *scanner) error(c byte, context string) int {
590	s.step = stateError
591	s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
592	return scanError
593}
594
595// quoteChar formats c as a quoted character literal
596func quoteChar(c byte) string {
597	// special cases - different from quoted strings
598	if c == '\'' {
599		return `'\''`
600	}
601	if c == '"' {
602		return `'"'`
603	}
604
605	// use quoted string with different quotation marks
606	s := strconv.Quote(string(c))
607	return "'" + s[1:len(s)-1] + "'"
608}
609