1// Copyright (c) 2021, Peter Ohler, All rights reserved.
2
3package oj
4
5import (
6	"bytes"
7	"encoding/json"
8	"fmt"
9	"io"
10	"math"
11	"unicode/utf8"
12
13	"github.com/ohler55/ojg/gen"
14)
15
16const (
17	objectStart = '{'
18	arrayStart  = '['
19)
20
21// Tokenizer is used to tokenize a JSON document.
22type Tokenizer struct {
23	tracker
24	tmp       []byte // used for numbers and strings
25	runeBytes []byte
26	starts    []byte
27	handler   TokenHandler
28	ri        int // read index for null, false, and true
29	mi        int
30	num       gen.Number
31	rn        rune
32	mode      string
33	nextMode  string
34}
35
36// TokenizeString the provided JSON and call the handler functions for each
37// token in the JSON.
38func TokenizeString(data string, handler TokenHandler) error {
39	t := Tokenizer{}
40	return t.Parse([]byte(data), handler)
41}
42
43// Tokenize the provided JSON and call the TokenHandler functions for each
44// token in the JSON.
45func Tokenize(data []byte, handler TokenHandler) error {
46	t := Tokenizer{}
47	return t.Parse(data, handler)
48}
49
50// TokenizeLoad JSON from a io.Reader and call the TokenHandler functions for
51// each token in the JSON.
52func TokenizeLoad(r io.Reader, handler TokenHandler) error {
53	t := Tokenizer{}
54	return t.Load(r, handler)
55}
56
57// Parse the JSON and call the handler functions for each token in the JSON.
58func (t *Tokenizer) Parse(buf []byte, handler TokenHandler) (err error) {
59	t.handler = handler
60	if t.starts == nil {
61		t.tmp = make([]byte, 0, tmpInitSize)
62		t.starts = make([]byte, 0, 16)
63	} else {
64		t.tmp = t.tmp[:0]
65		t.starts = t.starts[:0]
66	}
67	t.noff = -1
68	t.line = 1
69	t.mode = valueMap
70	t.mi = 0
71	// Skip BOM if present.
72	if 3 < len(buf) && buf[0] == 0xEF {
73		if buf[1] == 0xBB && buf[2] == 0xBF {
74			err = t.tokenizeBuffer(buf[3:], true)
75		} else {
76			err = fmt.Errorf("expected BOM at 1:3")
77		}
78	} else {
79		err = t.tokenizeBuffer(buf, true)
80	}
81	return
82}
83
84// Load aand parse the JSON and call the handler functions for each token in
85// the JSON.
86func (t *Tokenizer) Load(r io.Reader, handler TokenHandler) (err error) {
87	t.handler = handler
88	if t.starts == nil {
89		t.tmp = make([]byte, 0, tmpInitSize)
90		t.starts = make([]byte, 0, 16)
91	} else {
92		t.tmp = t.tmp[:0]
93		t.starts = t.starts[:0]
94	}
95	t.noff = -1
96	t.line = 1
97	t.mi = 0
98	buf := make([]byte, readBufSize)
99	eof := false
100	var cnt int
101	cnt, err = r.Read(buf)
102	buf = buf[:cnt]
103	t.mode = valueMap
104	if err != nil {
105		if err != io.EOF {
106			return
107		}
108		eof = true
109	}
110	var skip int
111	// Skip BOM if present.
112	if 3 < len(buf) && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF {
113		skip = 3
114	}
115	for {
116		if 0 < skip {
117			err = t.tokenizeBuffer(buf[skip:], eof)
118		} else {
119			err = t.tokenizeBuffer(buf, eof)
120		}
121		if err != nil {
122			return
123		}
124		skip = 0
125		if eof {
126			break
127		}
128		buf = buf[:cap(buf)]
129		cnt, err = r.Read(buf)
130		buf = buf[:cnt]
131		if err != nil {
132			if err != io.EOF {
133				return
134			}
135			eof = true
136		}
137	}
138	return
139}
140
141func (t *Tokenizer) tokenizeBuffer(buf []byte, last bool) error {
142	var b byte
143	var i int
144	var off int
145	depth := len(t.starts)
146	for off = 0; off < len(buf); off++ {
147		b = buf[off]
148		switch t.mode[b] {
149		case skipNewline:
150			t.line++
151			t.noff = off
152			for i, b = range buf[off+1:] {
153				if spaceMap[b] != skipChar {
154					break
155				}
156			}
157			off += i
158			continue
159		case colonColon:
160			t.mode = valueMap
161			continue
162		case skipChar: // skip and continue
163			continue
164		case strOk:
165			t.tmp = append(t.tmp, b)
166		case keyQuote:
167			start := off + 1
168			if len(buf) <= start {
169				t.tmp = t.tmp[:0]
170				t.mode = stringMap
171				t.nextMode = colonMap
172				continue
173			}
174			for i, b = range buf[off+1:] {
175				if stringMap[b] != strOk {
176					break
177				}
178			}
179			off += i
180			if b == '"' {
181				off++
182				t.handler.Key(string(buf[start:off]))
183				t.mode = colonMap
184			} else {
185				t.tmp = t.tmp[:0]
186				t.tmp = append(t.tmp, buf[start:off+1]...)
187				t.mode = stringMap
188				t.nextMode = colonMap
189			}
190			continue
191		case afterComma:
192			if 0 < len(t.starts) && t.starts[len(t.starts)-1] == '{' {
193				t.mode = keyMap
194			} else {
195				t.mode = commaMap
196			}
197			continue
198		case valQuote:
199			start := off + 1
200			if len(buf) <= start {
201				t.tmp = t.tmp[:0]
202				t.mode = stringMap
203				t.nextMode = afterMap
204				continue
205			}
206			for i, b = range buf[off+1:] {
207				if stringMap[b] != strOk {
208					break
209				}
210			}
211			off += i
212			if b == '"' {
213				off++
214				t.handler.String(string(buf[start:off]))
215				t.mode = afterMap
216			} else {
217				t.tmp = t.tmp[:0]
218				t.tmp = append(t.tmp, buf[start:off+1]...)
219				t.mode = stringMap
220				t.nextMode = afterMap
221				continue
222			}
223		case numComma:
224			t.handleNum()
225			if 0 < len(t.starts) && t.starts[len(t.starts)-1] == '{' {
226				t.mode = keyMap
227			} else {
228				t.mode = commaMap
229			}
230		case strSlash:
231			t.mode = escMap
232			continue
233		case escOk:
234			t.tmp = append(t.tmp, escByteMap[b])
235			t.mode = stringMap
236			continue
237		case openObject:
238			t.starts = append(t.starts, objectStart)
239			t.handler.ObjectStart()
240			t.mode = key1Map
241			depth++
242			continue
243		case closeObject:
244			depth--
245			if depth < 0 || t.starts[depth] != objectStart {
246				return t.newError(off, "unexpected object close")
247			}
248			if 256 < len(t.mode) && t.mode[256] == 'n' {
249				t.handleNum()
250			}
251			t.starts = t.starts[0:depth]
252			t.handler.ObjectEnd()
253			t.mode = afterMap
254		case val0:
255			t.mode = zeroMap
256			t.num.Reset()
257		case valDigit:
258			t.num.Reset()
259			t.mode = digitMap
260			t.num.I = uint64(b - '0')
261			for i, b = range buf[off+1:] {
262				if digitMap[b] != numDigit {
263					break
264				}
265				t.num.I = t.num.I*10 + uint64(b-'0')
266				if math.MaxInt64 < t.num.I {
267					t.num.FillBig()
268					break
269				}
270			}
271			if digitMap[b] == numDigit {
272				off++
273			}
274			off += i
275		case valNeg:
276			t.mode = negMap
277			t.num.Reset()
278			t.num.Neg = true
279			continue
280		case escU:
281			t.mode = uMap
282			t.rn = 0
283			t.ri = 0
284			continue
285		case openArray:
286			t.starts = append(t.starts, arrayStart)
287			t.handler.ArrayStart()
288			t.mode = valueMap
289			depth++
290			continue
291		case closeArray:
292			depth--
293			if depth < 0 || t.starts[depth] != arrayStart {
294				return t.newError(off, "unexpected array close")
295			}
296			// Only modes with a close array are value, after, and numbers
297			// which are all over 256 long.
298			if t.mode[256] == 'n' {
299				t.handleNum()
300			}
301			t.starts = t.starts[:len(t.starts)-1]
302			t.handler.ArrayEnd()
303			t.mode = afterMap
304		case valNull:
305			if off+4 <= len(buf) && string(buf[off:off+4]) == "null" {
306				off += 3
307				t.mode = afterMap
308				t.handler.Null()
309			} else {
310				t.mode = nullMap
311				t.ri = 0
312			}
313		case valTrue:
314			if off+4 <= len(buf) && string(buf[off:off+4]) == "true" {
315				off += 3
316				t.mode = afterMap
317				t.handler.Bool(true)
318			} else {
319				t.mode = trueMap
320				t.ri = 0
321			}
322		case valFalse:
323			if off+5 <= len(buf) && string(buf[off:off+5]) == "false" {
324				off += 4
325				t.mode = afterMap
326				t.handler.Bool(false)
327			} else {
328				t.mode = falseMap
329				t.ri = 0
330			}
331		case numDot:
332			if 0 < len(t.num.BigBuf) {
333				t.num.BigBuf = append(t.num.BigBuf, b)
334				t.mode = dotMap
335				continue
336			}
337			for i, b = range buf[off+1:] {
338				if digitMap[b] != numDigit {
339					break
340				}
341				t.num.Frac = t.num.Frac*10 + uint64(b-'0')
342				t.num.Div *= 10.0
343				if math.MaxInt64 < t.num.Frac {
344					t.num.FillBig()
345					break
346				}
347			}
348			off += i
349			if digitMap[b] == numDigit {
350				off++
351			}
352			t.mode = fracMap
353		case numFrac:
354			t.num.AddFrac(b)
355			t.mode = fracMap
356		case fracE:
357			if 0 < len(t.num.BigBuf) {
358				t.num.BigBuf = append(t.num.BigBuf, b)
359			}
360			t.mode = expSignMap
361			continue
362		case strQuote:
363			t.mode = t.nextMode
364			if t.nextMode == colonMap {
365				t.handler.Key(string(t.tmp))
366			} else {
367				t.handler.String(string(t.tmp))
368			}
369		case numZero:
370			t.mode = zeroMap
371		case numDigit:
372			t.num.AddDigit(b)
373		case negDigit:
374			t.num.AddDigit(b)
375			t.mode = digitMap
376		case numSpc:
377			t.handleNum()
378			t.mode = afterMap
379		case numNewline:
380			t.handleNum()
381			t.line++
382			t.noff = off
383			t.mode = afterMap
384			for i, b = range buf[off+1:] {
385				if spaceMap[b] != skipChar {
386					break
387				}
388			}
389			off += i
390		case expSign:
391			t.mode = expZeroMap
392			if b == '-' {
393				t.num.NegExp = true
394			}
395			continue
396		case expDigit:
397			t.num.AddExp(b)
398			t.mode = expMap
399		case uOk:
400			t.ri++
401			switch b {
402			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
403				t.rn = t.rn<<4 | rune(b-'0')
404			case 'a', 'b', 'c', 'd', 'e', 'f':
405				t.rn = t.rn<<4 | rune(b-'a'+10)
406			case 'A', 'B', 'C', 'D', 'E', 'F':
407				t.rn = t.rn<<4 | rune(b-'A'+10)
408			}
409			if t.ri == 4 {
410				if len(t.runeBytes) < 6 {
411					t.runeBytes = make([]byte, 6)
412				}
413				n := utf8.EncodeRune(t.runeBytes, t.rn)
414				t.tmp = append(t.tmp, t.runeBytes[:n]...)
415				t.mode = stringMap
416			}
417			continue
418		case tokenOk:
419			switch {
420			case t.mode['r'] == tokenOk:
421				t.ri++
422				if "true"[t.ri] != b {
423					return t.newError(off, "expected true")
424				}
425				if 3 <= t.ri {
426					t.handler.Bool(true)
427					t.mode = afterMap
428				}
429			case t.mode['a'] == tokenOk:
430				t.ri++
431				if "false"[t.ri] != b {
432					return t.newError(off, "expected false")
433				}
434				if 4 <= t.ri {
435					t.handler.Bool(false)
436					t.mode = afterMap
437				}
438			case t.mode['u'] == tokenOk && t.mode['l'] == tokenOk:
439				t.ri++
440				if "null"[t.ri] != b {
441					return t.newError(off, "expected null")
442				}
443				if 3 <= t.ri {
444					t.handler.Null()
445					t.mode = afterMap
446				}
447			}
448		case charErr:
449			return t.byteError(off, t.mode, b, bytes.Runes(buf[off:])[0])
450		}
451		if depth == 0 && 256 < len(t.mode) && t.mode[256] == 'a' {
452			t.mi = 0
453			if t.OnlyOne {
454				t.mode = spaceMap
455			} else {
456				t.mode = valueMap
457			}
458		}
459	}
460	if last {
461		if len(t.mode) == 256 { // valid finishing maps are one byte longer
462			return t.newError(off, "incomplete JSON")
463		}
464		if t.mode[256] == 'n' {
465			t.handleNum()
466		}
467	}
468	return nil
469}
470
471func (t *Tokenizer) handleNum() {
472	switch tn := t.num.AsNum().(type) {
473	case int64:
474		t.handler.Int(tn)
475	case float64:
476		t.handler.Float(tn)
477	case json.Number:
478		t.handler.Number(string(tn))
479	}
480}
481