1// Copyright (c) 2021, Peter Ohler, All rights reserved. 2 3package oj 4 5import ( 6 "bytes" 7 "encoding/json" 8 "fmt" 9 "io" 10 "math" 11 "unicode/utf8" 12 13 "github.com/ohler55/ojg/gen" 14) 15 16const ( 17 objectStart = '{' 18 arrayStart = '[' 19) 20 21// Tokenizer is used to tokenize a JSON document. 22type Tokenizer struct { 23 tracker 24 tmp []byte // used for numbers and strings 25 runeBytes []byte 26 starts []byte 27 handler TokenHandler 28 ri int // read index for null, false, and true 29 mi int 30 num gen.Number 31 rn rune 32 mode string 33 nextMode string 34} 35 36// TokenizeString the provided JSON and call the handler functions for each 37// token in the JSON. 38func TokenizeString(data string, handler TokenHandler) error { 39 t := Tokenizer{} 40 return t.Parse([]byte(data), handler) 41} 42 43// Tokenize the provided JSON and call the TokenHandler functions for each 44// token in the JSON. 45func Tokenize(data []byte, handler TokenHandler) error { 46 t := Tokenizer{} 47 return t.Parse(data, handler) 48} 49 50// TokenizeLoad JSON from a io.Reader and call the TokenHandler functions for 51// each token in the JSON. 52func TokenizeLoad(r io.Reader, handler TokenHandler) error { 53 t := Tokenizer{} 54 return t.Load(r, handler) 55} 56 57// Parse the JSON and call the handler functions for each token in the JSON. 58func (t *Tokenizer) Parse(buf []byte, handler TokenHandler) (err error) { 59 t.handler = handler 60 if t.starts == nil { 61 t.tmp = make([]byte, 0, tmpInitSize) 62 t.starts = make([]byte, 0, 16) 63 } else { 64 t.tmp = t.tmp[:0] 65 t.starts = t.starts[:0] 66 } 67 t.noff = -1 68 t.line = 1 69 t.mode = valueMap 70 t.mi = 0 71 // Skip BOM if present. 72 if 3 < len(buf) && buf[0] == 0xEF { 73 if buf[1] == 0xBB && buf[2] == 0xBF { 74 err = t.tokenizeBuffer(buf[3:], true) 75 } else { 76 err = fmt.Errorf("expected BOM at 1:3") 77 } 78 } else { 79 err = t.tokenizeBuffer(buf, true) 80 } 81 return 82} 83 84// Load aand parse the JSON and call the handler functions for each token in 85// the JSON. 86func (t *Tokenizer) Load(r io.Reader, handler TokenHandler) (err error) { 87 t.handler = handler 88 if t.starts == nil { 89 t.tmp = make([]byte, 0, tmpInitSize) 90 t.starts = make([]byte, 0, 16) 91 } else { 92 t.tmp = t.tmp[:0] 93 t.starts = t.starts[:0] 94 } 95 t.noff = -1 96 t.line = 1 97 t.mi = 0 98 buf := make([]byte, readBufSize) 99 eof := false 100 var cnt int 101 cnt, err = r.Read(buf) 102 buf = buf[:cnt] 103 t.mode = valueMap 104 if err != nil { 105 if err != io.EOF { 106 return 107 } 108 eof = true 109 } 110 var skip int 111 // Skip BOM if present. 112 if 3 < len(buf) && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF { 113 skip = 3 114 } 115 for { 116 if 0 < skip { 117 err = t.tokenizeBuffer(buf[skip:], eof) 118 } else { 119 err = t.tokenizeBuffer(buf, eof) 120 } 121 if err != nil { 122 return 123 } 124 skip = 0 125 if eof { 126 break 127 } 128 buf = buf[:cap(buf)] 129 cnt, err = r.Read(buf) 130 buf = buf[:cnt] 131 if err != nil { 132 if err != io.EOF { 133 return 134 } 135 eof = true 136 } 137 } 138 return 139} 140 141func (t *Tokenizer) tokenizeBuffer(buf []byte, last bool) error { 142 var b byte 143 var i int 144 var off int 145 depth := len(t.starts) 146 for off = 0; off < len(buf); off++ { 147 b = buf[off] 148 switch t.mode[b] { 149 case skipNewline: 150 t.line++ 151 t.noff = off 152 for i, b = range buf[off+1:] { 153 if spaceMap[b] != skipChar { 154 break 155 } 156 } 157 off += i 158 continue 159 case colonColon: 160 t.mode = valueMap 161 continue 162 case skipChar: // skip and continue 163 continue 164 case strOk: 165 t.tmp = append(t.tmp, b) 166 case keyQuote: 167 start := off + 1 168 if len(buf) <= start { 169 t.tmp = t.tmp[:0] 170 t.mode = stringMap 171 t.nextMode = colonMap 172 continue 173 } 174 for i, b = range buf[off+1:] { 175 if stringMap[b] != strOk { 176 break 177 } 178 } 179 off += i 180 if b == '"' { 181 off++ 182 t.handler.Key(string(buf[start:off])) 183 t.mode = colonMap 184 } else { 185 t.tmp = t.tmp[:0] 186 t.tmp = append(t.tmp, buf[start:off+1]...) 187 t.mode = stringMap 188 t.nextMode = colonMap 189 } 190 continue 191 case afterComma: 192 if 0 < len(t.starts) && t.starts[len(t.starts)-1] == '{' { 193 t.mode = keyMap 194 } else { 195 t.mode = commaMap 196 } 197 continue 198 case valQuote: 199 start := off + 1 200 if len(buf) <= start { 201 t.tmp = t.tmp[:0] 202 t.mode = stringMap 203 t.nextMode = afterMap 204 continue 205 } 206 for i, b = range buf[off+1:] { 207 if stringMap[b] != strOk { 208 break 209 } 210 } 211 off += i 212 if b == '"' { 213 off++ 214 t.handler.String(string(buf[start:off])) 215 t.mode = afterMap 216 } else { 217 t.tmp = t.tmp[:0] 218 t.tmp = append(t.tmp, buf[start:off+1]...) 219 t.mode = stringMap 220 t.nextMode = afterMap 221 continue 222 } 223 case numComma: 224 t.handleNum() 225 if 0 < len(t.starts) && t.starts[len(t.starts)-1] == '{' { 226 t.mode = keyMap 227 } else { 228 t.mode = commaMap 229 } 230 case strSlash: 231 t.mode = escMap 232 continue 233 case escOk: 234 t.tmp = append(t.tmp, escByteMap[b]) 235 t.mode = stringMap 236 continue 237 case openObject: 238 t.starts = append(t.starts, objectStart) 239 t.handler.ObjectStart() 240 t.mode = key1Map 241 depth++ 242 continue 243 case closeObject: 244 depth-- 245 if depth < 0 || t.starts[depth] != objectStart { 246 return t.newError(off, "unexpected object close") 247 } 248 if 256 < len(t.mode) && t.mode[256] == 'n' { 249 t.handleNum() 250 } 251 t.starts = t.starts[0:depth] 252 t.handler.ObjectEnd() 253 t.mode = afterMap 254 case val0: 255 t.mode = zeroMap 256 t.num.Reset() 257 case valDigit: 258 t.num.Reset() 259 t.mode = digitMap 260 t.num.I = uint64(b - '0') 261 for i, b = range buf[off+1:] { 262 if digitMap[b] != numDigit { 263 break 264 } 265 t.num.I = t.num.I*10 + uint64(b-'0') 266 if math.MaxInt64 < t.num.I { 267 t.num.FillBig() 268 break 269 } 270 } 271 if digitMap[b] == numDigit { 272 off++ 273 } 274 off += i 275 case valNeg: 276 t.mode = negMap 277 t.num.Reset() 278 t.num.Neg = true 279 continue 280 case escU: 281 t.mode = uMap 282 t.rn = 0 283 t.ri = 0 284 continue 285 case openArray: 286 t.starts = append(t.starts, arrayStart) 287 t.handler.ArrayStart() 288 t.mode = valueMap 289 depth++ 290 continue 291 case closeArray: 292 depth-- 293 if depth < 0 || t.starts[depth] != arrayStart { 294 return t.newError(off, "unexpected array close") 295 } 296 // Only modes with a close array are value, after, and numbers 297 // which are all over 256 long. 298 if t.mode[256] == 'n' { 299 t.handleNum() 300 } 301 t.starts = t.starts[:len(t.starts)-1] 302 t.handler.ArrayEnd() 303 t.mode = afterMap 304 case valNull: 305 if off+4 <= len(buf) && string(buf[off:off+4]) == "null" { 306 off += 3 307 t.mode = afterMap 308 t.handler.Null() 309 } else { 310 t.mode = nullMap 311 t.ri = 0 312 } 313 case valTrue: 314 if off+4 <= len(buf) && string(buf[off:off+4]) == "true" { 315 off += 3 316 t.mode = afterMap 317 t.handler.Bool(true) 318 } else { 319 t.mode = trueMap 320 t.ri = 0 321 } 322 case valFalse: 323 if off+5 <= len(buf) && string(buf[off:off+5]) == "false" { 324 off += 4 325 t.mode = afterMap 326 t.handler.Bool(false) 327 } else { 328 t.mode = falseMap 329 t.ri = 0 330 } 331 case numDot: 332 if 0 < len(t.num.BigBuf) { 333 t.num.BigBuf = append(t.num.BigBuf, b) 334 t.mode = dotMap 335 continue 336 } 337 for i, b = range buf[off+1:] { 338 if digitMap[b] != numDigit { 339 break 340 } 341 t.num.Frac = t.num.Frac*10 + uint64(b-'0') 342 t.num.Div *= 10.0 343 if math.MaxInt64 < t.num.Frac { 344 t.num.FillBig() 345 break 346 } 347 } 348 off += i 349 if digitMap[b] == numDigit { 350 off++ 351 } 352 t.mode = fracMap 353 case numFrac: 354 t.num.AddFrac(b) 355 t.mode = fracMap 356 case fracE: 357 if 0 < len(t.num.BigBuf) { 358 t.num.BigBuf = append(t.num.BigBuf, b) 359 } 360 t.mode = expSignMap 361 continue 362 case strQuote: 363 t.mode = t.nextMode 364 if t.nextMode == colonMap { 365 t.handler.Key(string(t.tmp)) 366 } else { 367 t.handler.String(string(t.tmp)) 368 } 369 case numZero: 370 t.mode = zeroMap 371 case numDigit: 372 t.num.AddDigit(b) 373 case negDigit: 374 t.num.AddDigit(b) 375 t.mode = digitMap 376 case numSpc: 377 t.handleNum() 378 t.mode = afterMap 379 case numNewline: 380 t.handleNum() 381 t.line++ 382 t.noff = off 383 t.mode = afterMap 384 for i, b = range buf[off+1:] { 385 if spaceMap[b] != skipChar { 386 break 387 } 388 } 389 off += i 390 case expSign: 391 t.mode = expZeroMap 392 if b == '-' { 393 t.num.NegExp = true 394 } 395 continue 396 case expDigit: 397 t.num.AddExp(b) 398 t.mode = expMap 399 case uOk: 400 t.ri++ 401 switch b { 402 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 403 t.rn = t.rn<<4 | rune(b-'0') 404 case 'a', 'b', 'c', 'd', 'e', 'f': 405 t.rn = t.rn<<4 | rune(b-'a'+10) 406 case 'A', 'B', 'C', 'D', 'E', 'F': 407 t.rn = t.rn<<4 | rune(b-'A'+10) 408 } 409 if t.ri == 4 { 410 if len(t.runeBytes) < 6 { 411 t.runeBytes = make([]byte, 6) 412 } 413 n := utf8.EncodeRune(t.runeBytes, t.rn) 414 t.tmp = append(t.tmp, t.runeBytes[:n]...) 415 t.mode = stringMap 416 } 417 continue 418 case tokenOk: 419 switch { 420 case t.mode['r'] == tokenOk: 421 t.ri++ 422 if "true"[t.ri] != b { 423 return t.newError(off, "expected true") 424 } 425 if 3 <= t.ri { 426 t.handler.Bool(true) 427 t.mode = afterMap 428 } 429 case t.mode['a'] == tokenOk: 430 t.ri++ 431 if "false"[t.ri] != b { 432 return t.newError(off, "expected false") 433 } 434 if 4 <= t.ri { 435 t.handler.Bool(false) 436 t.mode = afterMap 437 } 438 case t.mode['u'] == tokenOk && t.mode['l'] == tokenOk: 439 t.ri++ 440 if "null"[t.ri] != b { 441 return t.newError(off, "expected null") 442 } 443 if 3 <= t.ri { 444 t.handler.Null() 445 t.mode = afterMap 446 } 447 } 448 case charErr: 449 return t.byteError(off, t.mode, b, bytes.Runes(buf[off:])[0]) 450 } 451 if depth == 0 && 256 < len(t.mode) && t.mode[256] == 'a' { 452 t.mi = 0 453 if t.OnlyOne { 454 t.mode = spaceMap 455 } else { 456 t.mode = valueMap 457 } 458 } 459 } 460 if last { 461 if len(t.mode) == 256 { // valid finishing maps are one byte longer 462 return t.newError(off, "incomplete JSON") 463 } 464 if t.mode[256] == 'n' { 465 t.handleNum() 466 } 467 } 468 return nil 469} 470 471func (t *Tokenizer) handleNum() { 472 switch tn := t.num.AsNum().(type) { 473 case int64: 474 t.handler.Int(tn) 475 case float64: 476 t.handler.Float(tn) 477 case json.Number: 478 t.handler.Number(string(tn)) 479 } 480} 481