1// TOML lexer. 2// 3// Written using the principles developed by Rob Pike in 4// http://www.youtube.com/watch?v=HxaD_trXwRE 5 6package toml 7 8import ( 9 "bytes" 10 "errors" 11 "fmt" 12 "regexp" 13 "strconv" 14 "strings" 15) 16 17var dateRegexp *regexp.Regexp 18 19// Define state functions 20type tomlLexStateFn func() tomlLexStateFn 21 22// Define lexer 23type tomlLexer struct { 24 inputIdx int 25 input []rune // Textual source 26 currentTokenStart int 27 currentTokenStop int 28 tokens []token 29 depth int 30 line int 31 col int 32 endbufferLine int 33 endbufferCol int 34} 35 36// Basic read operations on input 37 38func (l *tomlLexer) read() rune { 39 r := l.peek() 40 if r == '\n' { 41 l.endbufferLine++ 42 l.endbufferCol = 1 43 } else { 44 l.endbufferCol++ 45 } 46 l.inputIdx++ 47 return r 48} 49 50func (l *tomlLexer) next() rune { 51 r := l.read() 52 53 if r != eof { 54 l.currentTokenStop++ 55 } 56 return r 57} 58 59func (l *tomlLexer) ignore() { 60 l.currentTokenStart = l.currentTokenStop 61 l.line = l.endbufferLine 62 l.col = l.endbufferCol 63} 64 65func (l *tomlLexer) skip() { 66 l.next() 67 l.ignore() 68} 69 70func (l *tomlLexer) fastForward(n int) { 71 for i := 0; i < n; i++ { 72 l.next() 73 } 74} 75 76func (l *tomlLexer) emitWithValue(t tokenType, value string) { 77 l.tokens = append(l.tokens, token{ 78 Position: Position{l.line, l.col}, 79 typ: t, 80 val: value, 81 }) 82 l.ignore() 83} 84 85func (l *tomlLexer) emit(t tokenType) { 86 l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop])) 87} 88 89func (l *tomlLexer) peek() rune { 90 if l.inputIdx >= len(l.input) { 91 return eof 92 } 93 return l.input[l.inputIdx] 94} 95 96func (l *tomlLexer) peekString(size int) string { 97 maxIdx := len(l.input) 98 upperIdx := l.inputIdx + size // FIXME: potential overflow 99 if upperIdx > maxIdx { 100 upperIdx = maxIdx 101 } 102 return string(l.input[l.inputIdx:upperIdx]) 103} 104 105func (l *tomlLexer) follow(next string) bool { 106 return next == l.peekString(len(next)) 107} 108 109// Error management 110 111func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { 112 l.tokens = append(l.tokens, token{ 113 Position: Position{l.line, l.col}, 114 typ: tokenError, 115 val: fmt.Sprintf(format, args...), 116 }) 117 return nil 118} 119 120// State functions 121 122func (l *tomlLexer) lexVoid() tomlLexStateFn { 123 for { 124 next := l.peek() 125 switch next { 126 case '[': 127 return l.lexTableKey 128 case '#': 129 return l.lexComment(l.lexVoid) 130 case '=': 131 return l.lexEqual 132 case '\r': 133 fallthrough 134 case '\n': 135 l.skip() 136 continue 137 } 138 139 if isSpace(next) { 140 l.skip() 141 } 142 143 if l.depth > 0 { 144 return l.lexRvalue 145 } 146 147 if isKeyStartChar(next) { 148 return l.lexKey 149 } 150 151 if next == eof { 152 l.next() 153 break 154 } 155 } 156 157 l.emit(tokenEOF) 158 return nil 159} 160 161func (l *tomlLexer) lexRvalue() tomlLexStateFn { 162 for { 163 next := l.peek() 164 switch next { 165 case '.': 166 return l.errorf("cannot start float with a dot") 167 case '=': 168 return l.lexEqual 169 case '[': 170 l.depth++ 171 return l.lexLeftBracket 172 case ']': 173 l.depth-- 174 return l.lexRightBracket 175 case '{': 176 return l.lexLeftCurlyBrace 177 case '}': 178 return l.lexRightCurlyBrace 179 case '#': 180 return l.lexComment(l.lexRvalue) 181 case '"': 182 return l.lexString 183 case '\'': 184 return l.lexLiteralString 185 case ',': 186 return l.lexComma 187 case '\r': 188 fallthrough 189 case '\n': 190 l.skip() 191 if l.depth == 0 { 192 return l.lexVoid 193 } 194 return l.lexRvalue 195 case '_': 196 return l.errorf("cannot start number with underscore") 197 } 198 199 if l.follow("true") { 200 return l.lexTrue 201 } 202 203 if l.follow("false") { 204 return l.lexFalse 205 } 206 207 if l.follow("inf") { 208 return l.lexInf 209 } 210 211 if l.follow("nan") { 212 return l.lexNan 213 } 214 215 if isSpace(next) { 216 l.skip() 217 continue 218 } 219 220 if next == eof { 221 l.next() 222 break 223 } 224 225 possibleDate := l.peekString(35) 226 dateMatch := dateRegexp.FindString(possibleDate) 227 if dateMatch != "" { 228 l.fastForward(len(dateMatch)) 229 return l.lexDate 230 } 231 232 if next == '+' || next == '-' || isDigit(next) { 233 return l.lexNumber 234 } 235 236 if isAlphanumeric(next) { 237 return l.lexKey 238 } 239 240 return l.errorf("no value can start with %c", next) 241 } 242 243 l.emit(tokenEOF) 244 return nil 245} 246 247func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { 248 l.next() 249 l.emit(tokenLeftCurlyBrace) 250 return l.lexRvalue 251} 252 253func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { 254 l.next() 255 l.emit(tokenRightCurlyBrace) 256 return l.lexRvalue 257} 258 259func (l *tomlLexer) lexDate() tomlLexStateFn { 260 l.emit(tokenDate) 261 return l.lexRvalue 262} 263 264func (l *tomlLexer) lexTrue() tomlLexStateFn { 265 l.fastForward(4) 266 l.emit(tokenTrue) 267 return l.lexRvalue 268} 269 270func (l *tomlLexer) lexFalse() tomlLexStateFn { 271 l.fastForward(5) 272 l.emit(tokenFalse) 273 return l.lexRvalue 274} 275 276func (l *tomlLexer) lexInf() tomlLexStateFn { 277 l.fastForward(3) 278 l.emit(tokenInf) 279 return l.lexRvalue 280} 281 282func (l *tomlLexer) lexNan() tomlLexStateFn { 283 l.fastForward(3) 284 l.emit(tokenNan) 285 return l.lexRvalue 286} 287 288func (l *tomlLexer) lexEqual() tomlLexStateFn { 289 l.next() 290 l.emit(tokenEqual) 291 return l.lexRvalue 292} 293 294func (l *tomlLexer) lexComma() tomlLexStateFn { 295 l.next() 296 l.emit(tokenComma) 297 return l.lexRvalue 298} 299 300// Parse the key and emits its value without escape sequences. 301// bare keys, basic string keys and literal string keys are supported. 302func (l *tomlLexer) lexKey() tomlLexStateFn { 303 growingString := "" 304 305 for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { 306 if r == '"' { 307 l.next() 308 str, err := l.lexStringAsString(`"`, false, true) 309 if err != nil { 310 return l.errorf(err.Error()) 311 } 312 growingString += str 313 l.next() 314 continue 315 } else if r == '\'' { 316 l.next() 317 str, err := l.lexLiteralStringAsString(`'`, false) 318 if err != nil { 319 return l.errorf(err.Error()) 320 } 321 growingString += str 322 l.next() 323 continue 324 } else if r == '\n' { 325 return l.errorf("keys cannot contain new lines") 326 } else if isSpace(r) { 327 break 328 } else if !isValidBareChar(r) { 329 return l.errorf("keys cannot contain %c character", r) 330 } 331 growingString += string(r) 332 l.next() 333 } 334 l.emitWithValue(tokenKey, growingString) 335 return l.lexVoid 336} 337 338func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { 339 return func() tomlLexStateFn { 340 for next := l.peek(); next != '\n' && next != eof; next = l.peek() { 341 if next == '\r' && l.follow("\r\n") { 342 break 343 } 344 l.next() 345 } 346 l.ignore() 347 return previousState 348 } 349} 350 351func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { 352 l.next() 353 l.emit(tokenLeftBracket) 354 return l.lexRvalue 355} 356 357func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { 358 growingString := "" 359 360 if discardLeadingNewLine { 361 if l.follow("\r\n") { 362 l.skip() 363 l.skip() 364 } else if l.peek() == '\n' { 365 l.skip() 366 } 367 } 368 369 // find end of string 370 for { 371 if l.follow(terminator) { 372 return growingString, nil 373 } 374 375 next := l.peek() 376 if next == eof { 377 break 378 } 379 growingString += string(l.next()) 380 } 381 382 return "", errors.New("unclosed string") 383} 384 385func (l *tomlLexer) lexLiteralString() tomlLexStateFn { 386 l.skip() 387 388 // handle special case for triple-quote 389 terminator := "'" 390 discardLeadingNewLine := false 391 if l.follow("''") { 392 l.skip() 393 l.skip() 394 terminator = "'''" 395 discardLeadingNewLine = true 396 } 397 398 str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) 399 if err != nil { 400 return l.errorf(err.Error()) 401 } 402 403 l.emitWithValue(tokenString, str) 404 l.fastForward(len(terminator)) 405 l.ignore() 406 return l.lexRvalue 407} 408 409// Lex a string and return the results as a string. 410// Terminator is the substring indicating the end of the token. 411// The resulting string does not include the terminator. 412func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { 413 growingString := "" 414 415 if discardLeadingNewLine { 416 if l.follow("\r\n") { 417 l.skip() 418 l.skip() 419 } else if l.peek() == '\n' { 420 l.skip() 421 } 422 } 423 424 for { 425 if l.follow(terminator) { 426 return growingString, nil 427 } 428 429 if l.follow("\\") { 430 l.next() 431 switch l.peek() { 432 case '\r': 433 fallthrough 434 case '\n': 435 fallthrough 436 case '\t': 437 fallthrough 438 case ' ': 439 // skip all whitespace chars following backslash 440 for strings.ContainsRune("\r\n\t ", l.peek()) { 441 l.next() 442 } 443 case '"': 444 growingString += "\"" 445 l.next() 446 case 'n': 447 growingString += "\n" 448 l.next() 449 case 'b': 450 growingString += "\b" 451 l.next() 452 case 'f': 453 growingString += "\f" 454 l.next() 455 case '/': 456 growingString += "/" 457 l.next() 458 case 't': 459 growingString += "\t" 460 l.next() 461 case 'r': 462 growingString += "\r" 463 l.next() 464 case '\\': 465 growingString += "\\" 466 l.next() 467 case 'u': 468 l.next() 469 code := "" 470 for i := 0; i < 4; i++ { 471 c := l.peek() 472 if !isHexDigit(c) { 473 return "", errors.New("unfinished unicode escape") 474 } 475 l.next() 476 code = code + string(c) 477 } 478 intcode, err := strconv.ParseInt(code, 16, 32) 479 if err != nil { 480 return "", errors.New("invalid unicode escape: \\u" + code) 481 } 482 growingString += string(rune(intcode)) 483 case 'U': 484 l.next() 485 code := "" 486 for i := 0; i < 8; i++ { 487 c := l.peek() 488 if !isHexDigit(c) { 489 return "", errors.New("unfinished unicode escape") 490 } 491 l.next() 492 code = code + string(c) 493 } 494 intcode, err := strconv.ParseInt(code, 16, 64) 495 if err != nil { 496 return "", errors.New("invalid unicode escape: \\U" + code) 497 } 498 growingString += string(rune(intcode)) 499 default: 500 return "", errors.New("invalid escape sequence: \\" + string(l.peek())) 501 } 502 } else { 503 r := l.peek() 504 505 if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) { 506 return "", fmt.Errorf("unescaped control character %U", r) 507 } 508 l.next() 509 growingString += string(r) 510 } 511 512 if l.peek() == eof { 513 break 514 } 515 } 516 517 return "", errors.New("unclosed string") 518} 519 520func (l *tomlLexer) lexString() tomlLexStateFn { 521 l.skip() 522 523 // handle special case for triple-quote 524 terminator := `"` 525 discardLeadingNewLine := false 526 acceptNewLines := false 527 if l.follow(`""`) { 528 l.skip() 529 l.skip() 530 terminator = `"""` 531 discardLeadingNewLine = true 532 acceptNewLines = true 533 } 534 535 str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) 536 537 if err != nil { 538 return l.errorf(err.Error()) 539 } 540 541 l.emitWithValue(tokenString, str) 542 l.fastForward(len(terminator)) 543 l.ignore() 544 return l.lexRvalue 545} 546 547func (l *tomlLexer) lexTableKey() tomlLexStateFn { 548 l.next() 549 550 if l.peek() == '[' { 551 // token '[[' signifies an array of tables 552 l.next() 553 l.emit(tokenDoubleLeftBracket) 554 return l.lexInsideTableArrayKey 555 } 556 // vanilla table key 557 l.emit(tokenLeftBracket) 558 return l.lexInsideTableKey 559} 560 561// Parse the key till "]]", but only bare keys are supported 562func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { 563 for r := l.peek(); r != eof; r = l.peek() { 564 switch r { 565 case ']': 566 if l.currentTokenStop > l.currentTokenStart { 567 l.emit(tokenKeyGroupArray) 568 } 569 l.next() 570 if l.peek() != ']' { 571 break 572 } 573 l.next() 574 l.emit(tokenDoubleRightBracket) 575 return l.lexVoid 576 case '[': 577 return l.errorf("table array key cannot contain ']'") 578 default: 579 l.next() 580 } 581 } 582 return l.errorf("unclosed table array key") 583} 584 585// Parse the key till "]" but only bare keys are supported 586func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { 587 for r := l.peek(); r != eof; r = l.peek() { 588 switch r { 589 case ']': 590 if l.currentTokenStop > l.currentTokenStart { 591 l.emit(tokenKeyGroup) 592 } 593 l.next() 594 l.emit(tokenRightBracket) 595 return l.lexVoid 596 case '[': 597 return l.errorf("table key cannot contain ']'") 598 default: 599 l.next() 600 } 601 } 602 return l.errorf("unclosed table key") 603} 604 605func (l *tomlLexer) lexRightBracket() tomlLexStateFn { 606 l.next() 607 l.emit(tokenRightBracket) 608 return l.lexRvalue 609} 610 611type validRuneFn func(r rune) bool 612 613func isValidHexRune(r rune) bool { 614 return r >= 'a' && r <= 'f' || 615 r >= 'A' && r <= 'F' || 616 r >= '0' && r <= '9' || 617 r == '_' 618} 619 620func isValidOctalRune(r rune) bool { 621 return r >= '0' && r <= '7' || r == '_' 622} 623 624func isValidBinaryRune(r rune) bool { 625 return r == '0' || r == '1' || r == '_' 626} 627 628func (l *tomlLexer) lexNumber() tomlLexStateFn { 629 r := l.peek() 630 631 if r == '0' { 632 follow := l.peekString(2) 633 if len(follow) == 2 { 634 var isValidRune validRuneFn 635 switch follow[1] { 636 case 'x': 637 isValidRune = isValidHexRune 638 case 'o': 639 isValidRune = isValidOctalRune 640 case 'b': 641 isValidRune = isValidBinaryRune 642 default: 643 if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' { 644 return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1])) 645 } 646 } 647 648 if isValidRune != nil { 649 l.next() 650 l.next() 651 digitSeen := false 652 for { 653 next := l.peek() 654 if !isValidRune(next) { 655 break 656 } 657 digitSeen = true 658 l.next() 659 } 660 661 if !digitSeen { 662 return l.errorf("number needs at least one digit") 663 } 664 665 l.emit(tokenInteger) 666 667 return l.lexRvalue 668 } 669 } 670 } 671 672 if r == '+' || r == '-' { 673 l.next() 674 if l.follow("inf") { 675 return l.lexInf 676 } 677 if l.follow("nan") { 678 return l.lexNan 679 } 680 } 681 682 pointSeen := false 683 expSeen := false 684 digitSeen := false 685 for { 686 next := l.peek() 687 if next == '.' { 688 if pointSeen { 689 return l.errorf("cannot have two dots in one float") 690 } 691 l.next() 692 if !isDigit(l.peek()) { 693 return l.errorf("float cannot end with a dot") 694 } 695 pointSeen = true 696 } else if next == 'e' || next == 'E' { 697 expSeen = true 698 l.next() 699 r := l.peek() 700 if r == '+' || r == '-' { 701 l.next() 702 } 703 } else if isDigit(next) { 704 digitSeen = true 705 l.next() 706 } else if next == '_' { 707 l.next() 708 } else { 709 break 710 } 711 if pointSeen && !digitSeen { 712 return l.errorf("cannot start float with a dot") 713 } 714 } 715 716 if !digitSeen { 717 return l.errorf("no digit in that number") 718 } 719 if pointSeen || expSeen { 720 l.emit(tokenFloat) 721 } else { 722 l.emit(tokenInteger) 723 } 724 return l.lexRvalue 725} 726 727func (l *tomlLexer) run() { 728 for state := l.lexVoid; state != nil; { 729 state = state() 730 } 731} 732 733func init() { 734 dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`) 735} 736 737// Entry point 738func lexToml(inputBytes []byte) []token { 739 runes := bytes.Runes(inputBytes) 740 l := &tomlLexer{ 741 input: runes, 742 tokens: make([]token, 0, 256), 743 line: 1, 744 col: 1, 745 endbufferLine: 1, 746 endbufferCol: 1, 747 } 748 l.run() 749 return l.tokens 750} 751