1// TOML lexer. 2// 3// Written using the principles developed by Rob Pike in 4// http://www.youtube.com/watch?v=HxaD_trXwRE 5 6package toml 7 8import ( 9 "bytes" 10 "errors" 11 "fmt" 12 "regexp" 13 "strconv" 14 "strings" 15) 16 17var dateRegexp *regexp.Regexp 18 19// Define state functions 20type tomlLexStateFn func() tomlLexStateFn 21 22// Define lexer 23type tomlLexer struct { 24 inputIdx int 25 input []rune // Textual source 26 currentTokenStart int 27 currentTokenStop int 28 tokens []token 29 brackets []rune 30 line int 31 col int 32 endbufferLine int 33 endbufferCol int 34} 35 36// Basic read operations on input 37 38func (l *tomlLexer) read() rune { 39 r := l.peek() 40 if r == '\n' { 41 l.endbufferLine++ 42 l.endbufferCol = 1 43 } else { 44 l.endbufferCol++ 45 } 46 l.inputIdx++ 47 return r 48} 49 50func (l *tomlLexer) next() rune { 51 r := l.read() 52 53 if r != eof { 54 l.currentTokenStop++ 55 } 56 return r 57} 58 59func (l *tomlLexer) ignore() { 60 l.currentTokenStart = l.currentTokenStop 61 l.line = l.endbufferLine 62 l.col = l.endbufferCol 63} 64 65func (l *tomlLexer) skip() { 66 l.next() 67 l.ignore() 68} 69 70func (l *tomlLexer) fastForward(n int) { 71 for i := 0; i < n; i++ { 72 l.next() 73 } 74} 75 76func (l *tomlLexer) emitWithValue(t tokenType, value string) { 77 l.tokens = append(l.tokens, token{ 78 Position: Position{l.line, l.col}, 79 typ: t, 80 val: value, 81 }) 82 l.ignore() 83} 84 85func (l *tomlLexer) emit(t tokenType) { 86 l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop])) 87} 88 89func (l *tomlLexer) peek() rune { 90 if l.inputIdx >= len(l.input) { 91 return eof 92 } 93 return l.input[l.inputIdx] 94} 95 96func (l *tomlLexer) peekString(size int) string { 97 maxIdx := len(l.input) 98 upperIdx := l.inputIdx + size // FIXME: potential overflow 99 if upperIdx > maxIdx { 100 upperIdx = maxIdx 101 } 102 return string(l.input[l.inputIdx:upperIdx]) 103} 104 105func (l *tomlLexer) follow(next string) bool { 106 return next == l.peekString(len(next)) 107} 108 109// Error management 110 111func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { 112 l.tokens = append(l.tokens, token{ 113 Position: Position{l.line, l.col}, 114 typ: tokenError, 115 val: fmt.Sprintf(format, args...), 116 }) 117 return nil 118} 119 120// State functions 121 122func (l *tomlLexer) lexVoid() tomlLexStateFn { 123 for { 124 next := l.peek() 125 switch next { 126 case '}': // after '{' 127 return l.lexRightCurlyBrace 128 case '[': 129 return l.lexTableKey 130 case '#': 131 return l.lexComment(l.lexVoid) 132 case '=': 133 return l.lexEqual 134 case '\r': 135 fallthrough 136 case '\n': 137 l.skip() 138 continue 139 } 140 141 if isSpace(next) { 142 l.skip() 143 } 144 145 if isKeyStartChar(next) { 146 return l.lexKey 147 } 148 149 if next == eof { 150 l.next() 151 break 152 } 153 } 154 155 l.emit(tokenEOF) 156 return nil 157} 158 159func (l *tomlLexer) lexRvalue() tomlLexStateFn { 160 for { 161 next := l.peek() 162 switch next { 163 case '.': 164 return l.errorf("cannot start float with a dot") 165 case '=': 166 return l.lexEqual 167 case '[': 168 return l.lexLeftBracket 169 case ']': 170 return l.lexRightBracket 171 case '{': 172 return l.lexLeftCurlyBrace 173 case '}': 174 return l.lexRightCurlyBrace 175 case '#': 176 return l.lexComment(l.lexRvalue) 177 case '"': 178 return l.lexString 179 case '\'': 180 return l.lexLiteralString 181 case ',': 182 return l.lexComma 183 case '\r': 184 fallthrough 185 case '\n': 186 l.skip() 187 if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '[' { 188 return l.lexRvalue 189 } 190 return l.lexVoid 191 } 192 193 if l.follow("true") { 194 return l.lexTrue 195 } 196 197 if l.follow("false") { 198 return l.lexFalse 199 } 200 201 if l.follow("inf") { 202 return l.lexInf 203 } 204 205 if l.follow("nan") { 206 return l.lexNan 207 } 208 209 if isSpace(next) { 210 l.skip() 211 continue 212 } 213 214 if next == eof { 215 l.next() 216 break 217 } 218 219 possibleDate := l.peekString(35) 220 dateSubmatches := dateRegexp.FindStringSubmatch(possibleDate) 221 if dateSubmatches != nil && dateSubmatches[0] != "" { 222 l.fastForward(len(dateSubmatches[0])) 223 if dateSubmatches[2] == "" { // no timezone information => local date 224 return l.lexLocalDate 225 } 226 return l.lexDate 227 } 228 229 if next == '+' || next == '-' || isDigit(next) { 230 return l.lexNumber 231 } 232 233 return l.errorf("no value can start with %c", next) 234 } 235 236 l.emit(tokenEOF) 237 return nil 238} 239 240func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { 241 l.next() 242 l.emit(tokenLeftCurlyBrace) 243 l.brackets = append(l.brackets, '{') 244 return l.lexVoid 245} 246 247func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { 248 l.next() 249 l.emit(tokenRightCurlyBrace) 250 if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '{' { 251 return l.errorf("cannot have '}' here") 252 } 253 l.brackets = l.brackets[:len(l.brackets)-1] 254 return l.lexRvalue 255} 256 257func (l *tomlLexer) lexDate() tomlLexStateFn { 258 l.emit(tokenDate) 259 return l.lexRvalue 260} 261 262func (l *tomlLexer) lexLocalDate() tomlLexStateFn { 263 l.emit(tokenLocalDate) 264 return l.lexRvalue 265} 266 267func (l *tomlLexer) lexTrue() tomlLexStateFn { 268 l.fastForward(4) 269 l.emit(tokenTrue) 270 return l.lexRvalue 271} 272 273func (l *tomlLexer) lexFalse() tomlLexStateFn { 274 l.fastForward(5) 275 l.emit(tokenFalse) 276 return l.lexRvalue 277} 278 279func (l *tomlLexer) lexInf() tomlLexStateFn { 280 l.fastForward(3) 281 l.emit(tokenInf) 282 return l.lexRvalue 283} 284 285func (l *tomlLexer) lexNan() tomlLexStateFn { 286 l.fastForward(3) 287 l.emit(tokenNan) 288 return l.lexRvalue 289} 290 291func (l *tomlLexer) lexEqual() tomlLexStateFn { 292 l.next() 293 l.emit(tokenEqual) 294 return l.lexRvalue 295} 296 297func (l *tomlLexer) lexComma() tomlLexStateFn { 298 l.next() 299 l.emit(tokenComma) 300 if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '{' { 301 return l.lexVoid 302 } 303 return l.lexRvalue 304} 305 306// Parse the key and emits its value without escape sequences. 307// bare keys, basic string keys and literal string keys are supported. 308func (l *tomlLexer) lexKey() tomlLexStateFn { 309 growingString := "" 310 311 for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { 312 if r == '"' { 313 l.next() 314 str, err := l.lexStringAsString(`"`, false, true) 315 if err != nil { 316 return l.errorf(err.Error()) 317 } 318 growingString += "\"" + str + "\"" 319 l.next() 320 continue 321 } else if r == '\'' { 322 l.next() 323 str, err := l.lexLiteralStringAsString(`'`, false) 324 if err != nil { 325 return l.errorf(err.Error()) 326 } 327 growingString += "'" + str + "'" 328 l.next() 329 continue 330 } else if r == '\n' { 331 return l.errorf("keys cannot contain new lines") 332 } else if isSpace(r) { 333 str := " " 334 // skip trailing whitespace 335 l.next() 336 for r = l.peek(); isSpace(r); r = l.peek() { 337 str += string(r) 338 l.next() 339 } 340 // break loop if not a dot 341 if r != '.' { 342 break 343 } 344 str += "." 345 // skip trailing whitespace after dot 346 l.next() 347 for r = l.peek(); isSpace(r); r = l.peek() { 348 str += string(r) 349 l.next() 350 } 351 growingString += str 352 continue 353 } else if r == '.' { 354 // skip 355 } else if !isValidBareChar(r) { 356 return l.errorf("keys cannot contain %c character", r) 357 } 358 growingString += string(r) 359 l.next() 360 } 361 l.emitWithValue(tokenKey, growingString) 362 return l.lexVoid 363} 364 365func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { 366 return func() tomlLexStateFn { 367 for next := l.peek(); next != '\n' && next != eof; next = l.peek() { 368 if next == '\r' && l.follow("\r\n") { 369 break 370 } 371 l.next() 372 } 373 l.ignore() 374 return previousState 375 } 376} 377 378func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { 379 l.next() 380 l.emit(tokenLeftBracket) 381 l.brackets = append(l.brackets, '[') 382 return l.lexRvalue 383} 384 385func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { 386 growingString := "" 387 388 if discardLeadingNewLine { 389 if l.follow("\r\n") { 390 l.skip() 391 l.skip() 392 } else if l.peek() == '\n' { 393 l.skip() 394 } 395 } 396 397 // find end of string 398 for { 399 if l.follow(terminator) { 400 return growingString, nil 401 } 402 403 next := l.peek() 404 if next == eof { 405 break 406 } 407 growingString += string(l.next()) 408 } 409 410 return "", errors.New("unclosed string") 411} 412 413func (l *tomlLexer) lexLiteralString() tomlLexStateFn { 414 l.skip() 415 416 // handle special case for triple-quote 417 terminator := "'" 418 discardLeadingNewLine := false 419 if l.follow("''") { 420 l.skip() 421 l.skip() 422 terminator = "'''" 423 discardLeadingNewLine = true 424 } 425 426 str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) 427 if err != nil { 428 return l.errorf(err.Error()) 429 } 430 431 l.emitWithValue(tokenString, str) 432 l.fastForward(len(terminator)) 433 l.ignore() 434 return l.lexRvalue 435} 436 437// Lex a string and return the results as a string. 438// Terminator is the substring indicating the end of the token. 439// The resulting string does not include the terminator. 440func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { 441 growingString := "" 442 443 if discardLeadingNewLine { 444 if l.follow("\r\n") { 445 l.skip() 446 l.skip() 447 } else if l.peek() == '\n' { 448 l.skip() 449 } 450 } 451 452 for { 453 if l.follow(terminator) { 454 return growingString, nil 455 } 456 457 if l.follow("\\") { 458 l.next() 459 switch l.peek() { 460 case '\r': 461 fallthrough 462 case '\n': 463 fallthrough 464 case '\t': 465 fallthrough 466 case ' ': 467 // skip all whitespace chars following backslash 468 for strings.ContainsRune("\r\n\t ", l.peek()) { 469 l.next() 470 } 471 case '"': 472 growingString += "\"" 473 l.next() 474 case 'n': 475 growingString += "\n" 476 l.next() 477 case 'b': 478 growingString += "\b" 479 l.next() 480 case 'f': 481 growingString += "\f" 482 l.next() 483 case '/': 484 growingString += "/" 485 l.next() 486 case 't': 487 growingString += "\t" 488 l.next() 489 case 'r': 490 growingString += "\r" 491 l.next() 492 case '\\': 493 growingString += "\\" 494 l.next() 495 case 'u': 496 l.next() 497 code := "" 498 for i := 0; i < 4; i++ { 499 c := l.peek() 500 if !isHexDigit(c) { 501 return "", errors.New("unfinished unicode escape") 502 } 503 l.next() 504 code = code + string(c) 505 } 506 intcode, err := strconv.ParseInt(code, 16, 32) 507 if err != nil { 508 return "", errors.New("invalid unicode escape: \\u" + code) 509 } 510 growingString += string(rune(intcode)) 511 case 'U': 512 l.next() 513 code := "" 514 for i := 0; i < 8; i++ { 515 c := l.peek() 516 if !isHexDigit(c) { 517 return "", errors.New("unfinished unicode escape") 518 } 519 l.next() 520 code = code + string(c) 521 } 522 intcode, err := strconv.ParseInt(code, 16, 64) 523 if err != nil { 524 return "", errors.New("invalid unicode escape: \\U" + code) 525 } 526 growingString += string(rune(intcode)) 527 default: 528 return "", errors.New("invalid escape sequence: \\" + string(l.peek())) 529 } 530 } else { 531 r := l.peek() 532 533 if 0x00 <= r && r <= 0x1F && r != '\t' && !(acceptNewLines && (r == '\n' || r == '\r')) { 534 return "", fmt.Errorf("unescaped control character %U", r) 535 } 536 l.next() 537 growingString += string(r) 538 } 539 540 if l.peek() == eof { 541 break 542 } 543 } 544 545 return "", errors.New("unclosed string") 546} 547 548func (l *tomlLexer) lexString() tomlLexStateFn { 549 l.skip() 550 551 // handle special case for triple-quote 552 terminator := `"` 553 discardLeadingNewLine := false 554 acceptNewLines := false 555 if l.follow(`""`) { 556 l.skip() 557 l.skip() 558 terminator = `"""` 559 discardLeadingNewLine = true 560 acceptNewLines = true 561 } 562 563 str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) 564 if err != nil { 565 return l.errorf(err.Error()) 566 } 567 568 l.emitWithValue(tokenString, str) 569 l.fastForward(len(terminator)) 570 l.ignore() 571 return l.lexRvalue 572} 573 574func (l *tomlLexer) lexTableKey() tomlLexStateFn { 575 l.next() 576 577 if l.peek() == '[' { 578 // token '[[' signifies an array of tables 579 l.next() 580 l.emit(tokenDoubleLeftBracket) 581 return l.lexInsideTableArrayKey 582 } 583 // vanilla table key 584 l.emit(tokenLeftBracket) 585 return l.lexInsideTableKey 586} 587 588// Parse the key till "]]", but only bare keys are supported 589func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { 590 for r := l.peek(); r != eof; r = l.peek() { 591 switch r { 592 case ']': 593 if l.currentTokenStop > l.currentTokenStart { 594 l.emit(tokenKeyGroupArray) 595 } 596 l.next() 597 if l.peek() != ']' { 598 break 599 } 600 l.next() 601 l.emit(tokenDoubleRightBracket) 602 return l.lexVoid 603 case '[': 604 return l.errorf("table array key cannot contain ']'") 605 default: 606 l.next() 607 } 608 } 609 return l.errorf("unclosed table array key") 610} 611 612// Parse the key till "]" but only bare keys are supported 613func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { 614 for r := l.peek(); r != eof; r = l.peek() { 615 switch r { 616 case ']': 617 if l.currentTokenStop > l.currentTokenStart { 618 l.emit(tokenKeyGroup) 619 } 620 l.next() 621 l.emit(tokenRightBracket) 622 return l.lexVoid 623 case '[': 624 return l.errorf("table key cannot contain ']'") 625 default: 626 l.next() 627 } 628 } 629 return l.errorf("unclosed table key") 630} 631 632func (l *tomlLexer) lexRightBracket() tomlLexStateFn { 633 l.next() 634 l.emit(tokenRightBracket) 635 if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '[' { 636 return l.errorf("cannot have ']' here") 637 } 638 l.brackets = l.brackets[:len(l.brackets)-1] 639 return l.lexRvalue 640} 641 642type validRuneFn func(r rune) bool 643 644func isValidHexRune(r rune) bool { 645 return r >= 'a' && r <= 'f' || 646 r >= 'A' && r <= 'F' || 647 r >= '0' && r <= '9' || 648 r == '_' 649} 650 651func isValidOctalRune(r rune) bool { 652 return r >= '0' && r <= '7' || r == '_' 653} 654 655func isValidBinaryRune(r rune) bool { 656 return r == '0' || r == '1' || r == '_' 657} 658 659func (l *tomlLexer) lexNumber() tomlLexStateFn { 660 r := l.peek() 661 662 if r == '0' { 663 follow := l.peekString(2) 664 if len(follow) == 2 { 665 var isValidRune validRuneFn 666 switch follow[1] { 667 case 'x': 668 isValidRune = isValidHexRune 669 case 'o': 670 isValidRune = isValidOctalRune 671 case 'b': 672 isValidRune = isValidBinaryRune 673 default: 674 if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' { 675 return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1])) 676 } 677 } 678 679 if isValidRune != nil { 680 l.next() 681 l.next() 682 digitSeen := false 683 for { 684 next := l.peek() 685 if !isValidRune(next) { 686 break 687 } 688 digitSeen = true 689 l.next() 690 } 691 692 if !digitSeen { 693 return l.errorf("number needs at least one digit") 694 } 695 696 l.emit(tokenInteger) 697 698 return l.lexRvalue 699 } 700 } 701 } 702 703 if r == '+' || r == '-' { 704 l.next() 705 if l.follow("inf") { 706 return l.lexInf 707 } 708 if l.follow("nan") { 709 return l.lexNan 710 } 711 } 712 713 pointSeen := false 714 expSeen := false 715 digitSeen := false 716 for { 717 next := l.peek() 718 if next == '.' { 719 if pointSeen { 720 return l.errorf("cannot have two dots in one float") 721 } 722 l.next() 723 if !isDigit(l.peek()) { 724 return l.errorf("float cannot end with a dot") 725 } 726 pointSeen = true 727 } else if next == 'e' || next == 'E' { 728 expSeen = true 729 l.next() 730 r := l.peek() 731 if r == '+' || r == '-' { 732 l.next() 733 } 734 } else if isDigit(next) { 735 digitSeen = true 736 l.next() 737 } else if next == '_' { 738 l.next() 739 } else { 740 break 741 } 742 if pointSeen && !digitSeen { 743 return l.errorf("cannot start float with a dot") 744 } 745 } 746 747 if !digitSeen { 748 return l.errorf("no digit in that number") 749 } 750 if pointSeen || expSeen { 751 l.emit(tokenFloat) 752 } else { 753 l.emit(tokenInteger) 754 } 755 return l.lexRvalue 756} 757 758func (l *tomlLexer) run() { 759 for state := l.lexVoid; state != nil; { 760 state = state() 761 } 762} 763 764func init() { 765 // Regexp for all date/time formats supported by TOML. 766 // Group 1: nano precision 767 // Group 2: timezone 768 // 769 // /!\ also matches the empty string 770 // 771 // Example matches: 772 //1979-05-27T07:32:00Z 773 //1979-05-27T00:32:00-07:00 774 //1979-05-27T00:32:00.999999-07:00 775 //1979-05-27 07:32:00Z 776 //1979-05-27 00:32:00-07:00 777 //1979-05-27 00:32:00.999999-07:00 778 //1979-05-27T07:32:00 779 //1979-05-27T00:32:00.999999 780 //1979-05-27 07:32:00 781 //1979-05-27 00:32:00.999999 782 //1979-05-27 783 //07:32:00 784 //00:32:00.999999 785 dateRegexp = regexp.MustCompile(`^(?:\d{1,4}-\d{2}-\d{2})?(?:[T ]?\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})?)?`) 786} 787 788// Entry point 789func lexToml(inputBytes []byte) []token { 790 runes := bytes.Runes(inputBytes) 791 l := &tomlLexer{ 792 input: runes, 793 tokens: make([]token, 0, 256), 794 line: 1, 795 col: 1, 796 endbufferLine: 1, 797 endbufferCol: 1, 798 } 799 l.run() 800 return l.tokens 801} 802