1// TOML lexer. 2// 3// Written using the principles developed by Rob Pike in 4// http://www.youtube.com/watch?v=HxaD_trXwRE 5 6package toml 7 8import ( 9 "errors" 10 "fmt" 11 "io" 12 "regexp" 13 "strconv" 14 "strings" 15 16 "github.com/pelletier/go-buffruneio" 17) 18 19var dateRegexp *regexp.Regexp 20 21// Define state functions 22type tomlLexStateFn func() tomlLexStateFn 23 24// Define lexer 25type tomlLexer struct { 26 input *buffruneio.Reader // Textual source 27 buffer []rune // Runes composing the current token 28 tokens chan token 29 depth int 30 line int 31 col int 32 endbufferLine int 33 endbufferCol int 34} 35 36// Basic read operations on input 37 38func (l *tomlLexer) read() rune { 39 r, _, err := l.input.ReadRune() 40 if err != nil { 41 panic(err) 42 } 43 if r == '\n' { 44 l.endbufferLine++ 45 l.endbufferCol = 1 46 } else { 47 l.endbufferCol++ 48 } 49 return r 50} 51 52func (l *tomlLexer) next() rune { 53 r := l.read() 54 55 if r != eof { 56 l.buffer = append(l.buffer, r) 57 } 58 return r 59} 60 61func (l *tomlLexer) ignore() { 62 l.buffer = make([]rune, 0) 63 l.line = l.endbufferLine 64 l.col = l.endbufferCol 65} 66 67func (l *tomlLexer) skip() { 68 l.next() 69 l.ignore() 70} 71 72func (l *tomlLexer) fastForward(n int) { 73 for i := 0; i < n; i++ { 74 l.next() 75 } 76} 77 78func (l *tomlLexer) emitWithValue(t tokenType, value string) { 79 l.tokens <- token{ 80 Position: Position{l.line, l.col}, 81 typ: t, 82 val: value, 83 } 84 l.ignore() 85} 86 87func (l *tomlLexer) emit(t tokenType) { 88 l.emitWithValue(t, string(l.buffer)) 89} 90 91func (l *tomlLexer) peek() rune { 92 r, _, err := l.input.ReadRune() 93 if err != nil { 94 panic(err) 95 } 96 l.input.UnreadRune() 97 return r 98} 99 100func (l *tomlLexer) follow(next string) bool { 101 for _, expectedRune := range next { 102 r, _, err := l.input.ReadRune() 103 defer l.input.UnreadRune() 104 if err != nil { 105 panic(err) 106 } 107 if expectedRune != r { 108 return false 109 } 110 } 111 return true 112} 113 114// Error management 115 116func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { 117 l.tokens <- token{ 118 Position: Position{l.line, l.col}, 119 typ: tokenError, 120 val: fmt.Sprintf(format, args...), 121 } 122 return nil 123} 124 125// State functions 126 127func (l *tomlLexer) lexVoid() tomlLexStateFn { 128 for { 129 next := l.peek() 130 switch next { 131 case '[': 132 return l.lexTableKey 133 case '#': 134 return l.lexComment(l.lexVoid) 135 case '=': 136 return l.lexEqual 137 case '\r': 138 fallthrough 139 case '\n': 140 l.skip() 141 continue 142 } 143 144 if isSpace(next) { 145 l.skip() 146 } 147 148 if l.depth > 0 { 149 return l.lexRvalue 150 } 151 152 if isKeyStartChar(next) { 153 return l.lexKey 154 } 155 156 if next == eof { 157 l.next() 158 break 159 } 160 } 161 162 l.emit(tokenEOF) 163 return nil 164} 165 166func (l *tomlLexer) lexRvalue() tomlLexStateFn { 167 for { 168 next := l.peek() 169 switch next { 170 case '.': 171 return l.errorf("cannot start float with a dot") 172 case '=': 173 return l.lexEqual 174 case '[': 175 l.depth++ 176 return l.lexLeftBracket 177 case ']': 178 l.depth-- 179 return l.lexRightBracket 180 case '{': 181 return l.lexLeftCurlyBrace 182 case '}': 183 return l.lexRightCurlyBrace 184 case '#': 185 return l.lexComment(l.lexRvalue) 186 case '"': 187 return l.lexString 188 case '\'': 189 return l.lexLiteralString 190 case ',': 191 return l.lexComma 192 case '\r': 193 fallthrough 194 case '\n': 195 l.skip() 196 if l.depth == 0 { 197 return l.lexVoid 198 } 199 return l.lexRvalue 200 case '_': 201 return l.errorf("cannot start number with underscore") 202 } 203 204 if l.follow("true") { 205 return l.lexTrue 206 } 207 208 if l.follow("false") { 209 return l.lexFalse 210 } 211 212 if isSpace(next) { 213 l.skip() 214 continue 215 } 216 217 if next == eof { 218 l.next() 219 break 220 } 221 222 possibleDate := string(l.input.PeekRunes(35)) 223 dateMatch := dateRegexp.FindString(possibleDate) 224 if dateMatch != "" { 225 l.fastForward(len(dateMatch)) 226 return l.lexDate 227 } 228 229 if next == '+' || next == '-' || isDigit(next) { 230 return l.lexNumber 231 } 232 233 if isAlphanumeric(next) { 234 return l.lexKey 235 } 236 237 return l.errorf("no value can start with %c", next) 238 } 239 240 l.emit(tokenEOF) 241 return nil 242} 243 244func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { 245 l.next() 246 l.emit(tokenLeftCurlyBrace) 247 return l.lexRvalue 248} 249 250func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { 251 l.next() 252 l.emit(tokenRightCurlyBrace) 253 return l.lexRvalue 254} 255 256func (l *tomlLexer) lexDate() tomlLexStateFn { 257 l.emit(tokenDate) 258 return l.lexRvalue 259} 260 261func (l *tomlLexer) lexTrue() tomlLexStateFn { 262 l.fastForward(4) 263 l.emit(tokenTrue) 264 return l.lexRvalue 265} 266 267func (l *tomlLexer) lexFalse() tomlLexStateFn { 268 l.fastForward(5) 269 l.emit(tokenFalse) 270 return l.lexRvalue 271} 272 273func (l *tomlLexer) lexEqual() tomlLexStateFn { 274 l.next() 275 l.emit(tokenEqual) 276 return l.lexRvalue 277} 278 279func (l *tomlLexer) lexComma() tomlLexStateFn { 280 l.next() 281 l.emit(tokenComma) 282 return l.lexRvalue 283} 284 285func (l *tomlLexer) lexKey() tomlLexStateFn { 286 growingString := "" 287 288 for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { 289 if r == '"' { 290 l.next() 291 str, err := l.lexStringAsString(`"`, false, true) 292 if err != nil { 293 return l.errorf(err.Error()) 294 } 295 growingString += `"` + str + `"` 296 l.next() 297 continue 298 } else if r == '\n' { 299 return l.errorf("keys cannot contain new lines") 300 } else if isSpace(r) { 301 break 302 } else if !isValidBareChar(r) { 303 return l.errorf("keys cannot contain %c character", r) 304 } 305 growingString += string(r) 306 l.next() 307 } 308 l.emitWithValue(tokenKey, growingString) 309 return l.lexVoid 310} 311 312func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { 313 return func() tomlLexStateFn { 314 for next := l.peek(); next != '\n' && next != eof; next = l.peek() { 315 if next == '\r' && l.follow("\r\n") { 316 break 317 } 318 l.next() 319 } 320 l.ignore() 321 return previousState 322 } 323} 324 325func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { 326 l.next() 327 l.emit(tokenLeftBracket) 328 return l.lexRvalue 329} 330 331func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { 332 growingString := "" 333 334 if discardLeadingNewLine { 335 if l.follow("\r\n") { 336 l.skip() 337 l.skip() 338 } else if l.peek() == '\n' { 339 l.skip() 340 } 341 } 342 343 // find end of string 344 for { 345 if l.follow(terminator) { 346 return growingString, nil 347 } 348 349 next := l.peek() 350 if next == eof { 351 break 352 } 353 growingString += string(l.next()) 354 } 355 356 return "", errors.New("unclosed string") 357} 358 359func (l *tomlLexer) lexLiteralString() tomlLexStateFn { 360 l.skip() 361 362 // handle special case for triple-quote 363 terminator := "'" 364 discardLeadingNewLine := false 365 if l.follow("''") { 366 l.skip() 367 l.skip() 368 terminator = "'''" 369 discardLeadingNewLine = true 370 } 371 372 str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) 373 if err != nil { 374 return l.errorf(err.Error()) 375 } 376 377 l.emitWithValue(tokenString, str) 378 l.fastForward(len(terminator)) 379 l.ignore() 380 return l.lexRvalue 381} 382 383// Lex a string and return the results as a string. 384// Terminator is the substring indicating the end of the token. 385// The resulting string does not include the terminator. 386func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { 387 growingString := "" 388 389 if discardLeadingNewLine { 390 if l.follow("\r\n") { 391 l.skip() 392 l.skip() 393 } else if l.peek() == '\n' { 394 l.skip() 395 } 396 } 397 398 for { 399 if l.follow(terminator) { 400 return growingString, nil 401 } 402 403 if l.follow("\\") { 404 l.next() 405 switch l.peek() { 406 case '\r': 407 fallthrough 408 case '\n': 409 fallthrough 410 case '\t': 411 fallthrough 412 case ' ': 413 // skip all whitespace chars following backslash 414 for strings.ContainsRune("\r\n\t ", l.peek()) { 415 l.next() 416 } 417 case '"': 418 growingString += "\"" 419 l.next() 420 case 'n': 421 growingString += "\n" 422 l.next() 423 case 'b': 424 growingString += "\b" 425 l.next() 426 case 'f': 427 growingString += "\f" 428 l.next() 429 case '/': 430 growingString += "/" 431 l.next() 432 case 't': 433 growingString += "\t" 434 l.next() 435 case 'r': 436 growingString += "\r" 437 l.next() 438 case '\\': 439 growingString += "\\" 440 l.next() 441 case 'u': 442 l.next() 443 code := "" 444 for i := 0; i < 4; i++ { 445 c := l.peek() 446 if !isHexDigit(c) { 447 return "", errors.New("unfinished unicode escape") 448 } 449 l.next() 450 code = code + string(c) 451 } 452 intcode, err := strconv.ParseInt(code, 16, 32) 453 if err != nil { 454 return "", errors.New("invalid unicode escape: \\u" + code) 455 } 456 growingString += string(rune(intcode)) 457 case 'U': 458 l.next() 459 code := "" 460 for i := 0; i < 8; i++ { 461 c := l.peek() 462 if !isHexDigit(c) { 463 return "", errors.New("unfinished unicode escape") 464 } 465 l.next() 466 code = code + string(c) 467 } 468 intcode, err := strconv.ParseInt(code, 16, 64) 469 if err != nil { 470 return "", errors.New("invalid unicode escape: \\U" + code) 471 } 472 growingString += string(rune(intcode)) 473 default: 474 return "", errors.New("invalid escape sequence: \\" + string(l.peek())) 475 } 476 } else { 477 r := l.peek() 478 479 if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) { 480 return "", fmt.Errorf("unescaped control character %U", r) 481 } 482 l.next() 483 growingString += string(r) 484 } 485 486 if l.peek() == eof { 487 break 488 } 489 } 490 491 return "", errors.New("unclosed string") 492} 493 494func (l *tomlLexer) lexString() tomlLexStateFn { 495 l.skip() 496 497 // handle special case for triple-quote 498 terminator := `"` 499 discardLeadingNewLine := false 500 acceptNewLines := false 501 if l.follow(`""`) { 502 l.skip() 503 l.skip() 504 terminator = `"""` 505 discardLeadingNewLine = true 506 acceptNewLines = true 507 } 508 509 str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) 510 511 if err != nil { 512 return l.errorf(err.Error()) 513 } 514 515 l.emitWithValue(tokenString, str) 516 l.fastForward(len(terminator)) 517 l.ignore() 518 return l.lexRvalue 519} 520 521func (l *tomlLexer) lexTableKey() tomlLexStateFn { 522 l.next() 523 524 if l.peek() == '[' { 525 // token '[[' signifies an array of tables 526 l.next() 527 l.emit(tokenDoubleLeftBracket) 528 return l.lexInsideTableArrayKey 529 } 530 // vanilla table key 531 l.emit(tokenLeftBracket) 532 return l.lexInsideTableKey 533} 534 535func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { 536 for r := l.peek(); r != eof; r = l.peek() { 537 switch r { 538 case ']': 539 if len(l.buffer) > 0 { 540 l.emit(tokenKeyGroupArray) 541 } 542 l.next() 543 if l.peek() != ']' { 544 break 545 } 546 l.next() 547 l.emit(tokenDoubleRightBracket) 548 return l.lexVoid 549 case '[': 550 return l.errorf("table array key cannot contain ']'") 551 default: 552 l.next() 553 } 554 } 555 return l.errorf("unclosed table array key") 556} 557 558func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { 559 for r := l.peek(); r != eof; r = l.peek() { 560 switch r { 561 case ']': 562 if len(l.buffer) > 0 { 563 l.emit(tokenKeyGroup) 564 } 565 l.next() 566 l.emit(tokenRightBracket) 567 return l.lexVoid 568 case '[': 569 return l.errorf("table key cannot contain ']'") 570 default: 571 l.next() 572 } 573 } 574 return l.errorf("unclosed table key") 575} 576 577func (l *tomlLexer) lexRightBracket() tomlLexStateFn { 578 l.next() 579 l.emit(tokenRightBracket) 580 return l.lexRvalue 581} 582 583func (l *tomlLexer) lexNumber() tomlLexStateFn { 584 r := l.peek() 585 if r == '+' || r == '-' { 586 l.next() 587 } 588 pointSeen := false 589 expSeen := false 590 digitSeen := false 591 for { 592 next := l.peek() 593 if next == '.' { 594 if pointSeen { 595 return l.errorf("cannot have two dots in one float") 596 } 597 l.next() 598 if !isDigit(l.peek()) { 599 return l.errorf("float cannot end with a dot") 600 } 601 pointSeen = true 602 } else if next == 'e' || next == 'E' { 603 expSeen = true 604 l.next() 605 r := l.peek() 606 if r == '+' || r == '-' { 607 l.next() 608 } 609 } else if isDigit(next) { 610 digitSeen = true 611 l.next() 612 } else if next == '_' { 613 l.next() 614 } else { 615 break 616 } 617 if pointSeen && !digitSeen { 618 return l.errorf("cannot start float with a dot") 619 } 620 } 621 622 if !digitSeen { 623 return l.errorf("no digit in that number") 624 } 625 if pointSeen || expSeen { 626 l.emit(tokenFloat) 627 } else { 628 l.emit(tokenInteger) 629 } 630 return l.lexRvalue 631} 632 633func (l *tomlLexer) run() { 634 for state := l.lexVoid; state != nil; { 635 state = state() 636 } 637 close(l.tokens) 638} 639 640func init() { 641 dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`) 642} 643 644// Entry point 645func lexToml(input io.Reader) chan token { 646 bufferedInput := buffruneio.NewReader(input) 647 l := &tomlLexer{ 648 input: bufferedInput, 649 tokens: make(chan token), 650 line: 1, 651 col: 1, 652 endbufferLine: 1, 653 endbufferCol: 1, 654 } 655 go l.run() 656 return l.tokens 657} 658