1// TOML lexer.
2//
3// Written using the principles developed by Rob Pike in
4// http://www.youtube.com/watch?v=HxaD_trXwRE
5
6package toml
7
8import (
9	"bytes"
10	"errors"
11	"fmt"
12	"regexp"
13	"strconv"
14	"strings"
15)
16
17var dateRegexp *regexp.Regexp
18
19// Define state functions
20type tomlLexStateFn func() tomlLexStateFn
21
22// Define lexer
23type tomlLexer struct {
24	inputIdx          int
25	input             []rune // Textual source
26	currentTokenStart int
27	currentTokenStop  int
28	tokens            []token
29	brackets          []rune
30	line              int
31	col               int
32	endbufferLine     int
33	endbufferCol      int
34}
35
36// Basic read operations on input
37
38func (l *tomlLexer) read() rune {
39	r := l.peek()
40	if r == '\n' {
41		l.endbufferLine++
42		l.endbufferCol = 1
43	} else {
44		l.endbufferCol++
45	}
46	l.inputIdx++
47	return r
48}
49
50func (l *tomlLexer) next() rune {
51	r := l.read()
52
53	if r != eof {
54		l.currentTokenStop++
55	}
56	return r
57}
58
59func (l *tomlLexer) ignore() {
60	l.currentTokenStart = l.currentTokenStop
61	l.line = l.endbufferLine
62	l.col = l.endbufferCol
63}
64
65func (l *tomlLexer) skip() {
66	l.next()
67	l.ignore()
68}
69
70func (l *tomlLexer) fastForward(n int) {
71	for i := 0; i < n; i++ {
72		l.next()
73	}
74}
75
76func (l *tomlLexer) emitWithValue(t tokenType, value string) {
77	l.tokens = append(l.tokens, token{
78		Position: Position{l.line, l.col},
79		typ:      t,
80		val:      value,
81	})
82	l.ignore()
83}
84
85func (l *tomlLexer) emit(t tokenType) {
86	l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
87}
88
89func (l *tomlLexer) peek() rune {
90	if l.inputIdx >= len(l.input) {
91		return eof
92	}
93	return l.input[l.inputIdx]
94}
95
96func (l *tomlLexer) peekString(size int) string {
97	maxIdx := len(l.input)
98	upperIdx := l.inputIdx + size // FIXME: potential overflow
99	if upperIdx > maxIdx {
100		upperIdx = maxIdx
101	}
102	return string(l.input[l.inputIdx:upperIdx])
103}
104
105func (l *tomlLexer) follow(next string) bool {
106	return next == l.peekString(len(next))
107}
108
109// Error management
110
111func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
112	l.tokens = append(l.tokens, token{
113		Position: Position{l.line, l.col},
114		typ:      tokenError,
115		val:      fmt.Sprintf(format, args...),
116	})
117	return nil
118}
119
120// State functions
121
122func (l *tomlLexer) lexVoid() tomlLexStateFn {
123	for {
124		next := l.peek()
125		switch next {
126		case '}': // after '{'
127			return l.lexRightCurlyBrace
128		case '[':
129			return l.lexTableKey
130		case '#':
131			return l.lexComment(l.lexVoid)
132		case '=':
133			return l.lexEqual
134		case '\r':
135			fallthrough
136		case '\n':
137			l.skip()
138			continue
139		}
140
141		if isSpace(next) {
142			l.skip()
143		}
144
145		if isKeyStartChar(next) {
146			return l.lexKey
147		}
148
149		if next == eof {
150			l.next()
151			break
152		}
153	}
154
155	l.emit(tokenEOF)
156	return nil
157}
158
159func (l *tomlLexer) lexRvalue() tomlLexStateFn {
160	for {
161		next := l.peek()
162		switch next {
163		case '.':
164			return l.errorf("cannot start float with a dot")
165		case '=':
166			return l.lexEqual
167		case '[':
168			return l.lexLeftBracket
169		case ']':
170			return l.lexRightBracket
171		case '{':
172			return l.lexLeftCurlyBrace
173		case '}':
174			return l.lexRightCurlyBrace
175		case '#':
176			return l.lexComment(l.lexRvalue)
177		case '"':
178			return l.lexString
179		case '\'':
180			return l.lexLiteralString
181		case ',':
182			return l.lexComma
183		case '\r':
184			fallthrough
185		case '\n':
186			l.skip()
187			if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '[' {
188				return l.lexRvalue
189			}
190			return l.lexVoid
191		}
192
193		if l.follow("true") {
194			return l.lexTrue
195		}
196
197		if l.follow("false") {
198			return l.lexFalse
199		}
200
201		if l.follow("inf") {
202			return l.lexInf
203		}
204
205		if l.follow("nan") {
206			return l.lexNan
207		}
208
209		if isSpace(next) {
210			l.skip()
211			continue
212		}
213
214		if next == eof {
215			l.next()
216			break
217		}
218
219		possibleDate := l.peekString(35)
220		dateSubmatches := dateRegexp.FindStringSubmatch(possibleDate)
221		if dateSubmatches != nil && dateSubmatches[0] != "" {
222			l.fastForward(len(dateSubmatches[0]))
223			if dateSubmatches[2] == "" { // no timezone information => local date
224				return l.lexLocalDate
225			}
226			return l.lexDate
227		}
228
229		if next == '+' || next == '-' || isDigit(next) {
230			return l.lexNumber
231		}
232
233		return l.errorf("no value can start with %c", next)
234	}
235
236	l.emit(tokenEOF)
237	return nil
238}
239
240func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
241	l.next()
242	l.emit(tokenLeftCurlyBrace)
243	l.brackets = append(l.brackets, '{')
244	return l.lexVoid
245}
246
247func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
248	l.next()
249	l.emit(tokenRightCurlyBrace)
250	if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '{' {
251		return l.errorf("cannot have '}' here")
252	}
253	l.brackets = l.brackets[:len(l.brackets)-1]
254	return l.lexRvalue
255}
256
257func (l *tomlLexer) lexDate() tomlLexStateFn {
258	l.emit(tokenDate)
259	return l.lexRvalue
260}
261
262func (l *tomlLexer) lexLocalDate() tomlLexStateFn {
263	l.emit(tokenLocalDate)
264	return l.lexRvalue
265}
266
267func (l *tomlLexer) lexTrue() tomlLexStateFn {
268	l.fastForward(4)
269	l.emit(tokenTrue)
270	return l.lexRvalue
271}
272
273func (l *tomlLexer) lexFalse() tomlLexStateFn {
274	l.fastForward(5)
275	l.emit(tokenFalse)
276	return l.lexRvalue
277}
278
279func (l *tomlLexer) lexInf() tomlLexStateFn {
280	l.fastForward(3)
281	l.emit(tokenInf)
282	return l.lexRvalue
283}
284
285func (l *tomlLexer) lexNan() tomlLexStateFn {
286	l.fastForward(3)
287	l.emit(tokenNan)
288	return l.lexRvalue
289}
290
291func (l *tomlLexer) lexEqual() tomlLexStateFn {
292	l.next()
293	l.emit(tokenEqual)
294	return l.lexRvalue
295}
296
297func (l *tomlLexer) lexComma() tomlLexStateFn {
298	l.next()
299	l.emit(tokenComma)
300	if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '{' {
301		return l.lexVoid
302	}
303	return l.lexRvalue
304}
305
306// Parse the key and emits its value without escape sequences.
307// bare keys, basic string keys and literal string keys are supported.
308func (l *tomlLexer) lexKey() tomlLexStateFn {
309	growingString := ""
310
311	for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
312		if r == '"' {
313			l.next()
314			str, err := l.lexStringAsString(`"`, false, true)
315			if err != nil {
316				return l.errorf(err.Error())
317			}
318			growingString += "\"" + str + "\""
319			l.next()
320			continue
321		} else if r == '\'' {
322			l.next()
323			str, err := l.lexLiteralStringAsString(`'`, false)
324			if err != nil {
325				return l.errorf(err.Error())
326			}
327			growingString += "'" + str + "'"
328			l.next()
329			continue
330		} else if r == '\n' {
331			return l.errorf("keys cannot contain new lines")
332		} else if isSpace(r) {
333			str := " "
334			// skip trailing whitespace
335			l.next()
336			for r = l.peek(); isSpace(r); r = l.peek() {
337				str += string(r)
338				l.next()
339			}
340			// break loop if not a dot
341			if r != '.' {
342				break
343			}
344			str += "."
345			// skip trailing whitespace after dot
346			l.next()
347			for r = l.peek(); isSpace(r); r = l.peek() {
348				str += string(r)
349				l.next()
350			}
351			growingString += str
352			continue
353		} else if r == '.' {
354			// skip
355		} else if !isValidBareChar(r) {
356			return l.errorf("keys cannot contain %c character", r)
357		}
358		growingString += string(r)
359		l.next()
360	}
361	l.emitWithValue(tokenKey, growingString)
362	return l.lexVoid
363}
364
365func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
366	return func() tomlLexStateFn {
367		for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
368			if next == '\r' && l.follow("\r\n") {
369				break
370			}
371			l.next()
372		}
373		l.ignore()
374		return previousState
375	}
376}
377
378func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
379	l.next()
380	l.emit(tokenLeftBracket)
381	l.brackets = append(l.brackets, '[')
382	return l.lexRvalue
383}
384
385func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
386	growingString := ""
387
388	if discardLeadingNewLine {
389		if l.follow("\r\n") {
390			l.skip()
391			l.skip()
392		} else if l.peek() == '\n' {
393			l.skip()
394		}
395	}
396
397	// find end of string
398	for {
399		if l.follow(terminator) {
400			return growingString, nil
401		}
402
403		next := l.peek()
404		if next == eof {
405			break
406		}
407		growingString += string(l.next())
408	}
409
410	return "", errors.New("unclosed string")
411}
412
413func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
414	l.skip()
415
416	// handle special case for triple-quote
417	terminator := "'"
418	discardLeadingNewLine := false
419	if l.follow("''") {
420		l.skip()
421		l.skip()
422		terminator = "'''"
423		discardLeadingNewLine = true
424	}
425
426	str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
427	if err != nil {
428		return l.errorf(err.Error())
429	}
430
431	l.emitWithValue(tokenString, str)
432	l.fastForward(len(terminator))
433	l.ignore()
434	return l.lexRvalue
435}
436
437// Lex a string and return the results as a string.
438// Terminator is the substring indicating the end of the token.
439// The resulting string does not include the terminator.
440func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
441	growingString := ""
442
443	if discardLeadingNewLine {
444		if l.follow("\r\n") {
445			l.skip()
446			l.skip()
447		} else if l.peek() == '\n' {
448			l.skip()
449		}
450	}
451
452	for {
453		if l.follow(terminator) {
454			return growingString, nil
455		}
456
457		if l.follow("\\") {
458			l.next()
459			switch l.peek() {
460			case '\r':
461				fallthrough
462			case '\n':
463				fallthrough
464			case '\t':
465				fallthrough
466			case ' ':
467				// skip all whitespace chars following backslash
468				for strings.ContainsRune("\r\n\t ", l.peek()) {
469					l.next()
470				}
471			case '"':
472				growingString += "\""
473				l.next()
474			case 'n':
475				growingString += "\n"
476				l.next()
477			case 'b':
478				growingString += "\b"
479				l.next()
480			case 'f':
481				growingString += "\f"
482				l.next()
483			case '/':
484				growingString += "/"
485				l.next()
486			case 't':
487				growingString += "\t"
488				l.next()
489			case 'r':
490				growingString += "\r"
491				l.next()
492			case '\\':
493				growingString += "\\"
494				l.next()
495			case 'u':
496				l.next()
497				code := ""
498				for i := 0; i < 4; i++ {
499					c := l.peek()
500					if !isHexDigit(c) {
501						return "", errors.New("unfinished unicode escape")
502					}
503					l.next()
504					code = code + string(c)
505				}
506				intcode, err := strconv.ParseInt(code, 16, 32)
507				if err != nil {
508					return "", errors.New("invalid unicode escape: \\u" + code)
509				}
510				growingString += string(rune(intcode))
511			case 'U':
512				l.next()
513				code := ""
514				for i := 0; i < 8; i++ {
515					c := l.peek()
516					if !isHexDigit(c) {
517						return "", errors.New("unfinished unicode escape")
518					}
519					l.next()
520					code = code + string(c)
521				}
522				intcode, err := strconv.ParseInt(code, 16, 64)
523				if err != nil {
524					return "", errors.New("invalid unicode escape: \\U" + code)
525				}
526				growingString += string(rune(intcode))
527			default:
528				return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
529			}
530		} else {
531			r := l.peek()
532
533			if 0x00 <= r && r <= 0x1F && r != '\t' && !(acceptNewLines && (r == '\n' || r == '\r')) {
534				return "", fmt.Errorf("unescaped control character %U", r)
535			}
536			l.next()
537			growingString += string(r)
538		}
539
540		if l.peek() == eof {
541			break
542		}
543	}
544
545	return "", errors.New("unclosed string")
546}
547
548func (l *tomlLexer) lexString() tomlLexStateFn {
549	l.skip()
550
551	// handle special case for triple-quote
552	terminator := `"`
553	discardLeadingNewLine := false
554	acceptNewLines := false
555	if l.follow(`""`) {
556		l.skip()
557		l.skip()
558		terminator = `"""`
559		discardLeadingNewLine = true
560		acceptNewLines = true
561	}
562
563	str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
564	if err != nil {
565		return l.errorf(err.Error())
566	}
567
568	l.emitWithValue(tokenString, str)
569	l.fastForward(len(terminator))
570	l.ignore()
571	return l.lexRvalue
572}
573
574func (l *tomlLexer) lexTableKey() tomlLexStateFn {
575	l.next()
576
577	if l.peek() == '[' {
578		// token '[[' signifies an array of tables
579		l.next()
580		l.emit(tokenDoubleLeftBracket)
581		return l.lexInsideTableArrayKey
582	}
583	// vanilla table key
584	l.emit(tokenLeftBracket)
585	return l.lexInsideTableKey
586}
587
588// Parse the key till "]]", but only bare keys are supported
589func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
590	for r := l.peek(); r != eof; r = l.peek() {
591		switch r {
592		case ']':
593			if l.currentTokenStop > l.currentTokenStart {
594				l.emit(tokenKeyGroupArray)
595			}
596			l.next()
597			if l.peek() != ']' {
598				break
599			}
600			l.next()
601			l.emit(tokenDoubleRightBracket)
602			return l.lexVoid
603		case '[':
604			return l.errorf("table array key cannot contain ']'")
605		default:
606			l.next()
607		}
608	}
609	return l.errorf("unclosed table array key")
610}
611
612// Parse the key till "]" but only bare keys are supported
613func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
614	for r := l.peek(); r != eof; r = l.peek() {
615		switch r {
616		case ']':
617			if l.currentTokenStop > l.currentTokenStart {
618				l.emit(tokenKeyGroup)
619			}
620			l.next()
621			l.emit(tokenRightBracket)
622			return l.lexVoid
623		case '[':
624			return l.errorf("table key cannot contain ']'")
625		default:
626			l.next()
627		}
628	}
629	return l.errorf("unclosed table key")
630}
631
632func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
633	l.next()
634	l.emit(tokenRightBracket)
635	if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '[' {
636		return l.errorf("cannot have ']' here")
637	}
638	l.brackets = l.brackets[:len(l.brackets)-1]
639	return l.lexRvalue
640}
641
642type validRuneFn func(r rune) bool
643
644func isValidHexRune(r rune) bool {
645	return r >= 'a' && r <= 'f' ||
646		r >= 'A' && r <= 'F' ||
647		r >= '0' && r <= '9' ||
648		r == '_'
649}
650
651func isValidOctalRune(r rune) bool {
652	return r >= '0' && r <= '7' || r == '_'
653}
654
655func isValidBinaryRune(r rune) bool {
656	return r == '0' || r == '1' || r == '_'
657}
658
659func (l *tomlLexer) lexNumber() tomlLexStateFn {
660	r := l.peek()
661
662	if r == '0' {
663		follow := l.peekString(2)
664		if len(follow) == 2 {
665			var isValidRune validRuneFn
666			switch follow[1] {
667			case 'x':
668				isValidRune = isValidHexRune
669			case 'o':
670				isValidRune = isValidOctalRune
671			case 'b':
672				isValidRune = isValidBinaryRune
673			default:
674				if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' {
675					return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1]))
676				}
677			}
678
679			if isValidRune != nil {
680				l.next()
681				l.next()
682				digitSeen := false
683				for {
684					next := l.peek()
685					if !isValidRune(next) {
686						break
687					}
688					digitSeen = true
689					l.next()
690				}
691
692				if !digitSeen {
693					return l.errorf("number needs at least one digit")
694				}
695
696				l.emit(tokenInteger)
697
698				return l.lexRvalue
699			}
700		}
701	}
702
703	if r == '+' || r == '-' {
704		l.next()
705		if l.follow("inf") {
706			return l.lexInf
707		}
708		if l.follow("nan") {
709			return l.lexNan
710		}
711	}
712
713	pointSeen := false
714	expSeen := false
715	digitSeen := false
716	for {
717		next := l.peek()
718		if next == '.' {
719			if pointSeen {
720				return l.errorf("cannot have two dots in one float")
721			}
722			l.next()
723			if !isDigit(l.peek()) {
724				return l.errorf("float cannot end with a dot")
725			}
726			pointSeen = true
727		} else if next == 'e' || next == 'E' {
728			expSeen = true
729			l.next()
730			r := l.peek()
731			if r == '+' || r == '-' {
732				l.next()
733			}
734		} else if isDigit(next) {
735			digitSeen = true
736			l.next()
737		} else if next == '_' {
738			l.next()
739		} else {
740			break
741		}
742		if pointSeen && !digitSeen {
743			return l.errorf("cannot start float with a dot")
744		}
745	}
746
747	if !digitSeen {
748		return l.errorf("no digit in that number")
749	}
750	if pointSeen || expSeen {
751		l.emit(tokenFloat)
752	} else {
753		l.emit(tokenInteger)
754	}
755	return l.lexRvalue
756}
757
758func (l *tomlLexer) run() {
759	for state := l.lexVoid; state != nil; {
760		state = state()
761	}
762}
763
764func init() {
765	// Regexp for all date/time formats supported by TOML.
766	// Group 1: nano precision
767	// Group 2: timezone
768	//
769	// /!\ also matches the empty string
770	//
771	// Example matches:
772	//1979-05-27T07:32:00Z
773	//1979-05-27T00:32:00-07:00
774	//1979-05-27T00:32:00.999999-07:00
775	//1979-05-27 07:32:00Z
776	//1979-05-27 00:32:00-07:00
777	//1979-05-27 00:32:00.999999-07:00
778	//1979-05-27T07:32:00
779	//1979-05-27T00:32:00.999999
780	//1979-05-27 07:32:00
781	//1979-05-27 00:32:00.999999
782	//1979-05-27
783	//07:32:00
784	//00:32:00.999999
785	dateRegexp = regexp.MustCompile(`^(?:\d{1,4}-\d{2}-\d{2})?(?:[T ]?\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})?)?`)
786}
787
788// Entry point
789func lexToml(inputBytes []byte) []token {
790	runes := bytes.Runes(inputBytes)
791	l := &tomlLexer{
792		input:         runes,
793		tokens:        make([]token, 0, 256),
794		line:          1,
795		col:           1,
796		endbufferLine: 1,
797		endbufferCol:  1,
798	}
799	l.run()
800	return l.tokens
801}
802