1// TOML lexer.
2//
3// Written using the principles developed by Rob Pike in
4// http://www.youtube.com/watch?v=HxaD_trXwRE
5
6package toml
7
8import (
9	"bytes"
10	"errors"
11	"fmt"
12	"regexp"
13	"strconv"
14	"strings"
15)
16
17var dateRegexp *regexp.Regexp
18
19// Define state functions
20type tomlLexStateFn func() tomlLexStateFn
21
22// Define lexer
23type tomlLexer struct {
24	inputIdx          int
25	input             []rune // Textual source
26	currentTokenStart int
27	currentTokenStop  int
28	tokens            []token
29	depth             int
30	line              int
31	col               int
32	endbufferLine     int
33	endbufferCol      int
34}
35
36// Basic read operations on input
37
38func (l *tomlLexer) read() rune {
39	r := l.peek()
40	if r == '\n' {
41		l.endbufferLine++
42		l.endbufferCol = 1
43	} else {
44		l.endbufferCol++
45	}
46	l.inputIdx++
47	return r
48}
49
50func (l *tomlLexer) next() rune {
51	r := l.read()
52
53	if r != eof {
54		l.currentTokenStop++
55	}
56	return r
57}
58
59func (l *tomlLexer) ignore() {
60	l.currentTokenStart = l.currentTokenStop
61	l.line = l.endbufferLine
62	l.col = l.endbufferCol
63}
64
65func (l *tomlLexer) skip() {
66	l.next()
67	l.ignore()
68}
69
70func (l *tomlLexer) fastForward(n int) {
71	for i := 0; i < n; i++ {
72		l.next()
73	}
74}
75
76func (l *tomlLexer) emitWithValue(t tokenType, value string) {
77	l.tokens = append(l.tokens, token{
78		Position: Position{l.line, l.col},
79		typ:      t,
80		val:      value,
81	})
82	l.ignore()
83}
84
85func (l *tomlLexer) emit(t tokenType) {
86	l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
87}
88
89func (l *tomlLexer) peek() rune {
90	if l.inputIdx >= len(l.input) {
91		return eof
92	}
93	return l.input[l.inputIdx]
94}
95
96func (l *tomlLexer) peekString(size int) string {
97	maxIdx := len(l.input)
98	upperIdx := l.inputIdx + size // FIXME: potential overflow
99	if upperIdx > maxIdx {
100		upperIdx = maxIdx
101	}
102	return string(l.input[l.inputIdx:upperIdx])
103}
104
105func (l *tomlLexer) follow(next string) bool {
106	return next == l.peekString(len(next))
107}
108
109// Error management
110
111func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
112	l.tokens = append(l.tokens, token{
113		Position: Position{l.line, l.col},
114		typ:      tokenError,
115		val:      fmt.Sprintf(format, args...),
116	})
117	return nil
118}
119
120// State functions
121
122func (l *tomlLexer) lexVoid() tomlLexStateFn {
123	for {
124		next := l.peek()
125		switch next {
126		case '[':
127			return l.lexTableKey
128		case '#':
129			return l.lexComment(l.lexVoid)
130		case '=':
131			return l.lexEqual
132		case '\r':
133			fallthrough
134		case '\n':
135			l.skip()
136			continue
137		}
138
139		if isSpace(next) {
140			l.skip()
141		}
142
143		if l.depth > 0 {
144			return l.lexRvalue
145		}
146
147		if isKeyStartChar(next) {
148			return l.lexKey
149		}
150
151		if next == eof {
152			l.next()
153			break
154		}
155	}
156
157	l.emit(tokenEOF)
158	return nil
159}
160
161func (l *tomlLexer) lexRvalue() tomlLexStateFn {
162	for {
163		next := l.peek()
164		switch next {
165		case '.':
166			return l.errorf("cannot start float with a dot")
167		case '=':
168			return l.lexEqual
169		case '[':
170			l.depth++
171			return l.lexLeftBracket
172		case ']':
173			l.depth--
174			return l.lexRightBracket
175		case '{':
176			return l.lexLeftCurlyBrace
177		case '}':
178			return l.lexRightCurlyBrace
179		case '#':
180			return l.lexComment(l.lexRvalue)
181		case '"':
182			return l.lexString
183		case '\'':
184			return l.lexLiteralString
185		case ',':
186			return l.lexComma
187		case '\r':
188			fallthrough
189		case '\n':
190			l.skip()
191			if l.depth == 0 {
192				return l.lexVoid
193			}
194			return l.lexRvalue
195		case '_':
196			return l.errorf("cannot start number with underscore")
197		}
198
199		if l.follow("true") {
200			return l.lexTrue
201		}
202
203		if l.follow("false") {
204			return l.lexFalse
205		}
206
207		if l.follow("inf") {
208			return l.lexInf
209		}
210
211		if l.follow("nan") {
212			return l.lexNan
213		}
214
215		if isSpace(next) {
216			l.skip()
217			continue
218		}
219
220		if next == eof {
221			l.next()
222			break
223		}
224
225		possibleDate := l.peekString(35)
226		dateMatch := dateRegexp.FindString(possibleDate)
227		if dateMatch != "" {
228			l.fastForward(len(dateMatch))
229			return l.lexDate
230		}
231
232		if next == '+' || next == '-' || isDigit(next) {
233			return l.lexNumber
234		}
235
236		if isAlphanumeric(next) {
237			return l.lexKey
238		}
239
240		return l.errorf("no value can start with %c", next)
241	}
242
243	l.emit(tokenEOF)
244	return nil
245}
246
247func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
248	l.next()
249	l.emit(tokenLeftCurlyBrace)
250	return l.lexRvalue
251}
252
253func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
254	l.next()
255	l.emit(tokenRightCurlyBrace)
256	return l.lexRvalue
257}
258
259func (l *tomlLexer) lexDate() tomlLexStateFn {
260	l.emit(tokenDate)
261	return l.lexRvalue
262}
263
264func (l *tomlLexer) lexTrue() tomlLexStateFn {
265	l.fastForward(4)
266	l.emit(tokenTrue)
267	return l.lexRvalue
268}
269
270func (l *tomlLexer) lexFalse() tomlLexStateFn {
271	l.fastForward(5)
272	l.emit(tokenFalse)
273	return l.lexRvalue
274}
275
276func (l *tomlLexer) lexInf() tomlLexStateFn {
277	l.fastForward(3)
278	l.emit(tokenInf)
279	return l.lexRvalue
280}
281
282func (l *tomlLexer) lexNan() tomlLexStateFn {
283	l.fastForward(3)
284	l.emit(tokenNan)
285	return l.lexRvalue
286}
287
288func (l *tomlLexer) lexEqual() tomlLexStateFn {
289	l.next()
290	l.emit(tokenEqual)
291	return l.lexRvalue
292}
293
294func (l *tomlLexer) lexComma() tomlLexStateFn {
295	l.next()
296	l.emit(tokenComma)
297	return l.lexRvalue
298}
299
300// Parse the key and emits its value without escape sequences.
301// bare keys, basic string keys and literal string keys are supported.
302func (l *tomlLexer) lexKey() tomlLexStateFn {
303	growingString := ""
304
305	for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
306		if r == '"' {
307			l.next()
308			str, err := l.lexStringAsString(`"`, false, true)
309			if err != nil {
310				return l.errorf(err.Error())
311			}
312			growingString += str
313			l.next()
314			continue
315		} else if r == '\'' {
316			l.next()
317			str, err := l.lexLiteralStringAsString(`'`, false)
318			if err != nil {
319				return l.errorf(err.Error())
320			}
321			growingString += str
322			l.next()
323			continue
324		} else if r == '\n' {
325			return l.errorf("keys cannot contain new lines")
326		} else if isSpace(r) {
327			break
328		} else if !isValidBareChar(r) {
329			return l.errorf("keys cannot contain %c character", r)
330		}
331		growingString += string(r)
332		l.next()
333	}
334	l.emitWithValue(tokenKey, growingString)
335	return l.lexVoid
336}
337
338func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
339	return func() tomlLexStateFn {
340		for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
341			if next == '\r' && l.follow("\r\n") {
342				break
343			}
344			l.next()
345		}
346		l.ignore()
347		return previousState
348	}
349}
350
351func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
352	l.next()
353	l.emit(tokenLeftBracket)
354	return l.lexRvalue
355}
356
357func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
358	growingString := ""
359
360	if discardLeadingNewLine {
361		if l.follow("\r\n") {
362			l.skip()
363			l.skip()
364		} else if l.peek() == '\n' {
365			l.skip()
366		}
367	}
368
369	// find end of string
370	for {
371		if l.follow(terminator) {
372			return growingString, nil
373		}
374
375		next := l.peek()
376		if next == eof {
377			break
378		}
379		growingString += string(l.next())
380	}
381
382	return "", errors.New("unclosed string")
383}
384
385func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
386	l.skip()
387
388	// handle special case for triple-quote
389	terminator := "'"
390	discardLeadingNewLine := false
391	if l.follow("''") {
392		l.skip()
393		l.skip()
394		terminator = "'''"
395		discardLeadingNewLine = true
396	}
397
398	str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
399	if err != nil {
400		return l.errorf(err.Error())
401	}
402
403	l.emitWithValue(tokenString, str)
404	l.fastForward(len(terminator))
405	l.ignore()
406	return l.lexRvalue
407}
408
409// Lex a string and return the results as a string.
410// Terminator is the substring indicating the end of the token.
411// The resulting string does not include the terminator.
412func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
413	growingString := ""
414
415	if discardLeadingNewLine {
416		if l.follow("\r\n") {
417			l.skip()
418			l.skip()
419		} else if l.peek() == '\n' {
420			l.skip()
421		}
422	}
423
424	for {
425		if l.follow(terminator) {
426			return growingString, nil
427		}
428
429		if l.follow("\\") {
430			l.next()
431			switch l.peek() {
432			case '\r':
433				fallthrough
434			case '\n':
435				fallthrough
436			case '\t':
437				fallthrough
438			case ' ':
439				// skip all whitespace chars following backslash
440				for strings.ContainsRune("\r\n\t ", l.peek()) {
441					l.next()
442				}
443			case '"':
444				growingString += "\""
445				l.next()
446			case 'n':
447				growingString += "\n"
448				l.next()
449			case 'b':
450				growingString += "\b"
451				l.next()
452			case 'f':
453				growingString += "\f"
454				l.next()
455			case '/':
456				growingString += "/"
457				l.next()
458			case 't':
459				growingString += "\t"
460				l.next()
461			case 'r':
462				growingString += "\r"
463				l.next()
464			case '\\':
465				growingString += "\\"
466				l.next()
467			case 'u':
468				l.next()
469				code := ""
470				for i := 0; i < 4; i++ {
471					c := l.peek()
472					if !isHexDigit(c) {
473						return "", errors.New("unfinished unicode escape")
474					}
475					l.next()
476					code = code + string(c)
477				}
478				intcode, err := strconv.ParseInt(code, 16, 32)
479				if err != nil {
480					return "", errors.New("invalid unicode escape: \\u" + code)
481				}
482				growingString += string(rune(intcode))
483			case 'U':
484				l.next()
485				code := ""
486				for i := 0; i < 8; i++ {
487					c := l.peek()
488					if !isHexDigit(c) {
489						return "", errors.New("unfinished unicode escape")
490					}
491					l.next()
492					code = code + string(c)
493				}
494				intcode, err := strconv.ParseInt(code, 16, 64)
495				if err != nil {
496					return "", errors.New("invalid unicode escape: \\U" + code)
497				}
498				growingString += string(rune(intcode))
499			default:
500				return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
501			}
502		} else {
503			r := l.peek()
504
505			if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) {
506				return "", fmt.Errorf("unescaped control character %U", r)
507			}
508			l.next()
509			growingString += string(r)
510		}
511
512		if l.peek() == eof {
513			break
514		}
515	}
516
517	return "", errors.New("unclosed string")
518}
519
520func (l *tomlLexer) lexString() tomlLexStateFn {
521	l.skip()
522
523	// handle special case for triple-quote
524	terminator := `"`
525	discardLeadingNewLine := false
526	acceptNewLines := false
527	if l.follow(`""`) {
528		l.skip()
529		l.skip()
530		terminator = `"""`
531		discardLeadingNewLine = true
532		acceptNewLines = true
533	}
534
535	str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
536
537	if err != nil {
538		return l.errorf(err.Error())
539	}
540
541	l.emitWithValue(tokenString, str)
542	l.fastForward(len(terminator))
543	l.ignore()
544	return l.lexRvalue
545}
546
547func (l *tomlLexer) lexTableKey() tomlLexStateFn {
548	l.next()
549
550	if l.peek() == '[' {
551		// token '[[' signifies an array of tables
552		l.next()
553		l.emit(tokenDoubleLeftBracket)
554		return l.lexInsideTableArrayKey
555	}
556	// vanilla table key
557	l.emit(tokenLeftBracket)
558	return l.lexInsideTableKey
559}
560
561// Parse the key till "]]", but only bare keys are supported
562func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
563	for r := l.peek(); r != eof; r = l.peek() {
564		switch r {
565		case ']':
566			if l.currentTokenStop > l.currentTokenStart {
567				l.emit(tokenKeyGroupArray)
568			}
569			l.next()
570			if l.peek() != ']' {
571				break
572			}
573			l.next()
574			l.emit(tokenDoubleRightBracket)
575			return l.lexVoid
576		case '[':
577			return l.errorf("table array key cannot contain ']'")
578		default:
579			l.next()
580		}
581	}
582	return l.errorf("unclosed table array key")
583}
584
585// Parse the key till "]" but only bare keys are supported
586func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
587	for r := l.peek(); r != eof; r = l.peek() {
588		switch r {
589		case ']':
590			if l.currentTokenStop > l.currentTokenStart {
591				l.emit(tokenKeyGroup)
592			}
593			l.next()
594			l.emit(tokenRightBracket)
595			return l.lexVoid
596		case '[':
597			return l.errorf("table key cannot contain ']'")
598		default:
599			l.next()
600		}
601	}
602	return l.errorf("unclosed table key")
603}
604
605func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
606	l.next()
607	l.emit(tokenRightBracket)
608	return l.lexRvalue
609}
610
611type validRuneFn func(r rune) bool
612
613func isValidHexRune(r rune) bool {
614	return r >= 'a' && r <= 'f' ||
615		r >= 'A' && r <= 'F' ||
616		r >= '0' && r <= '9' ||
617		r == '_'
618}
619
620func isValidOctalRune(r rune) bool {
621	return r >= '0' && r <= '7' || r == '_'
622}
623
624func isValidBinaryRune(r rune) bool {
625	return r == '0' || r == '1' || r == '_'
626}
627
628func (l *tomlLexer) lexNumber() tomlLexStateFn {
629	r := l.peek()
630
631	if r == '0' {
632		follow := l.peekString(2)
633		if len(follow) == 2 {
634			var isValidRune validRuneFn
635			switch follow[1] {
636			case 'x':
637				isValidRune = isValidHexRune
638			case 'o':
639				isValidRune = isValidOctalRune
640			case 'b':
641				isValidRune = isValidBinaryRune
642			default:
643				if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' {
644					return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1]))
645				}
646			}
647
648			if isValidRune != nil {
649				l.next()
650				l.next()
651				digitSeen := false
652				for {
653					next := l.peek()
654					if !isValidRune(next) {
655						break
656					}
657					digitSeen = true
658					l.next()
659				}
660
661				if !digitSeen {
662					return l.errorf("number needs at least one digit")
663				}
664
665				l.emit(tokenInteger)
666
667				return l.lexRvalue
668			}
669		}
670	}
671
672	if r == '+' || r == '-' {
673		l.next()
674		if l.follow("inf") {
675			return l.lexInf
676		}
677		if l.follow("nan") {
678			return l.lexNan
679		}
680	}
681
682	pointSeen := false
683	expSeen := false
684	digitSeen := false
685	for {
686		next := l.peek()
687		if next == '.' {
688			if pointSeen {
689				return l.errorf("cannot have two dots in one float")
690			}
691			l.next()
692			if !isDigit(l.peek()) {
693				return l.errorf("float cannot end with a dot")
694			}
695			pointSeen = true
696		} else if next == 'e' || next == 'E' {
697			expSeen = true
698			l.next()
699			r := l.peek()
700			if r == '+' || r == '-' {
701				l.next()
702			}
703		} else if isDigit(next) {
704			digitSeen = true
705			l.next()
706		} else if next == '_' {
707			l.next()
708		} else {
709			break
710		}
711		if pointSeen && !digitSeen {
712			return l.errorf("cannot start float with a dot")
713		}
714	}
715
716	if !digitSeen {
717		return l.errorf("no digit in that number")
718	}
719	if pointSeen || expSeen {
720		l.emit(tokenFloat)
721	} else {
722		l.emit(tokenInteger)
723	}
724	return l.lexRvalue
725}
726
727func (l *tomlLexer) run() {
728	for state := l.lexVoid; state != nil; {
729		state = state()
730	}
731}
732
733func init() {
734	dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`)
735}
736
737// Entry point
738func lexToml(inputBytes []byte) []token {
739	runes := bytes.Runes(inputBytes)
740	l := &tomlLexer{
741		input:         runes,
742		tokens:        make([]token, 0, 256),
743		line:          1,
744		col:           1,
745		endbufferLine: 1,
746		endbufferCol:  1,
747	}
748	l.run()
749	return l.tokens
750}
751