1// TOML lexer.
2//
3// Written using the principles developed by Rob Pike in
4// http://www.youtube.com/watch?v=HxaD_trXwRE
5
6package toml
7
8import (
9	"errors"
10	"fmt"
11	"io"
12	"regexp"
13	"strconv"
14	"strings"
15
16	"github.com/pelletier/go-buffruneio"
17)
18
19var dateRegexp *regexp.Regexp
20
21// Define state functions
22type tomlLexStateFn func() tomlLexStateFn
23
24// Define lexer
25type tomlLexer struct {
26	input         *buffruneio.Reader // Textual source
27	buffer        []rune             // Runes composing the current token
28	tokens        chan token
29	depth         int
30	line          int
31	col           int
32	endbufferLine int
33	endbufferCol  int
34}
35
36// Basic read operations on input
37
38func (l *tomlLexer) read() rune {
39	r, _, err := l.input.ReadRune()
40	if err != nil {
41		panic(err)
42	}
43	if r == '\n' {
44		l.endbufferLine++
45		l.endbufferCol = 1
46	} else {
47		l.endbufferCol++
48	}
49	return r
50}
51
52func (l *tomlLexer) next() rune {
53	r := l.read()
54
55	if r != eof {
56		l.buffer = append(l.buffer, r)
57	}
58	return r
59}
60
61func (l *tomlLexer) ignore() {
62	l.buffer = make([]rune, 0)
63	l.line = l.endbufferLine
64	l.col = l.endbufferCol
65}
66
67func (l *tomlLexer) skip() {
68	l.next()
69	l.ignore()
70}
71
72func (l *tomlLexer) fastForward(n int) {
73	for i := 0; i < n; i++ {
74		l.next()
75	}
76}
77
78func (l *tomlLexer) emitWithValue(t tokenType, value string) {
79	l.tokens <- token{
80		Position: Position{l.line, l.col},
81		typ:      t,
82		val:      value,
83	}
84	l.ignore()
85}
86
87func (l *tomlLexer) emit(t tokenType) {
88	l.emitWithValue(t, string(l.buffer))
89}
90
91func (l *tomlLexer) peek() rune {
92	r, _, err := l.input.ReadRune()
93	if err != nil {
94		panic(err)
95	}
96	l.input.UnreadRune()
97	return r
98}
99
100func (l *tomlLexer) follow(next string) bool {
101	for _, expectedRune := range next {
102		r, _, err := l.input.ReadRune()
103		defer l.input.UnreadRune()
104		if err != nil {
105			panic(err)
106		}
107		if expectedRune != r {
108			return false
109		}
110	}
111	return true
112}
113
114// Error management
115
116func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
117	l.tokens <- token{
118		Position: Position{l.line, l.col},
119		typ:      tokenError,
120		val:      fmt.Sprintf(format, args...),
121	}
122	return nil
123}
124
125// State functions
126
127func (l *tomlLexer) lexVoid() tomlLexStateFn {
128	for {
129		next := l.peek()
130		switch next {
131		case '[':
132			return l.lexTableKey
133		case '#':
134			return l.lexComment(l.lexVoid)
135		case '=':
136			return l.lexEqual
137		case '\r':
138			fallthrough
139		case '\n':
140			l.skip()
141			continue
142		}
143
144		if isSpace(next) {
145			l.skip()
146		}
147
148		if l.depth > 0 {
149			return l.lexRvalue
150		}
151
152		if isKeyStartChar(next) {
153			return l.lexKey
154		}
155
156		if next == eof {
157			l.next()
158			break
159		}
160	}
161
162	l.emit(tokenEOF)
163	return nil
164}
165
166func (l *tomlLexer) lexRvalue() tomlLexStateFn {
167	for {
168		next := l.peek()
169		switch next {
170		case '.':
171			return l.errorf("cannot start float with a dot")
172		case '=':
173			return l.lexEqual
174		case '[':
175			l.depth++
176			return l.lexLeftBracket
177		case ']':
178			l.depth--
179			return l.lexRightBracket
180		case '{':
181			return l.lexLeftCurlyBrace
182		case '}':
183			return l.lexRightCurlyBrace
184		case '#':
185			return l.lexComment(l.lexRvalue)
186		case '"':
187			return l.lexString
188		case '\'':
189			return l.lexLiteralString
190		case ',':
191			return l.lexComma
192		case '\r':
193			fallthrough
194		case '\n':
195			l.skip()
196			if l.depth == 0 {
197				return l.lexVoid
198			}
199			return l.lexRvalue
200		case '_':
201			return l.errorf("cannot start number with underscore")
202		}
203
204		if l.follow("true") {
205			return l.lexTrue
206		}
207
208		if l.follow("false") {
209			return l.lexFalse
210		}
211
212		if isSpace(next) {
213			l.skip()
214			continue
215		}
216
217		if next == eof {
218			l.next()
219			break
220		}
221
222		possibleDate := string(l.input.PeekRunes(35))
223		dateMatch := dateRegexp.FindString(possibleDate)
224		if dateMatch != "" {
225			l.fastForward(len(dateMatch))
226			return l.lexDate
227		}
228
229		if next == '+' || next == '-' || isDigit(next) {
230			return l.lexNumber
231		}
232
233		if isAlphanumeric(next) {
234			return l.lexKey
235		}
236
237		return l.errorf("no value can start with %c", next)
238	}
239
240	l.emit(tokenEOF)
241	return nil
242}
243
244func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
245	l.next()
246	l.emit(tokenLeftCurlyBrace)
247	return l.lexRvalue
248}
249
250func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
251	l.next()
252	l.emit(tokenRightCurlyBrace)
253	return l.lexRvalue
254}
255
256func (l *tomlLexer) lexDate() tomlLexStateFn {
257	l.emit(tokenDate)
258	return l.lexRvalue
259}
260
261func (l *tomlLexer) lexTrue() tomlLexStateFn {
262	l.fastForward(4)
263	l.emit(tokenTrue)
264	return l.lexRvalue
265}
266
267func (l *tomlLexer) lexFalse() tomlLexStateFn {
268	l.fastForward(5)
269	l.emit(tokenFalse)
270	return l.lexRvalue
271}
272
273func (l *tomlLexer) lexEqual() tomlLexStateFn {
274	l.next()
275	l.emit(tokenEqual)
276	return l.lexRvalue
277}
278
279func (l *tomlLexer) lexComma() tomlLexStateFn {
280	l.next()
281	l.emit(tokenComma)
282	return l.lexRvalue
283}
284
285func (l *tomlLexer) lexKey() tomlLexStateFn {
286	growingString := ""
287
288	for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
289		if r == '"' {
290			l.next()
291			str, err := l.lexStringAsString(`"`, false, true)
292			if err != nil {
293				return l.errorf(err.Error())
294			}
295			growingString += `"` + str + `"`
296			l.next()
297			continue
298		} else if r == '\n' {
299			return l.errorf("keys cannot contain new lines")
300		} else if isSpace(r) {
301			break
302		} else if !isValidBareChar(r) {
303			return l.errorf("keys cannot contain %c character", r)
304		}
305		growingString += string(r)
306		l.next()
307	}
308	l.emitWithValue(tokenKey, growingString)
309	return l.lexVoid
310}
311
312func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
313	return func() tomlLexStateFn {
314		for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
315			if next == '\r' && l.follow("\r\n") {
316				break
317			}
318			l.next()
319		}
320		l.ignore()
321		return previousState
322	}
323}
324
325func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
326	l.next()
327	l.emit(tokenLeftBracket)
328	return l.lexRvalue
329}
330
331func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
332	growingString := ""
333
334	if discardLeadingNewLine {
335		if l.follow("\r\n") {
336			l.skip()
337			l.skip()
338		} else if l.peek() == '\n' {
339			l.skip()
340		}
341	}
342
343	// find end of string
344	for {
345		if l.follow(terminator) {
346			return growingString, nil
347		}
348
349		next := l.peek()
350		if next == eof {
351			break
352		}
353		growingString += string(l.next())
354	}
355
356	return "", errors.New("unclosed string")
357}
358
359func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
360	l.skip()
361
362	// handle special case for triple-quote
363	terminator := "'"
364	discardLeadingNewLine := false
365	if l.follow("''") {
366		l.skip()
367		l.skip()
368		terminator = "'''"
369		discardLeadingNewLine = true
370	}
371
372	str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
373	if err != nil {
374		return l.errorf(err.Error())
375	}
376
377	l.emitWithValue(tokenString, str)
378	l.fastForward(len(terminator))
379	l.ignore()
380	return l.lexRvalue
381}
382
383// Lex a string and return the results as a string.
384// Terminator is the substring indicating the end of the token.
385// The resulting string does not include the terminator.
386func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
387	growingString := ""
388
389	if discardLeadingNewLine {
390		if l.follow("\r\n") {
391			l.skip()
392			l.skip()
393		} else if l.peek() == '\n' {
394			l.skip()
395		}
396	}
397
398	for {
399		if l.follow(terminator) {
400			return growingString, nil
401		}
402
403		if l.follow("\\") {
404			l.next()
405			switch l.peek() {
406			case '\r':
407				fallthrough
408			case '\n':
409				fallthrough
410			case '\t':
411				fallthrough
412			case ' ':
413				// skip all whitespace chars following backslash
414				for strings.ContainsRune("\r\n\t ", l.peek()) {
415					l.next()
416				}
417			case '"':
418				growingString += "\""
419				l.next()
420			case 'n':
421				growingString += "\n"
422				l.next()
423			case 'b':
424				growingString += "\b"
425				l.next()
426			case 'f':
427				growingString += "\f"
428				l.next()
429			case '/':
430				growingString += "/"
431				l.next()
432			case 't':
433				growingString += "\t"
434				l.next()
435			case 'r':
436				growingString += "\r"
437				l.next()
438			case '\\':
439				growingString += "\\"
440				l.next()
441			case 'u':
442				l.next()
443				code := ""
444				for i := 0; i < 4; i++ {
445					c := l.peek()
446					if !isHexDigit(c) {
447						return "", errors.New("unfinished unicode escape")
448					}
449					l.next()
450					code = code + string(c)
451				}
452				intcode, err := strconv.ParseInt(code, 16, 32)
453				if err != nil {
454					return "", errors.New("invalid unicode escape: \\u" + code)
455				}
456				growingString += string(rune(intcode))
457			case 'U':
458				l.next()
459				code := ""
460				for i := 0; i < 8; i++ {
461					c := l.peek()
462					if !isHexDigit(c) {
463						return "", errors.New("unfinished unicode escape")
464					}
465					l.next()
466					code = code + string(c)
467				}
468				intcode, err := strconv.ParseInt(code, 16, 64)
469				if err != nil {
470					return "", errors.New("invalid unicode escape: \\U" + code)
471				}
472				growingString += string(rune(intcode))
473			default:
474				return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
475			}
476		} else {
477			r := l.peek()
478
479			if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) {
480				return "", fmt.Errorf("unescaped control character %U", r)
481			}
482			l.next()
483			growingString += string(r)
484		}
485
486		if l.peek() == eof {
487			break
488		}
489	}
490
491	return "", errors.New("unclosed string")
492}
493
494func (l *tomlLexer) lexString() tomlLexStateFn {
495	l.skip()
496
497	// handle special case for triple-quote
498	terminator := `"`
499	discardLeadingNewLine := false
500	acceptNewLines := false
501	if l.follow(`""`) {
502		l.skip()
503		l.skip()
504		terminator = `"""`
505		discardLeadingNewLine = true
506		acceptNewLines = true
507	}
508
509	str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
510
511	if err != nil {
512		return l.errorf(err.Error())
513	}
514
515	l.emitWithValue(tokenString, str)
516	l.fastForward(len(terminator))
517	l.ignore()
518	return l.lexRvalue
519}
520
521func (l *tomlLexer) lexTableKey() tomlLexStateFn {
522	l.next()
523
524	if l.peek() == '[' {
525		// token '[[' signifies an array of tables
526		l.next()
527		l.emit(tokenDoubleLeftBracket)
528		return l.lexInsideTableArrayKey
529	}
530	// vanilla table key
531	l.emit(tokenLeftBracket)
532	return l.lexInsideTableKey
533}
534
535func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
536	for r := l.peek(); r != eof; r = l.peek() {
537		switch r {
538		case ']':
539			if len(l.buffer) > 0 {
540				l.emit(tokenKeyGroupArray)
541			}
542			l.next()
543			if l.peek() != ']' {
544				break
545			}
546			l.next()
547			l.emit(tokenDoubleRightBracket)
548			return l.lexVoid
549		case '[':
550			return l.errorf("table array key cannot contain ']'")
551		default:
552			l.next()
553		}
554	}
555	return l.errorf("unclosed table array key")
556}
557
558func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
559	for r := l.peek(); r != eof; r = l.peek() {
560		switch r {
561		case ']':
562			if len(l.buffer) > 0 {
563				l.emit(tokenKeyGroup)
564			}
565			l.next()
566			l.emit(tokenRightBracket)
567			return l.lexVoid
568		case '[':
569			return l.errorf("table key cannot contain ']'")
570		default:
571			l.next()
572		}
573	}
574	return l.errorf("unclosed table key")
575}
576
577func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
578	l.next()
579	l.emit(tokenRightBracket)
580	return l.lexRvalue
581}
582
583func (l *tomlLexer) lexNumber() tomlLexStateFn {
584	r := l.peek()
585	if r == '+' || r == '-' {
586		l.next()
587	}
588	pointSeen := false
589	expSeen := false
590	digitSeen := false
591	for {
592		next := l.peek()
593		if next == '.' {
594			if pointSeen {
595				return l.errorf("cannot have two dots in one float")
596			}
597			l.next()
598			if !isDigit(l.peek()) {
599				return l.errorf("float cannot end with a dot")
600			}
601			pointSeen = true
602		} else if next == 'e' || next == 'E' {
603			expSeen = true
604			l.next()
605			r := l.peek()
606			if r == '+' || r == '-' {
607				l.next()
608			}
609		} else if isDigit(next) {
610			digitSeen = true
611			l.next()
612		} else if next == '_' {
613			l.next()
614		} else {
615			break
616		}
617		if pointSeen && !digitSeen {
618			return l.errorf("cannot start float with a dot")
619		}
620	}
621
622	if !digitSeen {
623		return l.errorf("no digit in that number")
624	}
625	if pointSeen || expSeen {
626		l.emit(tokenFloat)
627	} else {
628		l.emit(tokenInteger)
629	}
630	return l.lexRvalue
631}
632
633func (l *tomlLexer) run() {
634	for state := l.lexVoid; state != nil; {
635		state = state()
636	}
637	close(l.tokens)
638}
639
640func init() {
641	dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`)
642}
643
644// Entry point
645func lexToml(input io.Reader) chan token {
646	bufferedInput := buffruneio.NewReader(input)
647	l := &tomlLexer{
648		input:         bufferedInput,
649		tokens:        make(chan token),
650		line:          1,
651		col:           1,
652		endbufferLine: 1,
653		endbufferCol:  1,
654	}
655	go l.run()
656	return l.tokens
657}
658