1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package parse
6
7import (
8	"fmt"
9	"strings"
10	"unicode"
11	"unicode/utf8"
12)
13
14// item represents a token or text string returned from the scanner.
15type item struct {
16	typ itemType // The type of this item.
17	pos Pos      // The starting position, in bytes, of this item in the input string.
18	val string   // The value of this item.
19}
20
21func (i item) String() string {
22	switch {
23	case i.typ == itemEOF:
24		return "EOF"
25	case i.typ == itemError:
26		return i.val
27	case i.typ > itemKeyword:
28		return fmt.Sprintf("<%s>", i.val)
29	case len(i.val) > 10:
30		return fmt.Sprintf("%.10q...", i.val)
31	}
32	return fmt.Sprintf("%q", i.val)
33}
34
35// itemType identifies the type of lex items.
36type itemType int
37
38const (
39	itemError        itemType = iota // error occurred; value is text of error
40	itemBool                         // boolean constant
41	itemChar                         // printable ASCII character; grab bag for comma etc.
42	itemCharConstant                 // character constant
43	itemComplex                      // complex constant (1+2i); imaginary is just a number
44	itemColonEquals                  // colon-equals (':=') introducing a declaration
45	itemEOF
46	itemField      // alphanumeric identifier starting with '.'
47	itemIdentifier // alphanumeric identifier not starting with '.'
48	itemLeftDelim  // left action delimiter
49	itemLeftParen  // '(' inside action
50	itemNumber     // simple number, including imaginary
51	itemPipe       // pipe symbol
52	itemRawString  // raw quoted string (includes quotes)
53	itemRightDelim // right action delimiter
54	itemRightParen // ')' inside action
55	itemSpace      // run of spaces separating arguments
56	itemString     // quoted string (includes quotes)
57	itemText       // plain text
58	itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'
59	// Keywords appear after all the rest.
60	itemKeyword  // used only to delimit the keywords
61	itemDot      // the cursor, spelled '.'
62	itemDefine   // define keyword
63	itemElse     // else keyword
64	itemEnd      // end keyword
65	itemIf       // if keyword
66	itemNil      // the untyped nil constant, easiest to treat as a keyword
67	itemRange    // range keyword
68	itemTemplate // template keyword
69	itemWith     // with keyword
70)
71
72var key = map[string]itemType{
73	".":        itemDot,
74	"define":   itemDefine,
75	"else":     itemElse,
76	"end":      itemEnd,
77	"if":       itemIf,
78	"range":    itemRange,
79	"nil":      itemNil,
80	"template": itemTemplate,
81	"with":     itemWith,
82}
83
84const eof = -1
85
86// stateFn represents the state of the scanner as a function that returns the next state.
87type stateFn func(*lexer) stateFn
88
89// lexer holds the state of the scanner.
90type lexer struct {
91	name       string    // the name of the input; used only for error reports
92	input      string    // the string being scanned
93	leftDelim  string    // start of action
94	rightDelim string    // end of action
95	state      stateFn   // the next lexing function to enter
96	pos        Pos       // current position in the input
97	start      Pos       // start position of this item
98	width      Pos       // width of last rune read from input
99	lastPos    Pos       // position of most recent item returned by nextItem
100	items      chan item // channel of scanned items
101	parenDepth int       // nesting depth of ( ) exprs
102}
103
104// next returns the next rune in the input.
105func (l *lexer) next() rune {
106	if int(l.pos) >= len(l.input) {
107		l.width = 0
108		return eof
109	}
110	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
111	l.width = Pos(w)
112	l.pos += l.width
113	return r
114}
115
116// peek returns but does not consume the next rune in the input.
117func (l *lexer) peek() rune {
118	r := l.next()
119	l.backup()
120	return r
121}
122
123// backup steps back one rune. Can only be called once per call of next.
124func (l *lexer) backup() {
125	l.pos -= l.width
126}
127
128// emit passes an item back to the client.
129func (l *lexer) emit(t itemType) {
130	l.items <- item{t, l.start, l.input[l.start:l.pos]}
131	l.start = l.pos
132}
133
134// ignore skips over the pending input before this point.
135func (l *lexer) ignore() {
136	l.start = l.pos
137}
138
139// accept consumes the next rune if it's from the valid set.
140func (l *lexer) accept(valid string) bool {
141	if strings.IndexRune(valid, l.next()) >= 0 {
142		return true
143	}
144	l.backup()
145	return false
146}
147
148// acceptRun consumes a run of runes from the valid set.
149func (l *lexer) acceptRun(valid string) {
150	for strings.IndexRune(valid, l.next()) >= 0 {
151	}
152	l.backup()
153}
154
155// lineNumber reports which line we're on, based on the position of
156// the previous item returned by nextItem. Doing it this way
157// means we don't have to worry about peek double counting.
158func (l *lexer) lineNumber() int {
159	return 1 + strings.Count(l.input[:l.lastPos], "\n")
160}
161
162// errorf returns an error token and terminates the scan by passing
163// back a nil pointer that will be the next state, terminating l.nextItem.
164func (l *lexer) errorf(format string, args ...interface{}) stateFn {
165	l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
166	return nil
167}
168
169// nextItem returns the next item from the input.
170func (l *lexer) nextItem() item {
171	item := <-l.items
172	l.lastPos = item.pos
173	return item
174}
175
176// lex creates a new scanner for the input string.
177func lex(name, input, left, right string) *lexer {
178	if left == "" {
179		left = leftDelim
180	}
181	if right == "" {
182		right = rightDelim
183	}
184	l := &lexer{
185		name:       name,
186		input:      input,
187		leftDelim:  left,
188		rightDelim: right,
189		items:      make(chan item),
190	}
191	go l.run()
192	return l
193}
194
195// run runs the state machine for the lexer.
196func (l *lexer) run() {
197	for l.state = lexText; l.state != nil; {
198		l.state = l.state(l)
199	}
200}
201
202// state functions
203
204const (
205	leftDelim    = "{{"
206	rightDelim   = "}}"
207	leftComment  = "/*"
208	rightComment = "*/"
209)
210
211// lexText scans until an opening action delimiter, "{{".
212func lexText(l *lexer) stateFn {
213	for {
214		if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
215			if l.pos > l.start {
216				l.emit(itemText)
217			}
218			return lexLeftDelim
219		}
220		if l.next() == eof {
221			break
222		}
223	}
224	// Correctly reached EOF.
225	if l.pos > l.start {
226		l.emit(itemText)
227	}
228	l.emit(itemEOF)
229	return nil
230}
231
232// lexLeftDelim scans the left delimiter, which is known to be present.
233func lexLeftDelim(l *lexer) stateFn {
234	l.pos += Pos(len(l.leftDelim))
235	if strings.HasPrefix(l.input[l.pos:], leftComment) {
236		return lexComment
237	}
238	l.emit(itemLeftDelim)
239	l.parenDepth = 0
240	return lexInsideAction
241}
242
243// lexComment scans a comment. The left comment marker is known to be present.
244func lexComment(l *lexer) stateFn {
245	l.pos += Pos(len(leftComment))
246	i := strings.Index(l.input[l.pos:], rightComment+l.rightDelim)
247	if i < 0 {
248		return l.errorf("unclosed comment")
249	}
250	l.pos += Pos(i + len(rightComment) + len(l.rightDelim))
251	l.ignore()
252	return lexText
253}
254
255// lexRightDelim scans the right delimiter, which is known to be present.
256func lexRightDelim(l *lexer) stateFn {
257	l.pos += Pos(len(l.rightDelim))
258	l.emit(itemRightDelim)
259	return lexText
260}
261
262// lexInsideAction scans the elements inside action delimiters.
263func lexInsideAction(l *lexer) stateFn {
264	// Either number, quoted string, or identifier.
265	// Spaces separate arguments; runs of spaces turn into itemSpace.
266	// Pipe symbols separate and are emitted.
267	if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
268		if l.parenDepth == 0 {
269			return lexRightDelim
270		}
271		return l.errorf("unclosed left paren")
272	}
273	switch r := l.next(); {
274	case r == eof || isEndOfLine(r):
275		return l.errorf("unclosed action")
276	case isSpace(r):
277		return lexSpace
278	case r == ':':
279		if l.next() != '=' {
280			return l.errorf("expected :=")
281		}
282		l.emit(itemColonEquals)
283	case r == '|':
284		l.emit(itemPipe)
285	case r == '"':
286		return lexQuote
287	case r == '`':
288		return lexRawQuote
289	case r == '$':
290		return lexVariable
291	case r == '\'':
292		return lexChar
293	case r == '.':
294		// special look-ahead for ".field" so we don't break l.backup().
295		if l.pos < Pos(len(l.input)) {
296			r := l.input[l.pos]
297			if r < '0' || '9' < r {
298				return lexField
299			}
300		}
301		fallthrough // '.' can start a number.
302	case r == '+' || r == '-' || ('0' <= r && r <= '9'):
303		l.backup()
304		return lexNumber
305	case isAlphaNumeric(r):
306		l.backup()
307		return lexIdentifier
308	case r == '(':
309		l.emit(itemLeftParen)
310		l.parenDepth++
311		return lexInsideAction
312	case r == ')':
313		l.emit(itemRightParen)
314		l.parenDepth--
315		if l.parenDepth < 0 {
316			return l.errorf("unexpected right paren %#U", r)
317		}
318		return lexInsideAction
319	case r <= unicode.MaxASCII && unicode.IsPrint(r):
320		l.emit(itemChar)
321		return lexInsideAction
322	default:
323		return l.errorf("unrecognized character in action: %#U", r)
324	}
325	return lexInsideAction
326}
327
328// lexSpace scans a run of space characters.
329// One space has already been seen.
330func lexSpace(l *lexer) stateFn {
331	for isSpace(l.peek()) {
332		l.next()
333	}
334	l.emit(itemSpace)
335	return lexInsideAction
336}
337
338// lexIdentifier scans an alphanumeric.
339func lexIdentifier(l *lexer) stateFn {
340Loop:
341	for {
342		switch r := l.next(); {
343		case isAlphaNumeric(r):
344			// absorb.
345		default:
346			l.backup()
347			word := l.input[l.start:l.pos]
348			if !l.atTerminator() {
349				return l.errorf("bad character %#U", r)
350			}
351			switch {
352			case key[word] > itemKeyword:
353				l.emit(key[word])
354			case word[0] == '.':
355				l.emit(itemField)
356			case word == "true", word == "false":
357				l.emit(itemBool)
358			default:
359				l.emit(itemIdentifier)
360			}
361			break Loop
362		}
363	}
364	return lexInsideAction
365}
366
367// lexField scans a field: .Alphanumeric.
368// The . has been scanned.
369func lexField(l *lexer) stateFn {
370	return lexFieldOrVariable(l, itemField)
371}
372
373// lexVariable scans a Variable: $Alphanumeric.
374// The $ has been scanned.
375func lexVariable(l *lexer) stateFn {
376	if l.atTerminator() { // Nothing interesting follows -> "$".
377		l.emit(itemVariable)
378		return lexInsideAction
379	}
380	return lexFieldOrVariable(l, itemVariable)
381}
382
383// lexVariable scans a field or variable: [.$]Alphanumeric.
384// The . or $ has been scanned.
385func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
386	if l.atTerminator() { // Nothing interesting follows -> "." or "$".
387		if typ == itemVariable {
388			l.emit(itemVariable)
389		} else {
390			l.emit(itemDot)
391		}
392		return lexInsideAction
393	}
394	var r rune
395	for {
396		r = l.next()
397		if !isAlphaNumeric(r) {
398			l.backup()
399			break
400		}
401	}
402	if !l.atTerminator() {
403		return l.errorf("bad character %#U", r)
404	}
405	l.emit(typ)
406	return lexInsideAction
407}
408
409// atTerminator reports whether the input is at valid termination character to
410// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
411// like "$x+2" not being acceptable without a space, in case we decide one
412// day to implement arithmetic.
413func (l *lexer) atTerminator() bool {
414	r := l.peek()
415	if isSpace(r) || isEndOfLine(r) {
416		return true
417	}
418	switch r {
419	case eof, '.', ',', '|', ':', ')', '(':
420		return true
421	}
422	// Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
423	// succeed but should fail) but only in extremely rare cases caused by willfully
424	// bad choice of delimiter.
425	if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
426		return true
427	}
428	return false
429}
430
431// lexChar scans a character constant. The initial quote is already
432// scanned. Syntax checking is done by the parser.
433func lexChar(l *lexer) stateFn {
434Loop:
435	for {
436		switch l.next() {
437		case '\\':
438			if r := l.next(); r != eof && r != '\n' {
439				break
440			}
441			fallthrough
442		case eof, '\n':
443			return l.errorf("unterminated character constant")
444		case '\'':
445			break Loop
446		}
447	}
448	l.emit(itemCharConstant)
449	return lexInsideAction
450}
451
452// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
453// isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
454// and "089" - but when it's wrong the input is invalid and the parser (via
455// strconv) will notice.
456func lexNumber(l *lexer) stateFn {
457	if !l.scanNumber() {
458		return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
459	}
460	if sign := l.peek(); sign == '+' || sign == '-' {
461		// Complex: 1+2i. No spaces, must end in 'i'.
462		if !l.scanNumber() || l.input[l.pos-1] != 'i' {
463			return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
464		}
465		l.emit(itemComplex)
466	} else {
467		l.emit(itemNumber)
468	}
469	return lexInsideAction
470}
471
472func (l *lexer) scanNumber() bool {
473	// Optional leading sign.
474	l.accept("+-")
475	// Is it hex?
476	digits := "0123456789"
477	if l.accept("0") && l.accept("xX") {
478		digits = "0123456789abcdefABCDEF"
479	}
480	l.acceptRun(digits)
481	if l.accept(".") {
482		l.acceptRun(digits)
483	}
484	if l.accept("eE") {
485		l.accept("+-")
486		l.acceptRun("0123456789")
487	}
488	// Is it imaginary?
489	l.accept("i")
490	// Next thing mustn't be alphanumeric.
491	if isAlphaNumeric(l.peek()) {
492		l.next()
493		return false
494	}
495	return true
496}
497
498// lexQuote scans a quoted string.
499func lexQuote(l *lexer) stateFn {
500Loop:
501	for {
502		switch l.next() {
503		case '\\':
504			if r := l.next(); r != eof && r != '\n' {
505				break
506			}
507			fallthrough
508		case eof, '\n':
509			return l.errorf("unterminated quoted string")
510		case '"':
511			break Loop
512		}
513	}
514	l.emit(itemString)
515	return lexInsideAction
516}
517
518// lexRawQuote scans a raw quoted string.
519func lexRawQuote(l *lexer) stateFn {
520Loop:
521	for {
522		switch l.next() {
523		case eof, '\n':
524			return l.errorf("unterminated raw quoted string")
525		case '`':
526			break Loop
527		}
528	}
529	l.emit(itemRawString)
530	return lexInsideAction
531}
532
533// isSpace reports whether r is a space character.
534func isSpace(r rune) bool {
535	return r == ' ' || r == '\t'
536}
537
538// isEndOfLine reports whether r is an end-of-line character.
539func isEndOfLine(r rune) bool {
540	return r == '\r' || r == '\n'
541}
542
543// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
544func isAlphaNumeric(r rune) bool {
545	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
546}
547