1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package parse
6
7import (
8	"fmt"
9	"strings"
10	"unicode"
11	"unicode/utf8"
12)
13
14// item represents a token or text string returned from the scanner.
15type item struct {
16	typ itemType // The type of this item.
17	pos Pos      // The starting position, in bytes, of this item in the input string.
18	val string   // The value of this item.
19}
20
21func (i item) String() string {
22	switch {
23	case i.typ == itemEOF:
24		return "EOF"
25	case i.typ == itemError:
26		return i.val
27	case i.typ > itemKeyword:
28		return fmt.Sprintf("<%s>", i.val)
29	case len(i.val) > 10:
30		return fmt.Sprintf("%.10q...", i.val)
31	}
32	return fmt.Sprintf("%q", i.val)
33}
34
35// itemType identifies the type of lex items.
36type itemType int
37
38const (
39	itemError        itemType = iota // error occurred; value is text of error
40	itemBool                         // boolean constant
41	itemChar                         // printable ASCII character; grab bag for comma etc.
42	itemCharConstant                 // character constant
43	itemComplex                      // complex constant (1+2i); imaginary is just a number
44	itemColonEquals                  // colon-equals (':=') introducing a declaration
45	itemEOF
46	itemField        // alphanumeric identifier starting with '.'
47	itemIdentifier   // alphanumeric identifier not starting with '.'
48	itemLeftDelim    // left action delimiter
49	itemLeftParen    // '(' inside action
50	itemNumber       // simple number, including imaginary
51	itemPipe         // pipe symbol
52	itemRawString    // raw quoted string (includes quotes)
53	itemRightDelim   // right action delimiter
54	itemElideNewline // elide newline after right delim
55	itemRightParen   // ')' inside action
56	itemSpace        // run of spaces separating arguments
57	itemString       // quoted string (includes quotes)
58	itemText         // plain text
59	itemVariable     // variable starting with '$', such as '$' or  '$1' or '$hello'
60	// Keywords appear after all the rest.
61	itemKeyword  // used only to delimit the keywords
62	itemDot      // the cursor, spelled '.'
63	itemDefine   // define keyword
64	itemElse     // else keyword
65	itemEnd      // end keyword
66	itemIf       // if keyword
67	itemNil      // the untyped nil constant, easiest to treat as a keyword
68	itemRange    // range keyword
69	itemTemplate // template keyword
70	itemWith     // with keyword
71)
72
73var key = map[string]itemType{
74	".":        itemDot,
75	"define":   itemDefine,
76	"else":     itemElse,
77	"end":      itemEnd,
78	"if":       itemIf,
79	"range":    itemRange,
80	"nil":      itemNil,
81	"template": itemTemplate,
82	"with":     itemWith,
83}
84
85const eof = -1
86
87// stateFn represents the state of the scanner as a function that returns the next state.
88type stateFn func(*lexer) stateFn
89
90// lexer holds the state of the scanner.
91type lexer struct {
92	name       string    // the name of the input; used only for error reports
93	input      string    // the string being scanned
94	leftDelim  string    // start of action
95	rightDelim string    // end of action
96	state      stateFn   // the next lexing function to enter
97	pos        Pos       // current position in the input
98	start      Pos       // start position of this item
99	width      Pos       // width of last rune read from input
100	lastPos    Pos       // position of most recent item returned by nextItem
101	items      chan item // channel of scanned items
102	parenDepth int       // nesting depth of ( ) exprs
103}
104
105// next returns the next rune in the input.
106func (l *lexer) next() rune {
107	if int(l.pos) >= len(l.input) {
108		l.width = 0
109		return eof
110	}
111	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
112	l.width = Pos(w)
113	l.pos += l.width
114	return r
115}
116
117// peek returns but does not consume the next rune in the input.
118func (l *lexer) peek() rune {
119	r := l.next()
120	l.backup()
121	return r
122}
123
124// backup steps back one rune. Can only be called once per call of next.
125func (l *lexer) backup() {
126	l.pos -= l.width
127}
128
129// emit passes an item back to the client.
130func (l *lexer) emit(t itemType) {
131	l.items <- item{t, l.start, l.input[l.start:l.pos]}
132	l.start = l.pos
133}
134
135// ignore skips over the pending input before this point.
136func (l *lexer) ignore() {
137	l.start = l.pos
138}
139
140// accept consumes the next rune if it's from the valid set.
141func (l *lexer) accept(valid string) bool {
142	if strings.IndexRune(valid, l.next()) >= 0 {
143		return true
144	}
145	l.backup()
146	return false
147}
148
149// acceptRun consumes a run of runes from the valid set.
150func (l *lexer) acceptRun(valid string) {
151	for strings.IndexRune(valid, l.next()) >= 0 {
152	}
153	l.backup()
154}
155
156// lineNumber reports which line we're on, based on the position of
157// the previous item returned by nextItem. Doing it this way
158// means we don't have to worry about peek double counting.
159func (l *lexer) lineNumber() int {
160	return 1 + strings.Count(l.input[:l.lastPos], "\n")
161}
162
163// errorf returns an error token and terminates the scan by passing
164// back a nil pointer that will be the next state, terminating l.nextItem.
165func (l *lexer) errorf(format string, args ...interface{}) stateFn {
166	l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
167	return nil
168}
169
170// nextItem returns the next item from the input.
171func (l *lexer) nextItem() item {
172	item := <-l.items
173	l.lastPos = item.pos
174	return item
175}
176
177// lex creates a new scanner for the input string.
178func lex(name, input, left, right string) *lexer {
179	if left == "" {
180		left = leftDelim
181	}
182	if right == "" {
183		right = rightDelim
184	}
185	l := &lexer{
186		name:       name,
187		input:      input,
188		leftDelim:  left,
189		rightDelim: right,
190		items:      make(chan item),
191	}
192	go l.run()
193	return l
194}
195
196// run runs the state machine for the lexer.
197func (l *lexer) run() {
198	for l.state = lexText; l.state != nil; {
199		l.state = l.state(l)
200	}
201}
202
203// state functions
204
205const (
206	leftDelim    = "{{"
207	rightDelim   = "}}"
208	leftComment  = "/*"
209	rightComment = "*/"
210)
211
212// lexText scans until an opening action delimiter, "{{".
213func lexText(l *lexer) stateFn {
214	for {
215		if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
216			if l.pos > l.start {
217				l.emit(itemText)
218			}
219			return lexLeftDelim
220		}
221		if l.next() == eof {
222			break
223		}
224	}
225	// Correctly reached EOF.
226	if l.pos > l.start {
227		l.emit(itemText)
228	}
229	l.emit(itemEOF)
230	return nil
231}
232
233// lexLeftDelim scans the left delimiter, which is known to be present.
234func lexLeftDelim(l *lexer) stateFn {
235	l.pos += Pos(len(l.leftDelim))
236	if strings.HasPrefix(l.input[l.pos:], leftComment) {
237		return lexComment
238	}
239	l.emit(itemLeftDelim)
240	l.parenDepth = 0
241	return lexInsideAction
242}
243
244// lexComment scans a comment. The left comment marker is known to be present.
245func lexComment(l *lexer) stateFn {
246	l.pos += Pos(len(leftComment))
247	i := strings.Index(l.input[l.pos:], rightComment)
248	if i < 0 {
249		return l.errorf("unclosed comment")
250	}
251	l.pos += Pos(i + len(rightComment))
252	if !strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
253		return l.errorf("comment ends before closing delimiter")
254
255	}
256	l.pos += Pos(len(l.rightDelim))
257	l.ignore()
258	return lexText
259}
260
261// lexRightDelim scans the right delimiter, which is known to be present.
262func lexRightDelim(l *lexer) stateFn {
263	l.pos += Pos(len(l.rightDelim))
264	l.emit(itemRightDelim)
265	if l.peek() == '\\' {
266		l.pos++
267		l.emit(itemElideNewline)
268	}
269	return lexText
270}
271
272// lexInsideAction scans the elements inside action delimiters.
273func lexInsideAction(l *lexer) stateFn {
274	// Either number, quoted string, or identifier.
275	// Spaces separate arguments; runs of spaces turn into itemSpace.
276	// Pipe symbols separate and are emitted.
277	if strings.HasPrefix(l.input[l.pos:], l.rightDelim+"\\") || strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
278		if l.parenDepth == 0 {
279			return lexRightDelim
280		}
281		return l.errorf("unclosed left paren")
282	}
283	switch r := l.next(); {
284	case r == eof || isEndOfLine(r):
285		return l.errorf("unclosed action")
286	case isSpace(r):
287		return lexSpace
288	case r == ':':
289		if l.next() != '=' {
290			return l.errorf("expected :=")
291		}
292		l.emit(itemColonEquals)
293	case r == '|':
294		l.emit(itemPipe)
295	case r == '"':
296		return lexQuote
297	case r == '`':
298		return lexRawQuote
299	case r == '$':
300		return lexVariable
301	case r == '\'':
302		return lexChar
303	case r == '.':
304		// special look-ahead for ".field" so we don't break l.backup().
305		if l.pos < Pos(len(l.input)) {
306			r := l.input[l.pos]
307			if r < '0' || '9' < r {
308				return lexField
309			}
310		}
311		fallthrough // '.' can start a number.
312	case r == '+' || r == '-' || ('0' <= r && r <= '9'):
313		l.backup()
314		return lexNumber
315	case isAlphaNumeric(r):
316		l.backup()
317		return lexIdentifier
318	case r == '(':
319		l.emit(itemLeftParen)
320		l.parenDepth++
321		return lexInsideAction
322	case r == ')':
323		l.emit(itemRightParen)
324		l.parenDepth--
325		if l.parenDepth < 0 {
326			return l.errorf("unexpected right paren %#U", r)
327		}
328		return lexInsideAction
329	case r <= unicode.MaxASCII && unicode.IsPrint(r):
330		l.emit(itemChar)
331		return lexInsideAction
332	default:
333		return l.errorf("unrecognized character in action: %#U", r)
334	}
335	return lexInsideAction
336}
337
338// lexSpace scans a run of space characters.
339// One space has already been seen.
340func lexSpace(l *lexer) stateFn {
341	for isSpace(l.peek()) {
342		l.next()
343	}
344	l.emit(itemSpace)
345	return lexInsideAction
346}
347
348// lexIdentifier scans an alphanumeric.
349func lexIdentifier(l *lexer) stateFn {
350Loop:
351	for {
352		switch r := l.next(); {
353		case isAlphaNumeric(r):
354			// absorb.
355		default:
356			l.backup()
357			word := l.input[l.start:l.pos]
358			if !l.atTerminator() {
359				return l.errorf("bad character %#U", r)
360			}
361			switch {
362			case key[word] > itemKeyword:
363				l.emit(key[word])
364			case word[0] == '.':
365				l.emit(itemField)
366			case word == "true", word == "false":
367				l.emit(itemBool)
368			default:
369				l.emit(itemIdentifier)
370			}
371			break Loop
372		}
373	}
374	return lexInsideAction
375}
376
377// lexField scans a field: .Alphanumeric.
378// The . has been scanned.
379func lexField(l *lexer) stateFn {
380	return lexFieldOrVariable(l, itemField)
381}
382
383// lexVariable scans a Variable: $Alphanumeric.
384// The $ has been scanned.
385func lexVariable(l *lexer) stateFn {
386	if l.atTerminator() { // Nothing interesting follows -> "$".
387		l.emit(itemVariable)
388		return lexInsideAction
389	}
390	return lexFieldOrVariable(l, itemVariable)
391}
392
393// lexVariable scans a field or variable: [.$]Alphanumeric.
394// The . or $ has been scanned.
395func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
396	if l.atTerminator() { // Nothing interesting follows -> "." or "$".
397		if typ == itemVariable {
398			l.emit(itemVariable)
399		} else {
400			l.emit(itemDot)
401		}
402		return lexInsideAction
403	}
404	var r rune
405	for {
406		r = l.next()
407		if !isAlphaNumeric(r) {
408			l.backup()
409			break
410		}
411	}
412	if !l.atTerminator() {
413		return l.errorf("bad character %#U", r)
414	}
415	l.emit(typ)
416	return lexInsideAction
417}
418
419// atTerminator reports whether the input is at valid termination character to
420// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
421// like "$x+2" not being acceptable without a space, in case we decide one
422// day to implement arithmetic.
423func (l *lexer) atTerminator() bool {
424	r := l.peek()
425	if isSpace(r) || isEndOfLine(r) {
426		return true
427	}
428	switch r {
429	case eof, '.', ',', '|', ':', ')', '(':
430		return true
431	}
432	// Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
433	// succeed but should fail) but only in extremely rare cases caused by willfully
434	// bad choice of delimiter.
435	if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
436		return true
437	}
438	return false
439}
440
441// lexChar scans a character constant. The initial quote is already
442// scanned. Syntax checking is done by the parser.
443func lexChar(l *lexer) stateFn {
444Loop:
445	for {
446		switch l.next() {
447		case '\\':
448			if r := l.next(); r != eof && r != '\n' {
449				break
450			}
451			fallthrough
452		case eof, '\n':
453			return l.errorf("unterminated character constant")
454		case '\'':
455			break Loop
456		}
457	}
458	l.emit(itemCharConstant)
459	return lexInsideAction
460}
461
462// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
463// isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
464// and "089" - but when it's wrong the input is invalid and the parser (via
465// strconv) will notice.
466func lexNumber(l *lexer) stateFn {
467	if !l.scanNumber() {
468		return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
469	}
470	if sign := l.peek(); sign == '+' || sign == '-' {
471		// Complex: 1+2i. No spaces, must end in 'i'.
472		if !l.scanNumber() || l.input[l.pos-1] != 'i' {
473			return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
474		}
475		l.emit(itemComplex)
476	} else {
477		l.emit(itemNumber)
478	}
479	return lexInsideAction
480}
481
482func (l *lexer) scanNumber() bool {
483	// Optional leading sign.
484	l.accept("+-")
485	// Is it hex?
486	digits := "0123456789"
487	if l.accept("0") && l.accept("xX") {
488		digits = "0123456789abcdefABCDEF"
489	}
490	l.acceptRun(digits)
491	if l.accept(".") {
492		l.acceptRun(digits)
493	}
494	if l.accept("eE") {
495		l.accept("+-")
496		l.acceptRun("0123456789")
497	}
498	// Is it imaginary?
499	l.accept("i")
500	// Next thing mustn't be alphanumeric.
501	if isAlphaNumeric(l.peek()) {
502		l.next()
503		return false
504	}
505	return true
506}
507
508// lexQuote scans a quoted string.
509func lexQuote(l *lexer) stateFn {
510Loop:
511	for {
512		switch l.next() {
513		case '\\':
514			if r := l.next(); r != eof && r != '\n' {
515				break
516			}
517			fallthrough
518		case eof, '\n':
519			return l.errorf("unterminated quoted string")
520		case '"':
521			break Loop
522		}
523	}
524	l.emit(itemString)
525	return lexInsideAction
526}
527
528// lexRawQuote scans a raw quoted string.
529func lexRawQuote(l *lexer) stateFn {
530Loop:
531	for {
532		switch l.next() {
533		case eof, '\n':
534			return l.errorf("unterminated raw quoted string")
535		case '`':
536			break Loop
537		}
538	}
539	l.emit(itemRawString)
540	return lexInsideAction
541}
542
543// isSpace reports whether r is a space character.
544func isSpace(r rune) bool {
545	return r == ' ' || r == '\t'
546}
547
548// isEndOfLine reports whether r is an end-of-line character.
549func isEndOfLine(r rune) bool {
550	return r == '\r' || r == '\n'
551}
552
553// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
554func isAlphaNumeric(r rune) bool {
555	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
556}
557