1// Package scanner implements a scanner for HCL (HashiCorp Configuration
2// Language) source text.
3package scanner
4
5import (
6	"bytes"
7	"fmt"
8	"os"
9	"regexp"
10	"unicode"
11	"unicode/utf8"
12
13	"github.com/hashicorp/hcl/hcl/token"
14)
15
16// eof represents a marker rune for the end of the reader.
17const eof = rune(0)
18
19// Scanner defines a lexical scanner
20type Scanner struct {
21	buf *bytes.Buffer // Source buffer for advancing and scanning
22	src []byte        // Source buffer for immutable access
23
24	// Source Position
25	srcPos  token.Pos // current position
26	prevPos token.Pos // previous position, used for peek() method
27
28	lastCharLen int // length of last character in bytes
29	lastLineLen int // length of last line in characters (for correct column reporting)
30
31	tokStart int // token text start position
32	tokEnd   int // token text end  position
33
34	// Error is called for each error encountered. If no Error
35	// function is set, the error is reported to os.Stderr.
36	Error func(pos token.Pos, msg string)
37
38	// ErrorCount is incremented by one for each error encountered.
39	ErrorCount int
40
41	// tokPos is the start position of most recently scanned token; set by
42	// Scan. The Filename field is always left untouched by the Scanner.  If
43	// an error is reported (via Error) and Position is invalid, the scanner is
44	// not inside a token.
45	tokPos token.Pos
46}
47
48// New creates and initializes a new instance of Scanner using src as
49// its source content.
50func New(src []byte) *Scanner {
51	// even though we accept a src, we read from a io.Reader compatible type
52	// (*bytes.Buffer). So in the future we might easily change it to streaming
53	// read.
54	b := bytes.NewBuffer(src)
55	s := &Scanner{
56		buf: b,
57		src: src,
58	}
59
60	// srcPosition always starts with 1
61	s.srcPos.Line = 1
62	return s
63}
64
65// next reads the next rune from the bufferred reader. Returns the rune(0) if
66// an error occurs (or io.EOF is returned).
67func (s *Scanner) next() rune {
68	ch, size, err := s.buf.ReadRune()
69	if err != nil {
70		// advance for error reporting
71		s.srcPos.Column++
72		s.srcPos.Offset += size
73		s.lastCharLen = size
74		return eof
75	}
76
77	// remember last position
78	s.prevPos = s.srcPos
79
80	s.srcPos.Column++
81	s.lastCharLen = size
82	s.srcPos.Offset += size
83
84	if ch == utf8.RuneError && size == 1 {
85		s.err("illegal UTF-8 encoding")
86		return ch
87	}
88
89	if ch == '\n' {
90		s.srcPos.Line++
91		s.lastLineLen = s.srcPos.Column
92		s.srcPos.Column = 0
93	}
94
95	if ch == '\x00' {
96		s.err("unexpected null character (0x00)")
97		return eof
98	}
99
100	if ch == '\uE123' {
101		s.err("unicode code point U+E123 reserved for internal use")
102		return utf8.RuneError
103	}
104
105	// debug
106	// fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column)
107	return ch
108}
109
110// unread unreads the previous read Rune and updates the source position
111func (s *Scanner) unread() {
112	if err := s.buf.UnreadRune(); err != nil {
113		panic(err) // this is user fault, we should catch it
114	}
115	s.srcPos = s.prevPos // put back last position
116}
117
118// peek returns the next rune without advancing the reader.
119func (s *Scanner) peek() rune {
120	peek, _, err := s.buf.ReadRune()
121	if err != nil {
122		return eof
123	}
124
125	s.buf.UnreadRune()
126	return peek
127}
128
129// Scan scans the next token and returns the token.
130func (s *Scanner) Scan() token.Token {
131	ch := s.next()
132
133	// skip white space
134	for isWhitespace(ch) {
135		ch = s.next()
136	}
137
138	var tok token.Type
139
140	// token text markings
141	s.tokStart = s.srcPos.Offset - s.lastCharLen
142
143	// token position, initial next() is moving the offset by one(size of rune
144	// actually), though we are interested with the starting point
145	s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen
146	if s.srcPos.Column > 0 {
147		// common case: last character was not a '\n'
148		s.tokPos.Line = s.srcPos.Line
149		s.tokPos.Column = s.srcPos.Column
150	} else {
151		// last character was a '\n'
152		// (we cannot be at the beginning of the source
153		// since we have called next() at least once)
154		s.tokPos.Line = s.srcPos.Line - 1
155		s.tokPos.Column = s.lastLineLen
156	}
157
158	switch {
159	case isLetter(ch):
160		tok = token.IDENT
161		lit := s.scanIdentifier()
162		if lit == "true" || lit == "false" {
163			tok = token.BOOL
164		}
165	case isDecimal(ch):
166		tok = s.scanNumber(ch)
167	default:
168		switch ch {
169		case eof:
170			tok = token.EOF
171		case '"':
172			tok = token.STRING
173			s.scanString()
174		case '#', '/':
175			tok = token.COMMENT
176			s.scanComment(ch)
177		case '.':
178			tok = token.PERIOD
179			ch = s.peek()
180			if isDecimal(ch) {
181				tok = token.FLOAT
182				ch = s.scanMantissa(ch)
183				ch = s.scanExponent(ch)
184			}
185		case '<':
186			tok = token.HEREDOC
187			s.scanHeredoc()
188		case '[':
189			tok = token.LBRACK
190		case ']':
191			tok = token.RBRACK
192		case '{':
193			tok = token.LBRACE
194		case '}':
195			tok = token.RBRACE
196		case ',':
197			tok = token.COMMA
198		case '=':
199			tok = token.ASSIGN
200		case '+':
201			tok = token.ADD
202		case '-':
203			if isDecimal(s.peek()) {
204				ch := s.next()
205				tok = s.scanNumber(ch)
206			} else {
207				tok = token.SUB
208			}
209		default:
210			s.err("illegal char")
211		}
212	}
213
214	// finish token ending
215	s.tokEnd = s.srcPos.Offset
216
217	// create token literal
218	var tokenText string
219	if s.tokStart >= 0 {
220		tokenText = string(s.src[s.tokStart:s.tokEnd])
221	}
222	s.tokStart = s.tokEnd // ensure idempotency of tokenText() call
223
224	return token.Token{
225		Type: tok,
226		Pos:  s.tokPos,
227		Text: tokenText,
228	}
229}
230
231func (s *Scanner) scanComment(ch rune) {
232	// single line comments
233	if ch == '#' || (ch == '/' && s.peek() != '*') {
234		if ch == '/' && s.peek() != '/' {
235			s.err("expected '/' for comment")
236			return
237		}
238
239		ch = s.next()
240		for ch != '\n' && ch >= 0 && ch != eof {
241			ch = s.next()
242		}
243		if ch != eof && ch >= 0 {
244			s.unread()
245		}
246		return
247	}
248
249	// be sure we get the character after /* This allows us to find comment's
250	// that are not erminated
251	if ch == '/' {
252		s.next()
253		ch = s.next() // read character after "/*"
254	}
255
256	// look for /* - style comments
257	for {
258		if ch < 0 || ch == eof {
259			s.err("comment not terminated")
260			break
261		}
262
263		ch0 := ch
264		ch = s.next()
265		if ch0 == '*' && ch == '/' {
266			break
267		}
268	}
269}
270
271// scanNumber scans a HCL number definition starting with the given rune
272func (s *Scanner) scanNumber(ch rune) token.Type {
273	if ch == '0' {
274		// check for hexadecimal, octal or float
275		ch = s.next()
276		if ch == 'x' || ch == 'X' {
277			// hexadecimal
278			ch = s.next()
279			found := false
280			for isHexadecimal(ch) {
281				ch = s.next()
282				found = true
283			}
284
285			if !found {
286				s.err("illegal hexadecimal number")
287			}
288
289			if ch != eof {
290				s.unread()
291			}
292
293			return token.NUMBER
294		}
295
296		// now it's either something like: 0421(octal) or 0.1231(float)
297		illegalOctal := false
298		for isDecimal(ch) {
299			ch = s.next()
300			if ch == '8' || ch == '9' {
301				// this is just a possibility. For example 0159 is illegal, but
302				// 0159.23 is valid. So we mark a possible illegal octal. If
303				// the next character is not a period, we'll print the error.
304				illegalOctal = true
305			}
306		}
307
308		if ch == 'e' || ch == 'E' {
309			ch = s.scanExponent(ch)
310			return token.FLOAT
311		}
312
313		if ch == '.' {
314			ch = s.scanFraction(ch)
315
316			if ch == 'e' || ch == 'E' {
317				ch = s.next()
318				ch = s.scanExponent(ch)
319			}
320			return token.FLOAT
321		}
322
323		if illegalOctal {
324			s.err("illegal octal number")
325		}
326
327		if ch != eof {
328			s.unread()
329		}
330		return token.NUMBER
331	}
332
333	s.scanMantissa(ch)
334	ch = s.next() // seek forward
335	if ch == 'e' || ch == 'E' {
336		ch = s.scanExponent(ch)
337		return token.FLOAT
338	}
339
340	if ch == '.' {
341		ch = s.scanFraction(ch)
342		if ch == 'e' || ch == 'E' {
343			ch = s.next()
344			ch = s.scanExponent(ch)
345		}
346		return token.FLOAT
347	}
348
349	if ch != eof {
350		s.unread()
351	}
352	return token.NUMBER
353}
354
355// scanMantissa scans the mantissa beginning from the rune. It returns the next
356// non decimal rune. It's used to determine wheter it's a fraction or exponent.
357func (s *Scanner) scanMantissa(ch rune) rune {
358	scanned := false
359	for isDecimal(ch) {
360		ch = s.next()
361		scanned = true
362	}
363
364	if scanned && ch != eof {
365		s.unread()
366	}
367	return ch
368}
369
370// scanFraction scans the fraction after the '.' rune
371func (s *Scanner) scanFraction(ch rune) rune {
372	if ch == '.' {
373		ch = s.peek() // we peek just to see if we can move forward
374		ch = s.scanMantissa(ch)
375	}
376	return ch
377}
378
379// scanExponent scans the remaining parts of an exponent after the 'e' or 'E'
380// rune.
381func (s *Scanner) scanExponent(ch rune) rune {
382	if ch == 'e' || ch == 'E' {
383		ch = s.next()
384		if ch == '-' || ch == '+' {
385			ch = s.next()
386		}
387		ch = s.scanMantissa(ch)
388	}
389	return ch
390}
391
392// scanHeredoc scans a heredoc string
393func (s *Scanner) scanHeredoc() {
394	// Scan the second '<' in example: '<<EOF'
395	if s.next() != '<' {
396		s.err("heredoc expected second '<', didn't see it")
397		return
398	}
399
400	// Get the original offset so we can read just the heredoc ident
401	offs := s.srcPos.Offset
402
403	// Scan the identifier
404	ch := s.next()
405
406	// Indented heredoc syntax
407	if ch == '-' {
408		ch = s.next()
409	}
410
411	for isLetter(ch) || isDigit(ch) {
412		ch = s.next()
413	}
414
415	// If we reached an EOF then that is not good
416	if ch == eof {
417		s.err("heredoc not terminated")
418		return
419	}
420
421	// Ignore the '\r' in Windows line endings
422	if ch == '\r' {
423		if s.peek() == '\n' {
424			ch = s.next()
425		}
426	}
427
428	// If we didn't reach a newline then that is also not good
429	if ch != '\n' {
430		s.err("invalid characters in heredoc anchor")
431		return
432	}
433
434	// Read the identifier
435	identBytes := s.src[offs : s.srcPos.Offset-s.lastCharLen]
436	if len(identBytes) == 0 || (len(identBytes) == 1 && identBytes[0] == '-') {
437		s.err("zero-length heredoc anchor")
438		return
439	}
440
441	var identRegexp *regexp.Regexp
442	if identBytes[0] == '-' {
443		identRegexp = regexp.MustCompile(fmt.Sprintf(`^[[:space:]]*%s\r*\z`, identBytes[1:]))
444	} else {
445		identRegexp = regexp.MustCompile(fmt.Sprintf(`^[[:space:]]*%s\r*\z`, identBytes))
446	}
447
448	// Read the actual string value
449	lineStart := s.srcPos.Offset
450	for {
451		ch := s.next()
452
453		// Special newline handling.
454		if ch == '\n' {
455			// Math is fast, so we first compare the byte counts to see if we have a chance
456			// of seeing the same identifier - if the length is less than the number of bytes
457			// in the identifier, this cannot be a valid terminator.
458			lineBytesLen := s.srcPos.Offset - s.lastCharLen - lineStart
459			if lineBytesLen >= len(identBytes) && identRegexp.Match(s.src[lineStart:s.srcPos.Offset-s.lastCharLen]) {
460				break
461			}
462
463			// Not an anchor match, record the start of a new line
464			lineStart = s.srcPos.Offset
465		}
466
467		if ch == eof {
468			s.err("heredoc not terminated")
469			return
470		}
471	}
472
473	return
474}
475
476// scanString scans a quoted string
477func (s *Scanner) scanString() {
478	braces := 0
479	for {
480		// '"' opening already consumed
481		// read character after quote
482		ch := s.next()
483
484		if (ch == '\n' && braces == 0) || ch < 0 || ch == eof {
485			s.err("literal not terminated")
486			return
487		}
488
489		if ch == '"' && braces == 0 {
490			break
491		}
492
493		// If we're going into a ${} then we can ignore quotes for awhile
494		if braces == 0 && ch == '$' && s.peek() == '{' {
495			braces++
496			s.next()
497		} else if braces > 0 && ch == '{' {
498			braces++
499		}
500		if braces > 0 && ch == '}' {
501			braces--
502		}
503
504		if ch == '\\' {
505			s.scanEscape()
506		}
507	}
508
509	return
510}
511
512// scanEscape scans an escape sequence
513func (s *Scanner) scanEscape() rune {
514	// http://en.cppreference.com/w/cpp/language/escape
515	ch := s.next() // read character after '/'
516	switch ch {
517	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
518		// nothing to do
519	case '0', '1', '2', '3', '4', '5', '6', '7':
520		// octal notation
521		ch = s.scanDigits(ch, 8, 3)
522	case 'x':
523		// hexademical notation
524		ch = s.scanDigits(s.next(), 16, 2)
525	case 'u':
526		// universal character name
527		ch = s.scanDigits(s.next(), 16, 4)
528	case 'U':
529		// universal character name
530		ch = s.scanDigits(s.next(), 16, 8)
531	default:
532		s.err("illegal char escape")
533	}
534	return ch
535}
536
537// scanDigits scans a rune with the given base for n times. For example an
538// octal notation \184 would yield in scanDigits(ch, 8, 3)
539func (s *Scanner) scanDigits(ch rune, base, n int) rune {
540	start := n
541	for n > 0 && digitVal(ch) < base {
542		ch = s.next()
543		if ch == eof {
544			// If we see an EOF, we halt any more scanning of digits
545			// immediately.
546			break
547		}
548
549		n--
550	}
551	if n > 0 {
552		s.err("illegal char escape")
553	}
554
555	if n != start && ch != eof {
556		// we scanned all digits, put the last non digit char back,
557		// only if we read anything at all
558		s.unread()
559	}
560
561	return ch
562}
563
564// scanIdentifier scans an identifier and returns the literal string
565func (s *Scanner) scanIdentifier() string {
566	offs := s.srcPos.Offset - s.lastCharLen
567	ch := s.next()
568	for isLetter(ch) || isDigit(ch) || ch == '-' || ch == '.' {
569		ch = s.next()
570	}
571
572	if ch != eof {
573		s.unread() // we got identifier, put back latest char
574	}
575
576	return string(s.src[offs:s.srcPos.Offset])
577}
578
579// recentPosition returns the position of the character immediately after the
580// character or token returned by the last call to Scan.
581func (s *Scanner) recentPosition() (pos token.Pos) {
582	pos.Offset = s.srcPos.Offset - s.lastCharLen
583	switch {
584	case s.srcPos.Column > 0:
585		// common case: last character was not a '\n'
586		pos.Line = s.srcPos.Line
587		pos.Column = s.srcPos.Column
588	case s.lastLineLen > 0:
589		// last character was a '\n'
590		// (we cannot be at the beginning of the source
591		// since we have called next() at least once)
592		pos.Line = s.srcPos.Line - 1
593		pos.Column = s.lastLineLen
594	default:
595		// at the beginning of the source
596		pos.Line = 1
597		pos.Column = 1
598	}
599	return
600}
601
602// err prints the error of any scanning to s.Error function. If the function is
603// not defined, by default it prints them to os.Stderr
604func (s *Scanner) err(msg string) {
605	s.ErrorCount++
606	pos := s.recentPosition()
607
608	if s.Error != nil {
609		s.Error(pos, msg)
610		return
611	}
612
613	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
614}
615
616// isHexadecimal returns true if the given rune is a letter
617func isLetter(ch rune) bool {
618	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
619}
620
621// isDigit returns true if the given rune is a decimal digit
622func isDigit(ch rune) bool {
623	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
624}
625
626// isDecimal returns true if the given rune is a decimal number
627func isDecimal(ch rune) bool {
628	return '0' <= ch && ch <= '9'
629}
630
631// isHexadecimal returns true if the given rune is an hexadecimal number
632func isHexadecimal(ch rune) bool {
633	return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
634}
635
636// isWhitespace returns true if the rune is a space, tab, newline or carriage return
637func isWhitespace(ch rune) bool {
638	return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
639}
640
641// digitVal returns the integer value of a given octal,decimal or hexadecimal rune
642func digitVal(ch rune) int {
643	switch {
644	case '0' <= ch && ch <= '9':
645		return int(ch - '0')
646	case 'a' <= ch && ch <= 'f':
647		return int(ch - 'a' + 10)
648	case 'A' <= ch && ch <= 'F':
649		return int(ch - 'A' + 10)
650	}
651	return 16 // larger than any legal digit val
652}
653