1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package scanner implements a scanner for Go source text.
6// It takes a []byte as source which can then be tokenized
7// through repeated calls to the Scan method.
8//
9package scanner
10
11import (
12	"bytes"
13	"fmt"
14	"go/token"
15	"path/filepath"
16	"strconv"
17	"unicode"
18	"unicode/utf8"
19)
20
21// An ErrorHandler may be provided to Scanner.Init. If a syntax error is
22// encountered and a handler was installed, the handler is called with a
23// position and an error message. The position points to the beginning of
24// the offending token.
25//
26type ErrorHandler func(pos token.Position, msg string)
27
28// A Scanner holds the scanner's internal state while processing
29// a given text.  It can be allocated as part of another data
30// structure but must be initialized via Init before use.
31//
32type Scanner struct {
33	// immutable state
34	file *token.File  // source file handle
35	dir  string       // directory portion of file.Name()
36	src  []byte       // source
37	err  ErrorHandler // error reporting; or nil
38	mode Mode         // scanning mode
39
40	// scanning state
41	ch         rune // current character
42	offset     int  // character offset
43	rdOffset   int  // reading offset (position after current character)
44	lineOffset int  // current line offset
45	insertSemi bool // insert a semicolon before next newline
46
47	// public state - ok to modify
48	ErrorCount int // number of errors encountered
49}
50
51const bom = 0xFEFF // byte order mark, only permitted as very first character
52
53// Read the next Unicode char into s.ch.
54// s.ch < 0 means end-of-file.
55//
56func (s *Scanner) next() {
57	if s.rdOffset < len(s.src) {
58		s.offset = s.rdOffset
59		if s.ch == '\n' {
60			s.lineOffset = s.offset
61			s.file.AddLine(s.offset)
62		}
63		r, w := rune(s.src[s.rdOffset]), 1
64		switch {
65		case r == 0:
66			s.error(s.offset, "illegal character NUL")
67		case r >= 0x80:
68			// not ASCII
69			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
70			if r == utf8.RuneError && w == 1 {
71				s.error(s.offset, "illegal UTF-8 encoding")
72			} else if r == bom && s.offset > 0 {
73				s.error(s.offset, "illegal byte order mark")
74			}
75		}
76		s.rdOffset += w
77		s.ch = r
78	} else {
79		s.offset = len(s.src)
80		if s.ch == '\n' {
81			s.lineOffset = s.offset
82			s.file.AddLine(s.offset)
83		}
84		s.ch = -1 // eof
85	}
86}
87
88// A mode value is a set of flags (or 0).
89// They control scanner behavior.
90//
91type Mode uint
92
93const (
94	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
95	dontInsertSemis                  // do not automatically insert semicolons - for testing only
96)
97
98// Init prepares the scanner s to tokenize the text src by setting the
99// scanner at the beginning of src. The scanner uses the file set file
100// for position information and it adds line information for each line.
101// It is ok to re-use the same file when re-scanning the same file as
102// line information which is already present is ignored. Init causes a
103// panic if the file size does not match the src size.
104//
105// Calls to Scan will invoke the error handler err if they encounter a
106// syntax error and err is not nil. Also, for each error encountered,
107// the Scanner field ErrorCount is incremented by one. The mode parameter
108// determines how comments are handled.
109//
110// Note that Init may call err if there is an error in the first character
111// of the file.
112//
113func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
114	// Explicitly initialize all fields since a scanner may be reused.
115	if file.Size() != len(src) {
116		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
117	}
118	s.file = file
119	s.dir, _ = filepath.Split(file.Name())
120	s.src = src
121	s.err = err
122	s.mode = mode
123
124	s.ch = ' '
125	s.offset = 0
126	s.rdOffset = 0
127	s.lineOffset = 0
128	s.insertSemi = false
129	s.ErrorCount = 0
130
131	s.next()
132	if s.ch == bom {
133		s.next() // ignore BOM at file beginning
134	}
135}
136
137func (s *Scanner) error(offs int, msg string) {
138	if s.err != nil {
139		s.err(s.file.Position(s.file.Pos(offs)), msg)
140	}
141	s.ErrorCount++
142}
143
144var prefix = []byte("//line ")
145
146func (s *Scanner) interpretLineComment(text []byte) {
147	if bytes.HasPrefix(text, prefix) {
148		// get filename and line number, if any
149		if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
150			if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
151				// valid //line filename:line comment;
152				filename := filepath.Clean(string(text[len(prefix):i]))
153				if !filepath.IsAbs(filename) {
154					// make filename relative to current directory
155					filename = filepath.Join(s.dir, filename)
156				}
157				// update scanner position
158				s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
159			}
160		}
161	}
162}
163
164func (s *Scanner) scanComment() string {
165	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
166	offs := s.offset - 1 // position of initial '/'
167	hasCR := false
168
169	if s.ch == '/' {
170		//-style comment
171		s.next()
172		for s.ch != '\n' && s.ch >= 0 {
173			if s.ch == '\r' {
174				hasCR = true
175			}
176			s.next()
177		}
178		if offs == s.lineOffset {
179			// comment starts at the beginning of the current line
180			s.interpretLineComment(s.src[offs:s.offset])
181		}
182		goto exit
183	}
184
185	/*-style comment */
186	s.next()
187	for s.ch >= 0 {
188		ch := s.ch
189		if ch == '\r' {
190			hasCR = true
191		}
192		s.next()
193		if ch == '*' && s.ch == '/' {
194			s.next()
195			goto exit
196		}
197	}
198
199	s.error(offs, "comment not terminated")
200
201exit:
202	lit := s.src[offs:s.offset]
203	if hasCR {
204		lit = stripCR(lit)
205	}
206
207	return string(lit)
208}
209
210func (s *Scanner) findLineEnd() bool {
211	// initial '/' already consumed
212
213	defer func(offs int) {
214		// reset scanner state to where it was upon calling findLineEnd
215		s.ch = '/'
216		s.offset = offs
217		s.rdOffset = offs + 1
218		s.next() // consume initial '/' again
219	}(s.offset - 1)
220
221	// read ahead until a newline, EOF, or non-comment token is found
222	for s.ch == '/' || s.ch == '*' {
223		if s.ch == '/' {
224			//-style comment always contains a newline
225			return true
226		}
227		/*-style comment: look for newline */
228		s.next()
229		for s.ch >= 0 {
230			ch := s.ch
231			if ch == '\n' {
232				return true
233			}
234			s.next()
235			if ch == '*' && s.ch == '/' {
236				s.next()
237				break
238			}
239		}
240		s.skipWhitespace() // s.insertSemi is set
241		if s.ch < 0 || s.ch == '\n' {
242			return true
243		}
244		if s.ch != '/' {
245			// non-comment token
246			return false
247		}
248		s.next() // consume '/'
249	}
250
251	return false
252}
253
254func isLetter(ch rune) bool {
255	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
256}
257
258func isDigit(ch rune) bool {
259	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
260}
261
262func (s *Scanner) scanIdentifier() string {
263	offs := s.offset
264	for isLetter(s.ch) || isDigit(s.ch) {
265		s.next()
266	}
267	return string(s.src[offs:s.offset])
268}
269
270func digitVal(ch rune) int {
271	switch {
272	case '0' <= ch && ch <= '9':
273		return int(ch - '0')
274	case 'a' <= ch && ch <= 'f':
275		return int(ch - 'a' + 10)
276	case 'A' <= ch && ch <= 'F':
277		return int(ch - 'A' + 10)
278	}
279	return 16 // larger than any legal digit val
280}
281
282func (s *Scanner) scanMantissa(base int) {
283	for digitVal(s.ch) < base {
284		s.next()
285	}
286}
287
288func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
289	// digitVal(s.ch) < 10
290	offs := s.offset
291	tok := token.INT
292
293	if seenDecimalPoint {
294		offs--
295		tok = token.FLOAT
296		s.scanMantissa(10)
297		goto exponent
298	}
299
300	if s.ch == '0' {
301		// int or float
302		offs := s.offset
303		s.next()
304		if s.ch == 'x' || s.ch == 'X' {
305			// hexadecimal int
306			s.next()
307			s.scanMantissa(16)
308			if s.offset-offs <= 2 {
309				// only scanned "0x" or "0X"
310				s.error(offs, "illegal hexadecimal number")
311			}
312		} else {
313			// octal int or float
314			seenDecimalDigit := false
315			s.scanMantissa(8)
316			if s.ch == '8' || s.ch == '9' {
317				// illegal octal int or float
318				seenDecimalDigit = true
319				s.scanMantissa(10)
320			}
321			if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
322				goto fraction
323			}
324			// octal int
325			if seenDecimalDigit {
326				s.error(offs, "illegal octal number")
327			}
328		}
329		goto exit
330	}
331
332	// decimal int or float
333	s.scanMantissa(10)
334
335fraction:
336	if s.ch == '.' {
337		tok = token.FLOAT
338		s.next()
339		s.scanMantissa(10)
340	}
341
342exponent:
343	if s.ch == 'e' || s.ch == 'E' {
344		tok = token.FLOAT
345		s.next()
346		if s.ch == '-' || s.ch == '+' {
347			s.next()
348		}
349		s.scanMantissa(10)
350	}
351
352	if s.ch == 'i' {
353		tok = token.IMAG
354		s.next()
355	}
356
357exit:
358	return tok, string(s.src[offs:s.offset])
359}
360
361func (s *Scanner) scanEscape(quote rune) {
362	offs := s.offset
363
364	var i, base, max uint32
365	switch s.ch {
366	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
367		s.next()
368		return
369	case '0', '1', '2', '3', '4', '5', '6', '7':
370		i, base, max = 3, 8, 255
371	case 'x':
372		s.next()
373		i, base, max = 2, 16, 255
374	case 'u':
375		s.next()
376		i, base, max = 4, 16, unicode.MaxRune
377	case 'U':
378		s.next()
379		i, base, max = 8, 16, unicode.MaxRune
380	default:
381		s.next() // always make progress
382		s.error(offs, "unknown escape sequence")
383		return
384	}
385
386	var x uint32
387	for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
388		d := uint32(digitVal(s.ch))
389		if d >= base {
390			s.error(s.offset, "illegal character in escape sequence")
391			break
392		}
393		x = x*base + d
394		s.next()
395	}
396	// in case of an error, consume remaining chars
397	for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
398		s.next()
399	}
400	if x > max || 0xD800 <= x && x < 0xE000 {
401		s.error(offs, "escape sequence is invalid Unicode code point")
402	}
403}
404
405func (s *Scanner) scanChar() string {
406	// '\'' opening already consumed
407	offs := s.offset - 1
408
409	n := 0
410	for s.ch != '\'' {
411		ch := s.ch
412		n++
413		s.next()
414		if ch == '\n' || ch < 0 {
415			s.error(offs, "character literal not terminated")
416			n = 1
417			break
418		}
419		if ch == '\\' {
420			s.scanEscape('\'')
421		}
422	}
423
424	s.next()
425
426	if n != 1 {
427		s.error(offs, "illegal character literal")
428	}
429
430	return string(s.src[offs:s.offset])
431}
432
433func (s *Scanner) scanString() string {
434	// '"' opening already consumed
435	offs := s.offset - 1
436
437	for s.ch != '"' {
438		ch := s.ch
439		s.next()
440		if ch == '\n' || ch < 0 {
441			s.error(offs, "string not terminated")
442			break
443		}
444		if ch == '\\' {
445			s.scanEscape('"')
446		}
447	}
448
449	s.next()
450
451	return string(s.src[offs:s.offset])
452}
453
454func stripCR(b []byte) []byte {
455	c := make([]byte, len(b))
456	i := 0
457	for _, ch := range b {
458		if ch != '\r' {
459			c[i] = ch
460			i++
461		}
462	}
463	return c[:i]
464}
465
466func (s *Scanner) scanRawString() string {
467	// '`' opening already consumed
468	offs := s.offset - 1
469
470	hasCR := false
471	for s.ch != '`' {
472		ch := s.ch
473		s.next()
474		if ch == '\r' {
475			hasCR = true
476		}
477		if ch < 0 {
478			s.error(offs, "string not terminated")
479			break
480		}
481	}
482
483	s.next()
484
485	lit := s.src[offs:s.offset]
486	if hasCR {
487		lit = stripCR(lit)
488	}
489
490	return string(lit)
491}
492
493func (s *Scanner) skipWhitespace() {
494	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
495		s.next()
496	}
497}
498
499// Helper functions for scanning multi-byte tokens such as >> += >>= .
500// Different routines recognize different length tok_i based on matches
501// of ch_i. If a token ends in '=', the result is tok1 or tok3
502// respectively. Otherwise, the result is tok0 if there was no other
503// matching character, or tok2 if the matching character was ch2.
504
505func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
506	if s.ch == '=' {
507		s.next()
508		return tok1
509	}
510	return tok0
511}
512
513func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
514	if s.ch == '=' {
515		s.next()
516		return tok1
517	}
518	if s.ch == ch2 {
519		s.next()
520		return tok2
521	}
522	return tok0
523}
524
525func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
526	if s.ch == '=' {
527		s.next()
528		return tok1
529	}
530	if s.ch == ch2 {
531		s.next()
532		if s.ch == '=' {
533			s.next()
534			return tok3
535		}
536		return tok2
537	}
538	return tok0
539}
540
541// Scan scans the next token and returns the token position, the token,
542// and its literal string if applicable. The source end is indicated by
543// token.EOF.
544//
545// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
546// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
547// has the corresponding value.
548//
549// If the returned token is a keyword, the literal string is the keyword.
550//
551// If the returned token is token.SEMICOLON, the corresponding
552// literal string is ";" if the semicolon was present in the source,
553// and "\n" if the semicolon was inserted because of a newline or
554// at EOF.
555//
556// If the returned token is token.ILLEGAL, the literal string is the
557// offending character.
558//
559// In all other cases, Scan returns an empty literal string.
560//
561// For more tolerant parsing, Scan will return a valid token if
562// possible even if a syntax error was encountered. Thus, even
563// if the resulting token sequence contains no illegal tokens,
564// a client may not assume that no error occurred. Instead it
565// must check the scanner's ErrorCount or the number of calls
566// of the error handler, if there was one installed.
567//
568// Scan adds line information to the file added to the file
569// set with Init. Token positions are relative to that file
570// and thus relative to the file set.
571//
572func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
573scanAgain:
574	s.skipWhitespace()
575
576	// current token start
577	pos = s.file.Pos(s.offset)
578
579	// determine token value
580	insertSemi := false
581	switch ch := s.ch; {
582	case isLetter(ch):
583		lit = s.scanIdentifier()
584		if len(lit) > 1 {
585			// keywords are longer than one letter - avoid lookup otherwise
586			tok = token.Lookup(lit)
587			switch tok {
588			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
589				insertSemi = true
590			}
591		} else {
592			insertSemi = true
593			tok = token.IDENT
594		}
595	case '0' <= ch && ch <= '9':
596		insertSemi = true
597		tok, lit = s.scanNumber(false)
598	default:
599		s.next() // always make progress
600		switch ch {
601		case -1:
602			if s.insertSemi {
603				s.insertSemi = false // EOF consumed
604				return pos, token.SEMICOLON, "\n"
605			}
606			tok = token.EOF
607		case '\n':
608			// we only reach here if s.insertSemi was
609			// set in the first place and exited early
610			// from s.skipWhitespace()
611			s.insertSemi = false // newline consumed
612			return pos, token.SEMICOLON, "\n"
613		case '"':
614			insertSemi = true
615			tok = token.STRING
616			lit = s.scanString()
617		case '\'':
618			insertSemi = true
619			tok = token.CHAR
620			lit = s.scanChar()
621		case '`':
622			insertSemi = true
623			tok = token.STRING
624			lit = s.scanRawString()
625		case ':':
626			tok = s.switch2(token.COLON, token.DEFINE)
627		case '.':
628			if '0' <= s.ch && s.ch <= '9' {
629				insertSemi = true
630				tok, lit = s.scanNumber(true)
631			} else if s.ch == '.' {
632				s.next()
633				if s.ch == '.' {
634					s.next()
635					tok = token.ELLIPSIS
636				}
637			} else {
638				tok = token.PERIOD
639			}
640		case ',':
641			tok = token.COMMA
642		case ';':
643			tok = token.SEMICOLON
644			lit = ";"
645		case '(':
646			tok = token.LPAREN
647		case ')':
648			insertSemi = true
649			tok = token.RPAREN
650		case '[':
651			tok = token.LBRACK
652		case ']':
653			insertSemi = true
654			tok = token.RBRACK
655		case '{':
656			tok = token.LBRACE
657		case '}':
658			insertSemi = true
659			tok = token.RBRACE
660		case '+':
661			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
662			if tok == token.INC {
663				insertSemi = true
664			}
665		case '-':
666			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
667			if tok == token.DEC {
668				insertSemi = true
669			}
670		case '*':
671			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
672		case '/':
673			if s.ch == '/' || s.ch == '*' {
674				// comment
675				if s.insertSemi && s.findLineEnd() {
676					// reset position to the beginning of the comment
677					s.ch = '/'
678					s.offset = s.file.Offset(pos)
679					s.rdOffset = s.offset + 1
680					s.insertSemi = false // newline consumed
681					return pos, token.SEMICOLON, "\n"
682				}
683				lit = s.scanComment()
684				if s.mode&ScanComments == 0 {
685					// skip comment
686					s.insertSemi = false // newline consumed
687					goto scanAgain
688				}
689				tok = token.COMMENT
690			} else {
691				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
692			}
693		case '%':
694			tok = s.switch2(token.REM, token.REM_ASSIGN)
695		case '^':
696			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
697		case '<':
698			if s.ch == '-' {
699				s.next()
700				tok = token.ARROW
701			} else {
702				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
703			}
704		case '>':
705			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
706		case '=':
707			tok = s.switch2(token.ASSIGN, token.EQL)
708		case '!':
709			tok = s.switch2(token.NOT, token.NEQ)
710		case '&':
711			if s.ch == '^' {
712				s.next()
713				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
714			} else {
715				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
716			}
717		case '|':
718			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
719		default:
720			// next reports unexpected BOMs - don't repeat
721			if ch != bom {
722				s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
723			}
724			insertSemi = s.insertSemi // preserve insertSemi info
725			tok = token.ILLEGAL
726			lit = string(ch)
727		}
728	}
729	if s.mode&dontInsertSemis == 0 {
730		s.insertSemi = insertSemi
731	}
732
733	return
734}
735