1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package scanner implements a scanner for Go source text.
6// It takes a []byte as source which can then be tokenized
7// through repeated calls to the Scan method.
8//
9package scanner
10
11import (
12	"bytes"
13	"fmt"
14	"go/token"
15	"path/filepath"
16	"strconv"
17	"unicode"
18	"unicode/utf8"
19)
20
21// An ErrorHandler may be provided to Scanner.Init. If a syntax error is
22// encountered and a handler was installed, the handler is called with a
23// position and an error message. The position points to the beginning of
24// the offending token.
25//
26type ErrorHandler func(pos token.Position, msg string)
27
28// A Scanner holds the scanner's internal state while processing
29// a given text. It can be allocated as part of another data
30// structure but must be initialized via Init before use.
31//
32type Scanner struct {
33	// immutable state
34	file *token.File  // source file handle
35	dir  string       // directory portion of file.Name()
36	src  []byte       // source
37	err  ErrorHandler // error reporting; or nil
38	mode Mode         // scanning mode
39
40	// scanning state
41	ch         rune // current character
42	offset     int  // character offset
43	rdOffset   int  // reading offset (position after current character)
44	lineOffset int  // current line offset
45	insertSemi bool // insert a semicolon before next newline
46
47	// public state - ok to modify
48	ErrorCount int // number of errors encountered
49}
50
51const (
52	bom = 0xFEFF // byte order mark, only permitted as very first character
53	eof = -1     // end of file
54)
55
56// Read the next Unicode char into s.ch.
57// s.ch < 0 means end-of-file.
58//
59// For optimization, there is some overlap between this method and
60// s.scanIdentifier.
61func (s *Scanner) next() {
62	if s.rdOffset < len(s.src) {
63		s.offset = s.rdOffset
64		if s.ch == '\n' {
65			s.lineOffset = s.offset
66			s.file.AddLine(s.offset)
67		}
68		r, w := rune(s.src[s.rdOffset]), 1
69		switch {
70		case r == 0:
71			s.error(s.offset, "illegal character NUL")
72		case r >= utf8.RuneSelf:
73			// not ASCII
74			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
75			if r == utf8.RuneError && w == 1 {
76				s.error(s.offset, "illegal UTF-8 encoding")
77			} else if r == bom && s.offset > 0 {
78				s.error(s.offset, "illegal byte order mark")
79			}
80		}
81		s.rdOffset += w
82		s.ch = r
83	} else {
84		s.offset = len(s.src)
85		if s.ch == '\n' {
86			s.lineOffset = s.offset
87			s.file.AddLine(s.offset)
88		}
89		s.ch = eof
90	}
91}
92
93// peek returns the byte following the most recently read character without
94// advancing the scanner. If the scanner is at EOF, peek returns 0.
95func (s *Scanner) peek() byte {
96	if s.rdOffset < len(s.src) {
97		return s.src[s.rdOffset]
98	}
99	return 0
100}
101
102// A mode value is a set of flags (or 0).
103// They control scanner behavior.
104//
105type Mode uint
106
107const (
108	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
109	dontInsertSemis                  // do not automatically insert semicolons - for testing only
110)
111
112// Init prepares the scanner s to tokenize the text src by setting the
113// scanner at the beginning of src. The scanner uses the file set file
114// for position information and it adds line information for each line.
115// It is ok to re-use the same file when re-scanning the same file as
116// line information which is already present is ignored. Init causes a
117// panic if the file size does not match the src size.
118//
119// Calls to Scan will invoke the error handler err if they encounter a
120// syntax error and err is not nil. Also, for each error encountered,
121// the Scanner field ErrorCount is incremented by one. The mode parameter
122// determines how comments are handled.
123//
124// Note that Init may call err if there is an error in the first character
125// of the file.
126//
127func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
128	// Explicitly initialize all fields since a scanner may be reused.
129	if file.Size() != len(src) {
130		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
131	}
132	s.file = file
133	s.dir, _ = filepath.Split(file.Name())
134	s.src = src
135	s.err = err
136	s.mode = mode
137
138	s.ch = ' '
139	s.offset = 0
140	s.rdOffset = 0
141	s.lineOffset = 0
142	s.insertSemi = false
143	s.ErrorCount = 0
144
145	s.next()
146	if s.ch == bom {
147		s.next() // ignore BOM at file beginning
148	}
149}
150
151func (s *Scanner) error(offs int, msg string) {
152	if s.err != nil {
153		s.err(s.file.Position(s.file.Pos(offs)), msg)
154	}
155	s.ErrorCount++
156}
157
158func (s *Scanner) errorf(offs int, format string, args ...any) {
159	s.error(offs, fmt.Sprintf(format, args...))
160}
161
162func (s *Scanner) scanComment() string {
163	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
164	offs := s.offset - 1 // position of initial '/'
165	next := -1           // position immediately following the comment; < 0 means invalid comment
166	numCR := 0
167
168	if s.ch == '/' {
169		//-style comment
170		// (the final '\n' is not considered part of the comment)
171		s.next()
172		for s.ch != '\n' && s.ch >= 0 {
173			if s.ch == '\r' {
174				numCR++
175			}
176			s.next()
177		}
178		// if we are at '\n', the position following the comment is afterwards
179		next = s.offset
180		if s.ch == '\n' {
181			next++
182		}
183		goto exit
184	}
185
186	/*-style comment */
187	s.next()
188	for s.ch >= 0 {
189		ch := s.ch
190		if ch == '\r' {
191			numCR++
192		}
193		s.next()
194		if ch == '*' && s.ch == '/' {
195			s.next()
196			next = s.offset
197			goto exit
198		}
199	}
200
201	s.error(offs, "comment not terminated")
202
203exit:
204	lit := s.src[offs:s.offset]
205
206	// On Windows, a (//-comment) line may end in "\r\n".
207	// Remove the final '\r' before analyzing the text for
208	// line directives (matching the compiler). Remove any
209	// other '\r' afterwards (matching the pre-existing be-
210	// havior of the scanner).
211	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
212		lit = lit[:len(lit)-1]
213		numCR--
214	}
215
216	// interpret line directives
217	// (//line directives must start at the beginning of the current line)
218	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
219		s.updateLineInfo(next, offs, lit)
220	}
221
222	if numCR > 0 {
223		lit = stripCR(lit, lit[1] == '*')
224	}
225
226	return string(lit)
227}
228
229var prefix = []byte("line ")
230
231// updateLineInfo parses the incoming comment text at offset offs
232// as a line directive. If successful, it updates the line info table
233// for the position next per the line directive.
234func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
235	// extract comment text
236	if text[1] == '*' {
237		text = text[:len(text)-2] // lop off trailing "*/"
238	}
239	text = text[7:] // lop off leading "//line " or "/*line "
240	offs += 7
241
242	i, n, ok := trailingDigits(text)
243	if i == 0 {
244		return // ignore (not a line directive)
245	}
246	// i > 0
247
248	if !ok {
249		// text has a suffix :xxx but xxx is not a number
250		s.error(offs+i, "invalid line number: "+string(text[i:]))
251		return
252	}
253
254	var line, col int
255	i2, n2, ok2 := trailingDigits(text[:i-1])
256	if ok2 {
257		//line filename:line:col
258		i, i2 = i2, i
259		line, col = n2, n
260		if col == 0 {
261			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
262			return
263		}
264		text = text[:i2-1] // lop off ":col"
265	} else {
266		//line filename:line
267		line = n
268	}
269
270	if line == 0 {
271		s.error(offs+i, "invalid line number: "+string(text[i:]))
272		return
273	}
274
275	// If we have a column (//line filename:line:col form),
276	// an empty filename means to use the previous filename.
277	filename := string(text[:i-1]) // lop off ":line", and trim white space
278	if filename == "" && ok2 {
279		filename = s.file.Position(s.file.Pos(offs)).Filename
280	} else if filename != "" {
281		// Put a relative filename in the current directory.
282		// This is for compatibility with earlier releases.
283		// See issue 26671.
284		filename = filepath.Clean(filename)
285		if !filepath.IsAbs(filename) {
286			filename = filepath.Join(s.dir, filename)
287		}
288	}
289
290	s.file.AddLineColumnInfo(next, filename, line, col)
291}
292
293func trailingDigits(text []byte) (int, int, bool) {
294	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
295	if i < 0 {
296		return 0, 0, false // no ":"
297	}
298	// i >= 0
299	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
300	return i + 1, int(n), err == nil
301}
302
303func (s *Scanner) findLineEnd() bool {
304	// initial '/' already consumed
305
306	defer func(offs int) {
307		// reset scanner state to where it was upon calling findLineEnd
308		s.ch = '/'
309		s.offset = offs
310		s.rdOffset = offs + 1
311		s.next() // consume initial '/' again
312	}(s.offset - 1)
313
314	// read ahead until a newline, EOF, or non-comment token is found
315	for s.ch == '/' || s.ch == '*' {
316		if s.ch == '/' {
317			//-style comment always contains a newline
318			return true
319		}
320		/*-style comment: look for newline */
321		s.next()
322		for s.ch >= 0 {
323			ch := s.ch
324			if ch == '\n' {
325				return true
326			}
327			s.next()
328			if ch == '*' && s.ch == '/' {
329				s.next()
330				break
331			}
332		}
333		s.skipWhitespace() // s.insertSemi is set
334		if s.ch < 0 || s.ch == '\n' {
335			return true
336		}
337		if s.ch != '/' {
338			// non-comment token
339			return false
340		}
341		s.next() // consume '/'
342	}
343
344	return false
345}
346
347func isLetter(ch rune) bool {
348	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
349}
350
351func isDigit(ch rune) bool {
352	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
353}
354
355// scanIdentifier reads the string of valid identifier characters at s.offset.
356// It must only be called when s.ch is known to be a valid letter.
357//
358// Be careful when making changes to this function: it is optimized and affects
359// scanning performance significantly.
360func (s *Scanner) scanIdentifier() string {
361	offs := s.offset
362
363	// Optimize for the common case of an ASCII identifier.
364	//
365	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
366	// avoids conversions to runes.
367	//
368	// In case we encounter a non-ASCII character, fall back on the slower path
369	// of calling into s.next().
370	for rdOffset, b := range s.src[s.rdOffset:] {
371		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
372			// Avoid assigning a rune for the common case of an ascii character.
373			continue
374		}
375		s.rdOffset += rdOffset
376		if 0 < b && b < utf8.RuneSelf {
377			// Optimization: we've encountered an ASCII character that's not a letter
378			// or number. Avoid the call into s.next() and corresponding set up.
379			//
380			// Note that s.next() does some line accounting if s.ch is '\n', so this
381			// shortcut is only possible because we know that the preceding character
382			// is not '\n'.
383			s.ch = rune(b)
384			s.offset = s.rdOffset
385			s.rdOffset++
386			goto exit
387		}
388		// We know that the preceding character is valid for an identifier because
389		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
390		// at s.rdOffset resets the scanner state.
391		s.next()
392		for isLetter(s.ch) || isDigit(s.ch) {
393			s.next()
394		}
395		goto exit
396	}
397	s.offset = len(s.src)
398	s.rdOffset = len(s.src)
399	s.ch = eof
400
401exit:
402	return string(s.src[offs:s.offset])
403}
404
405func digitVal(ch rune) int {
406	switch {
407	case '0' <= ch && ch <= '9':
408		return int(ch - '0')
409	case 'a' <= lower(ch) && lower(ch) <= 'f':
410		return int(lower(ch) - 'a' + 10)
411	}
412	return 16 // larger than any legal digit val
413}
414
415func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
416func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
417func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
418
419// digits accepts the sequence { digit | '_' }.
420// If base <= 10, digits accepts any decimal digit but records
421// the offset (relative to the source start) of a digit >= base
422// in *invalid, if *invalid < 0.
423// digits returns a bitset describing whether the sequence contained
424// digits (bit 0 is set), or separators '_' (bit 1 is set).
425func (s *Scanner) digits(base int, invalid *int) (digsep int) {
426	if base <= 10 {
427		max := rune('0' + base)
428		for isDecimal(s.ch) || s.ch == '_' {
429			ds := 1
430			if s.ch == '_' {
431				ds = 2
432			} else if s.ch >= max && *invalid < 0 {
433				*invalid = s.offset // record invalid rune offset
434			}
435			digsep |= ds
436			s.next()
437		}
438	} else {
439		for isHex(s.ch) || s.ch == '_' {
440			ds := 1
441			if s.ch == '_' {
442				ds = 2
443			}
444			digsep |= ds
445			s.next()
446		}
447	}
448	return
449}
450
451func (s *Scanner) scanNumber() (token.Token, string) {
452	offs := s.offset
453	tok := token.ILLEGAL
454
455	base := 10        // number base
456	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
457	digsep := 0       // bit 0: digit present, bit 1: '_' present
458	invalid := -1     // index of invalid digit in literal, or < 0
459
460	// integer part
461	if s.ch != '.' {
462		tok = token.INT
463		if s.ch == '0' {
464			s.next()
465			switch lower(s.ch) {
466			case 'x':
467				s.next()
468				base, prefix = 16, 'x'
469			case 'o':
470				s.next()
471				base, prefix = 8, 'o'
472			case 'b':
473				s.next()
474				base, prefix = 2, 'b'
475			default:
476				base, prefix = 8, '0'
477				digsep = 1 // leading 0
478			}
479		}
480		digsep |= s.digits(base, &invalid)
481	}
482
483	// fractional part
484	if s.ch == '.' {
485		tok = token.FLOAT
486		if prefix == 'o' || prefix == 'b' {
487			s.error(s.offset, "invalid radix point in "+litname(prefix))
488		}
489		s.next()
490		digsep |= s.digits(base, &invalid)
491	}
492
493	if digsep&1 == 0 {
494		s.error(s.offset, litname(prefix)+" has no digits")
495	}
496
497	// exponent
498	if e := lower(s.ch); e == 'e' || e == 'p' {
499		switch {
500		case e == 'e' && prefix != 0 && prefix != '0':
501			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
502		case e == 'p' && prefix != 'x':
503			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
504		}
505		s.next()
506		tok = token.FLOAT
507		if s.ch == '+' || s.ch == '-' {
508			s.next()
509		}
510		ds := s.digits(10, nil)
511		digsep |= ds
512		if ds&1 == 0 {
513			s.error(s.offset, "exponent has no digits")
514		}
515	} else if prefix == 'x' && tok == token.FLOAT {
516		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
517	}
518
519	// suffix 'i'
520	if s.ch == 'i' {
521		tok = token.IMAG
522		s.next()
523	}
524
525	lit := string(s.src[offs:s.offset])
526	if tok == token.INT && invalid >= 0 {
527		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
528	}
529	if digsep&2 != 0 {
530		if i := invalidSep(lit); i >= 0 {
531			s.error(offs+i, "'_' must separate successive digits")
532		}
533	}
534
535	return tok, lit
536}
537
538func litname(prefix rune) string {
539	switch prefix {
540	case 'x':
541		return "hexadecimal literal"
542	case 'o', '0':
543		return "octal literal"
544	case 'b':
545		return "binary literal"
546	}
547	return "decimal literal"
548}
549
550// invalidSep returns the index of the first invalid separator in x, or -1.
551func invalidSep(x string) int {
552	x1 := ' ' // prefix char, we only care if it's 'x'
553	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
554	i := 0
555
556	// a prefix counts as a digit
557	if len(x) >= 2 && x[0] == '0' {
558		x1 = lower(rune(x[1]))
559		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
560			d = '0'
561			i = 2
562		}
563	}
564
565	// mantissa and exponent
566	for ; i < len(x); i++ {
567		p := d // previous digit
568		d = rune(x[i])
569		switch {
570		case d == '_':
571			if p != '0' {
572				return i
573			}
574		case isDecimal(d) || x1 == 'x' && isHex(d):
575			d = '0'
576		default:
577			if p == '_' {
578				return i - 1
579			}
580			d = '.'
581		}
582	}
583	if d == '_' {
584		return len(x) - 1
585	}
586
587	return -1
588}
589
590// scanEscape parses an escape sequence where rune is the accepted
591// escaped quote. In case of a syntax error, it stops at the offending
592// character (without consuming it) and returns false. Otherwise
593// it returns true.
594func (s *Scanner) scanEscape(quote rune) bool {
595	offs := s.offset
596
597	var n int
598	var base, max uint32
599	switch s.ch {
600	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
601		s.next()
602		return true
603	case '0', '1', '2', '3', '4', '5', '6', '7':
604		n, base, max = 3, 8, 255
605	case 'x':
606		s.next()
607		n, base, max = 2, 16, 255
608	case 'u':
609		s.next()
610		n, base, max = 4, 16, unicode.MaxRune
611	case 'U':
612		s.next()
613		n, base, max = 8, 16, unicode.MaxRune
614	default:
615		msg := "unknown escape sequence"
616		if s.ch < 0 {
617			msg = "escape sequence not terminated"
618		}
619		s.error(offs, msg)
620		return false
621	}
622
623	var x uint32
624	for n > 0 {
625		d := uint32(digitVal(s.ch))
626		if d >= base {
627			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
628			if s.ch < 0 {
629				msg = "escape sequence not terminated"
630			}
631			s.error(s.offset, msg)
632			return false
633		}
634		x = x*base + d
635		s.next()
636		n--
637	}
638
639	if x > max || 0xD800 <= x && x < 0xE000 {
640		s.error(offs, "escape sequence is invalid Unicode code point")
641		return false
642	}
643
644	return true
645}
646
647func (s *Scanner) scanRune() string {
648	// '\'' opening already consumed
649	offs := s.offset - 1
650
651	valid := true
652	n := 0
653	for {
654		ch := s.ch
655		if ch == '\n' || ch < 0 {
656			// only report error if we don't have one already
657			if valid {
658				s.error(offs, "rune literal not terminated")
659				valid = false
660			}
661			break
662		}
663		s.next()
664		if ch == '\'' {
665			break
666		}
667		n++
668		if ch == '\\' {
669			if !s.scanEscape('\'') {
670				valid = false
671			}
672			// continue to read to closing quote
673		}
674	}
675
676	if valid && n != 1 {
677		s.error(offs, "illegal rune literal")
678	}
679
680	return string(s.src[offs:s.offset])
681}
682
683func (s *Scanner) scanString() string {
684	// '"' opening already consumed
685	offs := s.offset - 1
686
687	for {
688		ch := s.ch
689		if ch == '\n' || ch < 0 {
690			s.error(offs, "string literal not terminated")
691			break
692		}
693		s.next()
694		if ch == '"' {
695			break
696		}
697		if ch == '\\' {
698			s.scanEscape('"')
699		}
700	}
701
702	return string(s.src[offs:s.offset])
703}
704
705func stripCR(b []byte, comment bool) []byte {
706	c := make([]byte, len(b))
707	i := 0
708	for j, ch := range b {
709		// In a /*-style comment, don't strip \r from *\r/ (incl.
710		// sequences of \r from *\r\r...\r/) since the resulting
711		// */ would terminate the comment too early unless the \r
712		// is immediately following the opening /* in which case
713		// it's ok because /*/ is not closed yet (issue #11151).
714		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
715			c[i] = ch
716			i++
717		}
718	}
719	return c[:i]
720}
721
722func (s *Scanner) scanRawString() string {
723	// '`' opening already consumed
724	offs := s.offset - 1
725
726	hasCR := false
727	for {
728		ch := s.ch
729		if ch < 0 {
730			s.error(offs, "raw string literal not terminated")
731			break
732		}
733		s.next()
734		if ch == '`' {
735			break
736		}
737		if ch == '\r' {
738			hasCR = true
739		}
740	}
741
742	lit := s.src[offs:s.offset]
743	if hasCR {
744		lit = stripCR(lit, false)
745	}
746
747	return string(lit)
748}
749
750func (s *Scanner) skipWhitespace() {
751	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
752		s.next()
753	}
754}
755
756// Helper functions for scanning multi-byte tokens such as >> += >>= .
757// Different routines recognize different length tok_i based on matches
758// of ch_i. If a token ends in '=', the result is tok1 or tok3
759// respectively. Otherwise, the result is tok0 if there was no other
760// matching character, or tok2 if the matching character was ch2.
761
762func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
763	if s.ch == '=' {
764		s.next()
765		return tok1
766	}
767	return tok0
768}
769
770func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
771	if s.ch == '=' {
772		s.next()
773		return tok1
774	}
775	if s.ch == ch2 {
776		s.next()
777		return tok2
778	}
779	return tok0
780}
781
782func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
783	if s.ch == '=' {
784		s.next()
785		return tok1
786	}
787	if s.ch == ch2 {
788		s.next()
789		if s.ch == '=' {
790			s.next()
791			return tok3
792		}
793		return tok2
794	}
795	return tok0
796}
797
798// Scan scans the next token and returns the token position, the token,
799// and its literal string if applicable. The source end is indicated by
800// token.EOF.
801//
802// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
803// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
804// has the corresponding value.
805//
806// If the returned token is a keyword, the literal string is the keyword.
807//
808// If the returned token is token.SEMICOLON, the corresponding
809// literal string is ";" if the semicolon was present in the source,
810// and "\n" if the semicolon was inserted because of a newline or
811// at EOF.
812//
813// If the returned token is token.ILLEGAL, the literal string is the
814// offending character.
815//
816// In all other cases, Scan returns an empty literal string.
817//
818// For more tolerant parsing, Scan will return a valid token if
819// possible even if a syntax error was encountered. Thus, even
820// if the resulting token sequence contains no illegal tokens,
821// a client may not assume that no error occurred. Instead it
822// must check the scanner's ErrorCount or the number of calls
823// of the error handler, if there was one installed.
824//
825// Scan adds line information to the file added to the file
826// set with Init. Token positions are relative to that file
827// and thus relative to the file set.
828//
829func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
830scanAgain:
831	s.skipWhitespace()
832
833	// current token start
834	pos = s.file.Pos(s.offset)
835
836	// determine token value
837	insertSemi := false
838	switch ch := s.ch; {
839	case isLetter(ch):
840		lit = s.scanIdentifier()
841		if len(lit) > 1 {
842			// keywords are longer than one letter - avoid lookup otherwise
843			tok = token.Lookup(lit)
844			switch tok {
845			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
846				insertSemi = true
847			}
848		} else {
849			insertSemi = true
850			tok = token.IDENT
851		}
852	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
853		insertSemi = true
854		tok, lit = s.scanNumber()
855	default:
856		s.next() // always make progress
857		switch ch {
858		case -1:
859			if s.insertSemi {
860				s.insertSemi = false // EOF consumed
861				return pos, token.SEMICOLON, "\n"
862			}
863			tok = token.EOF
864		case '\n':
865			// we only reach here if s.insertSemi was
866			// set in the first place and exited early
867			// from s.skipWhitespace()
868			s.insertSemi = false // newline consumed
869			return pos, token.SEMICOLON, "\n"
870		case '"':
871			insertSemi = true
872			tok = token.STRING
873			lit = s.scanString()
874		case '\'':
875			insertSemi = true
876			tok = token.CHAR
877			lit = s.scanRune()
878		case '`':
879			insertSemi = true
880			tok = token.STRING
881			lit = s.scanRawString()
882		case ':':
883			tok = s.switch2(token.COLON, token.DEFINE)
884		case '.':
885			// fractions starting with a '.' are handled by outer switch
886			tok = token.PERIOD
887			if s.ch == '.' && s.peek() == '.' {
888				s.next()
889				s.next() // consume last '.'
890				tok = token.ELLIPSIS
891			}
892		case ',':
893			tok = token.COMMA
894		case ';':
895			tok = token.SEMICOLON
896			lit = ";"
897		case '(':
898			tok = token.LPAREN
899		case ')':
900			insertSemi = true
901			tok = token.RPAREN
902		case '[':
903			tok = token.LBRACK
904		case ']':
905			insertSemi = true
906			tok = token.RBRACK
907		case '{':
908			tok = token.LBRACE
909		case '}':
910			insertSemi = true
911			tok = token.RBRACE
912		case '+':
913			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
914			if tok == token.INC {
915				insertSemi = true
916			}
917		case '-':
918			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
919			if tok == token.DEC {
920				insertSemi = true
921			}
922		case '*':
923			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
924		case '/':
925			if s.ch == '/' || s.ch == '*' {
926				// comment
927				if s.insertSemi && s.findLineEnd() {
928					// reset position to the beginning of the comment
929					s.ch = '/'
930					s.offset = s.file.Offset(pos)
931					s.rdOffset = s.offset + 1
932					s.insertSemi = false // newline consumed
933					return pos, token.SEMICOLON, "\n"
934				}
935				comment := s.scanComment()
936				if s.mode&ScanComments == 0 {
937					// skip comment
938					s.insertSemi = false // newline consumed
939					goto scanAgain
940				}
941				tok = token.COMMENT
942				lit = comment
943			} else {
944				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
945			}
946		case '%':
947			tok = s.switch2(token.REM, token.REM_ASSIGN)
948		case '^':
949			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
950		case '<':
951			if s.ch == '-' {
952				s.next()
953				tok = token.ARROW
954			} else {
955				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
956			}
957		case '>':
958			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
959		case '=':
960			tok = s.switch2(token.ASSIGN, token.EQL)
961		case '!':
962			tok = s.switch2(token.NOT, token.NEQ)
963		case '&':
964			if s.ch == '^' {
965				s.next()
966				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
967			} else {
968				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
969			}
970		case '|':
971			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
972		case '~':
973			tok = token.TILDE
974		default:
975			// next reports unexpected BOMs - don't repeat
976			if ch != bom {
977				s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
978			}
979			insertSemi = s.insertSemi // preserve insertSemi info
980			tok = token.ILLEGAL
981			lit = string(ch)
982		}
983	}
984	if s.mode&dontInsertSemis == 0 {
985		s.insertSemi = insertSemi
986	}
987
988	return
989}
990