1package parser
2
3import (
4	"fmt"
5	"unicode"
6	"unicode/utf8"
7
8	"github.com/d5/tengo/v2/token"
9)
10
11// byte order mark
12const bom = 0xFEFF
13
14// ScanMode represents a scanner mode.
15type ScanMode int
16
17// List of scanner modes.
18const (
19	ScanComments ScanMode = 1 << iota
20	DontInsertSemis
21)
22
23// ScannerErrorHandler is an error handler for the scanner.
24type ScannerErrorHandler func(pos SourceFilePos, msg string)
25
26// Scanner reads the Tengo source text. It's based on Go's scanner
27// implementation.
28type Scanner struct {
29	file         *SourceFile         // source file handle
30	src          []byte              // source
31	ch           rune                // current character
32	offset       int                 // character offset
33	readOffset   int                 // reading offset (position after current character)
34	lineOffset   int                 // current line offset
35	insertSemi   bool                // insert a semicolon before next newline
36	errorHandler ScannerErrorHandler // error reporting; or nil
37	errorCount   int                 // number of errors encountered
38	mode         ScanMode
39}
40
41// NewScanner creates a Scanner.
42func NewScanner(
43	file *SourceFile,
44	src []byte,
45	errorHandler ScannerErrorHandler,
46	mode ScanMode,
47) *Scanner {
48	if file.Size != len(src) {
49		panic(fmt.Sprintf("file size (%d) does not match src len (%d)",
50			file.Size, len(src)))
51	}
52
53	s := &Scanner{
54		file:         file,
55		src:          src,
56		errorHandler: errorHandler,
57		ch:           ' ',
58		mode:         mode,
59	}
60
61	s.next()
62	if s.ch == bom {
63		s.next() // ignore BOM at file beginning
64	}
65
66	return s
67}
68
69// ErrorCount returns the number of errors.
70func (s *Scanner) ErrorCount() int {
71	return s.errorCount
72}
73
74// Scan returns a token, token literal and its position.
75func (s *Scanner) Scan() (
76	tok token.Token,
77	literal string,
78	pos Pos,
79) {
80	s.skipWhitespace()
81
82	pos = s.file.FileSetPos(s.offset)
83
84	insertSemi := false
85
86	// determine token value
87	switch ch := s.ch; {
88	case isLetter(ch):
89		literal = s.scanIdentifier()
90		tok = token.Lookup(literal)
91		switch tok {
92		case token.Ident, token.Break, token.Continue, token.Return,
93			token.Export, token.True, token.False, token.Undefined:
94			insertSemi = true
95		}
96	case '0' <= ch && ch <= '9':
97		insertSemi = true
98		tok, literal = s.scanNumber(false)
99	default:
100		s.next() // always make progress
101
102		switch ch {
103		case -1: // EOF
104			if s.insertSemi {
105				s.insertSemi = false // EOF consumed
106				return token.Semicolon, "\n", pos
107			}
108			tok = token.EOF
109		case '\n':
110			// we only reach here if s.insertSemi was set in the first place
111			s.insertSemi = false // newline consumed
112			return token.Semicolon, "\n", pos
113		case '"':
114			insertSemi = true
115			tok = token.String
116			literal = s.scanString()
117		case '\'':
118			insertSemi = true
119			tok = token.Char
120			literal = s.scanRune()
121		case '`':
122			insertSemi = true
123			tok = token.String
124			literal = s.scanRawString()
125		case ':':
126			tok = s.switch2(token.Colon, token.Define)
127		case '.':
128			if '0' <= s.ch && s.ch <= '9' {
129				insertSemi = true
130				tok, literal = s.scanNumber(true)
131			} else {
132				tok = token.Period
133				if s.ch == '.' && s.peek() == '.' {
134					s.next()
135					s.next() // consume last '.'
136					tok = token.Ellipsis
137				}
138			}
139		case ',':
140			tok = token.Comma
141		case '?':
142			tok = token.Question
143		case ';':
144			tok = token.Semicolon
145			literal = ";"
146		case '(':
147			tok = token.LParen
148		case ')':
149			insertSemi = true
150			tok = token.RParen
151		case '[':
152			tok = token.LBrack
153		case ']':
154			insertSemi = true
155			tok = token.RBrack
156		case '{':
157			tok = token.LBrace
158		case '}':
159			insertSemi = true
160			tok = token.RBrace
161		case '+':
162			tok = s.switch3(token.Add, token.AddAssign, '+', token.Inc)
163			if tok == token.Inc {
164				insertSemi = true
165			}
166		case '-':
167			tok = s.switch3(token.Sub, token.SubAssign, '-', token.Dec)
168			if tok == token.Dec {
169				insertSemi = true
170			}
171		case '*':
172			tok = s.switch2(token.Mul, token.MulAssign)
173		case '/':
174			if s.ch == '/' || s.ch == '*' {
175				// comment
176				if s.insertSemi && s.findLineEnd() {
177					// reset position to the beginning of the comment
178					s.ch = '/'
179					s.offset = s.file.Offset(pos)
180					s.readOffset = s.offset + 1
181					s.insertSemi = false // newline consumed
182					return token.Semicolon, "\n", pos
183				}
184				comment := s.scanComment()
185				if s.mode&ScanComments == 0 {
186					// skip comment
187					s.insertSemi = false // newline consumed
188					return s.Scan()
189				}
190				tok = token.Comment
191				literal = comment
192			} else {
193				tok = s.switch2(token.Quo, token.QuoAssign)
194			}
195		case '%':
196			tok = s.switch2(token.Rem, token.RemAssign)
197		case '^':
198			tok = s.switch2(token.Xor, token.XorAssign)
199		case '<':
200			tok = s.switch4(token.Less, token.LessEq, '<',
201				token.Shl, token.ShlAssign)
202		case '>':
203			tok = s.switch4(token.Greater, token.GreaterEq, '>',
204				token.Shr, token.ShrAssign)
205		case '=':
206			tok = s.switch2(token.Assign, token.Equal)
207		case '!':
208			tok = s.switch2(token.Not, token.NotEqual)
209		case '&':
210			if s.ch == '^' {
211				s.next()
212				tok = s.switch2(token.AndNot, token.AndNotAssign)
213			} else {
214				tok = s.switch3(token.And, token.AndAssign, '&', token.LAnd)
215			}
216		case '|':
217			tok = s.switch3(token.Or, token.OrAssign, '|', token.LOr)
218		default:
219			// next reports unexpected BOMs - don't repeat
220			if ch != bom {
221				s.error(s.file.Offset(pos),
222					fmt.Sprintf("illegal character %#U", ch))
223			}
224			insertSemi = s.insertSemi // preserve insertSemi info
225			tok = token.Illegal
226			literal = string(ch)
227		}
228	}
229	if s.mode&DontInsertSemis == 0 {
230		s.insertSemi = insertSemi
231	}
232	return
233}
234
235func (s *Scanner) next() {
236	if s.readOffset < len(s.src) {
237		s.offset = s.readOffset
238		if s.ch == '\n' {
239			s.lineOffset = s.offset
240			s.file.AddLine(s.offset)
241		}
242		r, w := rune(s.src[s.readOffset]), 1
243		switch {
244		case r == 0:
245			s.error(s.offset, "illegal character NUL")
246		case r >= utf8.RuneSelf:
247			// not ASCII
248			r, w = utf8.DecodeRune(s.src[s.readOffset:])
249			if r == utf8.RuneError && w == 1 {
250				s.error(s.offset, "illegal UTF-8 encoding")
251			} else if r == bom && s.offset > 0 {
252				s.error(s.offset, "illegal byte order mark")
253			}
254		}
255		s.readOffset += w
256		s.ch = r
257	} else {
258		s.offset = len(s.src)
259		if s.ch == '\n' {
260			s.lineOffset = s.offset
261			s.file.AddLine(s.offset)
262		}
263		s.ch = -1 // eof
264	}
265}
266
267func (s *Scanner) peek() byte {
268	if s.readOffset < len(s.src) {
269		return s.src[s.readOffset]
270	}
271	return 0
272}
273
274func (s *Scanner) error(offset int, msg string) {
275	if s.errorHandler != nil {
276		s.errorHandler(s.file.Position(s.file.FileSetPos(offset)), msg)
277	}
278	s.errorCount++
279}
280
281func (s *Scanner) scanComment() string {
282	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
283	offs := s.offset - 1 // position of initial '/'
284	var numCR int
285
286	if s.ch == '/' {
287		//-style comment
288		// (the final '\n' is not considered part of the comment)
289		s.next()
290		for s.ch != '\n' && s.ch >= 0 {
291			if s.ch == '\r' {
292				numCR++
293			}
294			s.next()
295		}
296		goto exit
297	}
298
299	/*-style comment */
300	s.next()
301	for s.ch >= 0 {
302		ch := s.ch
303		if ch == '\r' {
304			numCR++
305		}
306		s.next()
307		if ch == '*' && s.ch == '/' {
308			s.next()
309			goto exit
310		}
311	}
312
313	s.error(offs, "comment not terminated")
314
315exit:
316	lit := s.src[offs:s.offset]
317
318	// On Windows, a (//-comment) line may end in "\r\n".
319	// Remove the final '\r' before analyzing the text for line directives (matching the compiler).
320	// Remove any other '\r' afterwards (matching the pre-existing behavior of the scanner).
321	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
322		lit = lit[:len(lit)-1]
323		numCR--
324	}
325	if numCR > 0 {
326		lit = StripCR(lit, lit[1] == '*')
327	}
328	return string(lit)
329}
330
331func (s *Scanner) findLineEnd() bool {
332	// initial '/' already consumed
333
334	defer func(offs int) {
335		// reset scanner state to where it was upon calling findLineEnd
336		s.ch = '/'
337		s.offset = offs
338		s.readOffset = offs + 1
339		s.next() // consume initial '/' again
340	}(s.offset - 1)
341
342	// read ahead until a newline, EOF, or non-comment tok is found
343	for s.ch == '/' || s.ch == '*' {
344		if s.ch == '/' {
345			//-style comment always contains a newline
346			return true
347		}
348		/*-style comment: look for newline */
349		s.next()
350		for s.ch >= 0 {
351			ch := s.ch
352			if ch == '\n' {
353				return true
354			}
355			s.next()
356			if ch == '*' && s.ch == '/' {
357				s.next()
358				break
359			}
360		}
361		s.skipWhitespace() // s.insertSemi is set
362		if s.ch < 0 || s.ch == '\n' {
363			return true
364		}
365		if s.ch != '/' {
366			// non-comment tok
367			return false
368		}
369		s.next() // consume '/'
370	}
371	return false
372}
373
374func (s *Scanner) scanIdentifier() string {
375	offs := s.offset
376	for isLetter(s.ch) || isDigit(s.ch) {
377		s.next()
378	}
379	return string(s.src[offs:s.offset])
380}
381
382func (s *Scanner) scanMantissa(base int) {
383	for digitVal(s.ch) < base {
384		s.next()
385	}
386}
387
388func (s *Scanner) scanNumber(
389	seenDecimalPoint bool,
390) (tok token.Token, lit string) {
391	// digitVal(s.ch) < 10
392	offs := s.offset
393	tok = token.Int
394
395	defer func() {
396		lit = string(s.src[offs:s.offset])
397	}()
398
399	if seenDecimalPoint {
400		offs--
401		tok = token.Float
402		s.scanMantissa(10)
403		goto exponent
404	}
405
406	if s.ch == '0' {
407		// int or float
408		offs := s.offset
409		s.next()
410		if s.ch == 'x' || s.ch == 'X' {
411			// hexadecimal int
412			s.next()
413			s.scanMantissa(16)
414			if s.offset-offs <= 2 {
415				// only scanned "0x" or "0X"
416				s.error(offs, "illegal hexadecimal number")
417			}
418		} else {
419			// octal int or float
420			seenDecimalDigit := false
421			s.scanMantissa(8)
422			if s.ch == '8' || s.ch == '9' {
423				// illegal octal int or float
424				seenDecimalDigit = true
425				s.scanMantissa(10)
426			}
427			if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
428				goto fraction
429			}
430			// octal int
431			if seenDecimalDigit {
432				s.error(offs, "illegal octal number")
433			}
434		}
435		return
436	}
437
438	// decimal int or float
439	s.scanMantissa(10)
440
441fraction:
442	if s.ch == '.' {
443		tok = token.Float
444		s.next()
445		s.scanMantissa(10)
446	}
447
448exponent:
449	if s.ch == 'e' || s.ch == 'E' {
450		tok = token.Float
451		s.next()
452		if s.ch == '-' || s.ch == '+' {
453			s.next()
454		}
455		if digitVal(s.ch) < 10 {
456			s.scanMantissa(10)
457		} else {
458			s.error(offs, "illegal floating-point exponent")
459		}
460	}
461	return
462}
463
464func (s *Scanner) scanEscape(quote rune) bool {
465	offs := s.offset
466
467	var n int
468	var base, max uint32
469	switch s.ch {
470	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
471		s.next()
472		return true
473	case '0', '1', '2', '3', '4', '5', '6', '7':
474		n, base, max = 3, 8, 255
475	case 'x':
476		s.next()
477		n, base, max = 2, 16, 255
478	case 'u':
479		s.next()
480		n, base, max = 4, 16, unicode.MaxRune
481	case 'U':
482		s.next()
483		n, base, max = 8, 16, unicode.MaxRune
484	default:
485		msg := "unknown escape sequence"
486		if s.ch < 0 {
487			msg = "escape sequence not terminated"
488		}
489		s.error(offs, msg)
490		return false
491	}
492
493	var x uint32
494	for n > 0 {
495		d := uint32(digitVal(s.ch))
496		if d >= base {
497			msg := fmt.Sprintf(
498				"illegal character %#U in escape sequence", s.ch)
499			if s.ch < 0 {
500				msg = "escape sequence not terminated"
501			}
502			s.error(s.offset, msg)
503			return false
504		}
505		x = x*base + d
506		s.next()
507		n--
508	}
509
510	if x > max || 0xD800 <= x && x < 0xE000 {
511		s.error(offs, "escape sequence is invalid Unicode code point")
512		return false
513	}
514	return true
515}
516
517func (s *Scanner) scanRune() string {
518	offs := s.offset - 1 // '\'' opening already consumed
519
520	valid := true
521	n := 0
522	for {
523		ch := s.ch
524		if ch == '\n' || ch < 0 {
525			// only report error if we don't have one already
526			if valid {
527				s.error(offs, "rune literal not terminated")
528				valid = false
529			}
530			break
531		}
532		s.next()
533		if ch == '\'' {
534			break
535		}
536		n++
537		if ch == '\\' {
538			if !s.scanEscape('\'') {
539				valid = false
540			}
541			// continue to read to closing quote
542		}
543	}
544
545	if valid && n != 1 {
546		s.error(offs, "illegal rune literal")
547	}
548	return string(s.src[offs:s.offset])
549}
550
551func (s *Scanner) scanString() string {
552	offs := s.offset - 1 // '"' opening already consumed
553
554	for {
555		ch := s.ch
556		if ch == '\n' || ch < 0 {
557			s.error(offs, "string literal not terminated")
558			break
559		}
560		s.next()
561		if ch == '"' {
562			break
563		}
564		if ch == '\\' {
565			s.scanEscape('"')
566		}
567	}
568	return string(s.src[offs:s.offset])
569}
570
571func (s *Scanner) scanRawString() string {
572	offs := s.offset - 1 // '`' opening already consumed
573
574	hasCR := false
575	for {
576		ch := s.ch
577		if ch < 0 {
578			s.error(offs, "raw string literal not terminated")
579			break
580		}
581
582		s.next()
583
584		if ch == '`' {
585			break
586		}
587
588		if ch == '\r' {
589			hasCR = true
590		}
591	}
592
593	lit := s.src[offs:s.offset]
594	if hasCR {
595		lit = StripCR(lit, false)
596	}
597	return string(lit)
598}
599
600// StripCR removes carriage return characters.
601func StripCR(b []byte, comment bool) []byte {
602	c := make([]byte, len(b))
603	i := 0
604	for j, ch := range b {
605		// In a /*-style comment, don't strip \r from *\r/ (incl. sequences of
606		// \r from *\r\r...\r/) since the resulting  */ would terminate the
607		// comment too early unless the \r is immediately following the opening
608		// /* in which case it's ok because /*/ is not closed yet.
609		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' &&
610			j+1 < len(b) && b[j+1] == '/' {
611			c[i] = ch
612			i++
613		}
614	}
615	return c[:i]
616}
617
618func (s *Scanner) skipWhitespace() {
619	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi ||
620		s.ch == '\r' {
621		s.next()
622	}
623}
624
625func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
626	if s.ch == '=' {
627		s.next()
628		return tok1
629	}
630	return tok0
631}
632
633func (s *Scanner) switch3(
634	tok0, tok1 token.Token,
635	ch2 rune,
636	tok2 token.Token,
637) token.Token {
638	if s.ch == '=' {
639		s.next()
640		return tok1
641	}
642	if s.ch == ch2 {
643		s.next()
644		return tok2
645	}
646	return tok0
647}
648
649func (s *Scanner) switch4(
650	tok0, tok1 token.Token,
651	ch2 rune,
652	tok2, tok3 token.Token,
653) token.Token {
654	if s.ch == '=' {
655		s.next()
656		return tok1
657	}
658	if s.ch == ch2 {
659		s.next()
660		if s.ch == '=' {
661			s.next()
662			return tok3
663		}
664		return tok2
665	}
666	return tok0
667}
668
669func isLetter(ch rune) bool {
670	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' ||
671		ch >= utf8.RuneSelf && unicode.IsLetter(ch)
672}
673
674func isDigit(ch rune) bool {
675	return '0' <= ch && ch <= '9' ||
676		ch >= utf8.RuneSelf && unicode.IsDigit(ch)
677}
678
679func digitVal(ch rune) int {
680	switch {
681	case '0' <= ch && ch <= '9':
682		return int(ch - '0')
683	case 'a' <= ch && ch <= 'f':
684		return int(ch - 'a' + 10)
685	case 'A' <= ch && ch <= 'F':
686		return int(ch - 'A' + 10)
687	}
688	return 16 // larger than any legal digit val
689}
690