1package parser 2 3import ( 4 "fmt" 5 "unicode" 6 "unicode/utf8" 7 8 "github.com/d5/tengo/v2/token" 9) 10 11// byte order mark 12const bom = 0xFEFF 13 14// ScanMode represents a scanner mode. 15type ScanMode int 16 17// List of scanner modes. 18const ( 19 ScanComments ScanMode = 1 << iota 20 DontInsertSemis 21) 22 23// ScannerErrorHandler is an error handler for the scanner. 24type ScannerErrorHandler func(pos SourceFilePos, msg string) 25 26// Scanner reads the Tengo source text. It's based on Go's scanner 27// implementation. 28type Scanner struct { 29 file *SourceFile // source file handle 30 src []byte // source 31 ch rune // current character 32 offset int // character offset 33 readOffset int // reading offset (position after current character) 34 lineOffset int // current line offset 35 insertSemi bool // insert a semicolon before next newline 36 errorHandler ScannerErrorHandler // error reporting; or nil 37 errorCount int // number of errors encountered 38 mode ScanMode 39} 40 41// NewScanner creates a Scanner. 42func NewScanner( 43 file *SourceFile, 44 src []byte, 45 errorHandler ScannerErrorHandler, 46 mode ScanMode, 47) *Scanner { 48 if file.Size != len(src) { 49 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", 50 file.Size, len(src))) 51 } 52 53 s := &Scanner{ 54 file: file, 55 src: src, 56 errorHandler: errorHandler, 57 ch: ' ', 58 mode: mode, 59 } 60 61 s.next() 62 if s.ch == bom { 63 s.next() // ignore BOM at file beginning 64 } 65 66 return s 67} 68 69// ErrorCount returns the number of errors. 70func (s *Scanner) ErrorCount() int { 71 return s.errorCount 72} 73 74// Scan returns a token, token literal and its position. 75func (s *Scanner) Scan() ( 76 tok token.Token, 77 literal string, 78 pos Pos, 79) { 80 s.skipWhitespace() 81 82 pos = s.file.FileSetPos(s.offset) 83 84 insertSemi := false 85 86 // determine token value 87 switch ch := s.ch; { 88 case isLetter(ch): 89 literal = s.scanIdentifier() 90 tok = token.Lookup(literal) 91 switch tok { 92 case token.Ident, token.Break, token.Continue, token.Return, 93 token.Export, token.True, token.False, token.Undefined: 94 insertSemi = true 95 } 96 case '0' <= ch && ch <= '9': 97 insertSemi = true 98 tok, literal = s.scanNumber(false) 99 default: 100 s.next() // always make progress 101 102 switch ch { 103 case -1: // EOF 104 if s.insertSemi { 105 s.insertSemi = false // EOF consumed 106 return token.Semicolon, "\n", pos 107 } 108 tok = token.EOF 109 case '\n': 110 // we only reach here if s.insertSemi was set in the first place 111 s.insertSemi = false // newline consumed 112 return token.Semicolon, "\n", pos 113 case '"': 114 insertSemi = true 115 tok = token.String 116 literal = s.scanString() 117 case '\'': 118 insertSemi = true 119 tok = token.Char 120 literal = s.scanRune() 121 case '`': 122 insertSemi = true 123 tok = token.String 124 literal = s.scanRawString() 125 case ':': 126 tok = s.switch2(token.Colon, token.Define) 127 case '.': 128 if '0' <= s.ch && s.ch <= '9' { 129 insertSemi = true 130 tok, literal = s.scanNumber(true) 131 } else { 132 tok = token.Period 133 if s.ch == '.' && s.peek() == '.' { 134 s.next() 135 s.next() // consume last '.' 136 tok = token.Ellipsis 137 } 138 } 139 case ',': 140 tok = token.Comma 141 case '?': 142 tok = token.Question 143 case ';': 144 tok = token.Semicolon 145 literal = ";" 146 case '(': 147 tok = token.LParen 148 case ')': 149 insertSemi = true 150 tok = token.RParen 151 case '[': 152 tok = token.LBrack 153 case ']': 154 insertSemi = true 155 tok = token.RBrack 156 case '{': 157 tok = token.LBrace 158 case '}': 159 insertSemi = true 160 tok = token.RBrace 161 case '+': 162 tok = s.switch3(token.Add, token.AddAssign, '+', token.Inc) 163 if tok == token.Inc { 164 insertSemi = true 165 } 166 case '-': 167 tok = s.switch3(token.Sub, token.SubAssign, '-', token.Dec) 168 if tok == token.Dec { 169 insertSemi = true 170 } 171 case '*': 172 tok = s.switch2(token.Mul, token.MulAssign) 173 case '/': 174 if s.ch == '/' || s.ch == '*' { 175 // comment 176 if s.insertSemi && s.findLineEnd() { 177 // reset position to the beginning of the comment 178 s.ch = '/' 179 s.offset = s.file.Offset(pos) 180 s.readOffset = s.offset + 1 181 s.insertSemi = false // newline consumed 182 return token.Semicolon, "\n", pos 183 } 184 comment := s.scanComment() 185 if s.mode&ScanComments == 0 { 186 // skip comment 187 s.insertSemi = false // newline consumed 188 return s.Scan() 189 } 190 tok = token.Comment 191 literal = comment 192 } else { 193 tok = s.switch2(token.Quo, token.QuoAssign) 194 } 195 case '%': 196 tok = s.switch2(token.Rem, token.RemAssign) 197 case '^': 198 tok = s.switch2(token.Xor, token.XorAssign) 199 case '<': 200 tok = s.switch4(token.Less, token.LessEq, '<', 201 token.Shl, token.ShlAssign) 202 case '>': 203 tok = s.switch4(token.Greater, token.GreaterEq, '>', 204 token.Shr, token.ShrAssign) 205 case '=': 206 tok = s.switch2(token.Assign, token.Equal) 207 case '!': 208 tok = s.switch2(token.Not, token.NotEqual) 209 case '&': 210 if s.ch == '^' { 211 s.next() 212 tok = s.switch2(token.AndNot, token.AndNotAssign) 213 } else { 214 tok = s.switch3(token.And, token.AndAssign, '&', token.LAnd) 215 } 216 case '|': 217 tok = s.switch3(token.Or, token.OrAssign, '|', token.LOr) 218 default: 219 // next reports unexpected BOMs - don't repeat 220 if ch != bom { 221 s.error(s.file.Offset(pos), 222 fmt.Sprintf("illegal character %#U", ch)) 223 } 224 insertSemi = s.insertSemi // preserve insertSemi info 225 tok = token.Illegal 226 literal = string(ch) 227 } 228 } 229 if s.mode&DontInsertSemis == 0 { 230 s.insertSemi = insertSemi 231 } 232 return 233} 234 235func (s *Scanner) next() { 236 if s.readOffset < len(s.src) { 237 s.offset = s.readOffset 238 if s.ch == '\n' { 239 s.lineOffset = s.offset 240 s.file.AddLine(s.offset) 241 } 242 r, w := rune(s.src[s.readOffset]), 1 243 switch { 244 case r == 0: 245 s.error(s.offset, "illegal character NUL") 246 case r >= utf8.RuneSelf: 247 // not ASCII 248 r, w = utf8.DecodeRune(s.src[s.readOffset:]) 249 if r == utf8.RuneError && w == 1 { 250 s.error(s.offset, "illegal UTF-8 encoding") 251 } else if r == bom && s.offset > 0 { 252 s.error(s.offset, "illegal byte order mark") 253 } 254 } 255 s.readOffset += w 256 s.ch = r 257 } else { 258 s.offset = len(s.src) 259 if s.ch == '\n' { 260 s.lineOffset = s.offset 261 s.file.AddLine(s.offset) 262 } 263 s.ch = -1 // eof 264 } 265} 266 267func (s *Scanner) peek() byte { 268 if s.readOffset < len(s.src) { 269 return s.src[s.readOffset] 270 } 271 return 0 272} 273 274func (s *Scanner) error(offset int, msg string) { 275 if s.errorHandler != nil { 276 s.errorHandler(s.file.Position(s.file.FileSetPos(offset)), msg) 277 } 278 s.errorCount++ 279} 280 281func (s *Scanner) scanComment() string { 282 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 283 offs := s.offset - 1 // position of initial '/' 284 var numCR int 285 286 if s.ch == '/' { 287 //-style comment 288 // (the final '\n' is not considered part of the comment) 289 s.next() 290 for s.ch != '\n' && s.ch >= 0 { 291 if s.ch == '\r' { 292 numCR++ 293 } 294 s.next() 295 } 296 goto exit 297 } 298 299 /*-style comment */ 300 s.next() 301 for s.ch >= 0 { 302 ch := s.ch 303 if ch == '\r' { 304 numCR++ 305 } 306 s.next() 307 if ch == '*' && s.ch == '/' { 308 s.next() 309 goto exit 310 } 311 } 312 313 s.error(offs, "comment not terminated") 314 315exit: 316 lit := s.src[offs:s.offset] 317 318 // On Windows, a (//-comment) line may end in "\r\n". 319 // Remove the final '\r' before analyzing the text for line directives (matching the compiler). 320 // Remove any other '\r' afterwards (matching the pre-existing behavior of the scanner). 321 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' { 322 lit = lit[:len(lit)-1] 323 numCR-- 324 } 325 if numCR > 0 { 326 lit = StripCR(lit, lit[1] == '*') 327 } 328 return string(lit) 329} 330 331func (s *Scanner) findLineEnd() bool { 332 // initial '/' already consumed 333 334 defer func(offs int) { 335 // reset scanner state to where it was upon calling findLineEnd 336 s.ch = '/' 337 s.offset = offs 338 s.readOffset = offs + 1 339 s.next() // consume initial '/' again 340 }(s.offset - 1) 341 342 // read ahead until a newline, EOF, or non-comment tok is found 343 for s.ch == '/' || s.ch == '*' { 344 if s.ch == '/' { 345 //-style comment always contains a newline 346 return true 347 } 348 /*-style comment: look for newline */ 349 s.next() 350 for s.ch >= 0 { 351 ch := s.ch 352 if ch == '\n' { 353 return true 354 } 355 s.next() 356 if ch == '*' && s.ch == '/' { 357 s.next() 358 break 359 } 360 } 361 s.skipWhitespace() // s.insertSemi is set 362 if s.ch < 0 || s.ch == '\n' { 363 return true 364 } 365 if s.ch != '/' { 366 // non-comment tok 367 return false 368 } 369 s.next() // consume '/' 370 } 371 return false 372} 373 374func (s *Scanner) scanIdentifier() string { 375 offs := s.offset 376 for isLetter(s.ch) || isDigit(s.ch) { 377 s.next() 378 } 379 return string(s.src[offs:s.offset]) 380} 381 382func (s *Scanner) scanMantissa(base int) { 383 for digitVal(s.ch) < base { 384 s.next() 385 } 386} 387 388func (s *Scanner) scanNumber( 389 seenDecimalPoint bool, 390) (tok token.Token, lit string) { 391 // digitVal(s.ch) < 10 392 offs := s.offset 393 tok = token.Int 394 395 defer func() { 396 lit = string(s.src[offs:s.offset]) 397 }() 398 399 if seenDecimalPoint { 400 offs-- 401 tok = token.Float 402 s.scanMantissa(10) 403 goto exponent 404 } 405 406 if s.ch == '0' { 407 // int or float 408 offs := s.offset 409 s.next() 410 if s.ch == 'x' || s.ch == 'X' { 411 // hexadecimal int 412 s.next() 413 s.scanMantissa(16) 414 if s.offset-offs <= 2 { 415 // only scanned "0x" or "0X" 416 s.error(offs, "illegal hexadecimal number") 417 } 418 } else { 419 // octal int or float 420 seenDecimalDigit := false 421 s.scanMantissa(8) 422 if s.ch == '8' || s.ch == '9' { 423 // illegal octal int or float 424 seenDecimalDigit = true 425 s.scanMantissa(10) 426 } 427 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { 428 goto fraction 429 } 430 // octal int 431 if seenDecimalDigit { 432 s.error(offs, "illegal octal number") 433 } 434 } 435 return 436 } 437 438 // decimal int or float 439 s.scanMantissa(10) 440 441fraction: 442 if s.ch == '.' { 443 tok = token.Float 444 s.next() 445 s.scanMantissa(10) 446 } 447 448exponent: 449 if s.ch == 'e' || s.ch == 'E' { 450 tok = token.Float 451 s.next() 452 if s.ch == '-' || s.ch == '+' { 453 s.next() 454 } 455 if digitVal(s.ch) < 10 { 456 s.scanMantissa(10) 457 } else { 458 s.error(offs, "illegal floating-point exponent") 459 } 460 } 461 return 462} 463 464func (s *Scanner) scanEscape(quote rune) bool { 465 offs := s.offset 466 467 var n int 468 var base, max uint32 469 switch s.ch { 470 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 471 s.next() 472 return true 473 case '0', '1', '2', '3', '4', '5', '6', '7': 474 n, base, max = 3, 8, 255 475 case 'x': 476 s.next() 477 n, base, max = 2, 16, 255 478 case 'u': 479 s.next() 480 n, base, max = 4, 16, unicode.MaxRune 481 case 'U': 482 s.next() 483 n, base, max = 8, 16, unicode.MaxRune 484 default: 485 msg := "unknown escape sequence" 486 if s.ch < 0 { 487 msg = "escape sequence not terminated" 488 } 489 s.error(offs, msg) 490 return false 491 } 492 493 var x uint32 494 for n > 0 { 495 d := uint32(digitVal(s.ch)) 496 if d >= base { 497 msg := fmt.Sprintf( 498 "illegal character %#U in escape sequence", s.ch) 499 if s.ch < 0 { 500 msg = "escape sequence not terminated" 501 } 502 s.error(s.offset, msg) 503 return false 504 } 505 x = x*base + d 506 s.next() 507 n-- 508 } 509 510 if x > max || 0xD800 <= x && x < 0xE000 { 511 s.error(offs, "escape sequence is invalid Unicode code point") 512 return false 513 } 514 return true 515} 516 517func (s *Scanner) scanRune() string { 518 offs := s.offset - 1 // '\'' opening already consumed 519 520 valid := true 521 n := 0 522 for { 523 ch := s.ch 524 if ch == '\n' || ch < 0 { 525 // only report error if we don't have one already 526 if valid { 527 s.error(offs, "rune literal not terminated") 528 valid = false 529 } 530 break 531 } 532 s.next() 533 if ch == '\'' { 534 break 535 } 536 n++ 537 if ch == '\\' { 538 if !s.scanEscape('\'') { 539 valid = false 540 } 541 // continue to read to closing quote 542 } 543 } 544 545 if valid && n != 1 { 546 s.error(offs, "illegal rune literal") 547 } 548 return string(s.src[offs:s.offset]) 549} 550 551func (s *Scanner) scanString() string { 552 offs := s.offset - 1 // '"' opening already consumed 553 554 for { 555 ch := s.ch 556 if ch == '\n' || ch < 0 { 557 s.error(offs, "string literal not terminated") 558 break 559 } 560 s.next() 561 if ch == '"' { 562 break 563 } 564 if ch == '\\' { 565 s.scanEscape('"') 566 } 567 } 568 return string(s.src[offs:s.offset]) 569} 570 571func (s *Scanner) scanRawString() string { 572 offs := s.offset - 1 // '`' opening already consumed 573 574 hasCR := false 575 for { 576 ch := s.ch 577 if ch < 0 { 578 s.error(offs, "raw string literal not terminated") 579 break 580 } 581 582 s.next() 583 584 if ch == '`' { 585 break 586 } 587 588 if ch == '\r' { 589 hasCR = true 590 } 591 } 592 593 lit := s.src[offs:s.offset] 594 if hasCR { 595 lit = StripCR(lit, false) 596 } 597 return string(lit) 598} 599 600// StripCR removes carriage return characters. 601func StripCR(b []byte, comment bool) []byte { 602 c := make([]byte, len(b)) 603 i := 0 604 for j, ch := range b { 605 // In a /*-style comment, don't strip \r from *\r/ (incl. sequences of 606 // \r from *\r\r...\r/) since the resulting */ would terminate the 607 // comment too early unless the \r is immediately following the opening 608 // /* in which case it's ok because /*/ is not closed yet. 609 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && 610 j+1 < len(b) && b[j+1] == '/' { 611 c[i] = ch 612 i++ 613 } 614 } 615 return c[:i] 616} 617 618func (s *Scanner) skipWhitespace() { 619 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || 620 s.ch == '\r' { 621 s.next() 622 } 623} 624 625func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 626 if s.ch == '=' { 627 s.next() 628 return tok1 629 } 630 return tok0 631} 632 633func (s *Scanner) switch3( 634 tok0, tok1 token.Token, 635 ch2 rune, 636 tok2 token.Token, 637) token.Token { 638 if s.ch == '=' { 639 s.next() 640 return tok1 641 } 642 if s.ch == ch2 { 643 s.next() 644 return tok2 645 } 646 return tok0 647} 648 649func (s *Scanner) switch4( 650 tok0, tok1 token.Token, 651 ch2 rune, 652 tok2, tok3 token.Token, 653) token.Token { 654 if s.ch == '=' { 655 s.next() 656 return tok1 657 } 658 if s.ch == ch2 { 659 s.next() 660 if s.ch == '=' { 661 s.next() 662 return tok3 663 } 664 return tok2 665 } 666 return tok0 667} 668 669func isLetter(ch rune) bool { 670 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || 671 ch >= utf8.RuneSelf && unicode.IsLetter(ch) 672} 673 674func isDigit(ch rune) bool { 675 return '0' <= ch && ch <= '9' || 676 ch >= utf8.RuneSelf && unicode.IsDigit(ch) 677} 678 679func digitVal(ch rune) int { 680 switch { 681 case '0' <= ch && ch <= '9': 682 return int(ch - '0') 683 case 'a' <= ch && ch <= 'f': 684 return int(ch - 'a' + 10) 685 case 'A' <= ch && ch <= 'F': 686 return int(ch - 'A' + 10) 687 } 688 return 16 // larger than any legal digit val 689} 690