1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package scanner implements a scanner for Go source text. 6// It takes a []byte as source which can then be tokenized 7// through repeated calls to the Scan method. 8// 9package scanner 10 11import ( 12 "bytes" 13 "fmt" 14 "go/token" 15 "path/filepath" 16 "strconv" 17 "unicode" 18 "unicode/utf8" 19) 20 21// An ErrorHandler may be provided to Scanner.Init. If a syntax error is 22// encountered and a handler was installed, the handler is called with a 23// position and an error message. The position points to the beginning of 24// the offending token. 25// 26type ErrorHandler func(pos token.Position, msg string) 27 28// A Scanner holds the scanner's internal state while processing 29// a given text. It can be allocated as part of another data 30// structure but must be initialized via Init before use. 31// 32type Scanner struct { 33 // immutable state 34 file *token.File // source file handle 35 dir string // directory portion of file.Name() 36 src []byte // source 37 err ErrorHandler // error reporting; or nil 38 mode Mode // scanning mode 39 40 // scanning state 41 ch rune // current character 42 offset int // character offset 43 rdOffset int // reading offset (position after current character) 44 lineOffset int // current line offset 45 insertSemi bool // insert a semicolon before next newline 46 47 // public state - ok to modify 48 ErrorCount int // number of errors encountered 49} 50 51const ( 52 bom = 0xFEFF // byte order mark, only permitted as very first character 53 eof = -1 // end of file 54) 55 56// Read the next Unicode char into s.ch. 57// s.ch < 0 means end-of-file. 58// 59// For optimization, there is some overlap between this method and 60// s.scanIdentifier. 61func (s *Scanner) next() { 62 if s.rdOffset < len(s.src) { 63 s.offset = s.rdOffset 64 if s.ch == '\n' { 65 s.lineOffset = s.offset 66 s.file.AddLine(s.offset) 67 } 68 r, w := rune(s.src[s.rdOffset]), 1 69 switch { 70 case r == 0: 71 s.error(s.offset, "illegal character NUL") 72 case r >= utf8.RuneSelf: 73 // not ASCII 74 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 75 if r == utf8.RuneError && w == 1 { 76 s.error(s.offset, "illegal UTF-8 encoding") 77 } else if r == bom && s.offset > 0 { 78 s.error(s.offset, "illegal byte order mark") 79 } 80 } 81 s.rdOffset += w 82 s.ch = r 83 } else { 84 s.offset = len(s.src) 85 if s.ch == '\n' { 86 s.lineOffset = s.offset 87 s.file.AddLine(s.offset) 88 } 89 s.ch = eof 90 } 91} 92 93// peek returns the byte following the most recently read character without 94// advancing the scanner. If the scanner is at EOF, peek returns 0. 95func (s *Scanner) peek() byte { 96 if s.rdOffset < len(s.src) { 97 return s.src[s.rdOffset] 98 } 99 return 0 100} 101 102// A mode value is a set of flags (or 0). 103// They control scanner behavior. 104// 105type Mode uint 106 107const ( 108 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 109 dontInsertSemis // do not automatically insert semicolons - for testing only 110) 111 112// Init prepares the scanner s to tokenize the text src by setting the 113// scanner at the beginning of src. The scanner uses the file set file 114// for position information and it adds line information for each line. 115// It is ok to re-use the same file when re-scanning the same file as 116// line information which is already present is ignored. Init causes a 117// panic if the file size does not match the src size. 118// 119// Calls to Scan will invoke the error handler err if they encounter a 120// syntax error and err is not nil. Also, for each error encountered, 121// the Scanner field ErrorCount is incremented by one. The mode parameter 122// determines how comments are handled. 123// 124// Note that Init may call err if there is an error in the first character 125// of the file. 126// 127func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 128 // Explicitly initialize all fields since a scanner may be reused. 129 if file.Size() != len(src) { 130 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 131 } 132 s.file = file 133 s.dir, _ = filepath.Split(file.Name()) 134 s.src = src 135 s.err = err 136 s.mode = mode 137 138 s.ch = ' ' 139 s.offset = 0 140 s.rdOffset = 0 141 s.lineOffset = 0 142 s.insertSemi = false 143 s.ErrorCount = 0 144 145 s.next() 146 if s.ch == bom { 147 s.next() // ignore BOM at file beginning 148 } 149} 150 151func (s *Scanner) error(offs int, msg string) { 152 if s.err != nil { 153 s.err(s.file.Position(s.file.Pos(offs)), msg) 154 } 155 s.ErrorCount++ 156} 157 158func (s *Scanner) errorf(offs int, format string, args ...any) { 159 s.error(offs, fmt.Sprintf(format, args...)) 160} 161 162func (s *Scanner) scanComment() string { 163 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 164 offs := s.offset - 1 // position of initial '/' 165 next := -1 // position immediately following the comment; < 0 means invalid comment 166 numCR := 0 167 168 if s.ch == '/' { 169 //-style comment 170 // (the final '\n' is not considered part of the comment) 171 s.next() 172 for s.ch != '\n' && s.ch >= 0 { 173 if s.ch == '\r' { 174 numCR++ 175 } 176 s.next() 177 } 178 // if we are at '\n', the position following the comment is afterwards 179 next = s.offset 180 if s.ch == '\n' { 181 next++ 182 } 183 goto exit 184 } 185 186 /*-style comment */ 187 s.next() 188 for s.ch >= 0 { 189 ch := s.ch 190 if ch == '\r' { 191 numCR++ 192 } 193 s.next() 194 if ch == '*' && s.ch == '/' { 195 s.next() 196 next = s.offset 197 goto exit 198 } 199 } 200 201 s.error(offs, "comment not terminated") 202 203exit: 204 lit := s.src[offs:s.offset] 205 206 // On Windows, a (//-comment) line may end in "\r\n". 207 // Remove the final '\r' before analyzing the text for 208 // line directives (matching the compiler). Remove any 209 // other '\r' afterwards (matching the pre-existing be- 210 // havior of the scanner). 211 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' { 212 lit = lit[:len(lit)-1] 213 numCR-- 214 } 215 216 // interpret line directives 217 // (//line directives must start at the beginning of the current line) 218 if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) { 219 s.updateLineInfo(next, offs, lit) 220 } 221 222 if numCR > 0 { 223 lit = stripCR(lit, lit[1] == '*') 224 } 225 226 return string(lit) 227} 228 229var prefix = []byte("line ") 230 231// updateLineInfo parses the incoming comment text at offset offs 232// as a line directive. If successful, it updates the line info table 233// for the position next per the line directive. 234func (s *Scanner) updateLineInfo(next, offs int, text []byte) { 235 // extract comment text 236 if text[1] == '*' { 237 text = text[:len(text)-2] // lop off trailing "*/" 238 } 239 text = text[7:] // lop off leading "//line " or "/*line " 240 offs += 7 241 242 i, n, ok := trailingDigits(text) 243 if i == 0 { 244 return // ignore (not a line directive) 245 } 246 // i > 0 247 248 if !ok { 249 // text has a suffix :xxx but xxx is not a number 250 s.error(offs+i, "invalid line number: "+string(text[i:])) 251 return 252 } 253 254 var line, col int 255 i2, n2, ok2 := trailingDigits(text[:i-1]) 256 if ok2 { 257 //line filename:line:col 258 i, i2 = i2, i 259 line, col = n2, n 260 if col == 0 { 261 s.error(offs+i2, "invalid column number: "+string(text[i2:])) 262 return 263 } 264 text = text[:i2-1] // lop off ":col" 265 } else { 266 //line filename:line 267 line = n 268 } 269 270 if line == 0 { 271 s.error(offs+i, "invalid line number: "+string(text[i:])) 272 return 273 } 274 275 // If we have a column (//line filename:line:col form), 276 // an empty filename means to use the previous filename. 277 filename := string(text[:i-1]) // lop off ":line", and trim white space 278 if filename == "" && ok2 { 279 filename = s.file.Position(s.file.Pos(offs)).Filename 280 } else if filename != "" { 281 // Put a relative filename in the current directory. 282 // This is for compatibility with earlier releases. 283 // See issue 26671. 284 filename = filepath.Clean(filename) 285 if !filepath.IsAbs(filename) { 286 filename = filepath.Join(s.dir, filename) 287 } 288 } 289 290 s.file.AddLineColumnInfo(next, filename, line, col) 291} 292 293func trailingDigits(text []byte) (int, int, bool) { 294 i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':') 295 if i < 0 { 296 return 0, 0, false // no ":" 297 } 298 // i >= 0 299 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0) 300 return i + 1, int(n), err == nil 301} 302 303func (s *Scanner) findLineEnd() bool { 304 // initial '/' already consumed 305 306 defer func(offs int) { 307 // reset scanner state to where it was upon calling findLineEnd 308 s.ch = '/' 309 s.offset = offs 310 s.rdOffset = offs + 1 311 s.next() // consume initial '/' again 312 }(s.offset - 1) 313 314 // read ahead until a newline, EOF, or non-comment token is found 315 for s.ch == '/' || s.ch == '*' { 316 if s.ch == '/' { 317 //-style comment always contains a newline 318 return true 319 } 320 /*-style comment: look for newline */ 321 s.next() 322 for s.ch >= 0 { 323 ch := s.ch 324 if ch == '\n' { 325 return true 326 } 327 s.next() 328 if ch == '*' && s.ch == '/' { 329 s.next() 330 break 331 } 332 } 333 s.skipWhitespace() // s.insertSemi is set 334 if s.ch < 0 || s.ch == '\n' { 335 return true 336 } 337 if s.ch != '/' { 338 // non-comment token 339 return false 340 } 341 s.next() // consume '/' 342 } 343 344 return false 345} 346 347func isLetter(ch rune) bool { 348 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) 349} 350 351func isDigit(ch rune) bool { 352 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch) 353} 354 355// scanIdentifier reads the string of valid identifier characters at s.offset. 356// It must only be called when s.ch is known to be a valid letter. 357// 358// Be careful when making changes to this function: it is optimized and affects 359// scanning performance significantly. 360func (s *Scanner) scanIdentifier() string { 361 offs := s.offset 362 363 // Optimize for the common case of an ASCII identifier. 364 // 365 // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and 366 // avoids conversions to runes. 367 // 368 // In case we encounter a non-ASCII character, fall back on the slower path 369 // of calling into s.next(). 370 for rdOffset, b := range s.src[s.rdOffset:] { 371 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' { 372 // Avoid assigning a rune for the common case of an ascii character. 373 continue 374 } 375 s.rdOffset += rdOffset 376 if 0 < b && b < utf8.RuneSelf { 377 // Optimization: we've encountered an ASCII character that's not a letter 378 // or number. Avoid the call into s.next() and corresponding set up. 379 // 380 // Note that s.next() does some line accounting if s.ch is '\n', so this 381 // shortcut is only possible because we know that the preceding character 382 // is not '\n'. 383 s.ch = rune(b) 384 s.offset = s.rdOffset 385 s.rdOffset++ 386 goto exit 387 } 388 // We know that the preceding character is valid for an identifier because 389 // scanIdentifier is only called when s.ch is a letter, so calling s.next() 390 // at s.rdOffset resets the scanner state. 391 s.next() 392 for isLetter(s.ch) || isDigit(s.ch) { 393 s.next() 394 } 395 goto exit 396 } 397 s.offset = len(s.src) 398 s.rdOffset = len(s.src) 399 s.ch = eof 400 401exit: 402 return string(s.src[offs:s.offset]) 403} 404 405func digitVal(ch rune) int { 406 switch { 407 case '0' <= ch && ch <= '9': 408 return int(ch - '0') 409 case 'a' <= lower(ch) && lower(ch) <= 'f': 410 return int(lower(ch) - 'a' + 10) 411 } 412 return 16 // larger than any legal digit val 413} 414 415func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter 416func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 417func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } 418 419// digits accepts the sequence { digit | '_' }. 420// If base <= 10, digits accepts any decimal digit but records 421// the offset (relative to the source start) of a digit >= base 422// in *invalid, if *invalid < 0. 423// digits returns a bitset describing whether the sequence contained 424// digits (bit 0 is set), or separators '_' (bit 1 is set). 425func (s *Scanner) digits(base int, invalid *int) (digsep int) { 426 if base <= 10 { 427 max := rune('0' + base) 428 for isDecimal(s.ch) || s.ch == '_' { 429 ds := 1 430 if s.ch == '_' { 431 ds = 2 432 } else if s.ch >= max && *invalid < 0 { 433 *invalid = s.offset // record invalid rune offset 434 } 435 digsep |= ds 436 s.next() 437 } 438 } else { 439 for isHex(s.ch) || s.ch == '_' { 440 ds := 1 441 if s.ch == '_' { 442 ds = 2 443 } 444 digsep |= ds 445 s.next() 446 } 447 } 448 return 449} 450 451func (s *Scanner) scanNumber() (token.Token, string) { 452 offs := s.offset 453 tok := token.ILLEGAL 454 455 base := 10 // number base 456 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' 457 digsep := 0 // bit 0: digit present, bit 1: '_' present 458 invalid := -1 // index of invalid digit in literal, or < 0 459 460 // integer part 461 if s.ch != '.' { 462 tok = token.INT 463 if s.ch == '0' { 464 s.next() 465 switch lower(s.ch) { 466 case 'x': 467 s.next() 468 base, prefix = 16, 'x' 469 case 'o': 470 s.next() 471 base, prefix = 8, 'o' 472 case 'b': 473 s.next() 474 base, prefix = 2, 'b' 475 default: 476 base, prefix = 8, '0' 477 digsep = 1 // leading 0 478 } 479 } 480 digsep |= s.digits(base, &invalid) 481 } 482 483 // fractional part 484 if s.ch == '.' { 485 tok = token.FLOAT 486 if prefix == 'o' || prefix == 'b' { 487 s.error(s.offset, "invalid radix point in "+litname(prefix)) 488 } 489 s.next() 490 digsep |= s.digits(base, &invalid) 491 } 492 493 if digsep&1 == 0 { 494 s.error(s.offset, litname(prefix)+" has no digits") 495 } 496 497 // exponent 498 if e := lower(s.ch); e == 'e' || e == 'p' { 499 switch { 500 case e == 'e' && prefix != 0 && prefix != '0': 501 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch) 502 case e == 'p' && prefix != 'x': 503 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch) 504 } 505 s.next() 506 tok = token.FLOAT 507 if s.ch == '+' || s.ch == '-' { 508 s.next() 509 } 510 ds := s.digits(10, nil) 511 digsep |= ds 512 if ds&1 == 0 { 513 s.error(s.offset, "exponent has no digits") 514 } 515 } else if prefix == 'x' && tok == token.FLOAT { 516 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent") 517 } 518 519 // suffix 'i' 520 if s.ch == 'i' { 521 tok = token.IMAG 522 s.next() 523 } 524 525 lit := string(s.src[offs:s.offset]) 526 if tok == token.INT && invalid >= 0 { 527 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix)) 528 } 529 if digsep&2 != 0 { 530 if i := invalidSep(lit); i >= 0 { 531 s.error(offs+i, "'_' must separate successive digits") 532 } 533 } 534 535 return tok, lit 536} 537 538func litname(prefix rune) string { 539 switch prefix { 540 case 'x': 541 return "hexadecimal literal" 542 case 'o', '0': 543 return "octal literal" 544 case 'b': 545 return "binary literal" 546 } 547 return "decimal literal" 548} 549 550// invalidSep returns the index of the first invalid separator in x, or -1. 551func invalidSep(x string) int { 552 x1 := ' ' // prefix char, we only care if it's 'x' 553 d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else) 554 i := 0 555 556 // a prefix counts as a digit 557 if len(x) >= 2 && x[0] == '0' { 558 x1 = lower(rune(x[1])) 559 if x1 == 'x' || x1 == 'o' || x1 == 'b' { 560 d = '0' 561 i = 2 562 } 563 } 564 565 // mantissa and exponent 566 for ; i < len(x); i++ { 567 p := d // previous digit 568 d = rune(x[i]) 569 switch { 570 case d == '_': 571 if p != '0' { 572 return i 573 } 574 case isDecimal(d) || x1 == 'x' && isHex(d): 575 d = '0' 576 default: 577 if p == '_' { 578 return i - 1 579 } 580 d = '.' 581 } 582 } 583 if d == '_' { 584 return len(x) - 1 585 } 586 587 return -1 588} 589 590// scanEscape parses an escape sequence where rune is the accepted 591// escaped quote. In case of a syntax error, it stops at the offending 592// character (without consuming it) and returns false. Otherwise 593// it returns true. 594func (s *Scanner) scanEscape(quote rune) bool { 595 offs := s.offset 596 597 var n int 598 var base, max uint32 599 switch s.ch { 600 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 601 s.next() 602 return true 603 case '0', '1', '2', '3', '4', '5', '6', '7': 604 n, base, max = 3, 8, 255 605 case 'x': 606 s.next() 607 n, base, max = 2, 16, 255 608 case 'u': 609 s.next() 610 n, base, max = 4, 16, unicode.MaxRune 611 case 'U': 612 s.next() 613 n, base, max = 8, 16, unicode.MaxRune 614 default: 615 msg := "unknown escape sequence" 616 if s.ch < 0 { 617 msg = "escape sequence not terminated" 618 } 619 s.error(offs, msg) 620 return false 621 } 622 623 var x uint32 624 for n > 0 { 625 d := uint32(digitVal(s.ch)) 626 if d >= base { 627 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch) 628 if s.ch < 0 { 629 msg = "escape sequence not terminated" 630 } 631 s.error(s.offset, msg) 632 return false 633 } 634 x = x*base + d 635 s.next() 636 n-- 637 } 638 639 if x > max || 0xD800 <= x && x < 0xE000 { 640 s.error(offs, "escape sequence is invalid Unicode code point") 641 return false 642 } 643 644 return true 645} 646 647func (s *Scanner) scanRune() string { 648 // '\'' opening already consumed 649 offs := s.offset - 1 650 651 valid := true 652 n := 0 653 for { 654 ch := s.ch 655 if ch == '\n' || ch < 0 { 656 // only report error if we don't have one already 657 if valid { 658 s.error(offs, "rune literal not terminated") 659 valid = false 660 } 661 break 662 } 663 s.next() 664 if ch == '\'' { 665 break 666 } 667 n++ 668 if ch == '\\' { 669 if !s.scanEscape('\'') { 670 valid = false 671 } 672 // continue to read to closing quote 673 } 674 } 675 676 if valid && n != 1 { 677 s.error(offs, "illegal rune literal") 678 } 679 680 return string(s.src[offs:s.offset]) 681} 682 683func (s *Scanner) scanString() string { 684 // '"' opening already consumed 685 offs := s.offset - 1 686 687 for { 688 ch := s.ch 689 if ch == '\n' || ch < 0 { 690 s.error(offs, "string literal not terminated") 691 break 692 } 693 s.next() 694 if ch == '"' { 695 break 696 } 697 if ch == '\\' { 698 s.scanEscape('"') 699 } 700 } 701 702 return string(s.src[offs:s.offset]) 703} 704 705func stripCR(b []byte, comment bool) []byte { 706 c := make([]byte, len(b)) 707 i := 0 708 for j, ch := range b { 709 // In a /*-style comment, don't strip \r from *\r/ (incl. 710 // sequences of \r from *\r\r...\r/) since the resulting 711 // */ would terminate the comment too early unless the \r 712 // is immediately following the opening /* in which case 713 // it's ok because /*/ is not closed yet (issue #11151). 714 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' { 715 c[i] = ch 716 i++ 717 } 718 } 719 return c[:i] 720} 721 722func (s *Scanner) scanRawString() string { 723 // '`' opening already consumed 724 offs := s.offset - 1 725 726 hasCR := false 727 for { 728 ch := s.ch 729 if ch < 0 { 730 s.error(offs, "raw string literal not terminated") 731 break 732 } 733 s.next() 734 if ch == '`' { 735 break 736 } 737 if ch == '\r' { 738 hasCR = true 739 } 740 } 741 742 lit := s.src[offs:s.offset] 743 if hasCR { 744 lit = stripCR(lit, false) 745 } 746 747 return string(lit) 748} 749 750func (s *Scanner) skipWhitespace() { 751 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 752 s.next() 753 } 754} 755 756// Helper functions for scanning multi-byte tokens such as >> += >>= . 757// Different routines recognize different length tok_i based on matches 758// of ch_i. If a token ends in '=', the result is tok1 or tok3 759// respectively. Otherwise, the result is tok0 if there was no other 760// matching character, or tok2 if the matching character was ch2. 761 762func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 763 if s.ch == '=' { 764 s.next() 765 return tok1 766 } 767 return tok0 768} 769 770func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 771 if s.ch == '=' { 772 s.next() 773 return tok1 774 } 775 if s.ch == ch2 { 776 s.next() 777 return tok2 778 } 779 return tok0 780} 781 782func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 783 if s.ch == '=' { 784 s.next() 785 return tok1 786 } 787 if s.ch == ch2 { 788 s.next() 789 if s.ch == '=' { 790 s.next() 791 return tok3 792 } 793 return tok2 794 } 795 return tok0 796} 797 798// Scan scans the next token and returns the token position, the token, 799// and its literal string if applicable. The source end is indicated by 800// token.EOF. 801// 802// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 803// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 804// has the corresponding value. 805// 806// If the returned token is a keyword, the literal string is the keyword. 807// 808// If the returned token is token.SEMICOLON, the corresponding 809// literal string is ";" if the semicolon was present in the source, 810// and "\n" if the semicolon was inserted because of a newline or 811// at EOF. 812// 813// If the returned token is token.ILLEGAL, the literal string is the 814// offending character. 815// 816// In all other cases, Scan returns an empty literal string. 817// 818// For more tolerant parsing, Scan will return a valid token if 819// possible even if a syntax error was encountered. Thus, even 820// if the resulting token sequence contains no illegal tokens, 821// a client may not assume that no error occurred. Instead it 822// must check the scanner's ErrorCount or the number of calls 823// of the error handler, if there was one installed. 824// 825// Scan adds line information to the file added to the file 826// set with Init. Token positions are relative to that file 827// and thus relative to the file set. 828// 829func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 830scanAgain: 831 s.skipWhitespace() 832 833 // current token start 834 pos = s.file.Pos(s.offset) 835 836 // determine token value 837 insertSemi := false 838 switch ch := s.ch; { 839 case isLetter(ch): 840 lit = s.scanIdentifier() 841 if len(lit) > 1 { 842 // keywords are longer than one letter - avoid lookup otherwise 843 tok = token.Lookup(lit) 844 switch tok { 845 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 846 insertSemi = true 847 } 848 } else { 849 insertSemi = true 850 tok = token.IDENT 851 } 852 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())): 853 insertSemi = true 854 tok, lit = s.scanNumber() 855 default: 856 s.next() // always make progress 857 switch ch { 858 case -1: 859 if s.insertSemi { 860 s.insertSemi = false // EOF consumed 861 return pos, token.SEMICOLON, "\n" 862 } 863 tok = token.EOF 864 case '\n': 865 // we only reach here if s.insertSemi was 866 // set in the first place and exited early 867 // from s.skipWhitespace() 868 s.insertSemi = false // newline consumed 869 return pos, token.SEMICOLON, "\n" 870 case '"': 871 insertSemi = true 872 tok = token.STRING 873 lit = s.scanString() 874 case '\'': 875 insertSemi = true 876 tok = token.CHAR 877 lit = s.scanRune() 878 case '`': 879 insertSemi = true 880 tok = token.STRING 881 lit = s.scanRawString() 882 case ':': 883 tok = s.switch2(token.COLON, token.DEFINE) 884 case '.': 885 // fractions starting with a '.' are handled by outer switch 886 tok = token.PERIOD 887 if s.ch == '.' && s.peek() == '.' { 888 s.next() 889 s.next() // consume last '.' 890 tok = token.ELLIPSIS 891 } 892 case ',': 893 tok = token.COMMA 894 case ';': 895 tok = token.SEMICOLON 896 lit = ";" 897 case '(': 898 tok = token.LPAREN 899 case ')': 900 insertSemi = true 901 tok = token.RPAREN 902 case '[': 903 tok = token.LBRACK 904 case ']': 905 insertSemi = true 906 tok = token.RBRACK 907 case '{': 908 tok = token.LBRACE 909 case '}': 910 insertSemi = true 911 tok = token.RBRACE 912 case '+': 913 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 914 if tok == token.INC { 915 insertSemi = true 916 } 917 case '-': 918 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 919 if tok == token.DEC { 920 insertSemi = true 921 } 922 case '*': 923 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 924 case '/': 925 if s.ch == '/' || s.ch == '*' { 926 // comment 927 if s.insertSemi && s.findLineEnd() { 928 // reset position to the beginning of the comment 929 s.ch = '/' 930 s.offset = s.file.Offset(pos) 931 s.rdOffset = s.offset + 1 932 s.insertSemi = false // newline consumed 933 return pos, token.SEMICOLON, "\n" 934 } 935 comment := s.scanComment() 936 if s.mode&ScanComments == 0 { 937 // skip comment 938 s.insertSemi = false // newline consumed 939 goto scanAgain 940 } 941 tok = token.COMMENT 942 lit = comment 943 } else { 944 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 945 } 946 case '%': 947 tok = s.switch2(token.REM, token.REM_ASSIGN) 948 case '^': 949 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 950 case '<': 951 if s.ch == '-' { 952 s.next() 953 tok = token.ARROW 954 } else { 955 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 956 } 957 case '>': 958 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 959 case '=': 960 tok = s.switch2(token.ASSIGN, token.EQL) 961 case '!': 962 tok = s.switch2(token.NOT, token.NEQ) 963 case '&': 964 if s.ch == '^' { 965 s.next() 966 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 967 } else { 968 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 969 } 970 case '|': 971 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 972 case '~': 973 tok = token.TILDE 974 default: 975 // next reports unexpected BOMs - don't repeat 976 if ch != bom { 977 s.errorf(s.file.Offset(pos), "illegal character %#U", ch) 978 } 979 insertSemi = s.insertSemi // preserve insertSemi info 980 tok = token.ILLEGAL 981 lit = string(ch) 982 } 983 } 984 if s.mode&dontInsertSemis == 0 { 985 s.insertSemi = insertSemi 986 } 987 988 return 989} 990