1// Package scanner implements a scanner for HCL (HashiCorp Configuration 2// Language) source text. 3package scanner 4 5import ( 6 "bytes" 7 "fmt" 8 "os" 9 "regexp" 10 "unicode" 11 "unicode/utf8" 12 13 "github.com/hashicorp/hcl/hcl/token" 14) 15 16// eof represents a marker rune for the end of the reader. 17const eof = rune(0) 18 19// Scanner defines a lexical scanner 20type Scanner struct { 21 buf *bytes.Buffer // Source buffer for advancing and scanning 22 src []byte // Source buffer for immutable access 23 24 // Source Position 25 srcPos token.Pos // current position 26 prevPos token.Pos // previous position, used for peek() method 27 28 lastCharLen int // length of last character in bytes 29 lastLineLen int // length of last line in characters (for correct column reporting) 30 31 tokStart int // token text start position 32 tokEnd int // token text end position 33 34 // Error is called for each error encountered. If no Error 35 // function is set, the error is reported to os.Stderr. 36 Error func(pos token.Pos, msg string) 37 38 // ErrorCount is incremented by one for each error encountered. 39 ErrorCount int 40 41 // tokPos is the start position of most recently scanned token; set by 42 // Scan. The Filename field is always left untouched by the Scanner. If 43 // an error is reported (via Error) and Position is invalid, the scanner is 44 // not inside a token. 45 tokPos token.Pos 46} 47 48// New creates and initializes a new instance of Scanner using src as 49// its source content. 50func New(src []byte) *Scanner { 51 // even though we accept a src, we read from a io.Reader compatible type 52 // (*bytes.Buffer). So in the future we might easily change it to streaming 53 // read. 54 b := bytes.NewBuffer(src) 55 s := &Scanner{ 56 buf: b, 57 src: src, 58 } 59 60 // srcPosition always starts with 1 61 s.srcPos.Line = 1 62 return s 63} 64 65// next reads the next rune from the bufferred reader. Returns the rune(0) if 66// an error occurs (or io.EOF is returned). 67func (s *Scanner) next() rune { 68 ch, size, err := s.buf.ReadRune() 69 if err != nil { 70 // advance for error reporting 71 s.srcPos.Column++ 72 s.srcPos.Offset += size 73 s.lastCharLen = size 74 return eof 75 } 76 77 // remember last position 78 s.prevPos = s.srcPos 79 80 s.srcPos.Column++ 81 s.lastCharLen = size 82 s.srcPos.Offset += size 83 84 if ch == utf8.RuneError && size == 1 { 85 s.err("illegal UTF-8 encoding") 86 return ch 87 } 88 89 if ch == '\n' { 90 s.srcPos.Line++ 91 s.lastLineLen = s.srcPos.Column 92 s.srcPos.Column = 0 93 } 94 95 if ch == '\x00' { 96 s.err("unexpected null character (0x00)") 97 return eof 98 } 99 100 if ch == '\uE123' { 101 s.err("unicode code point U+E123 reserved for internal use") 102 return utf8.RuneError 103 } 104 105 // debug 106 // fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column) 107 return ch 108} 109 110// unread unreads the previous read Rune and updates the source position 111func (s *Scanner) unread() { 112 if err := s.buf.UnreadRune(); err != nil { 113 panic(err) // this is user fault, we should catch it 114 } 115 s.srcPos = s.prevPos // put back last position 116} 117 118// peek returns the next rune without advancing the reader. 119func (s *Scanner) peek() rune { 120 peek, _, err := s.buf.ReadRune() 121 if err != nil { 122 return eof 123 } 124 125 s.buf.UnreadRune() 126 return peek 127} 128 129// Scan scans the next token and returns the token. 130func (s *Scanner) Scan() token.Token { 131 ch := s.next() 132 133 // skip white space 134 for isWhitespace(ch) { 135 ch = s.next() 136 } 137 138 var tok token.Type 139 140 // token text markings 141 s.tokStart = s.srcPos.Offset - s.lastCharLen 142 143 // token position, initial next() is moving the offset by one(size of rune 144 // actually), though we are interested with the starting point 145 s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen 146 if s.srcPos.Column > 0 { 147 // common case: last character was not a '\n' 148 s.tokPos.Line = s.srcPos.Line 149 s.tokPos.Column = s.srcPos.Column 150 } else { 151 // last character was a '\n' 152 // (we cannot be at the beginning of the source 153 // since we have called next() at least once) 154 s.tokPos.Line = s.srcPos.Line - 1 155 s.tokPos.Column = s.lastLineLen 156 } 157 158 switch { 159 case isLetter(ch): 160 tok = token.IDENT 161 lit := s.scanIdentifier() 162 if lit == "true" || lit == "false" { 163 tok = token.BOOL 164 } 165 case isDecimal(ch): 166 tok = s.scanNumber(ch) 167 default: 168 switch ch { 169 case eof: 170 tok = token.EOF 171 case '"': 172 tok = token.STRING 173 s.scanString() 174 case '#', '/': 175 tok = token.COMMENT 176 s.scanComment(ch) 177 case '.': 178 tok = token.PERIOD 179 ch = s.peek() 180 if isDecimal(ch) { 181 tok = token.FLOAT 182 ch = s.scanMantissa(ch) 183 ch = s.scanExponent(ch) 184 } 185 case '<': 186 tok = token.HEREDOC 187 s.scanHeredoc() 188 case '[': 189 tok = token.LBRACK 190 case ']': 191 tok = token.RBRACK 192 case '{': 193 tok = token.LBRACE 194 case '}': 195 tok = token.RBRACE 196 case ',': 197 tok = token.COMMA 198 case '=': 199 tok = token.ASSIGN 200 case '+': 201 tok = token.ADD 202 case '-': 203 if isDecimal(s.peek()) { 204 ch := s.next() 205 tok = s.scanNumber(ch) 206 } else { 207 tok = token.SUB 208 } 209 default: 210 s.err("illegal char") 211 } 212 } 213 214 // finish token ending 215 s.tokEnd = s.srcPos.Offset 216 217 // create token literal 218 var tokenText string 219 if s.tokStart >= 0 { 220 tokenText = string(s.src[s.tokStart:s.tokEnd]) 221 } 222 s.tokStart = s.tokEnd // ensure idempotency of tokenText() call 223 224 return token.Token{ 225 Type: tok, 226 Pos: s.tokPos, 227 Text: tokenText, 228 } 229} 230 231func (s *Scanner) scanComment(ch rune) { 232 // single line comments 233 if ch == '#' || (ch == '/' && s.peek() != '*') { 234 if ch == '/' && s.peek() != '/' { 235 s.err("expected '/' for comment") 236 return 237 } 238 239 ch = s.next() 240 for ch != '\n' && ch >= 0 && ch != eof { 241 ch = s.next() 242 } 243 if ch != eof && ch >= 0 { 244 s.unread() 245 } 246 return 247 } 248 249 // be sure we get the character after /* This allows us to find comment's 250 // that are not erminated 251 if ch == '/' { 252 s.next() 253 ch = s.next() // read character after "/*" 254 } 255 256 // look for /* - style comments 257 for { 258 if ch < 0 || ch == eof { 259 s.err("comment not terminated") 260 break 261 } 262 263 ch0 := ch 264 ch = s.next() 265 if ch0 == '*' && ch == '/' { 266 break 267 } 268 } 269} 270 271// scanNumber scans a HCL number definition starting with the given rune 272func (s *Scanner) scanNumber(ch rune) token.Type { 273 if ch == '0' { 274 // check for hexadecimal, octal or float 275 ch = s.next() 276 if ch == 'x' || ch == 'X' { 277 // hexadecimal 278 ch = s.next() 279 found := false 280 for isHexadecimal(ch) { 281 ch = s.next() 282 found = true 283 } 284 285 if !found { 286 s.err("illegal hexadecimal number") 287 } 288 289 if ch != eof { 290 s.unread() 291 } 292 293 return token.NUMBER 294 } 295 296 // now it's either something like: 0421(octal) or 0.1231(float) 297 illegalOctal := false 298 for isDecimal(ch) { 299 ch = s.next() 300 if ch == '8' || ch == '9' { 301 // this is just a possibility. For example 0159 is illegal, but 302 // 0159.23 is valid. So we mark a possible illegal octal. If 303 // the next character is not a period, we'll print the error. 304 illegalOctal = true 305 } 306 } 307 308 if ch == 'e' || ch == 'E' { 309 ch = s.scanExponent(ch) 310 return token.FLOAT 311 } 312 313 if ch == '.' { 314 ch = s.scanFraction(ch) 315 316 if ch == 'e' || ch == 'E' { 317 ch = s.next() 318 ch = s.scanExponent(ch) 319 } 320 return token.FLOAT 321 } 322 323 if illegalOctal { 324 s.err("illegal octal number") 325 } 326 327 if ch != eof { 328 s.unread() 329 } 330 return token.NUMBER 331 } 332 333 s.scanMantissa(ch) 334 ch = s.next() // seek forward 335 if ch == 'e' || ch == 'E' { 336 ch = s.scanExponent(ch) 337 return token.FLOAT 338 } 339 340 if ch == '.' { 341 ch = s.scanFraction(ch) 342 if ch == 'e' || ch == 'E' { 343 ch = s.next() 344 ch = s.scanExponent(ch) 345 } 346 return token.FLOAT 347 } 348 349 if ch != eof { 350 s.unread() 351 } 352 return token.NUMBER 353} 354 355// scanMantissa scans the mantissa beginning from the rune. It returns the next 356// non decimal rune. It's used to determine wheter it's a fraction or exponent. 357func (s *Scanner) scanMantissa(ch rune) rune { 358 scanned := false 359 for isDecimal(ch) { 360 ch = s.next() 361 scanned = true 362 } 363 364 if scanned && ch != eof { 365 s.unread() 366 } 367 return ch 368} 369 370// scanFraction scans the fraction after the '.' rune 371func (s *Scanner) scanFraction(ch rune) rune { 372 if ch == '.' { 373 ch = s.peek() // we peek just to see if we can move forward 374 ch = s.scanMantissa(ch) 375 } 376 return ch 377} 378 379// scanExponent scans the remaining parts of an exponent after the 'e' or 'E' 380// rune. 381func (s *Scanner) scanExponent(ch rune) rune { 382 if ch == 'e' || ch == 'E' { 383 ch = s.next() 384 if ch == '-' || ch == '+' { 385 ch = s.next() 386 } 387 ch = s.scanMantissa(ch) 388 } 389 return ch 390} 391 392// scanHeredoc scans a heredoc string 393func (s *Scanner) scanHeredoc() { 394 // Scan the second '<' in example: '<<EOF' 395 if s.next() != '<' { 396 s.err("heredoc expected second '<', didn't see it") 397 return 398 } 399 400 // Get the original offset so we can read just the heredoc ident 401 offs := s.srcPos.Offset 402 403 // Scan the identifier 404 ch := s.next() 405 406 // Indented heredoc syntax 407 if ch == '-' { 408 ch = s.next() 409 } 410 411 for isLetter(ch) || isDigit(ch) { 412 ch = s.next() 413 } 414 415 // If we reached an EOF then that is not good 416 if ch == eof { 417 s.err("heredoc not terminated") 418 return 419 } 420 421 // Ignore the '\r' in Windows line endings 422 if ch == '\r' { 423 if s.peek() == '\n' { 424 ch = s.next() 425 } 426 } 427 428 // If we didn't reach a newline then that is also not good 429 if ch != '\n' { 430 s.err("invalid characters in heredoc anchor") 431 return 432 } 433 434 // Read the identifier 435 identBytes := s.src[offs : s.srcPos.Offset-s.lastCharLen] 436 if len(identBytes) == 0 || (len(identBytes) == 1 && identBytes[0] == '-') { 437 s.err("zero-length heredoc anchor") 438 return 439 } 440 441 var identRegexp *regexp.Regexp 442 if identBytes[0] == '-' { 443 identRegexp = regexp.MustCompile(fmt.Sprintf(`^[[:space:]]*%s\r*\z`, identBytes[1:])) 444 } else { 445 identRegexp = regexp.MustCompile(fmt.Sprintf(`^[[:space:]]*%s\r*\z`, identBytes)) 446 } 447 448 // Read the actual string value 449 lineStart := s.srcPos.Offset 450 for { 451 ch := s.next() 452 453 // Special newline handling. 454 if ch == '\n' { 455 // Math is fast, so we first compare the byte counts to see if we have a chance 456 // of seeing the same identifier - if the length is less than the number of bytes 457 // in the identifier, this cannot be a valid terminator. 458 lineBytesLen := s.srcPos.Offset - s.lastCharLen - lineStart 459 if lineBytesLen >= len(identBytes) && identRegexp.Match(s.src[lineStart:s.srcPos.Offset-s.lastCharLen]) { 460 break 461 } 462 463 // Not an anchor match, record the start of a new line 464 lineStart = s.srcPos.Offset 465 } 466 467 if ch == eof { 468 s.err("heredoc not terminated") 469 return 470 } 471 } 472 473 return 474} 475 476// scanString scans a quoted string 477func (s *Scanner) scanString() { 478 braces := 0 479 for { 480 // '"' opening already consumed 481 // read character after quote 482 ch := s.next() 483 484 if (ch == '\n' && braces == 0) || ch < 0 || ch == eof { 485 s.err("literal not terminated") 486 return 487 } 488 489 if ch == '"' && braces == 0 { 490 break 491 } 492 493 // If we're going into a ${} then we can ignore quotes for awhile 494 if braces == 0 && ch == '$' && s.peek() == '{' { 495 braces++ 496 s.next() 497 } else if braces > 0 && ch == '{' { 498 braces++ 499 } 500 if braces > 0 && ch == '}' { 501 braces-- 502 } 503 504 if ch == '\\' { 505 s.scanEscape() 506 } 507 } 508 509 return 510} 511 512// scanEscape scans an escape sequence 513func (s *Scanner) scanEscape() rune { 514 // http://en.cppreference.com/w/cpp/language/escape 515 ch := s.next() // read character after '/' 516 switch ch { 517 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"': 518 // nothing to do 519 case '0', '1', '2', '3', '4', '5', '6', '7': 520 // octal notation 521 ch = s.scanDigits(ch, 8, 3) 522 case 'x': 523 // hexademical notation 524 ch = s.scanDigits(s.next(), 16, 2) 525 case 'u': 526 // universal character name 527 ch = s.scanDigits(s.next(), 16, 4) 528 case 'U': 529 // universal character name 530 ch = s.scanDigits(s.next(), 16, 8) 531 default: 532 s.err("illegal char escape") 533 } 534 return ch 535} 536 537// scanDigits scans a rune with the given base for n times. For example an 538// octal notation \184 would yield in scanDigits(ch, 8, 3) 539func (s *Scanner) scanDigits(ch rune, base, n int) rune { 540 start := n 541 for n > 0 && digitVal(ch) < base { 542 ch = s.next() 543 if ch == eof { 544 // If we see an EOF, we halt any more scanning of digits 545 // immediately. 546 break 547 } 548 549 n-- 550 } 551 if n > 0 { 552 s.err("illegal char escape") 553 } 554 555 if n != start && ch != eof { 556 // we scanned all digits, put the last non digit char back, 557 // only if we read anything at all 558 s.unread() 559 } 560 561 return ch 562} 563 564// scanIdentifier scans an identifier and returns the literal string 565func (s *Scanner) scanIdentifier() string { 566 offs := s.srcPos.Offset - s.lastCharLen 567 ch := s.next() 568 for isLetter(ch) || isDigit(ch) || ch == '-' || ch == '.' { 569 ch = s.next() 570 } 571 572 if ch != eof { 573 s.unread() // we got identifier, put back latest char 574 } 575 576 return string(s.src[offs:s.srcPos.Offset]) 577} 578 579// recentPosition returns the position of the character immediately after the 580// character or token returned by the last call to Scan. 581func (s *Scanner) recentPosition() (pos token.Pos) { 582 pos.Offset = s.srcPos.Offset - s.lastCharLen 583 switch { 584 case s.srcPos.Column > 0: 585 // common case: last character was not a '\n' 586 pos.Line = s.srcPos.Line 587 pos.Column = s.srcPos.Column 588 case s.lastLineLen > 0: 589 // last character was a '\n' 590 // (we cannot be at the beginning of the source 591 // since we have called next() at least once) 592 pos.Line = s.srcPos.Line - 1 593 pos.Column = s.lastLineLen 594 default: 595 // at the beginning of the source 596 pos.Line = 1 597 pos.Column = 1 598 } 599 return 600} 601 602// err prints the error of any scanning to s.Error function. If the function is 603// not defined, by default it prints them to os.Stderr 604func (s *Scanner) err(msg string) { 605 s.ErrorCount++ 606 pos := s.recentPosition() 607 608 if s.Error != nil { 609 s.Error(pos, msg) 610 return 611 } 612 613 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 614} 615 616// isHexadecimal returns true if the given rune is a letter 617func isLetter(ch rune) bool { 618 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch) 619} 620 621// isDigit returns true if the given rune is a decimal digit 622func isDigit(ch rune) bool { 623 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch) 624} 625 626// isDecimal returns true if the given rune is a decimal number 627func isDecimal(ch rune) bool { 628 return '0' <= ch && ch <= '9' 629} 630 631// isHexadecimal returns true if the given rune is an hexadecimal number 632func isHexadecimal(ch rune) bool { 633 return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F' 634} 635 636// isWhitespace returns true if the rune is a space, tab, newline or carriage return 637func isWhitespace(ch rune) bool { 638 return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' 639} 640 641// digitVal returns the integer value of a given octal,decimal or hexadecimal rune 642func digitVal(ch rune) int { 643 switch { 644 case '0' <= ch && ch <= '9': 645 return int(ch - '0') 646 case 'a' <= ch && ch <= 'f': 647 return int(ch - 'a' + 10) 648 case 'A' <= ch && ch <= 'F': 649 return int(ch - 'A' + 10) 650 } 651 return 16 // larger than any legal digit val 652} 653