1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package parse 6 7import ( 8 "fmt" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12) 13 14// item represents a token or text string returned from the scanner. 15type item struct { 16 typ itemType // The type of this item. 17 pos Pos // The starting position, in bytes, of this item in the input string. 18 val string // The value of this item. 19} 20 21func (i item) String() string { 22 switch { 23 case i.typ == itemEOF: 24 return "EOF" 25 case i.typ == itemError: 26 return i.val 27 case i.typ > itemKeyword: 28 return fmt.Sprintf("<%s>", i.val) 29 case len(i.val) > 10: 30 return fmt.Sprintf("%.10q...", i.val) 31 } 32 return fmt.Sprintf("%q", i.val) 33} 34 35// itemType identifies the type of lex items. 36type itemType int 37 38const ( 39 itemError itemType = iota // error occurred; value is text of error 40 itemBool // boolean constant 41 itemChar // printable ASCII character; grab bag for comma etc. 42 itemCharConstant // character constant 43 itemComplex // complex constant (1+2i); imaginary is just a number 44 itemColonEquals // colon-equals (':=') introducing a declaration 45 itemEOF 46 itemField // alphanumeric identifier starting with '.' 47 itemIdentifier // alphanumeric identifier not starting with '.' 48 itemLeftDelim // left action delimiter 49 itemLeftParen // '(' inside action 50 itemNumber // simple number, including imaginary 51 itemPipe // pipe symbol 52 itemRawString // raw quoted string (includes quotes) 53 itemRightDelim // right action delimiter 54 itemElideNewline // elide newline after right delim 55 itemRightParen // ')' inside action 56 itemSpace // run of spaces separating arguments 57 itemString // quoted string (includes quotes) 58 itemText // plain text 59 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello' 60 // Keywords appear after all the rest. 61 itemKeyword // used only to delimit the keywords 62 itemDot // the cursor, spelled '.' 63 itemDefine // define keyword 64 itemElse // else keyword 65 itemEnd // end keyword 66 itemIf // if keyword 67 itemNil // the untyped nil constant, easiest to treat as a keyword 68 itemRange // range keyword 69 itemTemplate // template keyword 70 itemWith // with keyword 71) 72 73var key = map[string]itemType{ 74 ".": itemDot, 75 "define": itemDefine, 76 "else": itemElse, 77 "end": itemEnd, 78 "if": itemIf, 79 "range": itemRange, 80 "nil": itemNil, 81 "template": itemTemplate, 82 "with": itemWith, 83} 84 85const eof = -1 86 87// stateFn represents the state of the scanner as a function that returns the next state. 88type stateFn func(*lexer) stateFn 89 90// lexer holds the state of the scanner. 91type lexer struct { 92 name string // the name of the input; used only for error reports 93 input string // the string being scanned 94 leftDelim string // start of action 95 rightDelim string // end of action 96 state stateFn // the next lexing function to enter 97 pos Pos // current position in the input 98 start Pos // start position of this item 99 width Pos // width of last rune read from input 100 lastPos Pos // position of most recent item returned by nextItem 101 items chan item // channel of scanned items 102 parenDepth int // nesting depth of ( ) exprs 103} 104 105// next returns the next rune in the input. 106func (l *lexer) next() rune { 107 if int(l.pos) >= len(l.input) { 108 l.width = 0 109 return eof 110 } 111 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 112 l.width = Pos(w) 113 l.pos += l.width 114 return r 115} 116 117// peek returns but does not consume the next rune in the input. 118func (l *lexer) peek() rune { 119 r := l.next() 120 l.backup() 121 return r 122} 123 124// backup steps back one rune. Can only be called once per call of next. 125func (l *lexer) backup() { 126 l.pos -= l.width 127} 128 129// emit passes an item back to the client. 130func (l *lexer) emit(t itemType) { 131 l.items <- item{t, l.start, l.input[l.start:l.pos]} 132 l.start = l.pos 133} 134 135// ignore skips over the pending input before this point. 136func (l *lexer) ignore() { 137 l.start = l.pos 138} 139 140// accept consumes the next rune if it's from the valid set. 141func (l *lexer) accept(valid string) bool { 142 if strings.IndexRune(valid, l.next()) >= 0 { 143 return true 144 } 145 l.backup() 146 return false 147} 148 149// acceptRun consumes a run of runes from the valid set. 150func (l *lexer) acceptRun(valid string) { 151 for strings.IndexRune(valid, l.next()) >= 0 { 152 } 153 l.backup() 154} 155 156// lineNumber reports which line we're on, based on the position of 157// the previous item returned by nextItem. Doing it this way 158// means we don't have to worry about peek double counting. 159func (l *lexer) lineNumber() int { 160 return 1 + strings.Count(l.input[:l.lastPos], "\n") 161} 162 163// errorf returns an error token and terminates the scan by passing 164// back a nil pointer that will be the next state, terminating l.nextItem. 165func (l *lexer) errorf(format string, args ...interface{}) stateFn { 166 l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)} 167 return nil 168} 169 170// nextItem returns the next item from the input. 171func (l *lexer) nextItem() item { 172 item := <-l.items 173 l.lastPos = item.pos 174 return item 175} 176 177// lex creates a new scanner for the input string. 178func lex(name, input, left, right string) *lexer { 179 if left == "" { 180 left = leftDelim 181 } 182 if right == "" { 183 right = rightDelim 184 } 185 l := &lexer{ 186 name: name, 187 input: input, 188 leftDelim: left, 189 rightDelim: right, 190 items: make(chan item), 191 } 192 go l.run() 193 return l 194} 195 196// run runs the state machine for the lexer. 197func (l *lexer) run() { 198 for l.state = lexText; l.state != nil; { 199 l.state = l.state(l) 200 } 201} 202 203// state functions 204 205const ( 206 leftDelim = "{{" 207 rightDelim = "}}" 208 leftComment = "/*" 209 rightComment = "*/" 210) 211 212// lexText scans until an opening action delimiter, "{{". 213func lexText(l *lexer) stateFn { 214 for { 215 if strings.HasPrefix(l.input[l.pos:], l.leftDelim) { 216 if l.pos > l.start { 217 l.emit(itemText) 218 } 219 return lexLeftDelim 220 } 221 if l.next() == eof { 222 break 223 } 224 } 225 // Correctly reached EOF. 226 if l.pos > l.start { 227 l.emit(itemText) 228 } 229 l.emit(itemEOF) 230 return nil 231} 232 233// lexLeftDelim scans the left delimiter, which is known to be present. 234func lexLeftDelim(l *lexer) stateFn { 235 l.pos += Pos(len(l.leftDelim)) 236 if strings.HasPrefix(l.input[l.pos:], leftComment) { 237 return lexComment 238 } 239 l.emit(itemLeftDelim) 240 l.parenDepth = 0 241 return lexInsideAction 242} 243 244// lexComment scans a comment. The left comment marker is known to be present. 245func lexComment(l *lexer) stateFn { 246 l.pos += Pos(len(leftComment)) 247 i := strings.Index(l.input[l.pos:], rightComment) 248 if i < 0 { 249 return l.errorf("unclosed comment") 250 } 251 l.pos += Pos(i + len(rightComment)) 252 if !strings.HasPrefix(l.input[l.pos:], l.rightDelim) { 253 return l.errorf("comment ends before closing delimiter") 254 255 } 256 l.pos += Pos(len(l.rightDelim)) 257 l.ignore() 258 return lexText 259} 260 261// lexRightDelim scans the right delimiter, which is known to be present. 262func lexRightDelim(l *lexer) stateFn { 263 l.pos += Pos(len(l.rightDelim)) 264 l.emit(itemRightDelim) 265 if l.peek() == '\\' { 266 l.pos++ 267 l.emit(itemElideNewline) 268 } 269 return lexText 270} 271 272// lexInsideAction scans the elements inside action delimiters. 273func lexInsideAction(l *lexer) stateFn { 274 // Either number, quoted string, or identifier. 275 // Spaces separate arguments; runs of spaces turn into itemSpace. 276 // Pipe symbols separate and are emitted. 277 if strings.HasPrefix(l.input[l.pos:], l.rightDelim+"\\") || strings.HasPrefix(l.input[l.pos:], l.rightDelim) { 278 if l.parenDepth == 0 { 279 return lexRightDelim 280 } 281 return l.errorf("unclosed left paren") 282 } 283 switch r := l.next(); { 284 case r == eof || isEndOfLine(r): 285 return l.errorf("unclosed action") 286 case isSpace(r): 287 return lexSpace 288 case r == ':': 289 if l.next() != '=' { 290 return l.errorf("expected :=") 291 } 292 l.emit(itemColonEquals) 293 case r == '|': 294 l.emit(itemPipe) 295 case r == '"': 296 return lexQuote 297 case r == '`': 298 return lexRawQuote 299 case r == '$': 300 return lexVariable 301 case r == '\'': 302 return lexChar 303 case r == '.': 304 // special look-ahead for ".field" so we don't break l.backup(). 305 if l.pos < Pos(len(l.input)) { 306 r := l.input[l.pos] 307 if r < '0' || '9' < r { 308 return lexField 309 } 310 } 311 fallthrough // '.' can start a number. 312 case r == '+' || r == '-' || ('0' <= r && r <= '9'): 313 l.backup() 314 return lexNumber 315 case isAlphaNumeric(r): 316 l.backup() 317 return lexIdentifier 318 case r == '(': 319 l.emit(itemLeftParen) 320 l.parenDepth++ 321 return lexInsideAction 322 case r == ')': 323 l.emit(itemRightParen) 324 l.parenDepth-- 325 if l.parenDepth < 0 { 326 return l.errorf("unexpected right paren %#U", r) 327 } 328 return lexInsideAction 329 case r <= unicode.MaxASCII && unicode.IsPrint(r): 330 l.emit(itemChar) 331 return lexInsideAction 332 default: 333 return l.errorf("unrecognized character in action: %#U", r) 334 } 335 return lexInsideAction 336} 337 338// lexSpace scans a run of space characters. 339// One space has already been seen. 340func lexSpace(l *lexer) stateFn { 341 for isSpace(l.peek()) { 342 l.next() 343 } 344 l.emit(itemSpace) 345 return lexInsideAction 346} 347 348// lexIdentifier scans an alphanumeric. 349func lexIdentifier(l *lexer) stateFn { 350Loop: 351 for { 352 switch r := l.next(); { 353 case isAlphaNumeric(r): 354 // absorb. 355 default: 356 l.backup() 357 word := l.input[l.start:l.pos] 358 if !l.atTerminator() { 359 return l.errorf("bad character %#U", r) 360 } 361 switch { 362 case key[word] > itemKeyword: 363 l.emit(key[word]) 364 case word[0] == '.': 365 l.emit(itemField) 366 case word == "true", word == "false": 367 l.emit(itemBool) 368 default: 369 l.emit(itemIdentifier) 370 } 371 break Loop 372 } 373 } 374 return lexInsideAction 375} 376 377// lexField scans a field: .Alphanumeric. 378// The . has been scanned. 379func lexField(l *lexer) stateFn { 380 return lexFieldOrVariable(l, itemField) 381} 382 383// lexVariable scans a Variable: $Alphanumeric. 384// The $ has been scanned. 385func lexVariable(l *lexer) stateFn { 386 if l.atTerminator() { // Nothing interesting follows -> "$". 387 l.emit(itemVariable) 388 return lexInsideAction 389 } 390 return lexFieldOrVariable(l, itemVariable) 391} 392 393// lexVariable scans a field or variable: [.$]Alphanumeric. 394// The . or $ has been scanned. 395func lexFieldOrVariable(l *lexer, typ itemType) stateFn { 396 if l.atTerminator() { // Nothing interesting follows -> "." or "$". 397 if typ == itemVariable { 398 l.emit(itemVariable) 399 } else { 400 l.emit(itemDot) 401 } 402 return lexInsideAction 403 } 404 var r rune 405 for { 406 r = l.next() 407 if !isAlphaNumeric(r) { 408 l.backup() 409 break 410 } 411 } 412 if !l.atTerminator() { 413 return l.errorf("bad character %#U", r) 414 } 415 l.emit(typ) 416 return lexInsideAction 417} 418 419// atTerminator reports whether the input is at valid termination character to 420// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases 421// like "$x+2" not being acceptable without a space, in case we decide one 422// day to implement arithmetic. 423func (l *lexer) atTerminator() bool { 424 r := l.peek() 425 if isSpace(r) || isEndOfLine(r) { 426 return true 427 } 428 switch r { 429 case eof, '.', ',', '|', ':', ')', '(': 430 return true 431 } 432 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will 433 // succeed but should fail) but only in extremely rare cases caused by willfully 434 // bad choice of delimiter. 435 if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r { 436 return true 437 } 438 return false 439} 440 441// lexChar scans a character constant. The initial quote is already 442// scanned. Syntax checking is done by the parser. 443func lexChar(l *lexer) stateFn { 444Loop: 445 for { 446 switch l.next() { 447 case '\\': 448 if r := l.next(); r != eof && r != '\n' { 449 break 450 } 451 fallthrough 452 case eof, '\n': 453 return l.errorf("unterminated character constant") 454 case '\'': 455 break Loop 456 } 457 } 458 l.emit(itemCharConstant) 459 return lexInsideAction 460} 461 462// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This 463// isn't a perfect number scanner - for instance it accepts "." and "0x0.2" 464// and "089" - but when it's wrong the input is invalid and the parser (via 465// strconv) will notice. 466func lexNumber(l *lexer) stateFn { 467 if !l.scanNumber() { 468 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 469 } 470 if sign := l.peek(); sign == '+' || sign == '-' { 471 // Complex: 1+2i. No spaces, must end in 'i'. 472 if !l.scanNumber() || l.input[l.pos-1] != 'i' { 473 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 474 } 475 l.emit(itemComplex) 476 } else { 477 l.emit(itemNumber) 478 } 479 return lexInsideAction 480} 481 482func (l *lexer) scanNumber() bool { 483 // Optional leading sign. 484 l.accept("+-") 485 // Is it hex? 486 digits := "0123456789" 487 if l.accept("0") && l.accept("xX") { 488 digits = "0123456789abcdefABCDEF" 489 } 490 l.acceptRun(digits) 491 if l.accept(".") { 492 l.acceptRun(digits) 493 } 494 if l.accept("eE") { 495 l.accept("+-") 496 l.acceptRun("0123456789") 497 } 498 // Is it imaginary? 499 l.accept("i") 500 // Next thing mustn't be alphanumeric. 501 if isAlphaNumeric(l.peek()) { 502 l.next() 503 return false 504 } 505 return true 506} 507 508// lexQuote scans a quoted string. 509func lexQuote(l *lexer) stateFn { 510Loop: 511 for { 512 switch l.next() { 513 case '\\': 514 if r := l.next(); r != eof && r != '\n' { 515 break 516 } 517 fallthrough 518 case eof, '\n': 519 return l.errorf("unterminated quoted string") 520 case '"': 521 break Loop 522 } 523 } 524 l.emit(itemString) 525 return lexInsideAction 526} 527 528// lexRawQuote scans a raw quoted string. 529func lexRawQuote(l *lexer) stateFn { 530Loop: 531 for { 532 switch l.next() { 533 case eof, '\n': 534 return l.errorf("unterminated raw quoted string") 535 case '`': 536 break Loop 537 } 538 } 539 l.emit(itemRawString) 540 return lexInsideAction 541} 542 543// isSpace reports whether r is a space character. 544func isSpace(r rune) bool { 545 return r == ' ' || r == '\t' 546} 547 548// isEndOfLine reports whether r is an end-of-line character. 549func isEndOfLine(r rune) bool { 550 return r == '\r' || r == '\n' 551} 552 553// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. 554func isAlphaNumeric(r rune) bool { 555 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 556} 557