1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package parse 6 7import ( 8 "fmt" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12) 13 14// item represents a token or text string returned from the scanner. 15type item struct { 16 typ itemType // The type of this item. 17 pos Pos // The starting position, in bytes, of this item in the input string. 18 val string // The value of this item. 19} 20 21func (i item) String() string { 22 switch { 23 case i.typ == itemEOF: 24 return "EOF" 25 case i.typ == itemError: 26 return i.val 27 case i.typ > itemKeyword: 28 return fmt.Sprintf("<%s>", i.val) 29 case len(i.val) > 10: 30 return fmt.Sprintf("%.10q...", i.val) 31 } 32 return fmt.Sprintf("%q", i.val) 33} 34 35// itemType identifies the type of lex items. 36type itemType int 37 38const ( 39 itemError itemType = iota // error occurred; value is text of error 40 itemBool // boolean constant 41 itemChar // printable ASCII character; grab bag for comma etc. 42 itemCharConstant // character constant 43 itemComplex // complex constant (1+2i); imaginary is just a number 44 itemColonEquals // colon-equals (':=') introducing a declaration 45 itemEOF 46 itemField // alphanumeric identifier starting with '.' 47 itemIdentifier // alphanumeric identifier not starting with '.' 48 itemLeftDelim // left action delimiter 49 itemLeftParen // '(' inside action 50 itemNumber // simple number, including imaginary 51 itemPipe // pipe symbol 52 itemRawString // raw quoted string (includes quotes) 53 itemRightDelim // right action delimiter 54 itemRightParen // ')' inside action 55 itemSpace // run of spaces separating arguments 56 itemString // quoted string (includes quotes) 57 itemText // plain text 58 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello' 59 // Keywords appear after all the rest. 60 itemKeyword // used only to delimit the keywords 61 itemDot // the cursor, spelled '.' 62 itemDefine // define keyword 63 itemElse // else keyword 64 itemEnd // end keyword 65 itemIf // if keyword 66 itemNil // the untyped nil constant, easiest to treat as a keyword 67 itemRange // range keyword 68 itemTemplate // template keyword 69 itemWith // with keyword 70) 71 72var key = map[string]itemType{ 73 ".": itemDot, 74 "define": itemDefine, 75 "else": itemElse, 76 "end": itemEnd, 77 "if": itemIf, 78 "range": itemRange, 79 "nil": itemNil, 80 "template": itemTemplate, 81 "with": itemWith, 82} 83 84const eof = -1 85 86// stateFn represents the state of the scanner as a function that returns the next state. 87type stateFn func(*lexer) stateFn 88 89// lexer holds the state of the scanner. 90type lexer struct { 91 name string // the name of the input; used only for error reports 92 input string // the string being scanned 93 leftDelim string // start of action 94 rightDelim string // end of action 95 state stateFn // the next lexing function to enter 96 pos Pos // current position in the input 97 start Pos // start position of this item 98 width Pos // width of last rune read from input 99 lastPos Pos // position of most recent item returned by nextItem 100 items chan item // channel of scanned items 101 parenDepth int // nesting depth of ( ) exprs 102} 103 104// next returns the next rune in the input. 105func (l *lexer) next() rune { 106 if int(l.pos) >= len(l.input) { 107 l.width = 0 108 return eof 109 } 110 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 111 l.width = Pos(w) 112 l.pos += l.width 113 return r 114} 115 116// peek returns but does not consume the next rune in the input. 117func (l *lexer) peek() rune { 118 r := l.next() 119 l.backup() 120 return r 121} 122 123// backup steps back one rune. Can only be called once per call of next. 124func (l *lexer) backup() { 125 l.pos -= l.width 126} 127 128// emit passes an item back to the client. 129func (l *lexer) emit(t itemType) { 130 l.items <- item{t, l.start, l.input[l.start:l.pos]} 131 l.start = l.pos 132} 133 134// ignore skips over the pending input before this point. 135func (l *lexer) ignore() { 136 l.start = l.pos 137} 138 139// accept consumes the next rune if it's from the valid set. 140func (l *lexer) accept(valid string) bool { 141 if strings.IndexRune(valid, l.next()) >= 0 { 142 return true 143 } 144 l.backup() 145 return false 146} 147 148// acceptRun consumes a run of runes from the valid set. 149func (l *lexer) acceptRun(valid string) { 150 for strings.IndexRune(valid, l.next()) >= 0 { 151 } 152 l.backup() 153} 154 155// lineNumber reports which line we're on, based on the position of 156// the previous item returned by nextItem. Doing it this way 157// means we don't have to worry about peek double counting. 158func (l *lexer) lineNumber() int { 159 return 1 + strings.Count(l.input[:l.lastPos], "\n") 160} 161 162// errorf returns an error token and terminates the scan by passing 163// back a nil pointer that will be the next state, terminating l.nextItem. 164func (l *lexer) errorf(format string, args ...interface{}) stateFn { 165 l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)} 166 return nil 167} 168 169// nextItem returns the next item from the input. 170func (l *lexer) nextItem() item { 171 item := <-l.items 172 l.lastPos = item.pos 173 return item 174} 175 176// lex creates a new scanner for the input string. 177func lex(name, input, left, right string) *lexer { 178 if left == "" { 179 left = leftDelim 180 } 181 if right == "" { 182 right = rightDelim 183 } 184 l := &lexer{ 185 name: name, 186 input: input, 187 leftDelim: left, 188 rightDelim: right, 189 items: make(chan item), 190 } 191 go l.run() 192 return l 193} 194 195// run runs the state machine for the lexer. 196func (l *lexer) run() { 197 for l.state = lexText; l.state != nil; { 198 l.state = l.state(l) 199 } 200} 201 202// state functions 203 204const ( 205 leftDelim = "{{" 206 rightDelim = "}}" 207 leftComment = "/*" 208 rightComment = "*/" 209) 210 211// lexText scans until an opening action delimiter, "{{". 212func lexText(l *lexer) stateFn { 213 for { 214 if strings.HasPrefix(l.input[l.pos:], l.leftDelim) { 215 if l.pos > l.start { 216 l.emit(itemText) 217 } 218 return lexLeftDelim 219 } 220 if l.next() == eof { 221 break 222 } 223 } 224 // Correctly reached EOF. 225 if l.pos > l.start { 226 l.emit(itemText) 227 } 228 l.emit(itemEOF) 229 return nil 230} 231 232// lexLeftDelim scans the left delimiter, which is known to be present. 233func lexLeftDelim(l *lexer) stateFn { 234 l.pos += Pos(len(l.leftDelim)) 235 if strings.HasPrefix(l.input[l.pos:], leftComment) { 236 return lexComment 237 } 238 l.emit(itemLeftDelim) 239 l.parenDepth = 0 240 return lexInsideAction 241} 242 243// lexComment scans a comment. The left comment marker is known to be present. 244func lexComment(l *lexer) stateFn { 245 l.pos += Pos(len(leftComment)) 246 i := strings.Index(l.input[l.pos:], rightComment+l.rightDelim) 247 if i < 0 { 248 return l.errorf("unclosed comment") 249 } 250 l.pos += Pos(i + len(rightComment) + len(l.rightDelim)) 251 l.ignore() 252 return lexText 253} 254 255// lexRightDelim scans the right delimiter, which is known to be present. 256func lexRightDelim(l *lexer) stateFn { 257 l.pos += Pos(len(l.rightDelim)) 258 l.emit(itemRightDelim) 259 return lexText 260} 261 262// lexInsideAction scans the elements inside action delimiters. 263func lexInsideAction(l *lexer) stateFn { 264 // Either number, quoted string, or identifier. 265 // Spaces separate arguments; runs of spaces turn into itemSpace. 266 // Pipe symbols separate and are emitted. 267 if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { 268 if l.parenDepth == 0 { 269 return lexRightDelim 270 } 271 return l.errorf("unclosed left paren") 272 } 273 switch r := l.next(); { 274 case r == eof || isEndOfLine(r): 275 return l.errorf("unclosed action") 276 case isSpace(r): 277 return lexSpace 278 case r == ':': 279 if l.next() != '=' { 280 return l.errorf("expected :=") 281 } 282 l.emit(itemColonEquals) 283 case r == '|': 284 l.emit(itemPipe) 285 case r == '"': 286 return lexQuote 287 case r == '`': 288 return lexRawQuote 289 case r == '$': 290 return lexVariable 291 case r == '\'': 292 return lexChar 293 case r == '.': 294 // special look-ahead for ".field" so we don't break l.backup(). 295 if l.pos < Pos(len(l.input)) { 296 r := l.input[l.pos] 297 if r < '0' || '9' < r { 298 return lexField 299 } 300 } 301 fallthrough // '.' can start a number. 302 case r == '+' || r == '-' || ('0' <= r && r <= '9'): 303 l.backup() 304 return lexNumber 305 case isAlphaNumeric(r): 306 l.backup() 307 return lexIdentifier 308 case r == '(': 309 l.emit(itemLeftParen) 310 l.parenDepth++ 311 return lexInsideAction 312 case r == ')': 313 l.emit(itemRightParen) 314 l.parenDepth-- 315 if l.parenDepth < 0 { 316 return l.errorf("unexpected right paren %#U", r) 317 } 318 return lexInsideAction 319 case r <= unicode.MaxASCII && unicode.IsPrint(r): 320 l.emit(itemChar) 321 return lexInsideAction 322 default: 323 return l.errorf("unrecognized character in action: %#U", r) 324 } 325 return lexInsideAction 326} 327 328// lexSpace scans a run of space characters. 329// One space has already been seen. 330func lexSpace(l *lexer) stateFn { 331 for isSpace(l.peek()) { 332 l.next() 333 } 334 l.emit(itemSpace) 335 return lexInsideAction 336} 337 338// lexIdentifier scans an alphanumeric. 339func lexIdentifier(l *lexer) stateFn { 340Loop: 341 for { 342 switch r := l.next(); { 343 case isAlphaNumeric(r): 344 // absorb. 345 default: 346 l.backup() 347 word := l.input[l.start:l.pos] 348 if !l.atTerminator() { 349 return l.errorf("bad character %#U", r) 350 } 351 switch { 352 case key[word] > itemKeyword: 353 l.emit(key[word]) 354 case word[0] == '.': 355 l.emit(itemField) 356 case word == "true", word == "false": 357 l.emit(itemBool) 358 default: 359 l.emit(itemIdentifier) 360 } 361 break Loop 362 } 363 } 364 return lexInsideAction 365} 366 367// lexField scans a field: .Alphanumeric. 368// The . has been scanned. 369func lexField(l *lexer) stateFn { 370 return lexFieldOrVariable(l, itemField) 371} 372 373// lexVariable scans a Variable: $Alphanumeric. 374// The $ has been scanned. 375func lexVariable(l *lexer) stateFn { 376 if l.atTerminator() { // Nothing interesting follows -> "$". 377 l.emit(itemVariable) 378 return lexInsideAction 379 } 380 return lexFieldOrVariable(l, itemVariable) 381} 382 383// lexVariable scans a field or variable: [.$]Alphanumeric. 384// The . or $ has been scanned. 385func lexFieldOrVariable(l *lexer, typ itemType) stateFn { 386 if l.atTerminator() { // Nothing interesting follows -> "." or "$". 387 if typ == itemVariable { 388 l.emit(itemVariable) 389 } else { 390 l.emit(itemDot) 391 } 392 return lexInsideAction 393 } 394 var r rune 395 for { 396 r = l.next() 397 if !isAlphaNumeric(r) { 398 l.backup() 399 break 400 } 401 } 402 if !l.atTerminator() { 403 return l.errorf("bad character %#U", r) 404 } 405 l.emit(typ) 406 return lexInsideAction 407} 408 409// atTerminator reports whether the input is at valid termination character to 410// appear after an identifier. Breaks .X.Y into two pieces. Also catches cases 411// like "$x+2" not being acceptable without a space, in case we decide one 412// day to implement arithmetic. 413func (l *lexer) atTerminator() bool { 414 r := l.peek() 415 if isSpace(r) || isEndOfLine(r) { 416 return true 417 } 418 switch r { 419 case eof, '.', ',', '|', ':', ')', '(': 420 return true 421 } 422 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will 423 // succeed but should fail) but only in extremely rare cases caused by willfully 424 // bad choice of delimiter. 425 if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r { 426 return true 427 } 428 return false 429} 430 431// lexChar scans a character constant. The initial quote is already 432// scanned. Syntax checking is done by the parser. 433func lexChar(l *lexer) stateFn { 434Loop: 435 for { 436 switch l.next() { 437 case '\\': 438 if r := l.next(); r != eof && r != '\n' { 439 break 440 } 441 fallthrough 442 case eof, '\n': 443 return l.errorf("unterminated character constant") 444 case '\'': 445 break Loop 446 } 447 } 448 l.emit(itemCharConstant) 449 return lexInsideAction 450} 451 452// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This 453// isn't a perfect number scanner - for instance it accepts "." and "0x0.2" 454// and "089" - but when it's wrong the input is invalid and the parser (via 455// strconv) will notice. 456func lexNumber(l *lexer) stateFn { 457 if !l.scanNumber() { 458 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 459 } 460 if sign := l.peek(); sign == '+' || sign == '-' { 461 // Complex: 1+2i. No spaces, must end in 'i'. 462 if !l.scanNumber() || l.input[l.pos-1] != 'i' { 463 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 464 } 465 l.emit(itemComplex) 466 } else { 467 l.emit(itemNumber) 468 } 469 return lexInsideAction 470} 471 472func (l *lexer) scanNumber() bool { 473 // Optional leading sign. 474 l.accept("+-") 475 // Is it hex? 476 digits := "0123456789" 477 if l.accept("0") && l.accept("xX") { 478 digits = "0123456789abcdefABCDEF" 479 } 480 l.acceptRun(digits) 481 if l.accept(".") { 482 l.acceptRun(digits) 483 } 484 if l.accept("eE") { 485 l.accept("+-") 486 l.acceptRun("0123456789") 487 } 488 // Is it imaginary? 489 l.accept("i") 490 // Next thing mustn't be alphanumeric. 491 if isAlphaNumeric(l.peek()) { 492 l.next() 493 return false 494 } 495 return true 496} 497 498// lexQuote scans a quoted string. 499func lexQuote(l *lexer) stateFn { 500Loop: 501 for { 502 switch l.next() { 503 case '\\': 504 if r := l.next(); r != eof && r != '\n' { 505 break 506 } 507 fallthrough 508 case eof, '\n': 509 return l.errorf("unterminated quoted string") 510 case '"': 511 break Loop 512 } 513 } 514 l.emit(itemString) 515 return lexInsideAction 516} 517 518// lexRawQuote scans a raw quoted string. 519func lexRawQuote(l *lexer) stateFn { 520Loop: 521 for { 522 switch l.next() { 523 case eof, '\n': 524 return l.errorf("unterminated raw quoted string") 525 case '`': 526 break Loop 527 } 528 } 529 l.emit(itemRawString) 530 return lexInsideAction 531} 532 533// isSpace reports whether r is a space character. 534func isSpace(r rune) bool { 535 return r == ' ' || r == '\t' 536} 537 538// isEndOfLine reports whether r is an end-of-line character. 539func isEndOfLine(r rune) bool { 540 return r == '\r' || r == '\n' 541} 542 543// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. 544func isAlphaNumeric(r rune) bool { 545 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 546} 547