1// Copyright 2018 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package text 6 7import ( 8 "bytes" 9 "fmt" 10 "io" 11 "regexp" 12 "strconv" 13 "unicode/utf8" 14 15 "google.golang.org/protobuf/internal/errors" 16) 17 18// Decoder is a token-based textproto decoder. 19type Decoder struct { 20 // lastCall is last method called, either readCall or peekCall. 21 // Initial value is readCall. 22 lastCall call 23 24 // lastToken contains the last read token. 25 lastToken Token 26 27 // lastErr contains the last read error. 28 lastErr error 29 30 // openStack is a stack containing the byte characters for MessageOpen and 31 // ListOpen kinds. The top of stack represents the message or the list that 32 // the current token is nested in. An empty stack means the current token is 33 // at the top level message. The characters '{' and '<' both represent the 34 // MessageOpen kind. 35 openStack []byte 36 37 // orig is used in reporting line and column. 38 orig []byte 39 // in contains the unconsumed input. 40 in []byte 41} 42 43// NewDecoder returns a Decoder to read the given []byte. 44func NewDecoder(b []byte) *Decoder { 45 return &Decoder{orig: b, in: b} 46} 47 48// ErrUnexpectedEOF means that EOF was encountered in the middle of the input. 49var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF) 50 51// call specifies which Decoder method was invoked. 52type call uint8 53 54const ( 55 readCall call = iota 56 peekCall 57) 58 59// Peek looks ahead and returns the next token and error without advancing a read. 60func (d *Decoder) Peek() (Token, error) { 61 defer func() { d.lastCall = peekCall }() 62 if d.lastCall == readCall { 63 d.lastToken, d.lastErr = d.Read() 64 } 65 return d.lastToken, d.lastErr 66} 67 68// Read returns the next token. 69// It will return an error if there is no valid token. 70func (d *Decoder) Read() (Token, error) { 71 defer func() { d.lastCall = readCall }() 72 if d.lastCall == peekCall { 73 return d.lastToken, d.lastErr 74 } 75 76 tok, err := d.parseNext(d.lastToken.kind) 77 if err != nil { 78 return Token{}, err 79 } 80 81 switch tok.kind { 82 case comma, semicolon: 83 tok, err = d.parseNext(tok.kind) 84 if err != nil { 85 return Token{}, err 86 } 87 } 88 d.lastToken = tok 89 return tok, nil 90} 91 92const ( 93 mismatchedFmt = "mismatched close character %q" 94 unexpectedFmt = "unexpected character %q" 95) 96 97// parseNext parses the next Token based on given last kind. 98func (d *Decoder) parseNext(lastKind Kind) (Token, error) { 99 // Trim leading spaces. 100 d.consume(0) 101 isEOF := false 102 if len(d.in) == 0 { 103 isEOF = true 104 } 105 106 switch lastKind { 107 case EOF: 108 return d.consumeToken(EOF, 0, 0), nil 109 110 case bof: 111 // Start of top level message. Next token can be EOF or Name. 112 if isEOF { 113 return d.consumeToken(EOF, 0, 0), nil 114 } 115 return d.parseFieldName() 116 117 case Name: 118 // Next token can be MessageOpen, ListOpen or Scalar. 119 if isEOF { 120 return Token{}, ErrUnexpectedEOF 121 } 122 switch ch := d.in[0]; ch { 123 case '{', '<': 124 d.pushOpenStack(ch) 125 return d.consumeToken(MessageOpen, 1, 0), nil 126 case '[': 127 d.pushOpenStack(ch) 128 return d.consumeToken(ListOpen, 1, 0), nil 129 default: 130 return d.parseScalar() 131 } 132 133 case Scalar: 134 openKind, closeCh := d.currentOpenKind() 135 switch openKind { 136 case bof: 137 // Top level message. 138 // Next token can be EOF, comma, semicolon or Name. 139 if isEOF { 140 return d.consumeToken(EOF, 0, 0), nil 141 } 142 switch d.in[0] { 143 case ',': 144 return d.consumeToken(comma, 1, 0), nil 145 case ';': 146 return d.consumeToken(semicolon, 1, 0), nil 147 default: 148 return d.parseFieldName() 149 } 150 151 case MessageOpen: 152 // Next token can be MessageClose, comma, semicolon or Name. 153 if isEOF { 154 return Token{}, ErrUnexpectedEOF 155 } 156 switch ch := d.in[0]; ch { 157 case closeCh: 158 d.popOpenStack() 159 return d.consumeToken(MessageClose, 1, 0), nil 160 case otherCloseChar[closeCh]: 161 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 162 case ',': 163 return d.consumeToken(comma, 1, 0), nil 164 case ';': 165 return d.consumeToken(semicolon, 1, 0), nil 166 default: 167 return d.parseFieldName() 168 } 169 170 case ListOpen: 171 // Next token can be ListClose or comma. 172 if isEOF { 173 return Token{}, ErrUnexpectedEOF 174 } 175 switch ch := d.in[0]; ch { 176 case ']': 177 d.popOpenStack() 178 return d.consumeToken(ListClose, 1, 0), nil 179 case ',': 180 return d.consumeToken(comma, 1, 0), nil 181 default: 182 return Token{}, d.newSyntaxError(unexpectedFmt, ch) 183 } 184 } 185 186 case MessageOpen: 187 // Next token can be MessageClose or Name. 188 if isEOF { 189 return Token{}, ErrUnexpectedEOF 190 } 191 _, closeCh := d.currentOpenKind() 192 switch ch := d.in[0]; ch { 193 case closeCh: 194 d.popOpenStack() 195 return d.consumeToken(MessageClose, 1, 0), nil 196 case otherCloseChar[closeCh]: 197 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 198 default: 199 return d.parseFieldName() 200 } 201 202 case MessageClose: 203 openKind, closeCh := d.currentOpenKind() 204 switch openKind { 205 case bof: 206 // Top level message. 207 // Next token can be EOF, comma, semicolon or Name. 208 if isEOF { 209 return d.consumeToken(EOF, 0, 0), nil 210 } 211 switch ch := d.in[0]; ch { 212 case ',': 213 return d.consumeToken(comma, 1, 0), nil 214 case ';': 215 return d.consumeToken(semicolon, 1, 0), nil 216 default: 217 return d.parseFieldName() 218 } 219 220 case MessageOpen: 221 // Next token can be MessageClose, comma, semicolon or Name. 222 if isEOF { 223 return Token{}, ErrUnexpectedEOF 224 } 225 switch ch := d.in[0]; ch { 226 case closeCh: 227 d.popOpenStack() 228 return d.consumeToken(MessageClose, 1, 0), nil 229 case otherCloseChar[closeCh]: 230 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 231 case ',': 232 return d.consumeToken(comma, 1, 0), nil 233 case ';': 234 return d.consumeToken(semicolon, 1, 0), nil 235 default: 236 return d.parseFieldName() 237 } 238 239 case ListOpen: 240 // Next token can be ListClose or comma 241 if isEOF { 242 return Token{}, ErrUnexpectedEOF 243 } 244 switch ch := d.in[0]; ch { 245 case closeCh: 246 d.popOpenStack() 247 return d.consumeToken(ListClose, 1, 0), nil 248 case ',': 249 return d.consumeToken(comma, 1, 0), nil 250 default: 251 return Token{}, d.newSyntaxError(unexpectedFmt, ch) 252 } 253 } 254 255 case ListOpen: 256 // Next token can be ListClose, MessageStart or Scalar. 257 if isEOF { 258 return Token{}, ErrUnexpectedEOF 259 } 260 switch ch := d.in[0]; ch { 261 case ']': 262 d.popOpenStack() 263 return d.consumeToken(ListClose, 1, 0), nil 264 case '{', '<': 265 d.pushOpenStack(ch) 266 return d.consumeToken(MessageOpen, 1, 0), nil 267 default: 268 return d.parseScalar() 269 } 270 271 case ListClose: 272 openKind, closeCh := d.currentOpenKind() 273 switch openKind { 274 case bof: 275 // Top level message. 276 // Next token can be EOF, comma, semicolon or Name. 277 if isEOF { 278 return d.consumeToken(EOF, 0, 0), nil 279 } 280 switch ch := d.in[0]; ch { 281 case ',': 282 return d.consumeToken(comma, 1, 0), nil 283 case ';': 284 return d.consumeToken(semicolon, 1, 0), nil 285 default: 286 return d.parseFieldName() 287 } 288 289 case MessageOpen: 290 // Next token can be MessageClose, comma, semicolon or Name. 291 if isEOF { 292 return Token{}, ErrUnexpectedEOF 293 } 294 switch ch := d.in[0]; ch { 295 case closeCh: 296 d.popOpenStack() 297 return d.consumeToken(MessageClose, 1, 0), nil 298 case otherCloseChar[closeCh]: 299 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 300 case ',': 301 return d.consumeToken(comma, 1, 0), nil 302 case ';': 303 return d.consumeToken(semicolon, 1, 0), nil 304 default: 305 return d.parseFieldName() 306 } 307 308 default: 309 // It is not possible to have this case. Let it panic below. 310 } 311 312 case comma, semicolon: 313 openKind, closeCh := d.currentOpenKind() 314 switch openKind { 315 case bof: 316 // Top level message. Next token can be EOF or Name. 317 if isEOF { 318 return d.consumeToken(EOF, 0, 0), nil 319 } 320 return d.parseFieldName() 321 322 case MessageOpen: 323 // Next token can be MessageClose or Name. 324 if isEOF { 325 return Token{}, ErrUnexpectedEOF 326 } 327 switch ch := d.in[0]; ch { 328 case closeCh: 329 d.popOpenStack() 330 return d.consumeToken(MessageClose, 1, 0), nil 331 case otherCloseChar[closeCh]: 332 return Token{}, d.newSyntaxError(mismatchedFmt, ch) 333 default: 334 return d.parseFieldName() 335 } 336 337 case ListOpen: 338 if lastKind == semicolon { 339 // It is not be possible to have this case as logic here 340 // should not have produced a semicolon Token when inside a 341 // list. Let it panic below. 342 break 343 } 344 // Next token can be MessageOpen or Scalar. 345 if isEOF { 346 return Token{}, ErrUnexpectedEOF 347 } 348 switch ch := d.in[0]; ch { 349 case '{', '<': 350 d.pushOpenStack(ch) 351 return d.consumeToken(MessageOpen, 1, 0), nil 352 default: 353 return d.parseScalar() 354 } 355 } 356 } 357 358 line, column := d.Position(len(d.orig) - len(d.in)) 359 panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind)) 360} 361 362var otherCloseChar = map[byte]byte{ 363 '}': '>', 364 '>': '}', 365} 366 367// currentOpenKind indicates whether current position is inside a message, list 368// or top-level message by returning MessageOpen, ListOpen or bof respectively. 369// If the returned kind is either a MessageOpen or ListOpen, it also returns the 370// corresponding closing character. 371func (d *Decoder) currentOpenKind() (Kind, byte) { 372 if len(d.openStack) == 0 { 373 return bof, 0 374 } 375 openCh := d.openStack[len(d.openStack)-1] 376 switch openCh { 377 case '{': 378 return MessageOpen, '}' 379 case '<': 380 return MessageOpen, '>' 381 case '[': 382 return ListOpen, ']' 383 } 384 panic(fmt.Sprintf("Decoder: openStack contains invalid byte %s", string(openCh))) 385} 386 387func (d *Decoder) pushOpenStack(ch byte) { 388 d.openStack = append(d.openStack, ch) 389} 390 391func (d *Decoder) popOpenStack() { 392 d.openStack = d.openStack[:len(d.openStack)-1] 393} 394 395// parseFieldName parses field name and separator. 396func (d *Decoder) parseFieldName() (tok Token, err error) { 397 defer func() { 398 if err == nil && d.tryConsumeChar(':') { 399 tok.attrs |= hasSeparator 400 } 401 }() 402 403 // Extension or Any type URL. 404 if d.in[0] == '[' { 405 return d.parseTypeName() 406 } 407 408 // Identifier. 409 if size := parseIdent(d.in, false); size > 0 { 410 return d.consumeToken(Name, size, uint8(IdentName)), nil 411 } 412 413 // Field number. Identify if input is a valid number that is not negative 414 // and is decimal integer within 32-bit range. 415 if num := parseNumber(d.in); num.size > 0 { 416 if !num.neg && num.kind == numDec { 417 if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil { 418 return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil 419 } 420 } 421 return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size]) 422 } 423 424 return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in)) 425} 426 427// parseTypeName parses Any type URL or extension field name. The name is 428// enclosed in [ and ] characters. The C++ parser does not handle many legal URL 429// strings. This implementation is more liberal and allows for the pattern 430// ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed 431// in between [ ], '.', '/' and the sub names. 432func (d *Decoder) parseTypeName() (Token, error) { 433 startPos := len(d.orig) - len(d.in) 434 // Use alias s to advance first in order to use d.in for error handling. 435 // Caller already checks for [ as first character. 436 s := consume(d.in[1:], 0) 437 if len(s) == 0 { 438 return Token{}, ErrUnexpectedEOF 439 } 440 441 var name []byte 442 for len(s) > 0 && isTypeNameChar(s[0]) { 443 name = append(name, s[0]) 444 s = s[1:] 445 } 446 s = consume(s, 0) 447 448 var closed bool 449 for len(s) > 0 && !closed { 450 switch { 451 case s[0] == ']': 452 s = s[1:] 453 closed = true 454 455 case s[0] == '/', s[0] == '.': 456 if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') { 457 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 458 d.orig[startPos:len(d.orig)-len(s)+1]) 459 } 460 name = append(name, s[0]) 461 s = s[1:] 462 s = consume(s, 0) 463 for len(s) > 0 && isTypeNameChar(s[0]) { 464 name = append(name, s[0]) 465 s = s[1:] 466 } 467 s = consume(s, 0) 468 469 default: 470 return Token{}, d.newSyntaxError( 471 "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1]) 472 } 473 } 474 475 if !closed { 476 return Token{}, ErrUnexpectedEOF 477 } 478 479 // First character cannot be '.'. Last character cannot be '.' or '/'. 480 size := len(name) 481 if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' { 482 return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", 483 d.orig[startPos:len(d.orig)-len(s)]) 484 } 485 486 d.in = s 487 endPos := len(d.orig) - len(d.in) 488 d.consume(0) 489 490 return Token{ 491 kind: Name, 492 attrs: uint8(TypeName), 493 pos: startPos, 494 raw: d.orig[startPos:endPos], 495 str: string(name), 496 }, nil 497} 498 499func isTypeNameChar(b byte) bool { 500 return (b == '-' || b == '_' || 501 ('0' <= b && b <= '9') || 502 ('a' <= b && b <= 'z') || 503 ('A' <= b && b <= 'Z')) 504} 505 506func isWhiteSpace(b byte) bool { 507 switch b { 508 case ' ', '\n', '\r', '\t': 509 return true 510 default: 511 return false 512 } 513} 514 515// parseIdent parses an unquoted proto identifier and returns size. 516// If allowNeg is true, it allows '-' to be the first character in the 517// identifier. This is used when parsing literal values like -infinity, etc. 518// Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*` 519func parseIdent(input []byte, allowNeg bool) int { 520 var size int 521 522 s := input 523 if len(s) == 0 { 524 return 0 525 } 526 527 if allowNeg && s[0] == '-' { 528 s = s[1:] 529 size++ 530 if len(s) == 0 { 531 return 0 532 } 533 } 534 535 switch { 536 case s[0] == '_', 537 'a' <= s[0] && s[0] <= 'z', 538 'A' <= s[0] && s[0] <= 'Z': 539 s = s[1:] 540 size++ 541 default: 542 return 0 543 } 544 545 for len(s) > 0 && (s[0] == '_' || 546 'a' <= s[0] && s[0] <= 'z' || 547 'A' <= s[0] && s[0] <= 'Z' || 548 '0' <= s[0] && s[0] <= '9') { 549 s = s[1:] 550 size++ 551 } 552 553 if len(s) > 0 && !isDelim(s[0]) { 554 return 0 555 } 556 557 return size 558} 559 560// parseScalar parses for a string, literal or number value. 561func (d *Decoder) parseScalar() (Token, error) { 562 if d.in[0] == '"' || d.in[0] == '\'' { 563 return d.parseStringValue() 564 } 565 566 if tok, ok := d.parseLiteralValue(); ok { 567 return tok, nil 568 } 569 570 if tok, ok := d.parseNumberValue(); ok { 571 return tok, nil 572 } 573 574 return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in)) 575} 576 577// parseLiteralValue parses a literal value. A literal value is used for 578// bools, special floats and enums. This function simply identifies that the 579// field value is a literal. 580func (d *Decoder) parseLiteralValue() (Token, bool) { 581 size := parseIdent(d.in, true) 582 if size == 0 { 583 return Token{}, false 584 } 585 return d.consumeToken(Scalar, size, literalValue), true 586} 587 588// consumeToken constructs a Token for given Kind from d.in and consumes given 589// size-length from it. 590func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token { 591 // Important to compute raw and pos before consuming. 592 tok := Token{ 593 kind: kind, 594 attrs: attrs, 595 pos: len(d.orig) - len(d.in), 596 raw: d.in[:size], 597 } 598 d.consume(size) 599 return tok 600} 601 602// newSyntaxError returns a syntax error with line and column information for 603// current position. 604func (d *Decoder) newSyntaxError(f string, x ...interface{}) error { 605 e := errors.New(f, x...) 606 line, column := d.Position(len(d.orig) - len(d.in)) 607 return errors.New("syntax error (line %d:%d): %v", line, column, e) 608} 609 610// Position returns line and column number of given index of the original input. 611// It will panic if index is out of range. 612func (d *Decoder) Position(idx int) (line int, column int) { 613 b := d.orig[:idx] 614 line = bytes.Count(b, []byte("\n")) + 1 615 if i := bytes.LastIndexByte(b, '\n'); i >= 0 { 616 b = b[i+1:] 617 } 618 column = utf8.RuneCount(b) + 1 // ignore multi-rune characters 619 return line, column 620} 621 622func (d *Decoder) tryConsumeChar(c byte) bool { 623 if len(d.in) > 0 && d.in[0] == c { 624 d.consume(1) 625 return true 626 } 627 return false 628} 629 630// consume consumes n bytes of input and any subsequent whitespace or comments. 631func (d *Decoder) consume(n int) { 632 d.in = consume(d.in, n) 633 return 634} 635 636// consume consumes n bytes of input and any subsequent whitespace or comments. 637func consume(b []byte, n int) []byte { 638 b = b[n:] 639 for len(b) > 0 { 640 switch b[0] { 641 case ' ', '\n', '\r', '\t': 642 b = b[1:] 643 case '#': 644 if i := bytes.IndexByte(b, '\n'); i >= 0 { 645 b = b[i+len("\n"):] 646 } else { 647 b = nil 648 } 649 default: 650 return b 651 } 652 } 653 return b 654} 655 656// Any sequence that looks like a non-delimiter (for error reporting). 657var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`) 658 659// isDelim returns true if given byte is a delimiter character. 660func isDelim(c byte) bool { 661 return !(c == '-' || c == '+' || c == '.' || c == '_' || 662 ('a' <= c && c <= 'z') || 663 ('A' <= c && c <= 'Z') || 664 ('0' <= c && c <= '9')) 665} 666