1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package text
6
7import (
8	"bytes"
9	"fmt"
10	"io"
11	"regexp"
12	"strconv"
13	"unicode/utf8"
14
15	"google.golang.org/protobuf/internal/errors"
16)
17
18// Decoder is a token-based textproto decoder.
19type Decoder struct {
20	// lastCall is last method called, either readCall or peekCall.
21	// Initial value is readCall.
22	lastCall call
23
24	// lastToken contains the last read token.
25	lastToken Token
26
27	// lastErr contains the last read error.
28	lastErr error
29
30	// openStack is a stack containing the byte characters for MessageOpen and
31	// ListOpen kinds. The top of stack represents the message or the list that
32	// the current token is nested in. An empty stack means the current token is
33	// at the top level message. The characters '{' and '<' both represent the
34	// MessageOpen kind.
35	openStack []byte
36
37	// orig is used in reporting line and column.
38	orig []byte
39	// in contains the unconsumed input.
40	in []byte
41}
42
43// NewDecoder returns a Decoder to read the given []byte.
44func NewDecoder(b []byte) *Decoder {
45	return &Decoder{orig: b, in: b}
46}
47
48// ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
49var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
50
51// call specifies which Decoder method was invoked.
52type call uint8
53
54const (
55	readCall call = iota
56	peekCall
57)
58
59// Peek looks ahead and returns the next token and error without advancing a read.
60func (d *Decoder) Peek() (Token, error) {
61	defer func() { d.lastCall = peekCall }()
62	if d.lastCall == readCall {
63		d.lastToken, d.lastErr = d.Read()
64	}
65	return d.lastToken, d.lastErr
66}
67
68// Read returns the next token.
69// It will return an error if there is no valid token.
70func (d *Decoder) Read() (Token, error) {
71	defer func() { d.lastCall = readCall }()
72	if d.lastCall == peekCall {
73		return d.lastToken, d.lastErr
74	}
75
76	tok, err := d.parseNext(d.lastToken.kind)
77	if err != nil {
78		return Token{}, err
79	}
80
81	switch tok.kind {
82	case comma, semicolon:
83		tok, err = d.parseNext(tok.kind)
84		if err != nil {
85			return Token{}, err
86		}
87	}
88	d.lastToken = tok
89	return tok, nil
90}
91
92const (
93	mismatchedFmt = "mismatched close character %q"
94	unexpectedFmt = "unexpected character %q"
95)
96
97// parseNext parses the next Token based on given last kind.
98func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
99	// Trim leading spaces.
100	d.consume(0)
101	isEOF := false
102	if len(d.in) == 0 {
103		isEOF = true
104	}
105
106	switch lastKind {
107	case EOF:
108		return d.consumeToken(EOF, 0, 0), nil
109
110	case bof:
111		// Start of top level message. Next token can be EOF or Name.
112		if isEOF {
113			return d.consumeToken(EOF, 0, 0), nil
114		}
115		return d.parseFieldName()
116
117	case Name:
118		// Next token can be MessageOpen, ListOpen or Scalar.
119		if isEOF {
120			return Token{}, ErrUnexpectedEOF
121		}
122		switch ch := d.in[0]; ch {
123		case '{', '<':
124			d.pushOpenStack(ch)
125			return d.consumeToken(MessageOpen, 1, 0), nil
126		case '[':
127			d.pushOpenStack(ch)
128			return d.consumeToken(ListOpen, 1, 0), nil
129		default:
130			return d.parseScalar()
131		}
132
133	case Scalar:
134		openKind, closeCh := d.currentOpenKind()
135		switch openKind {
136		case bof:
137			// Top level message.
138			// 	Next token can be EOF, comma, semicolon or Name.
139			if isEOF {
140				return d.consumeToken(EOF, 0, 0), nil
141			}
142			switch d.in[0] {
143			case ',':
144				return d.consumeToken(comma, 1, 0), nil
145			case ';':
146				return d.consumeToken(semicolon, 1, 0), nil
147			default:
148				return d.parseFieldName()
149			}
150
151		case MessageOpen:
152			// Next token can be MessageClose, comma, semicolon or Name.
153			if isEOF {
154				return Token{}, ErrUnexpectedEOF
155			}
156			switch ch := d.in[0]; ch {
157			case closeCh:
158				d.popOpenStack()
159				return d.consumeToken(MessageClose, 1, 0), nil
160			case otherCloseChar[closeCh]:
161				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
162			case ',':
163				return d.consumeToken(comma, 1, 0), nil
164			case ';':
165				return d.consumeToken(semicolon, 1, 0), nil
166			default:
167				return d.parseFieldName()
168			}
169
170		case ListOpen:
171			// Next token can be ListClose or comma.
172			if isEOF {
173				return Token{}, ErrUnexpectedEOF
174			}
175			switch ch := d.in[0]; ch {
176			case ']':
177				d.popOpenStack()
178				return d.consumeToken(ListClose, 1, 0), nil
179			case ',':
180				return d.consumeToken(comma, 1, 0), nil
181			default:
182				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
183			}
184		}
185
186	case MessageOpen:
187		// Next token can be MessageClose or Name.
188		if isEOF {
189			return Token{}, ErrUnexpectedEOF
190		}
191		_, closeCh := d.currentOpenKind()
192		switch ch := d.in[0]; ch {
193		case closeCh:
194			d.popOpenStack()
195			return d.consumeToken(MessageClose, 1, 0), nil
196		case otherCloseChar[closeCh]:
197			return Token{}, d.newSyntaxError(mismatchedFmt, ch)
198		default:
199			return d.parseFieldName()
200		}
201
202	case MessageClose:
203		openKind, closeCh := d.currentOpenKind()
204		switch openKind {
205		case bof:
206			// Top level message.
207			// Next token can be EOF, comma, semicolon or Name.
208			if isEOF {
209				return d.consumeToken(EOF, 0, 0), nil
210			}
211			switch ch := d.in[0]; ch {
212			case ',':
213				return d.consumeToken(comma, 1, 0), nil
214			case ';':
215				return d.consumeToken(semicolon, 1, 0), nil
216			default:
217				return d.parseFieldName()
218			}
219
220		case MessageOpen:
221			// Next token can be MessageClose, comma, semicolon or Name.
222			if isEOF {
223				return Token{}, ErrUnexpectedEOF
224			}
225			switch ch := d.in[0]; ch {
226			case closeCh:
227				d.popOpenStack()
228				return d.consumeToken(MessageClose, 1, 0), nil
229			case otherCloseChar[closeCh]:
230				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
231			case ',':
232				return d.consumeToken(comma, 1, 0), nil
233			case ';':
234				return d.consumeToken(semicolon, 1, 0), nil
235			default:
236				return d.parseFieldName()
237			}
238
239		case ListOpen:
240			// Next token can be ListClose or comma
241			if isEOF {
242				return Token{}, ErrUnexpectedEOF
243			}
244			switch ch := d.in[0]; ch {
245			case closeCh:
246				d.popOpenStack()
247				return d.consumeToken(ListClose, 1, 0), nil
248			case ',':
249				return d.consumeToken(comma, 1, 0), nil
250			default:
251				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
252			}
253		}
254
255	case ListOpen:
256		// Next token can be ListClose, MessageStart or Scalar.
257		if isEOF {
258			return Token{}, ErrUnexpectedEOF
259		}
260		switch ch := d.in[0]; ch {
261		case ']':
262			d.popOpenStack()
263			return d.consumeToken(ListClose, 1, 0), nil
264		case '{', '<':
265			d.pushOpenStack(ch)
266			return d.consumeToken(MessageOpen, 1, 0), nil
267		default:
268			return d.parseScalar()
269		}
270
271	case ListClose:
272		openKind, closeCh := d.currentOpenKind()
273		switch openKind {
274		case bof:
275			// Top level message.
276			// Next token can be EOF, comma, semicolon or Name.
277			if isEOF {
278				return d.consumeToken(EOF, 0, 0), nil
279			}
280			switch ch := d.in[0]; ch {
281			case ',':
282				return d.consumeToken(comma, 1, 0), nil
283			case ';':
284				return d.consumeToken(semicolon, 1, 0), nil
285			default:
286				return d.parseFieldName()
287			}
288
289		case MessageOpen:
290			// Next token can be MessageClose, comma, semicolon or Name.
291			if isEOF {
292				return Token{}, ErrUnexpectedEOF
293			}
294			switch ch := d.in[0]; ch {
295			case closeCh:
296				d.popOpenStack()
297				return d.consumeToken(MessageClose, 1, 0), nil
298			case otherCloseChar[closeCh]:
299				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
300			case ',':
301				return d.consumeToken(comma, 1, 0), nil
302			case ';':
303				return d.consumeToken(semicolon, 1, 0), nil
304			default:
305				return d.parseFieldName()
306			}
307
308		default:
309			// It is not possible to have this case. Let it panic below.
310		}
311
312	case comma, semicolon:
313		openKind, closeCh := d.currentOpenKind()
314		switch openKind {
315		case bof:
316			// Top level message. Next token can be EOF or Name.
317			if isEOF {
318				return d.consumeToken(EOF, 0, 0), nil
319			}
320			return d.parseFieldName()
321
322		case MessageOpen:
323			// Next token can be MessageClose or Name.
324			if isEOF {
325				return Token{}, ErrUnexpectedEOF
326			}
327			switch ch := d.in[0]; ch {
328			case closeCh:
329				d.popOpenStack()
330				return d.consumeToken(MessageClose, 1, 0), nil
331			case otherCloseChar[closeCh]:
332				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
333			default:
334				return d.parseFieldName()
335			}
336
337		case ListOpen:
338			if lastKind == semicolon {
339				// It is not be possible to have this case as logic here
340				// should not have produced a semicolon Token when inside a
341				// list. Let it panic below.
342				break
343			}
344			// Next token can be MessageOpen or Scalar.
345			if isEOF {
346				return Token{}, ErrUnexpectedEOF
347			}
348			switch ch := d.in[0]; ch {
349			case '{', '<':
350				d.pushOpenStack(ch)
351				return d.consumeToken(MessageOpen, 1, 0), nil
352			default:
353				return d.parseScalar()
354			}
355		}
356	}
357
358	line, column := d.Position(len(d.orig) - len(d.in))
359	panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
360}
361
362var otherCloseChar = map[byte]byte{
363	'}': '>',
364	'>': '}',
365}
366
367// currentOpenKind indicates whether current position is inside a message, list
368// or top-level message by returning MessageOpen, ListOpen or bof respectively.
369// If the returned kind is either a MessageOpen or ListOpen, it also returns the
370// corresponding closing character.
371func (d *Decoder) currentOpenKind() (Kind, byte) {
372	if len(d.openStack) == 0 {
373		return bof, 0
374	}
375	openCh := d.openStack[len(d.openStack)-1]
376	switch openCh {
377	case '{':
378		return MessageOpen, '}'
379	case '<':
380		return MessageOpen, '>'
381	case '[':
382		return ListOpen, ']'
383	}
384	panic(fmt.Sprintf("Decoder: openStack contains invalid byte %s", string(openCh)))
385}
386
387func (d *Decoder) pushOpenStack(ch byte) {
388	d.openStack = append(d.openStack, ch)
389}
390
391func (d *Decoder) popOpenStack() {
392	d.openStack = d.openStack[:len(d.openStack)-1]
393}
394
395// parseFieldName parses field name and separator.
396func (d *Decoder) parseFieldName() (tok Token, err error) {
397	defer func() {
398		if err == nil && d.tryConsumeChar(':') {
399			tok.attrs |= hasSeparator
400		}
401	}()
402
403	// Extension or Any type URL.
404	if d.in[0] == '[' {
405		return d.parseTypeName()
406	}
407
408	// Identifier.
409	if size := parseIdent(d.in, false); size > 0 {
410		return d.consumeToken(Name, size, uint8(IdentName)), nil
411	}
412
413	// Field number. Identify if input is a valid number that is not negative
414	// and is decimal integer within 32-bit range.
415	if num := parseNumber(d.in); num.size > 0 {
416		if !num.neg && num.kind == numDec {
417			if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil {
418				return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
419			}
420		}
421		return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
422	}
423
424	return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in))
425}
426
427// parseTypeName parses Any type URL or extension field name. The name is
428// enclosed in [ and ] characters. The C++ parser does not handle many legal URL
429// strings. This implementation is more liberal and allows for the pattern
430// ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
431// in between [ ], '.', '/' and the sub names.
432func (d *Decoder) parseTypeName() (Token, error) {
433	startPos := len(d.orig) - len(d.in)
434	// Use alias s to advance first in order to use d.in for error handling.
435	// Caller already checks for [ as first character.
436	s := consume(d.in[1:], 0)
437	if len(s) == 0 {
438		return Token{}, ErrUnexpectedEOF
439	}
440
441	var name []byte
442	for len(s) > 0 && isTypeNameChar(s[0]) {
443		name = append(name, s[0])
444		s = s[1:]
445	}
446	s = consume(s, 0)
447
448	var closed bool
449	for len(s) > 0 && !closed {
450		switch {
451		case s[0] == ']':
452			s = s[1:]
453			closed = true
454
455		case s[0] == '/', s[0] == '.':
456			if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
457				return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
458					d.orig[startPos:len(d.orig)-len(s)+1])
459			}
460			name = append(name, s[0])
461			s = s[1:]
462			s = consume(s, 0)
463			for len(s) > 0 && isTypeNameChar(s[0]) {
464				name = append(name, s[0])
465				s = s[1:]
466			}
467			s = consume(s, 0)
468
469		default:
470			return Token{}, d.newSyntaxError(
471				"invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
472		}
473	}
474
475	if !closed {
476		return Token{}, ErrUnexpectedEOF
477	}
478
479	// First character cannot be '.'. Last character cannot be '.' or '/'.
480	size := len(name)
481	if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
482		return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
483			d.orig[startPos:len(d.orig)-len(s)])
484	}
485
486	d.in = s
487	endPos := len(d.orig) - len(d.in)
488	d.consume(0)
489
490	return Token{
491		kind:  Name,
492		attrs: uint8(TypeName),
493		pos:   startPos,
494		raw:   d.orig[startPos:endPos],
495		str:   string(name),
496	}, nil
497}
498
499func isTypeNameChar(b byte) bool {
500	return (b == '-' || b == '_' ||
501		('0' <= b && b <= '9') ||
502		('a' <= b && b <= 'z') ||
503		('A' <= b && b <= 'Z'))
504}
505
506func isWhiteSpace(b byte) bool {
507	switch b {
508	case ' ', '\n', '\r', '\t':
509		return true
510	default:
511		return false
512	}
513}
514
515// parseIdent parses an unquoted proto identifier and returns size.
516// If allowNeg is true, it allows '-' to be the first character in the
517// identifier. This is used when parsing literal values like -infinity, etc.
518// Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
519func parseIdent(input []byte, allowNeg bool) int {
520	var size int
521
522	s := input
523	if len(s) == 0 {
524		return 0
525	}
526
527	if allowNeg && s[0] == '-' {
528		s = s[1:]
529		size++
530		if len(s) == 0 {
531			return 0
532		}
533	}
534
535	switch {
536	case s[0] == '_',
537		'a' <= s[0] && s[0] <= 'z',
538		'A' <= s[0] && s[0] <= 'Z':
539		s = s[1:]
540		size++
541	default:
542		return 0
543	}
544
545	for len(s) > 0 && (s[0] == '_' ||
546		'a' <= s[0] && s[0] <= 'z' ||
547		'A' <= s[0] && s[0] <= 'Z' ||
548		'0' <= s[0] && s[0] <= '9') {
549		s = s[1:]
550		size++
551	}
552
553	if len(s) > 0 && !isDelim(s[0]) {
554		return 0
555	}
556
557	return size
558}
559
560// parseScalar parses for a string, literal or number value.
561func (d *Decoder) parseScalar() (Token, error) {
562	if d.in[0] == '"' || d.in[0] == '\'' {
563		return d.parseStringValue()
564	}
565
566	if tok, ok := d.parseLiteralValue(); ok {
567		return tok, nil
568	}
569
570	if tok, ok := d.parseNumberValue(); ok {
571		return tok, nil
572	}
573
574	return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in))
575}
576
577// parseLiteralValue parses a literal value. A literal value is used for
578// bools, special floats and enums. This function simply identifies that the
579// field value is a literal.
580func (d *Decoder) parseLiteralValue() (Token, bool) {
581	size := parseIdent(d.in, true)
582	if size == 0 {
583		return Token{}, false
584	}
585	return d.consumeToken(Scalar, size, literalValue), true
586}
587
588// consumeToken constructs a Token for given Kind from d.in and consumes given
589// size-length from it.
590func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
591	// Important to compute raw and pos before consuming.
592	tok := Token{
593		kind:  kind,
594		attrs: attrs,
595		pos:   len(d.orig) - len(d.in),
596		raw:   d.in[:size],
597	}
598	d.consume(size)
599	return tok
600}
601
602// newSyntaxError returns a syntax error with line and column information for
603// current position.
604func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
605	e := errors.New(f, x...)
606	line, column := d.Position(len(d.orig) - len(d.in))
607	return errors.New("syntax error (line %d:%d): %v", line, column, e)
608}
609
610// Position returns line and column number of given index of the original input.
611// It will panic if index is out of range.
612func (d *Decoder) Position(idx int) (line int, column int) {
613	b := d.orig[:idx]
614	line = bytes.Count(b, []byte("\n")) + 1
615	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
616		b = b[i+1:]
617	}
618	column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
619	return line, column
620}
621
622func (d *Decoder) tryConsumeChar(c byte) bool {
623	if len(d.in) > 0 && d.in[0] == c {
624		d.consume(1)
625		return true
626	}
627	return false
628}
629
630// consume consumes n bytes of input and any subsequent whitespace or comments.
631func (d *Decoder) consume(n int) {
632	d.in = consume(d.in, n)
633	return
634}
635
636// consume consumes n bytes of input and any subsequent whitespace or comments.
637func consume(b []byte, n int) []byte {
638	b = b[n:]
639	for len(b) > 0 {
640		switch b[0] {
641		case ' ', '\n', '\r', '\t':
642			b = b[1:]
643		case '#':
644			if i := bytes.IndexByte(b, '\n'); i >= 0 {
645				b = b[i+len("\n"):]
646			} else {
647				b = nil
648			}
649		default:
650			return b
651		}
652	}
653	return b
654}
655
656// Any sequence that looks like a non-delimiter (for error reporting).
657var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
658
659// isDelim returns true if given byte is a delimiter character.
660func isDelim(c byte) bool {
661	return !(c == '-' || c == '+' || c == '.' || c == '_' ||
662		('a' <= c && c <= 'z') ||
663		('A' <= c && c <= 'Z') ||
664		('0' <= c && c <= '9'))
665}
666