1// Go support for Protocol Buffers - Google's data interchange format
2//
3// Copyright 2010 The Go Authors.  All rights reserved.
4// https://github.com/golang/protobuf
5//
6// Redistribution and use in source and binary forms, with or without
7// modification, are permitted provided that the following conditions are
8// met:
9//
10//     * Redistributions of source code must retain the above copyright
11// notice, this list of conditions and the following disclaimer.
12//     * Redistributions in binary form must reproduce the above
13// copyright notice, this list of conditions and the following disclaimer
14// in the documentation and/or other materials provided with the
15// distribution.
16//     * Neither the name of Google Inc. nor the names of its
17// contributors may be used to endorse or promote products derived from
18// this software without specific prior written permission.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
32package proto
33
34// Functions for parsing the Text protocol buffer format.
35// TODO: message sets.
36
37import (
38	"encoding"
39	"errors"
40	"fmt"
41	"reflect"
42	"strconv"
43	"strings"
44	"unicode/utf8"
45)
46
47type ParseError struct {
48	Message string
49	Line    int // 1-based line number
50	Offset  int // 0-based byte offset from start of input
51}
52
53func (p *ParseError) Error() string {
54	if p.Line == 1 {
55		// show offset only for first line
56		return fmt.Sprintf("line 1.%d: %v", p.Offset, p.Message)
57	}
58	return fmt.Sprintf("line %d: %v", p.Line, p.Message)
59}
60
61type token struct {
62	value    string
63	err      *ParseError
64	line     int    // line number
65	offset   int    // byte number from start of input, not start of line
66	unquoted string // the unquoted version of value, if it was a quoted string
67}
68
69func (t *token) String() string {
70	if t.err == nil {
71		return fmt.Sprintf("%q (line=%d, offset=%d)", t.value, t.line, t.offset)
72	}
73	return fmt.Sprintf("parse error: %v", t.err)
74}
75
76type textParser struct {
77	s            string // remaining input
78	done         bool   // whether the parsing is finished (success or error)
79	backed       bool   // whether back() was called
80	offset, line int
81	cur          token
82}
83
84func newTextParser(s string) *textParser {
85	p := new(textParser)
86	p.s = s
87	p.line = 1
88	p.cur.line = 1
89	return p
90}
91
92func (p *textParser) errorf(format string, a ...interface{}) *ParseError {
93	pe := &ParseError{fmt.Sprintf(format, a...), p.cur.line, p.cur.offset}
94	p.cur.err = pe
95	p.done = true
96	return pe
97}
98
99// Numbers and identifiers are matched by [-+._A-Za-z0-9]
100func isIdentOrNumberChar(c byte) bool {
101	switch {
102	case 'A' <= c && c <= 'Z', 'a' <= c && c <= 'z':
103		return true
104	case '0' <= c && c <= '9':
105		return true
106	}
107	switch c {
108	case '-', '+', '.', '_':
109		return true
110	}
111	return false
112}
113
114func isWhitespace(c byte) bool {
115	switch c {
116	case ' ', '\t', '\n', '\r':
117		return true
118	}
119	return false
120}
121
122func isQuote(c byte) bool {
123	switch c {
124	case '"', '\'':
125		return true
126	}
127	return false
128}
129
130func (p *textParser) skipWhitespace() {
131	i := 0
132	for i < len(p.s) && (isWhitespace(p.s[i]) || p.s[i] == '#') {
133		if p.s[i] == '#' {
134			// comment; skip to end of line or input
135			for i < len(p.s) && p.s[i] != '\n' {
136				i++
137			}
138			if i == len(p.s) {
139				break
140			}
141		}
142		if p.s[i] == '\n' {
143			p.line++
144		}
145		i++
146	}
147	p.offset += i
148	p.s = p.s[i:len(p.s)]
149	if len(p.s) == 0 {
150		p.done = true
151	}
152}
153
154func (p *textParser) advance() {
155	// Skip whitespace
156	p.skipWhitespace()
157	if p.done {
158		return
159	}
160
161	// Start of non-whitespace
162	p.cur.err = nil
163	p.cur.offset, p.cur.line = p.offset, p.line
164	p.cur.unquoted = ""
165	switch p.s[0] {
166	case '<', '>', '{', '}', ':', '[', ']', ';', ',':
167		// Single symbol
168		p.cur.value, p.s = p.s[0:1], p.s[1:len(p.s)]
169	case '"', '\'':
170		// Quoted string
171		i := 1
172		for i < len(p.s) && p.s[i] != p.s[0] && p.s[i] != '\n' {
173			if p.s[i] == '\\' && i+1 < len(p.s) {
174				// skip escaped char
175				i++
176			}
177			i++
178		}
179		if i >= len(p.s) || p.s[i] != p.s[0] {
180			p.errorf("unmatched quote")
181			return
182		}
183		unq, err := unquoteC(p.s[1:i], rune(p.s[0]))
184		if err != nil {
185			p.errorf("invalid quoted string %s: %v", p.s[0:i+1], err)
186			return
187		}
188		p.cur.value, p.s = p.s[0:i+1], p.s[i+1:len(p.s)]
189		p.cur.unquoted = unq
190	default:
191		i := 0
192		for i < len(p.s) && isIdentOrNumberChar(p.s[i]) {
193			i++
194		}
195		if i == 0 {
196			p.errorf("unexpected byte %#x", p.s[0])
197			return
198		}
199		p.cur.value, p.s = p.s[0:i], p.s[i:len(p.s)]
200	}
201	p.offset += len(p.cur.value)
202}
203
204var (
205	errBadUTF8 = errors.New("proto: bad UTF-8")
206	errBadHex  = errors.New("proto: bad hexadecimal")
207)
208
209func unquoteC(s string, quote rune) (string, error) {
210	// This is based on C++'s tokenizer.cc.
211	// Despite its name, this is *not* parsing C syntax.
212	// For instance, "\0" is an invalid quoted string.
213
214	// Avoid allocation in trivial cases.
215	simple := true
216	for _, r := range s {
217		if r == '\\' || r == quote {
218			simple = false
219			break
220		}
221	}
222	if simple {
223		return s, nil
224	}
225
226	buf := make([]byte, 0, 3*len(s)/2)
227	for len(s) > 0 {
228		r, n := utf8.DecodeRuneInString(s)
229		if r == utf8.RuneError && n == 1 {
230			return "", errBadUTF8
231		}
232		s = s[n:]
233		if r != '\\' {
234			if r < utf8.RuneSelf {
235				buf = append(buf, byte(r))
236			} else {
237				buf = append(buf, string(r)...)
238			}
239			continue
240		}
241
242		ch, tail, err := unescape(s)
243		if err != nil {
244			return "", err
245		}
246		buf = append(buf, ch...)
247		s = tail
248	}
249	return string(buf), nil
250}
251
252func unescape(s string) (ch string, tail string, err error) {
253	r, n := utf8.DecodeRuneInString(s)
254	if r == utf8.RuneError && n == 1 {
255		return "", "", errBadUTF8
256	}
257	s = s[n:]
258	switch r {
259	case 'a':
260		return "\a", s, nil
261	case 'b':
262		return "\b", s, nil
263	case 'f':
264		return "\f", s, nil
265	case 'n':
266		return "\n", s, nil
267	case 'r':
268		return "\r", s, nil
269	case 't':
270		return "\t", s, nil
271	case 'v':
272		return "\v", s, nil
273	case '?':
274		return "?", s, nil // trigraph workaround
275	case '\'', '"', '\\':
276		return string(r), s, nil
277	case '0', '1', '2', '3', '4', '5', '6', '7', 'x', 'X':
278		if len(s) < 2 {
279			return "", "", fmt.Errorf(`\%c requires 2 following digits`, r)
280		}
281		base := 8
282		ss := s[:2]
283		s = s[2:]
284		if r == 'x' || r == 'X' {
285			base = 16
286		} else {
287			ss = string(r) + ss
288		}
289		i, err := strconv.ParseUint(ss, base, 8)
290		if err != nil {
291			return "", "", err
292		}
293		return string([]byte{byte(i)}), s, nil
294	case 'u', 'U':
295		n := 4
296		if r == 'U' {
297			n = 8
298		}
299		if len(s) < n {
300			return "", "", fmt.Errorf(`\%c requires %d digits`, r, n)
301		}
302
303		bs := make([]byte, n/2)
304		for i := 0; i < n; i += 2 {
305			a, ok1 := unhex(s[i])
306			b, ok2 := unhex(s[i+1])
307			if !ok1 || !ok2 {
308				return "", "", errBadHex
309			}
310			bs[i/2] = a<<4 | b
311		}
312		s = s[n:]
313		return string(bs), s, nil
314	}
315	return "", "", fmt.Errorf(`unknown escape \%c`, r)
316}
317
318// Adapted from src/pkg/strconv/quote.go.
319func unhex(b byte) (v byte, ok bool) {
320	switch {
321	case '0' <= b && b <= '9':
322		return b - '0', true
323	case 'a' <= b && b <= 'f':
324		return b - 'a' + 10, true
325	case 'A' <= b && b <= 'F':
326		return b - 'A' + 10, true
327	}
328	return 0, false
329}
330
331// Back off the parser by one token. Can only be done between calls to next().
332// It makes the next advance() a no-op.
333func (p *textParser) back() { p.backed = true }
334
335// Advances the parser and returns the new current token.
336func (p *textParser) next() *token {
337	if p.backed || p.done {
338		p.backed = false
339		return &p.cur
340	}
341	p.advance()
342	if p.done {
343		p.cur.value = ""
344	} else if len(p.cur.value) > 0 && isQuote(p.cur.value[0]) {
345		// Look for multiple quoted strings separated by whitespace,
346		// and concatenate them.
347		cat := p.cur
348		for {
349			p.skipWhitespace()
350			if p.done || !isQuote(p.s[0]) {
351				break
352			}
353			p.advance()
354			if p.cur.err != nil {
355				return &p.cur
356			}
357			cat.value += " " + p.cur.value
358			cat.unquoted += p.cur.unquoted
359		}
360		p.done = false // parser may have seen EOF, but we want to return cat
361		p.cur = cat
362	}
363	return &p.cur
364}
365
366func (p *textParser) consumeToken(s string) error {
367	tok := p.next()
368	if tok.err != nil {
369		return tok.err
370	}
371	if tok.value != s {
372		p.back()
373		return p.errorf("expected %q, found %q", s, tok.value)
374	}
375	return nil
376}
377
378// Return a RequiredNotSetError indicating which required field was not set.
379func (p *textParser) missingRequiredFieldError(sv reflect.Value) *RequiredNotSetError {
380	st := sv.Type()
381	sprops := GetProperties(st)
382	for i := 0; i < st.NumField(); i++ {
383		if !isNil(sv.Field(i)) {
384			continue
385		}
386
387		props := sprops.Prop[i]
388		if props.Required {
389			return &RequiredNotSetError{fmt.Sprintf("%v.%v", st, props.OrigName)}
390		}
391	}
392	return &RequiredNotSetError{fmt.Sprintf("%v.<unknown field name>", st)} // should not happen
393}
394
395// Returns the index in the struct for the named field, as well as the parsed tag properties.
396func structFieldByName(sprops *StructProperties, name string) (int, *Properties, bool) {
397	i, ok := sprops.decoderOrigNames[name]
398	if ok {
399		return i, sprops.Prop[i], true
400	}
401	return -1, nil, false
402}
403
404// Consume a ':' from the input stream (if the next token is a colon),
405// returning an error if a colon is needed but not present.
406func (p *textParser) checkForColon(props *Properties, typ reflect.Type) *ParseError {
407	tok := p.next()
408	if tok.err != nil {
409		return tok.err
410	}
411	if tok.value != ":" {
412		// Colon is optional when the field is a group or message.
413		needColon := true
414		switch props.Wire {
415		case "group":
416			needColon = false
417		case "bytes":
418			// A "bytes" field is either a message, a string, or a repeated field;
419			// those three become *T, *string and []T respectively, so we can check for
420			// this field being a pointer to a non-string.
421			if typ.Kind() == reflect.Ptr {
422				// *T or *string
423				if typ.Elem().Kind() == reflect.String {
424					break
425				}
426			} else if typ.Kind() == reflect.Slice {
427				// []T or []*T
428				if typ.Elem().Kind() != reflect.Ptr {
429					break
430				}
431			} else if typ.Kind() == reflect.String {
432				// The proto3 exception is for a string field,
433				// which requires a colon.
434				break
435			}
436			needColon = false
437		}
438		if needColon {
439			return p.errorf("expected ':', found %q", tok.value)
440		}
441		p.back()
442	}
443	return nil
444}
445
446func (p *textParser) readStruct(sv reflect.Value, terminator string) error {
447	st := sv.Type()
448	sprops := GetProperties(st)
449	reqCount := sprops.reqCount
450	var reqFieldErr error
451	fieldSet := make(map[string]bool)
452	// A struct is a sequence of "name: value", terminated by one of
453	// '>' or '}', or the end of the input.  A name may also be
454	// "[extension]".
455	for {
456		tok := p.next()
457		if tok.err != nil {
458			return tok.err
459		}
460		if tok.value == terminator {
461			break
462		}
463		if tok.value == "[" {
464			// Looks like an extension.
465			//
466			// TODO: Check whether we need to handle
467			// namespace rooted names (e.g. ".something.Foo").
468			tok = p.next()
469			if tok.err != nil {
470				return tok.err
471			}
472			var desc *ExtensionDesc
473			// This could be faster, but it's functional.
474			// TODO: Do something smarter than a linear scan.
475			for _, d := range RegisteredExtensions(reflect.New(st).Interface().(Message)) {
476				if d.Name == tok.value {
477					desc = d
478					break
479				}
480			}
481			if desc == nil {
482				return p.errorf("unrecognized extension %q", tok.value)
483			}
484			// Check the extension terminator.
485			tok = p.next()
486			if tok.err != nil {
487				return tok.err
488			}
489			if tok.value != "]" {
490				return p.errorf("unrecognized extension terminator %q", tok.value)
491			}
492
493			props := &Properties{}
494			props.Parse(desc.Tag)
495
496			typ := reflect.TypeOf(desc.ExtensionType)
497			if err := p.checkForColon(props, typ); err != nil {
498				return err
499			}
500
501			rep := desc.repeated()
502
503			// Read the extension structure, and set it in
504			// the value we're constructing.
505			var ext reflect.Value
506			if !rep {
507				ext = reflect.New(typ).Elem()
508			} else {
509				ext = reflect.New(typ.Elem()).Elem()
510			}
511			if err := p.readAny(ext, props); err != nil {
512				if _, ok := err.(*RequiredNotSetError); !ok {
513					return err
514				}
515				reqFieldErr = err
516			}
517			ep := sv.Addr().Interface().(extendableProto)
518			if !rep {
519				SetExtension(ep, desc, ext.Interface())
520			} else {
521				old, err := GetExtension(ep, desc)
522				var sl reflect.Value
523				if err == nil {
524					sl = reflect.ValueOf(old) // existing slice
525				} else {
526					sl = reflect.MakeSlice(typ, 0, 1)
527				}
528				sl = reflect.Append(sl, ext)
529				SetExtension(ep, desc, sl.Interface())
530			}
531			if err := p.consumeOptionalSeparator(); err != nil {
532				return err
533			}
534			continue
535		}
536
537		// This is a normal, non-extension field.
538		name := tok.value
539		var dst reflect.Value
540		fi, props, ok := structFieldByName(sprops, name)
541		if ok {
542			dst = sv.Field(fi)
543		} else if oop, ok := sprops.OneofTypes[name]; ok {
544			// It is a oneof.
545			props = oop.Prop
546			nv := reflect.New(oop.Type.Elem())
547			dst = nv.Elem().Field(0)
548			sv.Field(oop.Field).Set(nv)
549		}
550		if !dst.IsValid() {
551			return p.errorf("unknown field name %q in %v", name, st)
552		}
553
554		if dst.Kind() == reflect.Map {
555			// Consume any colon.
556			if err := p.checkForColon(props, dst.Type()); err != nil {
557				return err
558			}
559
560			// Construct the map if it doesn't already exist.
561			if dst.IsNil() {
562				dst.Set(reflect.MakeMap(dst.Type()))
563			}
564			key := reflect.New(dst.Type().Key()).Elem()
565			val := reflect.New(dst.Type().Elem()).Elem()
566
567			// The map entry should be this sequence of tokens:
568			//	< key : KEY value : VALUE >
569			// Technically the "key" and "value" could come in any order,
570			// but in practice they won't.
571
572			tok := p.next()
573			var terminator string
574			switch tok.value {
575			case "<":
576				terminator = ">"
577			case "{":
578				terminator = "}"
579			default:
580				return p.errorf("expected '{' or '<', found %q", tok.value)
581			}
582			if err := p.consumeToken("key"); err != nil {
583				return err
584			}
585			if err := p.consumeToken(":"); err != nil {
586				return err
587			}
588			if err := p.readAny(key, props.mkeyprop); err != nil {
589				return err
590			}
591			if err := p.consumeOptionalSeparator(); err != nil {
592				return err
593			}
594			if err := p.consumeToken("value"); err != nil {
595				return err
596			}
597			if err := p.checkForColon(props.mvalprop, dst.Type().Elem()); err != nil {
598				return err
599			}
600			if err := p.readAny(val, props.mvalprop); err != nil {
601				return err
602			}
603			if err := p.consumeOptionalSeparator(); err != nil {
604				return err
605			}
606			if err := p.consumeToken(terminator); err != nil {
607				return err
608			}
609
610			dst.SetMapIndex(key, val)
611			continue
612		}
613
614		// Check that it's not already set if it's not a repeated field.
615		if !props.Repeated && fieldSet[name] {
616			return p.errorf("non-repeated field %q was repeated", name)
617		}
618
619		if err := p.checkForColon(props, dst.Type()); err != nil {
620			return err
621		}
622
623		// Parse into the field.
624		fieldSet[name] = true
625		if err := p.readAny(dst, props); err != nil {
626			if _, ok := err.(*RequiredNotSetError); !ok {
627				return err
628			}
629			reqFieldErr = err
630		} else if props.Required {
631			reqCount--
632		}
633
634		if err := p.consumeOptionalSeparator(); err != nil {
635			return err
636		}
637
638	}
639
640	if reqCount > 0 {
641		return p.missingRequiredFieldError(sv)
642	}
643	return reqFieldErr
644}
645
646// consumeOptionalSeparator consumes an optional semicolon or comma.
647// It is used in readStruct to provide backward compatibility.
648func (p *textParser) consumeOptionalSeparator() error {
649	tok := p.next()
650	if tok.err != nil {
651		return tok.err
652	}
653	if tok.value != ";" && tok.value != "," {
654		p.back()
655	}
656	return nil
657}
658
659func (p *textParser) readAny(v reflect.Value, props *Properties) error {
660	tok := p.next()
661	if tok.err != nil {
662		return tok.err
663	}
664	if tok.value == "" {
665		return p.errorf("unexpected EOF")
666	}
667
668	switch fv := v; fv.Kind() {
669	case reflect.Slice:
670		at := v.Type()
671		if at.Elem().Kind() == reflect.Uint8 {
672			// Special case for []byte
673			if tok.value[0] != '"' && tok.value[0] != '\'' {
674				// Deliberately written out here, as the error after
675				// this switch statement would write "invalid []byte: ...",
676				// which is not as user-friendly.
677				return p.errorf("invalid string: %v", tok.value)
678			}
679			bytes := []byte(tok.unquoted)
680			fv.Set(reflect.ValueOf(bytes))
681			return nil
682		}
683		// Repeated field.
684		if tok.value == "[" {
685			// Repeated field with list notation, like [1,2,3].
686			for {
687				fv.Set(reflect.Append(fv, reflect.New(at.Elem()).Elem()))
688				err := p.readAny(fv.Index(fv.Len()-1), props)
689				if err != nil {
690					return err
691				}
692				tok := p.next()
693				if tok.err != nil {
694					return tok.err
695				}
696				if tok.value == "]" {
697					break
698				}
699				if tok.value != "," {
700					return p.errorf("Expected ']' or ',' found %q", tok.value)
701				}
702			}
703			return nil
704		}
705		// One value of the repeated field.
706		p.back()
707		fv.Set(reflect.Append(fv, reflect.New(at.Elem()).Elem()))
708		return p.readAny(fv.Index(fv.Len()-1), props)
709	case reflect.Bool:
710		// Either "true", "false", 1 or 0.
711		switch tok.value {
712		case "true", "1":
713			fv.SetBool(true)
714			return nil
715		case "false", "0":
716			fv.SetBool(false)
717			return nil
718		}
719	case reflect.Float32, reflect.Float64:
720		v := tok.value
721		// Ignore 'f' for compatibility with output generated by C++, but don't
722		// remove 'f' when the value is "-inf" or "inf".
723		if strings.HasSuffix(v, "f") && tok.value != "-inf" && tok.value != "inf" {
724			v = v[:len(v)-1]
725		}
726		if f, err := strconv.ParseFloat(v, fv.Type().Bits()); err == nil {
727			fv.SetFloat(f)
728			return nil
729		}
730	case reflect.Int32:
731		if x, err := strconv.ParseInt(tok.value, 0, 32); err == nil {
732			fv.SetInt(x)
733			return nil
734		}
735
736		if len(props.Enum) == 0 {
737			break
738		}
739		m, ok := enumValueMaps[props.Enum]
740		if !ok {
741			break
742		}
743		x, ok := m[tok.value]
744		if !ok {
745			break
746		}
747		fv.SetInt(int64(x))
748		return nil
749	case reflect.Int64:
750		if x, err := strconv.ParseInt(tok.value, 0, 64); err == nil {
751			fv.SetInt(x)
752			return nil
753		}
754
755	case reflect.Ptr:
756		// A basic field (indirected through pointer), or a repeated message/group
757		p.back()
758		fv.Set(reflect.New(fv.Type().Elem()))
759		return p.readAny(fv.Elem(), props)
760	case reflect.String:
761		if tok.value[0] == '"' || tok.value[0] == '\'' {
762			fv.SetString(tok.unquoted)
763			return nil
764		}
765	case reflect.Struct:
766		var terminator string
767		switch tok.value {
768		case "{":
769			terminator = "}"
770		case "<":
771			terminator = ">"
772		default:
773			return p.errorf("expected '{' or '<', found %q", tok.value)
774		}
775		// TODO: Handle nested messages which implement encoding.TextUnmarshaler.
776		return p.readStruct(fv, terminator)
777	case reflect.Uint32:
778		if x, err := strconv.ParseUint(tok.value, 0, 32); err == nil {
779			fv.SetUint(uint64(x))
780			return nil
781		}
782	case reflect.Uint64:
783		if x, err := strconv.ParseUint(tok.value, 0, 64); err == nil {
784			fv.SetUint(x)
785			return nil
786		}
787	}
788	return p.errorf("invalid %v: %v", v.Type(), tok.value)
789}
790
791// UnmarshalText reads a protocol buffer in Text format. UnmarshalText resets pb
792// before starting to unmarshal, so any existing data in pb is always removed.
793// If a required field is not set and no other error occurs,
794// UnmarshalText returns *RequiredNotSetError.
795func UnmarshalText(s string, pb Message) error {
796	if um, ok := pb.(encoding.TextUnmarshaler); ok {
797		err := um.UnmarshalText([]byte(s))
798		return err
799	}
800	pb.Reset()
801	v := reflect.ValueOf(pb)
802	if pe := newTextParser(s).readStruct(v.Elem(), ""); pe != nil {
803		return pe
804	}
805	return nil
806}
807