1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package ebnf is a library for EBNF grammars. The input is text ([]byte)
6// satisfying the following grammar (represented itself in EBNF):
7//
8//	Production  = name "=" [ Expression ] "." .
9//	Expression  = Alternative { "|" Alternative } .
10//	Alternative = Term { Term } .
11//	Term        = name | token [ "…" token ] | Group | Option | Repetition .
12//	Group       = "(" Expression ")" .
13//	Option      = "[" Expression "]" .
14//	Repetition  = "{" Expression "}" .
15//
16// A name is a Go identifier, a token is a Go string, and comments
17// and white space follow the same rules as for the Go language.
18// Production names starting with an uppercase Unicode letter denote
19// non-terminal productions (i.e., productions which allow white-space
20// and comments between tokens); all other production names denote
21// lexical productions.
22//
23package ebnf
24
25import (
26	"errors"
27	"fmt"
28	"text/scanner"
29	"unicode"
30	"unicode/utf8"
31)
32
33// ----------------------------------------------------------------------------
34// Error handling
35
36type errorList []error
37
38func (list errorList) Err() error {
39	if len(list) == 0 {
40		return nil
41	}
42	return list
43}
44
45func (list errorList) Error() string {
46	switch len(list) {
47	case 0:
48		return "no errors"
49	case 1:
50		return list[0].Error()
51	}
52	return fmt.Sprintf("%s (and %d more errors)", list[0], len(list)-1)
53}
54
55func newError(pos scanner.Position, msg string) error {
56	return errors.New(fmt.Sprintf("%s: %s", pos, msg))
57}
58
59// ----------------------------------------------------------------------------
60// Internal representation
61
62type (
63	// An Expression node represents a production expression.
64	Expression interface {
65		// Pos is the position of the first character of the syntactic construct
66		Pos() scanner.Position
67	}
68
69	// An Alternative node represents a non-empty list of alternative expressions.
70	Alternative []Expression // x | y | z
71
72	// A Sequence node represents a non-empty list of sequential expressions.
73	Sequence []Expression // x y z
74
75	// A Name node represents a production name.
76	Name struct {
77		StringPos scanner.Position
78		String    string
79	}
80
81	// A Token node represents a literal.
82	Token struct {
83		StringPos scanner.Position
84		String    string
85	}
86
87	// A List node represents a range of characters.
88	Range struct {
89		Begin, End *Token // begin ... end
90	}
91
92	// A Group node represents a grouped expression.
93	Group struct {
94		Lparen scanner.Position
95		Body   Expression // (body)
96	}
97
98	// An Option node represents an optional expression.
99	Option struct {
100		Lbrack scanner.Position
101		Body   Expression // [body]
102	}
103
104	// A Repetition node represents a repeated expression.
105	Repetition struct {
106		Lbrace scanner.Position
107		Body   Expression // {body}
108	}
109
110	// A Production node represents an EBNF production.
111	Production struct {
112		Name *Name
113		Expr Expression
114	}
115
116	// A Bad node stands for pieces of source code that lead to a parse error.
117	Bad struct {
118		TokPos scanner.Position
119		Error  string // parser error message
120	}
121
122	// A Grammar is a set of EBNF productions. The map
123	// is indexed by production name.
124	//
125	Grammar map[string]*Production
126)
127
128func (x Alternative) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Alternative
129func (x Sequence) Pos() scanner.Position    { return x[0].Pos() } // the parser always generates non-empty Sequences
130func (x *Name) Pos() scanner.Position       { return x.StringPos }
131func (x *Token) Pos() scanner.Position      { return x.StringPos }
132func (x *Range) Pos() scanner.Position      { return x.Begin.Pos() }
133func (x *Group) Pos() scanner.Position      { return x.Lparen }
134func (x *Option) Pos() scanner.Position     { return x.Lbrack }
135func (x *Repetition) Pos() scanner.Position { return x.Lbrace }
136func (x *Production) Pos() scanner.Position { return x.Name.Pos() }
137func (x *Bad) Pos() scanner.Position        { return x.TokPos }
138
139// ----------------------------------------------------------------------------
140// Grammar verification
141
142func isLexical(name string) bool {
143	ch, _ := utf8.DecodeRuneInString(name)
144	return !unicode.IsUpper(ch)
145}
146
147type verifier struct {
148	errors   errorList
149	worklist []*Production
150	reached  Grammar // set of productions reached from (and including) the root production
151	grammar  Grammar
152}
153
154func (v *verifier) error(pos scanner.Position, msg string) {
155	v.errors = append(v.errors, newError(pos, msg))
156}
157
158func (v *verifier) push(prod *Production) {
159	name := prod.Name.String
160	if _, found := v.reached[name]; !found {
161		v.worklist = append(v.worklist, prod)
162		v.reached[name] = prod
163	}
164}
165
166func (v *verifier) verifyChar(x *Token) rune {
167	s := x.String
168	if utf8.RuneCountInString(s) != 1 {
169		v.error(x.Pos(), "single char expected, found "+s)
170		return 0
171	}
172	ch, _ := utf8.DecodeRuneInString(s)
173	return ch
174}
175
176func (v *verifier) verifyExpr(expr Expression, lexical bool) {
177	switch x := expr.(type) {
178	case nil:
179		// empty expression
180	case Alternative:
181		for _, e := range x {
182			v.verifyExpr(e, lexical)
183		}
184	case Sequence:
185		for _, e := range x {
186			v.verifyExpr(e, lexical)
187		}
188	case *Name:
189		// a production with this name must exist;
190		// add it to the worklist if not yet processed
191		if prod, found := v.grammar[x.String]; found {
192			v.push(prod)
193		} else {
194			v.error(x.Pos(), "missing production "+x.String)
195		}
196		// within a lexical production references
197		// to non-lexical productions are invalid
198		if lexical && !isLexical(x.String) {
199			v.error(x.Pos(), "reference to non-lexical production "+x.String)
200		}
201	case *Token:
202		// nothing to do for now
203	case *Range:
204		i := v.verifyChar(x.Begin)
205		j := v.verifyChar(x.End)
206		if i >= j {
207			v.error(x.Pos(), "decreasing character range")
208		}
209	case *Group:
210		v.verifyExpr(x.Body, lexical)
211	case *Option:
212		v.verifyExpr(x.Body, lexical)
213	case *Repetition:
214		v.verifyExpr(x.Body, lexical)
215	case *Bad:
216		v.error(x.Pos(), x.Error)
217	default:
218		panic(fmt.Sprintf("internal error: unexpected type %T", expr))
219	}
220}
221
222func (v *verifier) verify(grammar Grammar, start string) {
223	// find root production
224	root, found := grammar[start]
225	if !found {
226		var noPos scanner.Position
227		v.error(noPos, "no start production "+start)
228		return
229	}
230
231	// initialize verifier
232	v.worklist = v.worklist[0:0]
233	v.reached = make(Grammar)
234	v.grammar = grammar
235
236	// work through the worklist
237	v.push(root)
238	for {
239		n := len(v.worklist) - 1
240		if n < 0 {
241			break
242		}
243		prod := v.worklist[n]
244		v.worklist = v.worklist[0:n]
245		v.verifyExpr(prod.Expr, isLexical(prod.Name.String))
246	}
247
248	// check if all productions were reached
249	if len(v.reached) < len(v.grammar) {
250		for name, prod := range v.grammar {
251			if _, found := v.reached[name]; !found {
252				v.error(prod.Pos(), name+" is unreachable")
253			}
254		}
255	}
256}
257
258// Verify checks that:
259//	- all productions used are defined
260//	- all productions defined are used when beginning at start
261//	- lexical productions refer only to other lexical productions
262//
263// Position information is interpreted relative to the file set fset.
264//
265func Verify(grammar Grammar, start string) error {
266	var v verifier
267	v.verify(grammar, start)
268	return v.errors.Err()
269}
270