1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package ebnf is a library for EBNF grammars. The input is text ([]byte) 6// satisfying the following grammar (represented itself in EBNF): 7// 8// Production = name "=" [ Expression ] "." . 9// Expression = Alternative { "|" Alternative } . 10// Alternative = Term { Term } . 11// Term = name | token [ "…" token ] | Group | Option | Repetition . 12// Group = "(" Expression ")" . 13// Option = "[" Expression "]" . 14// Repetition = "{" Expression "}" . 15// 16// A name is a Go identifier, a token is a Go string, and comments 17// and white space follow the same rules as for the Go language. 18// Production names starting with an uppercase Unicode letter denote 19// non-terminal productions (i.e., productions which allow white-space 20// and comments between tokens); all other production names denote 21// lexical productions. 22// 23package ebnf 24 25import ( 26 "errors" 27 "fmt" 28 "text/scanner" 29 "unicode" 30 "unicode/utf8" 31) 32 33// ---------------------------------------------------------------------------- 34// Error handling 35 36type errorList []error 37 38func (list errorList) Err() error { 39 if len(list) == 0 { 40 return nil 41 } 42 return list 43} 44 45func (list errorList) Error() string { 46 switch len(list) { 47 case 0: 48 return "no errors" 49 case 1: 50 return list[0].Error() 51 } 52 return fmt.Sprintf("%s (and %d more errors)", list[0], len(list)-1) 53} 54 55func newError(pos scanner.Position, msg string) error { 56 return errors.New(fmt.Sprintf("%s: %s", pos, msg)) 57} 58 59// ---------------------------------------------------------------------------- 60// Internal representation 61 62type ( 63 // An Expression node represents a production expression. 64 Expression interface { 65 // Pos is the position of the first character of the syntactic construct 66 Pos() scanner.Position 67 } 68 69 // An Alternative node represents a non-empty list of alternative expressions. 70 Alternative []Expression // x | y | z 71 72 // A Sequence node represents a non-empty list of sequential expressions. 73 Sequence []Expression // x y z 74 75 // A Name node represents a production name. 76 Name struct { 77 StringPos scanner.Position 78 String string 79 } 80 81 // A Token node represents a literal. 82 Token struct { 83 StringPos scanner.Position 84 String string 85 } 86 87 // A List node represents a range of characters. 88 Range struct { 89 Begin, End *Token // begin ... end 90 } 91 92 // A Group node represents a grouped expression. 93 Group struct { 94 Lparen scanner.Position 95 Body Expression // (body) 96 } 97 98 // An Option node represents an optional expression. 99 Option struct { 100 Lbrack scanner.Position 101 Body Expression // [body] 102 } 103 104 // A Repetition node represents a repeated expression. 105 Repetition struct { 106 Lbrace scanner.Position 107 Body Expression // {body} 108 } 109 110 // A Production node represents an EBNF production. 111 Production struct { 112 Name *Name 113 Expr Expression 114 } 115 116 // A Bad node stands for pieces of source code that lead to a parse error. 117 Bad struct { 118 TokPos scanner.Position 119 Error string // parser error message 120 } 121 122 // A Grammar is a set of EBNF productions. The map 123 // is indexed by production name. 124 // 125 Grammar map[string]*Production 126) 127 128func (x Alternative) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Alternative 129func (x Sequence) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Sequences 130func (x *Name) Pos() scanner.Position { return x.StringPos } 131func (x *Token) Pos() scanner.Position { return x.StringPos } 132func (x *Range) Pos() scanner.Position { return x.Begin.Pos() } 133func (x *Group) Pos() scanner.Position { return x.Lparen } 134func (x *Option) Pos() scanner.Position { return x.Lbrack } 135func (x *Repetition) Pos() scanner.Position { return x.Lbrace } 136func (x *Production) Pos() scanner.Position { return x.Name.Pos() } 137func (x *Bad) Pos() scanner.Position { return x.TokPos } 138 139// ---------------------------------------------------------------------------- 140// Grammar verification 141 142func isLexical(name string) bool { 143 ch, _ := utf8.DecodeRuneInString(name) 144 return !unicode.IsUpper(ch) 145} 146 147type verifier struct { 148 errors errorList 149 worklist []*Production 150 reached Grammar // set of productions reached from (and including) the root production 151 grammar Grammar 152} 153 154func (v *verifier) error(pos scanner.Position, msg string) { 155 v.errors = append(v.errors, newError(pos, msg)) 156} 157 158func (v *verifier) push(prod *Production) { 159 name := prod.Name.String 160 if _, found := v.reached[name]; !found { 161 v.worklist = append(v.worklist, prod) 162 v.reached[name] = prod 163 } 164} 165 166func (v *verifier) verifyChar(x *Token) rune { 167 s := x.String 168 if utf8.RuneCountInString(s) != 1 { 169 v.error(x.Pos(), "single char expected, found "+s) 170 return 0 171 } 172 ch, _ := utf8.DecodeRuneInString(s) 173 return ch 174} 175 176func (v *verifier) verifyExpr(expr Expression, lexical bool) { 177 switch x := expr.(type) { 178 case nil: 179 // empty expression 180 case Alternative: 181 for _, e := range x { 182 v.verifyExpr(e, lexical) 183 } 184 case Sequence: 185 for _, e := range x { 186 v.verifyExpr(e, lexical) 187 } 188 case *Name: 189 // a production with this name must exist; 190 // add it to the worklist if not yet processed 191 if prod, found := v.grammar[x.String]; found { 192 v.push(prod) 193 } else { 194 v.error(x.Pos(), "missing production "+x.String) 195 } 196 // within a lexical production references 197 // to non-lexical productions are invalid 198 if lexical && !isLexical(x.String) { 199 v.error(x.Pos(), "reference to non-lexical production "+x.String) 200 } 201 case *Token: 202 // nothing to do for now 203 case *Range: 204 i := v.verifyChar(x.Begin) 205 j := v.verifyChar(x.End) 206 if i >= j { 207 v.error(x.Pos(), "decreasing character range") 208 } 209 case *Group: 210 v.verifyExpr(x.Body, lexical) 211 case *Option: 212 v.verifyExpr(x.Body, lexical) 213 case *Repetition: 214 v.verifyExpr(x.Body, lexical) 215 case *Bad: 216 v.error(x.Pos(), x.Error) 217 default: 218 panic(fmt.Sprintf("internal error: unexpected type %T", expr)) 219 } 220} 221 222func (v *verifier) verify(grammar Grammar, start string) { 223 // find root production 224 root, found := grammar[start] 225 if !found { 226 var noPos scanner.Position 227 v.error(noPos, "no start production "+start) 228 return 229 } 230 231 // initialize verifier 232 v.worklist = v.worklist[0:0] 233 v.reached = make(Grammar) 234 v.grammar = grammar 235 236 // work through the worklist 237 v.push(root) 238 for { 239 n := len(v.worklist) - 1 240 if n < 0 { 241 break 242 } 243 prod := v.worklist[n] 244 v.worklist = v.worklist[0:n] 245 v.verifyExpr(prod.Expr, isLexical(prod.Name.String)) 246 } 247 248 // check if all productions were reached 249 if len(v.reached) < len(v.grammar) { 250 for name, prod := range v.grammar { 251 if _, found := v.reached[name]; !found { 252 v.error(prod.Pos(), name+" is unreachable") 253 } 254 } 255 } 256} 257 258// Verify checks that: 259// - all productions used are defined 260// - all productions defined are used when beginning at start 261// - lexical productions refer only to other lexical productions 262// 263// Position information is interpreted relative to the file set fset. 264// 265func Verify(grammar Grammar, start string) error { 266 var v verifier 267 v.verify(grammar, start) 268 return v.errors.Err() 269} 270