1"Converts between Lark and Nearley grammars. Work in progress!" 2 3import os.path 4import sys 5import codecs 6 7 8from lark import Lark, InlineTransformer 9 10nearley_grammar = r""" 11 start: (ruledef|directive)+ 12 13 directive: "@" NAME (STRING|NAME) 14 | "@" JS -> js_code 15 ruledef: NAME "->" expansions 16 | NAME REGEXP "->" expansions -> macro 17 expansions: expansion ("|" expansion)* 18 19 expansion: expr+ js 20 21 ?expr: item (":" /[+*?]/)? 22 23 ?item: rule|string|regexp|null 24 | "(" expansions ")" 25 26 rule: NAME 27 string: STRING 28 regexp: REGEXP 29 null: "null" 30 JS: /{%.*?%}/s 31 js: JS? 32 33 NAME: /[a-zA-Z_$]\w*/ 34 COMMENT: /#[^\n]*/ 35 REGEXP: /\[.*?\]/ 36 37 %import common.ESCAPED_STRING -> STRING 38 %import common.WS 39 %ignore WS 40 %ignore COMMENT 41 42 """ 43 44nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard') 45 46def _get_rulename(name): 47 name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) 48 return 'n_' + name.replace('$', '__DOLLAR__').lower() 49 50class NearleyToLark(InlineTransformer): 51 def __init__(self): 52 self._count = 0 53 self.extra_rules = {} 54 self.extra_rules_rev = {} 55 self.alias_js_code = {} 56 57 def _new_function(self, code): 58 name = 'alias_%d' % self._count 59 self._count += 1 60 61 self.alias_js_code[name] = code 62 return name 63 64 def _extra_rule(self, rule): 65 if rule in self.extra_rules_rev: 66 return self.extra_rules_rev[rule] 67 68 name = 'xrule_%d' % len(self.extra_rules) 69 assert name not in self.extra_rules 70 self.extra_rules[name] = rule 71 self.extra_rules_rev[rule] = name 72 return name 73 74 def rule(self, name): 75 return _get_rulename(name) 76 77 def ruledef(self, name, exps): 78 return '!%s: %s' % (_get_rulename(name), exps) 79 80 def expr(self, item, op): 81 rule = '(%s)%s' % (item, op) 82 return self._extra_rule(rule) 83 84 def regexp(self, r): 85 return '/%s/' % r 86 87 def null(self): 88 return '' 89 90 def string(self, s): 91 return self._extra_rule(s) 92 93 def expansion(self, *x): 94 x, js = x[:-1], x[-1] 95 if js.children: 96 js_code ,= js.children 97 js_code = js_code[2:-2] 98 alias = '-> ' + self._new_function(js_code) 99 else: 100 alias = '' 101 return ' '.join(x) + alias 102 103 def expansions(self, *x): 104 return '%s' % ('\n |'.join(x)) 105 106 def start(self, *rules): 107 return '\n'.join(filter(None, rules)) 108 109def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes): 110 rule_defs = [] 111 112 tree = nearley_grammar_parser.parse(g) 113 for statement in tree.children: 114 if statement.data == 'directive': 115 directive, arg = statement.children 116 if directive in ('builtin', 'include'): 117 folder = builtin_path if directive == 'builtin' else folder_path 118 path = os.path.join(folder, arg[1:-1]) 119 if path not in includes: 120 includes.add(path) 121 with codecs.open(path, encoding='utf8') as f: 122 text = f.read() 123 rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes) 124 else: 125 assert False, directive 126 elif statement.data == 'js_code': 127 code ,= statement.children 128 code = code[2:-2] 129 js_code.append(code) 130 elif statement.data == 'macro': 131 pass # TODO Add support for macros! 132 elif statement.data == 'ruledef': 133 rule_defs.append( n2l.transform(statement) ) 134 else: 135 raise Exception("Unknown statement: %s" % statement) 136 137 return rule_defs 138 139 140def create_code_for_nearley_grammar(g, start, builtin_path, folder_path): 141 import js2py 142 143 emit_code = [] 144 def emit(x=None): 145 if x: 146 emit_code.append(x) 147 emit_code.append('\n') 148 149 js_code = ['function id(x) {return x[0];}'] 150 n2l = NearleyToLark() 151 rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set()) 152 lark_g = '\n'.join(rule_defs) 153 lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items()) 154 155 emit('from lark import Lark, Transformer') 156 emit() 157 emit('grammar = ' + repr(lark_g)) 158 emit() 159 160 for alias, code in n2l.alias_js_code.items(): 161 js_code.append('%s = (%s);' % (alias, code)) 162 163 emit(js2py.translate_js('\n'.join(js_code))) 164 emit('class TransformNearley(Transformer):') 165 for alias in n2l.alias_js_code: 166 emit(" %s = var.get('%s').to_python()" % (alias, alias)) 167 emit(" __default__ = lambda self, n, c, m: c if c else None") 168 169 emit() 170 emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start) 171 emit('def parse(text):') 172 emit(' return TransformNearley().transform(parser.parse(text))') 173 174 return ''.join(emit_code) 175 176def main(fn, start, nearley_lib): 177 with codecs.open(fn, encoding='utf8') as f: 178 grammar = f.read() 179 return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn))) 180 181 182if __name__ == '__main__': 183 if len(sys.argv) < 4: 184 print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.") 185 print("Usage: %s <nearley_grammar_path> <start_rule> <nearley_lib_path>" % sys.argv[0]) 186 sys.exit(1) 187 188 fn, start, nearley_lib = sys.argv[1:] 189 190 print(main(fn, start, nearley_lib)) 191