1#======================================================================= 2# 3# Python Lexical Analyser 4# 5# Lexical Analyser Specification 6# 7#======================================================================= 8 9from __future__ import absolute_import 10 11import types 12 13from . import Actions 14from . import DFA 15from . import Errors 16from . import Machines 17from . import Regexps 18 19# debug_flags for Lexicon constructor 20DUMP_NFA = 1 21DUMP_DFA = 2 22 23 24class State(object): 25 """ 26 This class is used as part of a Plex.Lexicon specification to 27 introduce a user-defined state. 28 29 Constructor: 30 31 State(name, token_specifications) 32 """ 33 34 name = None 35 tokens = None 36 37 def __init__(self, name, tokens): 38 self.name = name 39 self.tokens = tokens 40 41 42class Lexicon(object): 43 """ 44 Lexicon(specification) builds a lexical analyser from the given 45 |specification|. The specification consists of a list of 46 specification items. Each specification item may be either: 47 48 1) A token definition, which is a tuple: 49 50 (pattern, action) 51 52 The |pattern| is a regular axpression built using the 53 constructors defined in the Plex module. 54 55 The |action| is the action to be performed when this pattern 56 is recognised (see below). 57 58 2) A state definition: 59 60 State(name, tokens) 61 62 where |name| is a character string naming the state, 63 and |tokens| is a list of token definitions as 64 above. The meaning and usage of states is described 65 below. 66 67 Actions 68 ------- 69 70 The |action| in a token specication may be one of three things: 71 72 1) A function, which is called as follows: 73 74 function(scanner, text) 75 76 where |scanner| is the relevant Scanner instance, and |text| 77 is the matched text. If the function returns anything 78 other than None, that value is returned as the value of the 79 token. If it returns None, scanning continues as if the IGNORE 80 action were specified (see below). 81 82 2) One of the following special actions: 83 84 IGNORE means that the recognised characters will be treated as 85 white space and ignored. Scanning will continue until 86 the next non-ignored token is recognised before returning. 87 88 TEXT causes the scanned text itself to be returned as the 89 value of the token. 90 91 3) Any other value, which is returned as the value of the token. 92 93 States 94 ------ 95 96 At any given time, the scanner is in one of a number of states. 97 Associated with each state is a set of possible tokens. When scanning, 98 only tokens associated with the current state are recognised. 99 100 There is a default state, whose name is the empty string. Token 101 definitions which are not inside any State definition belong to 102 the default state. 103 104 The initial state of the scanner is the default state. The state can 105 be changed in one of two ways: 106 107 1) Using Begin(state_name) as the action of a token. 108 109 2) Calling the begin(state_name) method of the Scanner. 110 111 To change back to the default state, use '' as the state name. 112 """ 113 114 machine = None # Machine 115 tables = None # StateTableMachine 116 117 def __init__(self, specifications, debug=None, debug_flags=7, timings=None): 118 if not isinstance(specifications, list): 119 raise Errors.InvalidScanner("Scanner definition is not a list") 120 if timings: 121 from .Timing import time 122 123 total_time = 0.0 124 time1 = time() 125 nfa = Machines.Machine() 126 default_initial_state = nfa.new_initial_state('') 127 token_number = 1 128 for spec in specifications: 129 if isinstance(spec, State): 130 user_initial_state = nfa.new_initial_state(spec.name) 131 for token in spec.tokens: 132 self.add_token_to_machine( 133 nfa, user_initial_state, token, token_number) 134 token_number += 1 135 elif isinstance(spec, tuple): 136 self.add_token_to_machine( 137 nfa, default_initial_state, spec, token_number) 138 token_number += 1 139 else: 140 raise Errors.InvalidToken( 141 token_number, 142 "Expected a token definition (tuple) or State instance") 143 if timings: 144 time2 = time() 145 total_time = total_time + (time2 - time1) 146 time3 = time() 147 if debug and (debug_flags & 1): 148 debug.write("\n============= NFA ===========\n") 149 nfa.dump(debug) 150 dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug) 151 if timings: 152 time4 = time() 153 total_time = total_time + (time4 - time3) 154 if debug and (debug_flags & 2): 155 debug.write("\n============= DFA ===========\n") 156 dfa.dump(debug) 157 if timings: 158 timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) 159 timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) 160 timings.write("TOTAL : %5.2f\n" % total_time) 161 self.machine = dfa 162 163 def add_token_to_machine(self, machine, initial_state, token_spec, token_number): 164 try: 165 (re, action_spec) = self.parse_token_definition(token_spec) 166 # Disabled this -- matching empty strings can be useful 167 #if re.nullable: 168 # raise Errors.InvalidToken( 169 # token_number, "Pattern can match 0 input symbols") 170 if isinstance(action_spec, Actions.Action): 171 action = action_spec 172 else: 173 try: 174 action_spec.__call__ 175 except AttributeError: 176 action = Actions.Return(action_spec) 177 else: 178 action = Actions.Call(action_spec) 179 final_state = machine.new_state() 180 re.build_machine(machine, initial_state, final_state, 181 match_bol=1, nocase=0) 182 final_state.set_action(action, priority=-token_number) 183 except Errors.PlexError as e: 184 raise e.__class__("Token number %d: %s" % (token_number, e)) 185 186 def parse_token_definition(self, token_spec): 187 if not isinstance(token_spec, tuple): 188 raise Errors.InvalidToken("Token definition is not a tuple") 189 if len(token_spec) != 2: 190 raise Errors.InvalidToken("Wrong number of items in token definition") 191 pattern, action = token_spec 192 if not isinstance(pattern, Regexps.RE): 193 raise Errors.InvalidToken("Pattern is not an RE instance") 194 return (pattern, action) 195 196 def get_initial_state(self, name): 197 return self.machine.get_initial_state(name) 198 199 200 201