1#=======================================================================
2#
3#   Python Lexical Analyser
4#
5#   Lexical Analyser Specification
6#
7#=======================================================================
8
9import types
10
11import Actions
12import DFA
13import Errors
14import Machines
15import Regexps
16
17# debug_flags for Lexicon constructor
18DUMP_NFA = 1
19DUMP_DFA = 2
20
21class State:
22  """
23  This class is used as part of a Plex.Lexicon specification to
24  introduce a user-defined state.
25
26  Constructor:
27
28     State(name, token_specifications)
29  """
30
31  name = None
32  tokens = None
33
34  def __init__(self, name, tokens):
35    self.name = name
36    self.tokens = tokens
37
38class Lexicon:
39  """
40  Lexicon(specification) builds a lexical analyser from the given
41  |specification|. The specification consists of a list of
42  specification items. Each specification item may be either:
43
44     1) A token definition, which is a tuple:
45
46           (pattern, action)
47
48        The |pattern| is a regular axpression built using the
49        constructors defined in the Plex module.
50
51        The |action| is the action to be performed when this pattern
52        is recognised (see below).
53
54     2) A state definition:
55
56           State(name, tokens)
57
58        where |name| is a character string naming the state,
59        and |tokens| is a list of token definitions as
60        above. The meaning and usage of states is described
61        below.
62
63  Actions
64  -------
65
66  The |action| in a token specication may be one of three things:
67
68     1) A function, which is called as follows:
69
70           function(scanner, text)
71
72        where |scanner| is the relevant Scanner instance, and |text|
73        is the matched text. If the function returns anything
74        other than None, that value is returned as the value of the
75        token. If it returns None, scanning continues as if the IGNORE
76        action were specified (see below).
77
78      2) One of the following special actions:
79
80         IGNORE means that the recognised characters will be treated as
81                white space and ignored. Scanning will continue until
82                the next non-ignored token is recognised before returning.
83
84         TEXT   causes the scanned text itself to be returned as the
85                value of the token.
86
87      3) Any other value, which is returned as the value of the token.
88
89  States
90  ------
91
92  At any given time, the scanner is in one of a number of states.
93  Associated with each state is a set of possible tokens. When scanning,
94  only tokens associated with the current state are recognised.
95
96  There is a default state, whose name is the empty string. Token
97  definitions which are not inside any State definition belong to
98  the default state.
99
100  The initial state of the scanner is the default state. The state can
101  be changed in one of two ways:
102
103     1) Using Begin(state_name) as the action of a token.
104
105     2) Calling the begin(state_name) method of the Scanner.
106
107  To change back to the default state, use '' as the state name.
108  """
109
110  machine = None # Machine
111  tables = None # StateTableMachine
112
113  def __init__(self, specifications, debug = None, debug_flags = 7, timings = None):
114    if type(specifications) <> types.ListType:
115      raise Errors.InvalidScanner("Scanner definition is not a list")
116    if timings:
117      from Timing import time
118      total_time = 0.0
119      time1 = time()
120    nfa = Machines.Machine()
121    default_initial_state = nfa.new_initial_state('')
122    token_number = 1
123    for spec in specifications:
124      if isinstance(spec, State):
125        user_initial_state = nfa.new_initial_state(spec.name)
126        for token in spec.tokens:
127          self.add_token_to_machine(
128            nfa, user_initial_state, token, token_number)
129          token_number = token_number + 1
130      elif type(spec) == types.TupleType:
131        self.add_token_to_machine(
132          nfa, default_initial_state, spec, token_number)
133        token_number = token_number + 1
134      else:
135        raise Errors.InvalidToken(
136          token_number,
137          "Expected a token definition (tuple) or State instance")
138    if timings:
139      time2 = time()
140      total_time = total_time + (time2 - time1)
141      time3 = time()
142    if debug and (debug_flags & 1):
143      debug.write("\n============= NFA ===========\n")
144      nfa.dump(debug)
145    dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug)
146    if timings:
147      time4 = time()
148      total_time = total_time + (time4 - time3)
149    if debug and (debug_flags & 2):
150      debug.write("\n============= DFA ===========\n")
151      dfa.dump(debug)
152    if timings:
153      timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
154      timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
155      timings.write("TOTAL            : %5.2f\n" % total_time)
156    self.machine = dfa
157
158  def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
159    try:
160      (re, action_spec) = self.parse_token_definition(token_spec)
161      # Disabled this -- matching empty strings can be useful
162      #if re.nullable:
163      #  raise Errors.InvalidToken(
164      #    token_number, "Pattern can match 0 input symbols")
165      if isinstance(action_spec, Actions.Action):
166        action = action_spec
167      elif callable(action_spec):
168        action = Actions.Call(action_spec)
169      else:
170        action = Actions.Return(action_spec)
171      final_state = machine.new_state()
172      re.build_machine(machine, initial_state, final_state,
173                       match_bol = 1, nocase = 0)
174      final_state.set_action(action, priority = -token_number)
175    except Errors.PlexError, e:
176      raise e.__class__("Token number %d: %s" % (token_number, e))
177
178  def parse_token_definition(self, token_spec):
179    if type(token_spec) <> types.TupleType:
180      raise Errors.InvalidToken("Token definition is not a tuple")
181    if len(token_spec) <> 2:
182      raise Errors.InvalidToken("Wrong number of items in token definition")
183    pattern, action = token_spec
184    if not isinstance(pattern, Regexps.RE):
185      raise Errors.InvalidToken("Pattern is not an RE instance")
186    return (pattern, action)
187
188  def get_initial_state(self, name):
189    return self.machine.get_initial_state(name)
190
191
192
193