tools/docco/t_parse.py

#Copyright ReportLab Europe Ltd. 2000-2017
#see license.txt for license details
#history https://hg.reportlab.com/hg-public/reportlab/log/tip/tools/docco/t_parse.py
"""
Template parsing module inspired by REXX (with thanks to Donn Cave for discussion).

Template initialization has the form:
   T = Template(template_string, wild_card_marker, single_char_marker,
             x = regex_x, y = regex_y, ...)
Parsing has the form
   ([match1, match2, ..., matchn], lastindex) = T.PARSE(string)

Only the first argument is mandatory.

The resultant object efficiently parses strings that match the template_string,
giving a list of substrings that correspond to each "directive" of the template.

Template directives:

  Wildcard:
    The template may be initialized with a wildcard that matches any string
    up to the string matching the next directive (which may not be a wild
    card or single character marker) or the next literal sequence of characters
    of the template.  The character that represents a wildcard is specified
    by the wild_card_marker parameter, which has no default.

    For example, using X as the wildcard:


    >>> T = Template("prefixXinteriorX", "X")
    >>> T.PARSE("prefix this is before interior and this is after")
    ([' this is before ', ' and this is after'], 47)
    >>> T = Template("<X>X<X>", "X")
    >>> T.PARSE('<A HREF="index.html">go to index</A>')
    (['A HREF="index.html"', 'go to index', '/A'], 36)

    Obviously the character used to represent the wildcard must be distinct
    from the characters used to represent literals or other directives.

  Fixed length character sequences:
    The template may have a marker character which indicates a fixed
    length field.  All adjacent instances of this marker will be matched
    by a substring of the same length in the parsed string.  For example:

      >>> T = Template("NNN-NN-NNNN", single_char_marker="N")
      >>> T.PARSE("1-2-34-5-12")
      (['1-2', '34', '5-12'], 11)
      >>> T.PARSE("111-22-3333")
      (['111', '22', '3333'], 11)
      >>> T.PARSE("1111-22-3333")
      ValueError: literal not found at (3, '-')

    A template may have multiple fixed length markers, which allows fixed
    length fields to be adjacent, but recognized separately.  For example:

      >>> T = Template("MMDDYYX", "X", "MDY")
      >>> T.PARSE("112489 Somebody's birthday!")
      (['11', '24', '89', " Somebody's birthday!"], 27)

  Regular expression markers:
    The template may have markers associated with regular expressions.
    the regular expressions may be either string represenations of compiled.
    For example:
      >>> T = Template("v: s i", v=id, s=str, i=int)
      >>> T.PARSE("this_is_an_identifier: 'a string' 12344")
      (['this_is_an_identifier', "'a string'", '12344'], 39)
      >>>

    Here id, str, and int are regular expression conveniences provided by
    this module.

  Directive markers may be mixed and matched, except that wildcards cannot precede
  wildcards or single character markers.
  Example:
>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str)
>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'")
(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72)
>>>

"""

import re, string
from reportlab.lib.utils import ascii_letters

#
# template parsing
#
# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N")
#     ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters")
#
class Template:

   def __init__(self,
                template,
                wild_card_marker=None,
                single_char_marker=None,
                **marker_to_regex_dict):
       self.template = template
       self.wild_card = wild_card_marker
       self.char = single_char_marker
       # determine the set of markers for this template
       markers = list(marker_to_regex_dict.keys())
       if wild_card_marker:
          markers.append(wild_card_marker)
       if single_char_marker:
          for ch in single_char_marker: # allow multiple scm's
              markers.append(ch)
          self.char = single_char_primary = single_char_marker[0]
       self.markers = markers
       for mark in markers:
           if len(mark)>1:
              raise ValueError("Marks must be single characters: "+repr(mark))
       # compile the regular expressions if needed
       self.marker_dict = marker_dict = {}
       for mark, rgex in marker_to_regex_dict.items():
           if isinstance(rgex,str):
              rgex = re.compile(rgex)
           marker_dict[mark] = rgex
       # determine the parse sequence
       parse_seq = []
       # dummy last char
       lastchar = None
       index = 0
       last = len(template)
       # count the number of directives encountered
       ndirectives = 0
       while index<last:
          start = index
          thischar = template[index]
          # is it a wildcard?
          if thischar == wild_card_marker:
             if lastchar == wild_card_marker:
                raise ValueError("two wild cards in sequence is not allowed")
             parse_seq.append( (wild_card_marker, None) )
             index = index+1
             ndirectives = ndirectives+1
          # is it a sequence of single character markers?
          elif single_char_marker and thischar in single_char_marker:
             if lastchar == wild_card_marker:
                raise ValueError("wild card cannot precede single char marker")
             while index<last and template[index] == thischar:
                index = index+1
             parse_seq.append( (single_char_primary, index-start) )
             ndirectives = ndirectives+1
          # is it a literal sequence?
          elif not thischar in markers:
             while index<last and not template[index] in markers:
                index = index+1
             parse_seq.append( (None, template[start:index]) )
          # otherwise it must be a re marker
          else:
             rgex = marker_dict[thischar]
             parse_seq.append( (thischar, rgex) )
             ndirectives = ndirectives+1
             index = index+1
          lastchar = template[index-1]
       self.parse_seq = parse_seq
       self.ndirectives = ndirectives

   def PARSE(self, s, start=0):
       ndirectives = self.ndirectives
       wild_card = self.wild_card
       single_char = self.char
       parse_seq = self.parse_seq
       lparse_seq = len(parse_seq) - 1
       # make a list long enough for substitutions for directives
       result = [None] * ndirectives
       current_directive_index = 0
       currentindex = start
       # scan through the parse sequence, recognizing
       for parse_index in range(lparse_seq + 1):
           (indicator, data) = parse_seq[parse_index]
           # is it a literal indicator?
           if indicator is None:
              if s.find(data, currentindex) != currentindex:
                 raise ValueError("literal not found at "+repr((currentindex,data)))
              currentindex = currentindex + len(data)
           else:
              # anything else is a directive
              # is it a wildcard?
              if indicator == wild_card:
                 # if it is the last directive then it matches the rest of the string
                 if parse_index == lparse_seq:
                    last = len(s)
                 # otherwise must look at next directive to find end of wildcard
                 else:
                    # next directive must be re or literal
                    (nextindicator, nextdata) = parse_seq[parse_index+1]
                    if nextindicator is None:
                       # search for literal
                       last = s.find(nextdata, currentindex)
                       if last<currentindex:
                          raise ValueError("couldn't terminate wild with lit "+repr(currentindex))
                    else:
                       # data is a re, search for it
                       last = nextdata.search(s, currentindex)
                       if last<currentindex:
                          raise ValueError("couldn't terminate wild with re "+repr(currentindex))
              elif indicator == single_char:
                 # data is length to eat
                 last = currentindex + data
              else:
                 # other directives are always regular expressions
                 last = data.match(s, currentindex) + currentindex
                 if last<currentindex:
                    raise ValueError("couldn't match re at "+repr(currentindex))
              #print("accepting", s[currentindex:last])
              result[current_directive_index] = s[currentindex:last]
              current_directive_index = current_directive_index+1
              currentindex = last
       # sanity check
       if current_directive_index != ndirectives:
          raise SystemError("not enough directives found?")
       return (result, currentindex)

# some useful regular expressions
USERNAMEREGEX = \
  "["+ascii_letters+"]["+ascii_letters+string.digits+"_]*"
STRINGLITREGEX = "'[^\n']*'"
SIMPLEINTREGEX = "["+string.digits+"]+"
id = re.compile(USERNAMEREGEX)
str = re.compile(STRINGLITREGEX)
int = re.compile(SIMPLEINTREGEX)

def test():
    global T, T1, T2, T3

    T = Template("(NNN)NNN-NNNN X X", "X", "N")
    print(T.PARSE("(908)949-2726 Aaron Watters"))

    T1 = Template("s --> s blah", s=str)
    s = "' <-- a string --> ' --> 'blah blah another string blah' blah"
    print(T1.PARSE(s))

    T2 = Template("s --> NNNiX", "X", "N", s=str, i=int)
    print(T2.PARSE("'A STRING' --> 15964653alpha beta gamma"))

    T3 = Template("XsXi", "X", "N", s=str, i=int)
    print(T3.PARSE("prefix'string'interior1234junk not parsed"))

    T4 = Template("MMDDYYX", "X", "MDY")
    print(T4.PARSE("122961 Somebody's birthday!"))


if __name__=="__main__": test()