1#!/usr/bin/env python
2
3"""
4NAME:
5    sparser.py
6
7SYNOPSIS:
8    sparser.py [options] filename
9
10DESCRIPTION:
11    The sparser.py script is a Specified PARSER.  It is unique (as far as I can
12    tell) because it doesn't care about the delimiter(s).  The user specifies
13    what is expected, and the order, for each line of text.  All of the heavy
14    lifting is handled by pyparsing (http://pyparsing.sf.net).
15
16OPTIONS:
17    -h,--help        this message
18    -v,--version     version
19    -d,--debug       turn on debug messages
20
21EXAMPLES:
22    1. As standalone
23        sparser.py myfile
24    2. As library
25        import sparser
26        ...
27
28#Copyright (C) 2006  Tim Cera timcera@earthlink.net
29#
30#
31#    This program is free software; you can redistribute it and/or modify it
32#    under the terms of the GNU General Public License as published by the Free
33#    Software Foundation; either version 2 of the License, or (at your option)
34#    any later version.
35#
36#    This program is distributed in the hope that it will be useful, but
37#    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
38#    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
39#    for more details.
40#
41#    You should have received a copy of the GNU General Public License along
42#    with this program; if not, write to the Free Software Foundation, Inc.,
43#    675 Mass Ave, Cambridge, MA 02139, USA.
44"""
45
46#===imports======================
47import sys
48import os
49import getopt
50
51from pyparsing import *
52
53
54#===globals======================
55modname = "sparser"
56__version__ = "0.1"
57
58
59#--option args--
60debug_p = 0
61#opt_b=None  #string arg, default is undefined
62
63
64#---positional args, default is empty---
65pargs = []
66
67
68#---other---
69
70
71#===utilities====================
72def msg(txt):
73    """Send message to stdout."""
74    sys.stdout.write(txt)
75    sys.stdout.flush()
76
77def debug(ftn, txt):
78    """Used for debugging."""
79    if debug_p:
80        sys.stdout.write("{0}.{1}:{2}\n".format(modname, ftn, txt))
81        sys.stdout.flush()
82
83def fatal(ftn, txt):
84    """If can't continue."""
85    msg = "{0}.{1}:FATAL:{2}\n".format(modname, ftn, txt)
86    raise SystemExit(msg)
87
88def usage():
89    """Prints the docstring."""
90    print(__doc__)
91
92
93
94#====================================
95class ToInteger(TokenConverter):
96    """Converter to make token into an integer."""
97    def postParse( self, instring, loc, tokenlist ):
98        return int(tokenlist[0])
99
100class ToFloat(TokenConverter):
101    """Converter to make token into a float."""
102    def postParse( self, instring, loc, tokenlist ):
103        return float(tokenlist[0])
104
105class ParseFileLineByLine:
106    """
107    Bring data from text files into a program, optionally parsing each line
108    according to specifications in a parse definition file.
109
110    ParseFileLineByLine instances can be used like normal file objects (i.e. by
111    calling readline(), readlines(), and write()), but can also be used as
112    sequences of lines in for-loops.
113
114    ParseFileLineByLine objects also handle compression transparently. i.e. it
115    is possible to read lines from a compressed text file as if it were not
116    compressed.  Compression is deduced from the file name suffixes '.Z'
117    (compress/uncompress), '.gz' (gzip/gunzip), and '.bz2' (bzip2).
118
119    The parse definition fi le name is developed based on the input file name.
120    If the input file name is 'basename.ext', then the definition file is
121    'basename_def.ext'.  If a definition file specific to the input file is not
122    found, then the program searches for the file 'sparse.def' which would be
123    the definition file for all files in that directory without a file specific
124    definition file.
125
126    Finally, ParseFileLineByLine objects accept file names that start with '~'
127    or '~user' to indicate a home directory, as well as URLs (for reading
128    only).
129
130    Constructor:
131    ParseFileLineByLine(|filename|, |mode|='"r"'), where |filename| is the name
132    of the file (or a URL) and |mode| is one of '"r"' (read), '"w"' (write) or
133    '"a"' (append, not supported for .Z files).
134    """
135
136    def __init__(self, filename, mode = 'r'):
137        """Opens input file, and if available the definition file.  If the
138        definition file is available __init__ will then create some pyparsing
139        helper variables.  """
140        if mode not in ['r', 'w', 'a']:
141            raise IOError(0, 'Illegal mode: ' + repr(mode))
142
143        if string.find(filename, ':/') > 1: # URL
144            if mode == 'w':
145                raise IOError("can't write to a URL")
146            import urllib.request, urllib.parse, urllib.error
147            self.file = urllib.request.urlopen(filename)
148        else:
149            filename = os.path.expanduser(filename)
150            if mode == 'r' or mode == 'a':
151                if not os.path.exists(filename):
152                    raise IOError(2, 'No such file or directory: ' + filename)
153            filen, file_extension = os.path.splitext(filename)
154            command_dict = {
155              ('.Z', 'r'):
156                "self.file = os.popen('uncompress -c ' + filename, mode)",
157              ('.gz', 'r'):
158                "self.file = gzip.GzipFile(filename, 'rb')",
159              ('.bz2', 'r'):
160                "self.file = os.popen('bzip2 -dc ' + filename, mode)",
161              ('.Z', 'w'):
162                "self.file = os.popen('compress > ' + filename, mode)",
163              ('.gz', 'w'):
164                "self.file = gzip.GzipFile(filename, 'wb')",
165              ('.bz2', 'w'):
166                "self.file = os.popen('bzip2 > ' + filename, mode)",
167              ('.Z', 'a'):
168                "raise IOError, (0, 'Can\'t append to .Z files')",
169              ('.gz', 'a'):
170                "self.file = gzip.GzipFile(filename, 'ab')",
171              ('.bz2', 'a'):
172                "raise IOError, (0, 'Can\'t append to .bz2 files')",
173                           }
174
175            exec(command_dict.get((file_extension, mode),
176                                  'self.file = open(filename, mode)'))
177
178        self.grammar = None
179
180        # Try to find a parse ('*_def.ext') definition file.  First try to find
181        # a file specific parse definition file, then look for 'sparse.def'
182        # that would be the definition file for all files within the directory.
183
184        # The definition file is pure Python.  The one variable that needs to
185        # be specified is 'parse'.  The 'parse' variable is a list of tuples
186        # defining the name, type, and because it is a list, the order of
187        # variables on each line in the data file.  The variable name is a
188        # string, the type variable is defined as integer, real, and qString.
189
190        # parse = [
191        #          ('year', integer),
192        #          ('month', integer),
193        #          ('day', integer),
194        #          ('value', real),
195        #         ]
196
197        definition_file_one = filen + "_def" + file_extension
198        definition_file_two = os.path.dirname(filen) + os.sep + "sparse.def"
199        if os.path.exists(definition_file_one):
200            self.parsedef = definition_file_one
201        elif os.path.exists(definition_file_two):
202            self.parsedef = definition_file_two
203        else:
204            self.parsedef = None
205            return None
206
207        # Create some handy pyparsing constructs.  I kept 'decimal_sep' so that
208        # could easily change to parse if the decimal separator is a ",".
209        decimal_sep = "."
210        sign = oneOf("+ -")
211        # part of printables without decimal_sep, +, -
212        special_chars = string.replace('!"#$%&\'()*,./:;<=>?@[\\]^_`{|}~',
213                                       decimal_sep, "")
214        integer = ToInteger(
215                  Combine(Optional(sign) +
216                          Word(nums))).setName("integer")
217        positive_integer = ToInteger(
218                           Combine(Optional("+") +
219                                   Word(nums))).setName("integer")
220        negative_integer = ToInteger(
221                           Combine("-" +
222                                   Word(nums))).setName("integer")
223        real = ToFloat(
224               Combine(Optional(sign) +
225                       Word(nums) +
226                       decimal_sep +
227                       Optional(Word(nums)) +
228                       Optional(oneOf("E e") +
229                                Word(nums)))).setName("real")
230        positive_real = ToFloat(
231                        Combine(Optional("+") +
232                                Word(nums) +
233                                decimal_sep +
234                                Optional(Word(nums)) +
235                                Optional(oneOf("E e") +
236                                         Word(nums)))).setName("real")
237        negative_real = ToFloat(
238                        Combine("-" +
239                                Word(nums) +
240                                decimal_sep +
241                                Optional(Word(nums)) +
242                                Optional(oneOf("E e") +
243                                         Word(nums)))).setName("real")
244        qString = ( sglQuotedString | dblQuotedString ).setName("qString")
245
246        # add other characters we should skip over between interesting fields
247        integer_junk = Optional(
248                       Suppress(
249                       Word(alphas +
250                            special_chars +
251                            decimal_sep))).setName("integer_junk")
252        real_junk = Optional(
253                    Suppress(
254                    Word(alphas +
255                         special_chars))).setName("real_junk")
256        qString_junk = SkipTo(qString).setName("qString_junk")
257
258        # Now that 'integer', 'real', and 'qString' have been assigned I can
259        # execute the definition file.
260        exec(compile(open(self.parsedef).read(), self.parsedef, 'exec'))
261
262        # Build the grammar, combination of the 'integer', 'real, 'qString',
263        # and '*_junk' variables assigned above in the order specified in the
264        # definition file.
265        grammar = []
266        for nam, expr in parse:
267            grammar.append( eval(expr.name + "_junk"))
268            grammar.append( expr.setResultsName(nam) )
269        self.grammar = And( grammar[1:] + [restOfLine] )
270
271    def __del__(self):
272        """Delete (close) the file wrapper."""
273        self.close()
274
275    def __getitem__(self, item):
276        """Used in 'for line in fp:' idiom."""
277        line = self.readline()
278        if not line:
279            raise IndexError
280        return line
281
282    def readline(self):
283        """Reads (and optionally parses) a single line."""
284        line = self.file.readline()
285        if self.grammar and line:
286            try:
287                return self.grammar.parseString(line).asDict()
288            except ParseException:
289                return self.readline()
290        else:
291            return line
292
293    def readlines(self):
294        """Returns a list of all lines (optionally parsed) in the file."""
295        if self.grammar:
296            tot = []
297            # Used this way instead of a 'for' loop against
298            # self.file.readlines() so that there wasn't two copies of the file
299            # in memory.
300            while 1:
301                line = self.file.readline()
302                if not line:
303                    break
304                tot.append(line)
305            return tot
306        return self.file.readlines()
307
308    def write(self, data):
309        """Write to a file."""
310        self.file.write(data)
311
312    def writelines(self, list):
313        """Write a list to a file. Each item in the list is a line in the
314        file.
315        """
316        for line in list:
317            self.file.write(line)
318
319    def close(self):
320        """Close the file."""
321        self.file.close()
322
323    def flush(self):
324        """Flush in memory contents to file."""
325        self.file.flush()
326
327
328#=============================
329def main(pargs):
330    """This should only be used for testing. The primary mode of operation is
331    as an imported library.
332    """
333    input_file = sys.argv[1]
334    fp = ParseFileLineByLine(input_file)
335    for i in fp:
336        print(i)
337
338
339#-------------------------
340if __name__ == '__main__':
341    ftn = "main"
342    opts, pargs = getopt.getopt(sys.argv[1:], 'hvd',
343                 ['help', 'version', 'debug', 'bb='])
344    for opt in opts:
345        if opt[0] == '-h' or opt[0] == '--help':
346            print(modname+": version="+__version__)
347            usage()
348            sys.exit(0)
349        elif opt[0] == '-v' or opt[0] == '--version':
350            print(modname+": version="+__version__)
351            sys.exit(0)
352        elif opt[0] == '-d' or opt[0] == '--debug':
353            debug_p = 1
354        elif opt[0] == '--bb':
355            opt_b = opt[1]
356
357    #---make the object and run it---
358    main(pargs)
359
360#===Revision Log===
361#Created by mkpythonproj:
362#2006-02-06  Tim Cera
363#
364