1#!/usr/bin/env python 2 3""" 4NAME: 5 sparser.py 6 7SYNOPSIS: 8 sparser.py [options] filename 9 10DESCRIPTION: 11 The sparser.py script is a Specified PARSER. It is unique (as far as I can 12 tell) because it doesn't care about the delimiter(s). The user specifies 13 what is expected, and the order, for each line of text. All of the heavy 14 lifting is handled by pyparsing (http://pyparsing.sf.net). 15 16OPTIONS: 17 -h,--help this message 18 -v,--version version 19 -d,--debug turn on debug messages 20 21EXAMPLES: 22 1. As standalone 23 sparser.py myfile 24 2. As library 25 import sparser 26 ... 27 28#Copyright (C) 2006 Tim Cera timcera@earthlink.net 29# 30# 31# This program is free software; you can redistribute it and/or modify it 32# under the terms of the GNU General Public License as published by the Free 33# Software Foundation; either version 2 of the License, or (at your option) 34# any later version. 35# 36# This program is distributed in the hope that it will be useful, but 37# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 38# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 39# for more details. 40# 41# You should have received a copy of the GNU General Public License along 42# with this program; if not, write to the Free Software Foundation, Inc., 43# 675 Mass Ave, Cambridge, MA 02139, USA. 44""" 45 46#===imports====================== 47import sys 48import os 49import getopt 50 51from pyparsing import * 52 53 54#===globals====================== 55modname = "sparser" 56__version__ = "0.1" 57 58 59#--option args-- 60debug_p = 0 61#opt_b=None #string arg, default is undefined 62 63 64#---positional args, default is empty--- 65pargs = [] 66 67 68#---other--- 69 70 71#===utilities==================== 72def msg(txt): 73 """Send message to stdout.""" 74 sys.stdout.write(txt) 75 sys.stdout.flush() 76 77def debug(ftn, txt): 78 """Used for debugging.""" 79 if debug_p: 80 sys.stdout.write("{0}.{1}:{2}\n".format(modname, ftn, txt)) 81 sys.stdout.flush() 82 83def fatal(ftn, txt): 84 """If can't continue.""" 85 msg = "{0}.{1}:FATAL:{2}\n".format(modname, ftn, txt) 86 raise SystemExit(msg) 87 88def usage(): 89 """Prints the docstring.""" 90 print(__doc__) 91 92 93 94#==================================== 95class ToInteger(TokenConverter): 96 """Converter to make token into an integer.""" 97 def postParse( self, instring, loc, tokenlist ): 98 return int(tokenlist[0]) 99 100class ToFloat(TokenConverter): 101 """Converter to make token into a float.""" 102 def postParse( self, instring, loc, tokenlist ): 103 return float(tokenlist[0]) 104 105class ParseFileLineByLine: 106 """ 107 Bring data from text files into a program, optionally parsing each line 108 according to specifications in a parse definition file. 109 110 ParseFileLineByLine instances can be used like normal file objects (i.e. by 111 calling readline(), readlines(), and write()), but can also be used as 112 sequences of lines in for-loops. 113 114 ParseFileLineByLine objects also handle compression transparently. i.e. it 115 is possible to read lines from a compressed text file as if it were not 116 compressed. Compression is deduced from the file name suffixes '.Z' 117 (compress/uncompress), '.gz' (gzip/gunzip), and '.bz2' (bzip2). 118 119 The parse definition fi le name is developed based on the input file name. 120 If the input file name is 'basename.ext', then the definition file is 121 'basename_def.ext'. If a definition file specific to the input file is not 122 found, then the program searches for the file 'sparse.def' which would be 123 the definition file for all files in that directory without a file specific 124 definition file. 125 126 Finally, ParseFileLineByLine objects accept file names that start with '~' 127 or '~user' to indicate a home directory, as well as URLs (for reading 128 only). 129 130 Constructor: 131 ParseFileLineByLine(|filename|, |mode|='"r"'), where |filename| is the name 132 of the file (or a URL) and |mode| is one of '"r"' (read), '"w"' (write) or 133 '"a"' (append, not supported for .Z files). 134 """ 135 136 def __init__(self, filename, mode = 'r'): 137 """Opens input file, and if available the definition file. If the 138 definition file is available __init__ will then create some pyparsing 139 helper variables. """ 140 if mode not in ['r', 'w', 'a']: 141 raise IOError(0, 'Illegal mode: ' + repr(mode)) 142 143 if string.find(filename, ':/') > 1: # URL 144 if mode == 'w': 145 raise IOError("can't write to a URL") 146 import urllib.request, urllib.parse, urllib.error 147 self.file = urllib.request.urlopen(filename) 148 else: 149 filename = os.path.expanduser(filename) 150 if mode == 'r' or mode == 'a': 151 if not os.path.exists(filename): 152 raise IOError(2, 'No such file or directory: ' + filename) 153 filen, file_extension = os.path.splitext(filename) 154 command_dict = { 155 ('.Z', 'r'): 156 "self.file = os.popen('uncompress -c ' + filename, mode)", 157 ('.gz', 'r'): 158 "self.file = gzip.GzipFile(filename, 'rb')", 159 ('.bz2', 'r'): 160 "self.file = os.popen('bzip2 -dc ' + filename, mode)", 161 ('.Z', 'w'): 162 "self.file = os.popen('compress > ' + filename, mode)", 163 ('.gz', 'w'): 164 "self.file = gzip.GzipFile(filename, 'wb')", 165 ('.bz2', 'w'): 166 "self.file = os.popen('bzip2 > ' + filename, mode)", 167 ('.Z', 'a'): 168 "raise IOError, (0, 'Can\'t append to .Z files')", 169 ('.gz', 'a'): 170 "self.file = gzip.GzipFile(filename, 'ab')", 171 ('.bz2', 'a'): 172 "raise IOError, (0, 'Can\'t append to .bz2 files')", 173 } 174 175 exec(command_dict.get((file_extension, mode), 176 'self.file = open(filename, mode)')) 177 178 self.grammar = None 179 180 # Try to find a parse ('*_def.ext') definition file. First try to find 181 # a file specific parse definition file, then look for 'sparse.def' 182 # that would be the definition file for all files within the directory. 183 184 # The definition file is pure Python. The one variable that needs to 185 # be specified is 'parse'. The 'parse' variable is a list of tuples 186 # defining the name, type, and because it is a list, the order of 187 # variables on each line in the data file. The variable name is a 188 # string, the type variable is defined as integer, real, and qString. 189 190 # parse = [ 191 # ('year', integer), 192 # ('month', integer), 193 # ('day', integer), 194 # ('value', real), 195 # ] 196 197 definition_file_one = filen + "_def" + file_extension 198 definition_file_two = os.path.dirname(filen) + os.sep + "sparse.def" 199 if os.path.exists(definition_file_one): 200 self.parsedef = definition_file_one 201 elif os.path.exists(definition_file_two): 202 self.parsedef = definition_file_two 203 else: 204 self.parsedef = None 205 return None 206 207 # Create some handy pyparsing constructs. I kept 'decimal_sep' so that 208 # could easily change to parse if the decimal separator is a ",". 209 decimal_sep = "." 210 sign = oneOf("+ -") 211 # part of printables without decimal_sep, +, - 212 special_chars = string.replace('!"#$%&\'()*,./:;<=>?@[\\]^_`{|}~', 213 decimal_sep, "") 214 integer = ToInteger( 215 Combine(Optional(sign) + 216 Word(nums))).setName("integer") 217 positive_integer = ToInteger( 218 Combine(Optional("+") + 219 Word(nums))).setName("integer") 220 negative_integer = ToInteger( 221 Combine("-" + 222 Word(nums))).setName("integer") 223 real = ToFloat( 224 Combine(Optional(sign) + 225 Word(nums) + 226 decimal_sep + 227 Optional(Word(nums)) + 228 Optional(oneOf("E e") + 229 Word(nums)))).setName("real") 230 positive_real = ToFloat( 231 Combine(Optional("+") + 232 Word(nums) + 233 decimal_sep + 234 Optional(Word(nums)) + 235 Optional(oneOf("E e") + 236 Word(nums)))).setName("real") 237 negative_real = ToFloat( 238 Combine("-" + 239 Word(nums) + 240 decimal_sep + 241 Optional(Word(nums)) + 242 Optional(oneOf("E e") + 243 Word(nums)))).setName("real") 244 qString = ( sglQuotedString | dblQuotedString ).setName("qString") 245 246 # add other characters we should skip over between interesting fields 247 integer_junk = Optional( 248 Suppress( 249 Word(alphas + 250 special_chars + 251 decimal_sep))).setName("integer_junk") 252 real_junk = Optional( 253 Suppress( 254 Word(alphas + 255 special_chars))).setName("real_junk") 256 qString_junk = SkipTo(qString).setName("qString_junk") 257 258 # Now that 'integer', 'real', and 'qString' have been assigned I can 259 # execute the definition file. 260 exec(compile(open(self.parsedef).read(), self.parsedef, 'exec')) 261 262 # Build the grammar, combination of the 'integer', 'real, 'qString', 263 # and '*_junk' variables assigned above in the order specified in the 264 # definition file. 265 grammar = [] 266 for nam, expr in parse: 267 grammar.append( eval(expr.name + "_junk")) 268 grammar.append( expr.setResultsName(nam) ) 269 self.grammar = And( grammar[1:] + [restOfLine] ) 270 271 def __del__(self): 272 """Delete (close) the file wrapper.""" 273 self.close() 274 275 def __getitem__(self, item): 276 """Used in 'for line in fp:' idiom.""" 277 line = self.readline() 278 if not line: 279 raise IndexError 280 return line 281 282 def readline(self): 283 """Reads (and optionally parses) a single line.""" 284 line = self.file.readline() 285 if self.grammar and line: 286 try: 287 return self.grammar.parseString(line).asDict() 288 except ParseException: 289 return self.readline() 290 else: 291 return line 292 293 def readlines(self): 294 """Returns a list of all lines (optionally parsed) in the file.""" 295 if self.grammar: 296 tot = [] 297 # Used this way instead of a 'for' loop against 298 # self.file.readlines() so that there wasn't two copies of the file 299 # in memory. 300 while 1: 301 line = self.file.readline() 302 if not line: 303 break 304 tot.append(line) 305 return tot 306 return self.file.readlines() 307 308 def write(self, data): 309 """Write to a file.""" 310 self.file.write(data) 311 312 def writelines(self, list): 313 """Write a list to a file. Each item in the list is a line in the 314 file. 315 """ 316 for line in list: 317 self.file.write(line) 318 319 def close(self): 320 """Close the file.""" 321 self.file.close() 322 323 def flush(self): 324 """Flush in memory contents to file.""" 325 self.file.flush() 326 327 328#============================= 329def main(pargs): 330 """This should only be used for testing. The primary mode of operation is 331 as an imported library. 332 """ 333 input_file = sys.argv[1] 334 fp = ParseFileLineByLine(input_file) 335 for i in fp: 336 print(i) 337 338 339#------------------------- 340if __name__ == '__main__': 341 ftn = "main" 342 opts, pargs = getopt.getopt(sys.argv[1:], 'hvd', 343 ['help', 'version', 'debug', 'bb=']) 344 for opt in opts: 345 if opt[0] == '-h' or opt[0] == '--help': 346 print(modname+": version="+__version__) 347 usage() 348 sys.exit(0) 349 elif opt[0] == '-v' or opt[0] == '--version': 350 print(modname+": version="+__version__) 351 sys.exit(0) 352 elif opt[0] == '-d' or opt[0] == '--debug': 353 debug_p = 1 354 elif opt[0] == '--bb': 355 opt_b = opt[1] 356 357 #---make the object and run it--- 358 main(pargs) 359 360#===Revision Log=== 361#Created by mkpythonproj: 362#2006-02-06 Tim Cera 363# 364