1#!/usr/bin/python 2 3import os, sys 4import getopt 5import string, re 6import collections 7 8# ================================================================ 9# o inclflds a,x,b 10# o newflds '{$y:$x*$x, $z:$x/2, $n:-$z}' 11# o greprecs '$x <= 2 && $y eq "zebra"' 12# 13# o tabular pretty-print 14# o mean 15# o sort 16 17# absolute essentials: 18# * RECORD-LEVEL: 19# k include/exclude fields 20# o new field as function of old 21# o vertical pretty-print 22# * STREAM-LEVEL: 23# o include/exclude records 24# o sort 25# o summarizations: min, max, mean, count, sum, first, last 26# o tabular pretty-print 27 28# ================================================================ 29def usage(): 30 print >> sys.stderr, "Usage: %s [options] {modulator-spec} {zero or more filenames}" % os.path.basename(sys.argv[0]) 31 print >> sys.stderr, "Options:" 32 print >> sys.stderr, " -R {rs} Input/output record separator" 33 print >> sys.stderr, " -F {fs} Input/output field separator" 34 print >> sys.stderr, " -P {ps} Input/output key-value-pair separator" 35 print >> sys.stderr, " -v {name=value} xxx needs more doc" 36 print >> sys.stderr, "" 37 print >> sys.stderr, " --idkvp Input format is delimited by IRS,IFS,IPS" 38 print >> sys.stderr, " --odkvp Output format is delimited by IRS,IFS,IPS" 39 print >> sys.stderr, " --icsv Input format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" 40 print >> sys.stderr, " --ocsv Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)" 41 print >> sys.stderr, " --inidx Input format is implicitly integer-indexed (awk-style)" 42 print >> sys.stderr, " --onidx Output format is implicitly integer-indexed (awk-style)" 43 print >> sys.stderr, " --ixtab Input format is transposed-tabular-pretty-print" 44 print >> sys.stderr, " --oxtab Output format is transposed-tabular-pretty-print" 45 print >> sys.stderr, "Modulator specs:" 46 print >> sys.stderr, '--cat' 47 print >> sys.stderr, '--tac' 48 print >> sys.stderr, '--cut' 49 print >> sys.stderr, '--cutx' 50 print >> sys.stderr, '--sortfields' 51 print >> sys.stderr, '--sortfieldsup' 52 print >> sys.stderr, '--sortfieldsdown' 53 54 sys.exit(1) 55 56# ---------------------------------------------------------------- 57def parse_command_line(): 58 namespace = set_up_namespace() 59 rreader = None 60 rwriter = None 61 rmodulator = None 62 63 try: 64 optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [ 65 'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab', 66 'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown']) 67 68 except getopt.GetoptError, err: 69 print str(err) 70 usage() 71 sys.exit(1) 72 73 for opt, arg in optargs: 74 if opt == '-R': 75 rs = arg 76 namespace.put("ORS", namespace.put("IRS", rs)) 77 elif opt == '-F': 78 fs = arg 79 namespace.put("OFS", namespace.put("IFS", fs)) 80 elif opt == '-P': 81 ps = arg 82 namespace.put("OPS", namespace.put("IPS", ps)) 83 elif opt == '-v': 84 kv = string.split(arg, "=", 1) 85 namespace.put(kv[0], kv[1]) 86 87 elif opt == '--idkvp': 88 rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS")) 89 elif opt == '--odkvp': 90 rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS")) 91 92 elif opt == '--icsv': 93 rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS")) 94 elif opt == '--ocsv': 95 rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS")) 96 97 elif opt == '--inidx': 98 rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS")) 99 elif opt == '--onidx': 100 rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS")) 101 102 #elif opt == '--ixtab': 103 # pass 104 elif opt == '--oxtab': 105 rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!? 106 107 elif opt == '--cat': 108 rmodulator = CatModulator() 109 elif opt == '--tac': 110 rmodulator = TacModulator() 111 elif opt == '--cut': 112 rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS"))) 113 elif opt == '--cutx': 114 rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS"))) 115 elif opt == '--cutx': 116 rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS"))) 117 elif opt == '--sortfields': 118 rmodulator = SortFieldsInRecordModulator(True) 119 elif opt == '--sortfieldsup': 120 rmodulator = SortFieldsInRecordModulator(True) 121 elif opt == '--sortfieldsdown': 122 rmodulator = SortFieldsInRecordModulator(False) 123 124 elif opt == '--help': 125 usage() 126 else: 127 print >> sys.stderr, "Unhandled option \"%s\"." % opt 128 sys.exit(1) 129 130 #xxx non_option_arg_count = len(non_option_args) 131 132 if rreader == None: 133 rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS")) 134 if rwriter == None: 135 rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS")) 136 if rmodulator == None: 137 rmodulator = CatModulator() 138 139 return {'namespace':namespace, 'rreader':rreader, 'rwriter':rwriter, 'rmodulator':rmodulator} 140 141def main(): 142 options = parse_command_line() 143 144 # parse ARGV: 145 # * --ifmt: dkvp,hdr1st,iidxed,align,xposealign 146 # * --ofmt: dkvp,hdr1st,iidxed,align,xposealign 147 # * which-control-language spec?!? 148 # * modulators/script ... this is the key decision area for language(s) design. 149 # * filenames 150 151 namespace = options['namespace'] 152 rreader = options['rreader'] 153 rmodulator = options['rmodulator'] 154 rwriter = options['rwriter'] 155 156 smodulator = StreamModulator() 157 smodulator.modulate(rreader, rmodulator, rwriter) 158 159# ================================================================ 160class MillerNamespace: 161 def __init__(self): 162 self.mapping = {} 163 self.imapping = {} 164 def get(self, name): 165 return self.mapping[name] 166 def iget(self, name): 167 return self.imapping[name] 168 def put(self, name, value): 169 self.mapping[name] = value 170 return value 171 def iput(self, name, ivalue): 172 self.imapping[name] = ivalue 173 return ivalue 174 175# ================================================================ 176class Record: 177 # kvs is list of pair-lists. (xxx: do tuples work too?) 178 def __init__(self, kvs=[]): 179 self.fields = collections.OrderedDict() 180 self.mput(kvs) 181 def put(self, k, v): 182 self.fields[k] = v 183 def mput(self, kvs): 184 for [k,v] in kvs: 185 self.fields[k] = v 186 def get(self, k): 187 return self.fields[k] 188 def has_key(self, k): 189 return self.fields.has_key(k) 190 def get_field_names(self): 191 return self.fields.keys() 192 def get_pairs(self): 193 return self.fields.items() 194 def num_pairs(self): 195 return len(self.fields.items()) 196 # xxx xref to record-formatter classes 197 def __str__(self): 198 return self.fields.__repr__ 199 def __repr__(self): 200 return self.fields.__repr__ 201 202# ================================================================ 203# Each record is a sequence of fields delimited by FS, each of which is a 204# key-value pair separated by PS. 205 206class RecordReader: 207 def __init__(self, istream, namespace, irs, ifs, ips): 208 self.istream = istream 209 self.namespace = namespace 210 self.irs = irs 211 self.ifs = ifs 212 self.ips = ips 213 214class RecordReaderDefault(RecordReader): 215 def __init__(self, istream, namespace, irs, ifs, ips): 216 RecordReader.__init__(self, istream, namespace, irs, ifs, ips) 217 218 def read(self): 219 line = self.istream.readline() # xxx use self.irs 220 if line == '': 221 return None 222 223 line = line.strip() # Remove leading/trailing whitespace including carriage return from readline(). 224 fields = string.split(line, self.ifs) 225 kvs = [string.split(field, self.ips, 1) for field in fields] 226 record = Record(kvs) 227 228 self.namespace.iput("NF", record.num_pairs) 229 self.namespace.iput("NR", self.namespace.iget("NR") + 1) 230 231 # xxx stub 232 self.namespace.put("FILENAME", None) 233 self.namespace.iput("FNR", self.namespace.iget("FNR") + 1) 234 235 return record 236 237# ---------------------------------------------------------------- 238# awk-style 239class RecordReaderIntegerIndexed(RecordReader): 240 # xxx ctor with istream context?!? or independent of that?!? for cskv, no matter. 241 # csv reader of course needs context. 242 def __init__(self, istream, namespace, irs, ifs): 243 RecordReader.__init__(self, istream, namespace, irs, ifs, None) 244 245 def read(self): 246 # xxx use self.irs 247 line = self.istream.readline() 248 if line == '': 249 return None 250 line = line.strip() # Remove leading/trailing whitespace including carriage return from readline(). 251 fields = re.split(self.ifs, line) 252 kvs = [] 253 i = 1 254 for field in fields: 255 kvs.append([i, field]) 256 i += 1 257 return Record(kvs) 258 259# ---------------------------------------------------------------- 260# csv-style 261class RecordReaderHeaderFirst(RecordReader): 262 def __init__(self, istream, namespace, irs, ifs): 263 RecordReader.__init__(self, istream, namespace, irs, ifs, None) 264 self.field_names = None 265 self.header_line = None 266 267 def read(self): 268 if self.field_names == None: 269 header_line = self.istream.readline() 270 if header_line == '': 271 return None 272 # Remove leading/trailing whitespace including carriage return from readline(). 273 header_line = header_line.strip() 274 self.field_names = string.split(header_line, self.ifs, -1) 275 self.header_line = header_line 276 277 data_line = self.istream.readline() 278 if data_line == '': 279 return None 280 # Remove leading/trailing whitespace including carriage return from readline(). 281 data_line = data_line.strip() 282 field_values = string.split(data_line, self.ifs, -1) 283 if len(self.field_names) != len(field_values): 284 raise Exception("Header/data length mismatch: %d != %d in \"%s\" and \"%s\"" % \ 285 (len(field_names), len(field_values), self.header_line, data_line)) 286 287 return Record(zip(self.field_names, field_values)) 288 289# ================================================================ 290# xxx ostream at ctor?? needs drain-at-end logic for prettyprint. 291 292class RecordWriter: 293 def __init__(self, ostream, ors, ofs, ops): 294 self.ostream = ostream 295 self.ors = ors 296 self.ofs = ofs 297 self.ops = ops 298 299class RecordWriterDefault(RecordWriter): 300 def __init__(self, ostream, ors, ofs, ops): 301 RecordWriter.__init__(self, ostream, ors, ofs, ops) 302 303 def write(self, record): 304 self.ostream.write(self.ofs.join([str(k)+self.ops+str(v) for [k,v] in record.get_pairs()])) 305 self.ostream.write("\n") 306 307# ---------------------------------------------------------------- 308class RecordWriterHeaderFirst(RecordWriter): 309 def __init__(self, ostream, ors, ofs): 310 RecordWriter.__init__(self, ostream, ors, ofs, None) 311 self.field_names = None 312 313 def write(self, record): 314 data_string = self.ofs.join([str(v) for [k,v] in record.get_pairs()]) 315 if self.field_names == None: 316 self.field_names = record.get_field_names() 317 header_string = self.ofs.join([str(k) for [k,v] in record.get_pairs()]) 318 self.ostream.write(header_string) 319 self.ostream.write("\n") 320 self.ostream.write(data_string) 321 self.ostream.write("\n") 322 323# ---------------------------------------------------------------- 324# xxx rename 325 326class RecordWriterVerticallyTabulated(RecordWriter): 327 def __init__(self, ostream): 328 RecordWriter.__init__(self, ostream, None, None, None) 329 330 def write(self, record): 331 max_field_name_width = 1 332 field_names = record.get_field_names() 333 for field_name in field_names: 334 field_name_width = len(field_name) 335 if field_name_width > max_field_name_width: 336 max_field_name_width = field_name_width 337 lines = [] 338 for field_name in field_names: 339 lines.append("%-*s %s" % (max_field_name_width, field_name, record.get(field_name))) 340 self.ostream.write("\n".join(lines)) 341 self.ostream.write("\n\n") 342 343# ---------------------------------------------------------------- 344class RecordWriterIntegerIndexed: 345 def __init__(self, ostream, ors, ofs): 346 self.ostream = ostream 347 self.ors = ors 348 self.ofs = ofs 349 def write(self, record): 350 self.ostream.write(self.ofs.join([str(v) for [k,v] in record.get_pairs()])) 351 self.ostream.write("\n") 352 353# ================================================================ 354class CatModulator: 355 def __init__(self): 356 pass 357 def modulate(self, record): 358 if record == None: # drain at end 359 return [] 360 return [record] 361 362class TacModulator: 363 def __init__(self): 364 self.records = [] 365 def modulate(self, record): 366 if record == None: # drain at end 367 self.records.reverse() 368 rv = self.records 369 self.records = [] 370 return rv 371 else: 372 self.records.append(record) 373 return [] 374 375class SelectFieldsModulator: 376 def __init__(self, field_names): 377 self.field_names = field_names 378 def modulate(self, record): 379 if record == None: # drain at end 380 return [] 381 kvs = [] 382 for field_name in self.field_names: 383 if record.has_key(field_name): 384 kvs.append((field_name, record.get(field_name))) 385 new_record = Record() 386 new_record.mput(kvs) 387 return [new_record] 388 389# The field_names argument may be a list or hash-set -- as long as it supports 390# the "in" operator as in "name in field_names". 391# xxx to do: use a hash-set internally. 392class DeselectFieldsModulator: 393 def __init__(self, field_names): 394 self.field_names = field_names 395 def modulate(self, record): 396 if record == None: # drain at end 397 return [] 398 kvs = [] 399 for field_name in record.get_field_names(): 400 if not field_name in self.field_names: 401 kvs.append((field_name, record.get(field_name))) 402 new_record = Record() 403 new_record.mput(kvs) 404 return [new_record] 405 406class SortFieldsInRecordModulator: 407 def __init__(self, do_ascending_sort=True): 408 self.do_ascending_sort = do_ascending_sort 409 def modulate(self, record): 410 if record == None: # drain at end 411 return [] 412 kvs = [] 413 sorted_field_names = sorted(record.get_field_names()) 414 if not self.do_ascending_sort: 415 sorted_field_names.reverse() # xxx optimize 416 for field_name in sorted_field_names: 417 kvs.append((field_name, record.get(field_name))) 418 new_record = Record() 419 new_record.mput(kvs) 420 return [new_record] 421 422class MeanKeeper: 423 def __init__(self): 424 self.sum = 0.0 425 self.count = 0 426 def put(x): 427 self.sum += x 428 self.count += 1 429 def get_sum(): 430 return self.sum 431 def get_count(): 432 return self.count 433 def get_mean(): 434 # In IEEE-standard floating-point this would give NaN in the empty case. 435 # But Python throws an exception on divide by zero instead. 436 if self.count == 0: 437 return None 438 else: 439 return self.sum / self.count 440 441class MeanModulator: 442 def __init__(self, collate_field_names, key_field_names=[]): 443 self.collate_field_names = collate_field_names 444 self.key_field_names = key_field_names 445 # map from key-field values to (map from collate-field names to MSCKeeper objects). 446 self.collate_outputs = {} 447 448 def modulate(self, record): 449 if record != None: # drain at end 450 451 # xxx optimize 452 for value_field_name in self.collate_field_names: 453 if not record.has_key(value_field_name): 454 return [] 455 for key_field_name in self.key_field_names: 456 if not record.has_key(key_field_name): 457 return [] 458 459 collate_field_values = [float(record.get(k)) for k in self.collate_field_names] 460 key_string = ",".join([record.get(k) for k in self.key_field_names]) 461 462 return [] 463 else: 464 # xxx stub 465 output_record = Record() 466 output_record.put("foo", "bar") 467 return [output_record] 468 469# ================================================================ 470class StreamModulator: 471 def __init__(self): 472 pass 473 def modulate(self, rreader, rmodulator, rwriter): 474 while True: 475 in_record = rreader.read() 476 477 out_records = rmodulator.modulate(in_record) 478 479 for out_record in out_records: 480 rwriter.write(out_record) 481 482 if in_record == None: 483 break 484 485# ================================================================ 486def set_up_namespace(): 487 namespace = MillerNamespace() 488 namespace.put("ORS", namespace.put("IRS", "\n")) 489 namespace.put("OFS", namespace.put("IFS", ",")) 490 namespace.put("OPS", namespace.put("IPS", "=")) 491 492 # xxx CONVFMT 493 494 namespace.put("FILENAME", None) 495 namespace.iput("NF", None) 496 namespace.iput("NR", 0) 497 namespace.iput("FNR", 0) 498 499 return namespace 500 501# ================================================================ 502main() 503