1#!/usr/bin/python
2
3import os, sys
4import getopt
5import string, re
6import collections
7
8# ================================================================
9#  o inclflds a,x,b
10#  o newflds '{$y:$x*$x, $z:$x/2, $n:-$z}'
11#  o greprecs '$x <= 2 && $y eq "zebra"'
12#
13#  o tabular pretty-print
14#  o mean
15#  o sort
16
17# absolute essentials:
18# * RECORD-LEVEL:
19#   k include/exclude fields
20#   o new field as function of old
21#   o vertical pretty-print
22# * STREAM-LEVEL:
23#   o include/exclude records
24#   o sort
25#   o summarizations: min, max, mean, count, sum, first, last
26#   o tabular pretty-print
27
28# ================================================================
29def usage():
30   print >> sys.stderr, "Usage: %s [options] {modulator-spec} {zero or more filenames}" % os.path.basename(sys.argv[0])
31   print >> sys.stderr, "Options:"
32   print >> sys.stderr, "  -R {rs}   Input/output record separator"
33   print >> sys.stderr, "  -F {fs}   Input/output field separator"
34   print >> sys.stderr, "  -P {ps}   Input/output key-value-pair separator"
35   print >> sys.stderr, "  -v {name=value} xxx needs more doc"
36   print >> sys.stderr, ""
37   print >> sys.stderr, "  --idkvp  Input  format is delimited by IRS,IFS,IPS"
38   print >> sys.stderr, "  --odkvp  Output format is delimited by IRS,IFS,IPS"
39   print >> sys.stderr, "  --icsv   Input  format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
40   print >> sys.stderr, "  --ocsv   Output format is delimited by IRS,IFS,IPS, with header line followed by data lines (e.g. CSV)"
41   print >> sys.stderr, "  --inidx  Input  format is implicitly integer-indexed (awk-style)"
42   print >> sys.stderr, "  --onidx  Output format is implicitly integer-indexed (awk-style)"
43   print >> sys.stderr, "  --ixtab  Input  format is transposed-tabular-pretty-print"
44   print >> sys.stderr, "  --oxtab  Output format is transposed-tabular-pretty-print"
45   print >> sys.stderr, "Modulator specs:"
46   print >> sys.stderr, '--cat'
47   print >> sys.stderr, '--tac'
48   print >> sys.stderr, '--cut'
49   print >> sys.stderr, '--cutx'
50   print >> sys.stderr, '--sortfields'
51   print >> sys.stderr, '--sortfieldsup'
52   print >> sys.stderr, '--sortfieldsdown'
53
54   sys.exit(1)
55
56# ----------------------------------------------------------------
57def parse_command_line():
58   namespace  = set_up_namespace()
59   rreader    = None
60   rwriter    = None
61   rmodulator = None
62
63   try:
64      optargs, non_option_args = getopt.getopt(sys.argv[1:], "R:F:P:v:h", [
65		  'help', 'idkvp', 'odkvp', 'icsv', 'ocsv', 'inidx', 'onidx', 'ixtab', 'oxtab',
66		  'cat', 'tac', 'cut=', 'cutx=', 'sortfields', 'sortfieldsup', 'sortfieldsdown'])
67
68   except getopt.GetoptError, err:
69      print str(err)
70      usage()
71      sys.exit(1)
72
73   for opt, arg in optargs:
74      if opt == '-R':
75         rs = arg
76         namespace.put("ORS", namespace.put("IRS",  rs))
77      elif opt == '-F':
78         fs = arg
79         namespace.put("OFS", namespace.put("IFS",  fs))
80      elif opt == '-P':
81         ps = arg
82         namespace.put("OPS", namespace.put("IPS",  ps))
83      elif opt == '-v':
84         kv = string.split(arg, "=", 1)
85         namespace.put(kv[0], kv[1])
86
87      elif opt == '--idkvp':
88         rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
89      elif opt == '--odkvp':
90         rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
91
92      elif opt == '--icsv':
93         rreader = RecordReaderHeaderFirst(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
94      elif opt == '--ocsv':
95         rwriter = RecordWriterHeaderFirst(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
96
97      elif opt == '--inidx':
98         rreader = RecordReaderIntegerIndexed(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"))
99      elif opt == '--onidx':
100         rwriter = RecordWriterIntegerIndexed(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"))
101
102      #elif opt == '--ixtab':
103      #   pass
104      elif opt == '--oxtab':
105         rwriter = RecordWriterVerticallyTabulated(ostream=sys.stdout) # xxx args w/r/t/ RS/FS/PS?!?
106
107      elif opt == '--cat':
108         rmodulator = CatModulator()
109      elif opt == '--tac':
110         rmodulator = TacModulator()
111      elif opt == '--cut':
112         rmodulator = SelectFieldsModulator(string.split(arg, namespace.get("IFS")))
113      elif opt == '--cutx':
114         rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
115      elif opt == '--cutx':
116         rmodulator = DeselectFieldsModulator(string.split(arg, namespace.get("IFS")))
117      elif opt == '--sortfields':
118         rmodulator = SortFieldsInRecordModulator(True)
119      elif opt == '--sortfieldsup':
120         rmodulator = SortFieldsInRecordModulator(True)
121      elif opt == '--sortfieldsdown':
122         rmodulator = SortFieldsInRecordModulator(False)
123
124      elif opt == '--help':
125         usage()
126      else:
127         print >> sys.stderr, "Unhandled option \"%s\"." % opt
128         sys.exit(1)
129
130   #xxx non_option_arg_count = len(non_option_args)
131
132   if rreader == None:
133      rreader = RecordReaderDefault(istream=sys.stdin, namespace=namespace, irs=namespace.get("IRS"), ifs=namespace.get("IFS"), ips=namespace.get("IPS"))
134   if rwriter == None:
135      rwriter = RecordWriterDefault(ostream=sys.stdout, ors=namespace.get("ORS"), ofs=namespace.get("OFS"), ops=namespace.get("OPS"))
136   if rmodulator == None:
137      rmodulator = CatModulator()
138
139   return {'namespace':namespace, 'rreader':rreader, 'rwriter':rwriter, 'rmodulator':rmodulator}
140
141def main():
142   options = parse_command_line()
143
144   # parse ARGV:
145   # * --ifmt: dkvp,hdr1st,iidxed,align,xposealign
146   # * --ofmt: dkvp,hdr1st,iidxed,align,xposealign
147   # * which-control-language spec?!?
148   # * modulators/script ... this is the key decision area for language(s) design.
149   # * filenames
150
151   namespace  = options['namespace']
152   rreader    = options['rreader']
153   rmodulator = options['rmodulator']
154   rwriter    = options['rwriter']
155
156   smodulator = StreamModulator()
157   smodulator.modulate(rreader, rmodulator, rwriter)
158
159# ================================================================
160class MillerNamespace:
161   def __init__(self):
162      self.mapping = {}
163      self.imapping = {}
164   def get(self, name):
165      return self.mapping[name]
166   def iget(self, name):
167      return self.imapping[name]
168   def put(self, name, value):
169      self.mapping[name] = value
170      return value
171   def iput(self, name, ivalue):
172      self.imapping[name] = ivalue
173      return ivalue
174
175# ================================================================
176class Record:
177   # kvs is list of pair-lists. (xxx: do tuples work too?)
178   def __init__(self, kvs=[]):
179      self.fields = collections.OrderedDict()
180      self.mput(kvs)
181   def put(self, k, v):
182      self.fields[k] = v
183   def mput(self, kvs):
184      for [k,v] in kvs:
185         self.fields[k] = v
186   def get(self, k):
187      return self.fields[k]
188   def has_key(self, k):
189      return self.fields.has_key(k)
190   def get_field_names(self):
191      return self.fields.keys()
192   def get_pairs(self):
193      return self.fields.items()
194   def num_pairs(self):
195      return len(self.fields.items())
196   # xxx xref to record-formatter classes
197   def __str__(self):
198      return self.fields.__repr__
199   def __repr__(self):
200      return self.fields.__repr__
201
202# ================================================================
203# Each record is a sequence of fields delimited by FS, each of which is a
204# key-value pair separated by PS.
205
206class RecordReader:
207   def __init__(self, istream, namespace, irs, ifs, ips):
208      self.istream = istream
209      self.namespace = namespace
210      self.irs = irs
211      self.ifs = ifs
212      self.ips = ips
213
214class RecordReaderDefault(RecordReader):
215   def __init__(self, istream, namespace, irs, ifs, ips):
216      RecordReader.__init__(self, istream, namespace, irs, ifs, ips)
217
218   def read(self):
219      line = self.istream.readline() # xxx use self.irs
220      if line == '':
221         return None
222
223      line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
224      fields = string.split(line, self.ifs)
225      kvs = [string.split(field, self.ips, 1) for field in fields]
226      record = Record(kvs)
227
228      self.namespace.iput("NF", record.num_pairs)
229      self.namespace.iput("NR", self.namespace.iget("NR") + 1)
230
231      # xxx stub
232      self.namespace.put("FILENAME", None)
233      self.namespace.iput("FNR", self.namespace.iget("FNR") + 1)
234
235      return record
236
237# ----------------------------------------------------------------
238# awk-style
239class RecordReaderIntegerIndexed(RecordReader):
240   # xxx ctor with istream context?!? or independent of that?!? for cskv, no matter.
241   # csv reader of course needs context.
242   def __init__(self, istream, namespace, irs, ifs):
243      RecordReader.__init__(self, istream, namespace, irs, ifs, None)
244
245   def read(self):
246      # xxx use self.irs
247      line = self.istream.readline()
248      if line == '':
249         return None
250      line = line.strip() # Remove leading/trailing whitespace including carriage return from readline().
251      fields = re.split(self.ifs, line)
252      kvs = []
253      i = 1
254      for field in fields:
255         kvs.append([i, field])
256         i += 1
257      return Record(kvs)
258
259# ----------------------------------------------------------------
260# csv-style
261class RecordReaderHeaderFirst(RecordReader):
262   def __init__(self, istream, namespace, irs, ifs):
263      RecordReader.__init__(self, istream, namespace, irs, ifs, None)
264      self.field_names = None
265      self.header_line = None
266
267   def read(self):
268      if self.field_names == None:
269         header_line = self.istream.readline()
270         if header_line == '':
271            return None
272         # Remove leading/trailing whitespace including carriage return from readline().
273         header_line = header_line.strip()
274         self.field_names = string.split(header_line, self.ifs, -1)
275         self.header_line = header_line
276
277      data_line = self.istream.readline()
278      if data_line == '':
279         return None
280      # Remove leading/trailing whitespace including carriage return from readline().
281      data_line = data_line.strip()
282      field_values = string.split(data_line, self.ifs, -1)
283      if len(self.field_names) != len(field_values):
284         raise Exception("Header/data length mismatch: %d != %d in \"%s\" and \"%s\"" % \
285            (len(field_names), len(field_values), self.header_line, data_line))
286
287      return Record(zip(self.field_names, field_values))
288
289# ================================================================
290# xxx ostream at ctor??  needs drain-at-end logic for prettyprint.
291
292class RecordWriter:
293   def __init__(self, ostream, ors, ofs, ops):
294      self.ostream = ostream
295      self.ors = ors
296      self.ofs = ofs
297      self.ops = ops
298
299class RecordWriterDefault(RecordWriter):
300   def __init__(self, ostream, ors, ofs, ops):
301      RecordWriter.__init__(self, ostream, ors, ofs, ops)
302
303   def write(self, record):
304      self.ostream.write(self.ofs.join([str(k)+self.ops+str(v) for [k,v] in record.get_pairs()]))
305      self.ostream.write("\n")
306
307# ----------------------------------------------------------------
308class RecordWriterHeaderFirst(RecordWriter):
309  def __init__(self, ostream, ors, ofs):
310      RecordWriter.__init__(self, ostream, ors, ofs, None)
311      self.field_names = None
312
313  def write(self, record):
314     data_string = self.ofs.join([str(v) for [k,v] in record.get_pairs()])
315     if self.field_names == None:
316        self.field_names = record.get_field_names()
317        header_string = self.ofs.join([str(k) for [k,v] in record.get_pairs()])
318        self.ostream.write(header_string)
319        self.ostream.write("\n")
320     self.ostream.write(data_string)
321     self.ostream.write("\n")
322
323# ----------------------------------------------------------------
324# xxx rename
325
326class RecordWriterVerticallyTabulated(RecordWriter):
327   def __init__(self, ostream):
328      RecordWriter.__init__(self, ostream, None, None, None)
329
330   def write(self, record):
331      max_field_name_width = 1
332      field_names = record.get_field_names()
333      for field_name in field_names:
334         field_name_width = len(field_name)
335         if field_name_width > max_field_name_width:
336            max_field_name_width = field_name_width
337      lines = []
338      for field_name in field_names:
339         lines.append("%-*s %s" % (max_field_name_width, field_name, record.get(field_name)))
340      self.ostream.write("\n".join(lines))
341      self.ostream.write("\n\n")
342
343# ----------------------------------------------------------------
344class RecordWriterIntegerIndexed:
345   def __init__(self, ostream, ors, ofs):
346      self.ostream = ostream
347      self.ors = ors
348      self.ofs = ofs
349   def write(self, record):
350      self.ostream.write(self.ofs.join([str(v) for [k,v] in record.get_pairs()]))
351      self.ostream.write("\n")
352
353# ================================================================
354class CatModulator:
355   def __init__(self):
356      pass
357   def modulate(self, record):
358      if record == None: # drain at end
359         return []
360      return [record]
361
362class TacModulator:
363   def __init__(self):
364      self.records = []
365   def modulate(self, record):
366      if record == None: # drain at end
367         self.records.reverse()
368         rv = self.records
369         self.records = []
370         return rv
371      else:
372         self.records.append(record)
373         return []
374
375class SelectFieldsModulator:
376   def __init__(self, field_names):
377      self.field_names = field_names
378   def modulate(self, record):
379      if record == None: # drain at end
380         return []
381      kvs = []
382      for field_name in self.field_names:
383         if record.has_key(field_name):
384            kvs.append((field_name, record.get(field_name)))
385      new_record = Record()
386      new_record.mput(kvs)
387      return [new_record]
388
389# The field_names argument may be a list or hash-set -- as long as it supports
390# the "in" operator as in "name in field_names".
391# xxx to do: use a hash-set internally.
392class DeselectFieldsModulator:
393   def __init__(self, field_names):
394      self.field_names = field_names
395   def modulate(self, record):
396      if record == None: # drain at end
397         return []
398      kvs = []
399      for field_name in record.get_field_names():
400         if not field_name in self.field_names:
401            kvs.append((field_name, record.get(field_name)))
402      new_record = Record()
403      new_record.mput(kvs)
404      return [new_record]
405
406class SortFieldsInRecordModulator:
407   def __init__(self, do_ascending_sort=True):
408      self.do_ascending_sort = do_ascending_sort
409   def modulate(self, record):
410      if record == None: # drain at end
411         return []
412      kvs = []
413      sorted_field_names = sorted(record.get_field_names())
414      if not self.do_ascending_sort:
415         sorted_field_names.reverse() # xxx optimize
416      for field_name in sorted_field_names:
417         kvs.append((field_name, record.get(field_name)))
418      new_record = Record()
419      new_record.mput(kvs)
420      return [new_record]
421
422class MeanKeeper:
423   def __init__(self):
424      self.sum   = 0.0
425      self.count = 0
426   def put(x):
427      self.sum   += x
428      self.count += 1
429   def get_sum():
430      return self.sum
431   def get_count():
432      return self.count
433   def get_mean():
434      # In IEEE-standard floating-point this would give NaN in the empty case.
435      # But Python throws an exception on divide by zero instead.
436      if self.count == 0:
437         return None
438      else:
439         return self.sum / self.count
440
441class MeanModulator:
442   def __init__(self, collate_field_names, key_field_names=[]):
443      self.collate_field_names = collate_field_names
444      self.key_field_names     = key_field_names
445      # map from key-field values to (map from collate-field names to MSCKeeper objects).
446      self.collate_outputs     = {}
447
448   def modulate(self, record):
449      if record != None: # drain at end
450
451         # xxx optimize
452         for value_field_name in self.collate_field_names:
453            if not record.has_key(value_field_name):
454               return []
455         for key_field_name in self.key_field_names:
456            if not record.has_key(key_field_name):
457               return []
458
459         collate_field_values = [float(record.get(k)) for k in self.collate_field_names]
460         key_string = ",".join([record.get(k) for k in self.key_field_names])
461
462         return []
463      else:
464         # xxx stub
465         output_record = Record()
466         output_record.put("foo", "bar")
467         return [output_record]
468
469# ================================================================
470class StreamModulator:
471   def __init__(self):
472      pass
473   def modulate(self, rreader, rmodulator, rwriter):
474      while True:
475         in_record = rreader.read()
476
477         out_records = rmodulator.modulate(in_record)
478
479         for out_record in out_records:
480            rwriter.write(out_record)
481
482         if in_record == None:
483            break
484
485# ================================================================
486def set_up_namespace():
487   namespace = MillerNamespace()
488   namespace.put("ORS", namespace.put("IRS",  "\n"))
489   namespace.put("OFS", namespace.put("IFS",  ","))
490   namespace.put("OPS", namespace.put("IPS", "="))
491
492   # xxx CONVFMT
493
494   namespace.put("FILENAME", None)
495   namespace.iput("NF", None)
496   namespace.iput("NR", 0)
497   namespace.iput("FNR", 0)
498
499   return namespace
500
501# ================================================================
502main()
503