1#!/usr/bin/env python
2"""
3DFfeatureselect.py -
4First step in the LD feature selection process, select features based on document
5frequency.
6
7Marco Lui January 2013
8
9Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
10
11Redistribution and use in source and binary forms, with or without modification, are
12permitted provided that the following conditions are met:
13
14   1. Redistributions of source code must retain the above copyright notice, this list of
15      conditions and the following disclaimer.
16
17   2. Redistributions in binary form must reproduce the above copyright notice, this list
18      of conditions and the following disclaimer in the documentation and/or other materials
19      provided with the distribution.
20
21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
22WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
23FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
24CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31The views and conclusions contained in the software and documentation are those of the
32authors and should not be interpreted as representing official policies, either expressed
33or implied, of the copyright holder.
34"""
35
36######
37# Default values
38# Can be overriden with command-line options
39######
40MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
41TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order
42
43import os, sys, argparse
44import collections
45import csv
46import shutil
47import tempfile
48import marshal
49import random
50import numpy
51import cPickle
52import multiprocessing as mp
53import atexit
54import gzip
55from itertools import tee, imap, islice
56from collections import defaultdict
57from datetime import datetime
58from contextlib import closing
59
60from common import Enumerator, unmarshal_iter, MapPool, write_features, write_weights
61
62def pass_sum_df(bucket):
63  """
64  Compute document frequency (df) by summing up (key,domain,count) triplets
65  over all domains.
66  """
67  doc_count = defaultdict(int)
68  count = 0
69  with gzip.open(os.path.join(bucket, "docfreq"),'wb') as docfreq:
70    for path in os.listdir(bucket):
71      # We use the domain buckets as there are usually less domains
72      if path.endswith('.domain'):
73        for key, _, value in unmarshal_iter(os.path.join(bucket,path)):
74          doc_count[key] += value
75          count += 1
76
77    for item in doc_count.iteritems():
78      docfreq.write(marshal.dumps(item))
79  return count
80
81def tally(bucketlist, jobs=None):
82  """
83  Sum up the counts for each feature across all buckets. This
84  builds a full mapping of feature->count. This is stored in-memory
85  and thus could be an issue for large feature sets.
86  """
87
88  with MapPool(jobs) as f:
89    pass_sum_df_out = f(pass_sum_df, bucketlist)
90
91    for i, keycount in enumerate(pass_sum_df_out):
92      print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)
93
94  # build the global term->df mapping
95  doc_count = {}
96  for bucket in bucketlist:
97    for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')):
98      doc_count[key] = value
99
100  return doc_count
101
102
103
104def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_PER_ORDER):
105  """
106  DF feature selection for byte-ngram tokenization
107  """
108  # Work out the set of features to compute IG
109  features = set()
110  for i in range(1, max_order+1):
111    d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i)
112    features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order])
113  features = sorted(features)
114
115  return features
116
117
118
119if __name__ == "__main__":
120  parser = argparse.ArgumentParser()
121  parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
122  parser.add_argument("-f","--features", metavar='FEATURE_FILE', help="output features to FEATURE_FILE")
123  parser.add_argument("--tokens_per_order", metavar='N', type=int, help="consider top N tokens per ngram order")
124  parser.add_argument("--tokens", metavar='N', type=int, help="consider top N tokens")
125  parser.add_argument("--max_order", type=int, help="highest n-gram order to use", default=MAX_NGRAM_ORDER)
126  parser.add_argument("--doc_count", nargs='?', const=True, metavar='DOC_COUNT_PATH', help="output full mapping of feature->frequency to DOC_COUNT_PATH")
127  parser.add_argument("--bucketlist", help="read list of buckets from")
128  parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR")
129
130  args = parser.parse_args()
131
132  if args.tokens and args.tokens_per_order:
133    parser.error("--tokens and --tokens_per_order are mutually exclusive")
134
135  # if neither --tokens nor --tokens_per_order is given, default behaviour is tokens_per_order
136  if not(args.tokens) and not(args.tokens_per_order):
137    args.tokens_per_order = TOKENS_PER_ORDER
138
139  if args.features:
140    feature_path = args.features
141  else:
142    feature_path = os.path.join(args.model, 'DFfeats')
143
144  if args.bucketlist:
145    bucketlist_path = args.bucketlist
146  else:
147    bucketlist_path = os.path.join(args.model, 'bucketlist')
148
149  # display paths
150  print "buckets path:", bucketlist_path
151  print "features output path:", feature_path
152  if args.tokens_per_order:
153    print "max ngram order:", args.max_order
154    print "tokens per order:", args.tokens_per_order
155  else:
156    print "tokens:", args.tokens
157
158  with open(bucketlist_path) as f:
159    bucketlist = map(str.strip, f)
160
161  doc_count = tally(bucketlist, args.jobs)
162  print "unique features:", len(doc_count)
163  if args.doc_count:
164    # The constant true is used to indicate output to default location
165    doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count
166    write_weights(doc_count, doc_count_path)
167    print "wrote DF counts for all features to:", doc_count_path
168
169  if args.tokens_per_order:
170    # Choose a number of features for each length of token
171    feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
172  else:
173    # Choose a number of features overall
174    feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
175  print "selected features: ", len(feats)
176
177  write_features(feats, feature_path)
178  print 'wrote features to "%s"' % feature_path
179
180
181