1#!/usr/bin/env python 2""" 3DFfeatureselect.py - 4First step in the LD feature selection process, select features based on document 5frequency. 6 7Marco Lui January 2013 8 9Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved. 10 11Redistribution and use in source and binary forms, with or without modification, are 12permitted provided that the following conditions are met: 13 14 1. Redistributions of source code must retain the above copyright notice, this list of 15 conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright notice, this list 18 of conditions and the following disclaimer in the documentation and/or other materials 19 provided with the distribution. 20 21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 22WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 23FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 24CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31The views and conclusions contained in the software and documentation are those of the 32authors and should not be interpreted as representing official policies, either expressed 33or implied, of the copyright holder. 34""" 35 36###### 37# Default values 38# Can be overriden with command-line options 39###### 40MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider 41TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order 42 43import os, sys, argparse 44import collections 45import csv 46import shutil 47import tempfile 48import marshal 49import random 50import numpy 51import cPickle 52import multiprocessing as mp 53import atexit 54import gzip 55from itertools import tee, imap, islice 56from collections import defaultdict 57from datetime import datetime 58from contextlib import closing 59 60from common import Enumerator, unmarshal_iter, MapPool, write_features, write_weights 61 62def pass_sum_df(bucket): 63 """ 64 Compute document frequency (df) by summing up (key,domain,count) triplets 65 over all domains. 66 """ 67 doc_count = defaultdict(int) 68 count = 0 69 with gzip.open(os.path.join(bucket, "docfreq"),'wb') as docfreq: 70 for path in os.listdir(bucket): 71 # We use the domain buckets as there are usually less domains 72 if path.endswith('.domain'): 73 for key, _, value in unmarshal_iter(os.path.join(bucket,path)): 74 doc_count[key] += value 75 count += 1 76 77 for item in doc_count.iteritems(): 78 docfreq.write(marshal.dumps(item)) 79 return count 80 81def tally(bucketlist, jobs=None): 82 """ 83 Sum up the counts for each feature across all buckets. This 84 builds a full mapping of feature->count. This is stored in-memory 85 and thus could be an issue for large feature sets. 86 """ 87 88 with MapPool(jobs) as f: 89 pass_sum_df_out = f(pass_sum_df, bucketlist) 90 91 for i, keycount in enumerate(pass_sum_df_out): 92 print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount) 93 94 # build the global term->df mapping 95 doc_count = {} 96 for bucket in bucketlist: 97 for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')): 98 doc_count[key] = value 99 100 return doc_count 101 102 103 104def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_PER_ORDER): 105 """ 106 DF feature selection for byte-ngram tokenization 107 """ 108 # Work out the set of features to compute IG 109 features = set() 110 for i in range(1, max_order+1): 111 d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i) 112 features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order]) 113 features = sorted(features) 114 115 return features 116 117 118 119if __name__ == "__main__": 120 parser = argparse.ArgumentParser() 121 parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") 122 parser.add_argument("-f","--features", metavar='FEATURE_FILE', help="output features to FEATURE_FILE") 123 parser.add_argument("--tokens_per_order", metavar='N', type=int, help="consider top N tokens per ngram order") 124 parser.add_argument("--tokens", metavar='N', type=int, help="consider top N tokens") 125 parser.add_argument("--max_order", type=int, help="highest n-gram order to use", default=MAX_NGRAM_ORDER) 126 parser.add_argument("--doc_count", nargs='?', const=True, metavar='DOC_COUNT_PATH', help="output full mapping of feature->frequency to DOC_COUNT_PATH") 127 parser.add_argument("--bucketlist", help="read list of buckets from") 128 parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR") 129 130 args = parser.parse_args() 131 132 if args.tokens and args.tokens_per_order: 133 parser.error("--tokens and --tokens_per_order are mutually exclusive") 134 135 # if neither --tokens nor --tokens_per_order is given, default behaviour is tokens_per_order 136 if not(args.tokens) and not(args.tokens_per_order): 137 args.tokens_per_order = TOKENS_PER_ORDER 138 139 if args.features: 140 feature_path = args.features 141 else: 142 feature_path = os.path.join(args.model, 'DFfeats') 143 144 if args.bucketlist: 145 bucketlist_path = args.bucketlist 146 else: 147 bucketlist_path = os.path.join(args.model, 'bucketlist') 148 149 # display paths 150 print "buckets path:", bucketlist_path 151 print "features output path:", feature_path 152 if args.tokens_per_order: 153 print "max ngram order:", args.max_order 154 print "tokens per order:", args.tokens_per_order 155 else: 156 print "tokens:", args.tokens 157 158 with open(bucketlist_path) as f: 159 bucketlist = map(str.strip, f) 160 161 doc_count = tally(bucketlist, args.jobs) 162 print "unique features:", len(doc_count) 163 if args.doc_count: 164 # The constant true is used to indicate output to default location 165 doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count 166 write_weights(doc_count, doc_count_path) 167 print "wrote DF counts for all features to:", doc_count_path 168 169 if args.tokens_per_order: 170 # Choose a number of features for each length of token 171 feats = ngram_select(doc_count, args.max_order, args.tokens_per_order) 172 else: 173 # Choose a number of features overall 174 feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] ) 175 print "selected features: ", len(feats) 176 177 write_features(feats, feature_path) 178 print 'wrote features to "%s"' % feature_path 179 180 181