1#!/usr/bin/env python 2""" 3scanner.py - 4Assemble a "feature scanner" using Aho-Corasick string matching. 5This takes a list of features (byte sequences) and builds a DFA 6that when run on a byte stream can identify how often each of 7the features is present in a single pass over the stream. 8 9Marco Lui, January 2013 10 11Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved. 12 13Redistribution and use in source and binary forms, with or without modification, are 14permitted provided that the following conditions are met: 15 16 1. Redistributions of source code must retain the above copyright notice, this list of 17 conditions and the following disclaimer. 18 19 2. Redistributions in binary form must reproduce the above copyright notice, this list 20 of conditions and the following disclaimer in the documentation and/or other materials 21 provided with the distribution. 22 23THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED 24WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 25FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 26CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 28SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 29ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 31ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33The views and conclusions contained in the software and documentation are those of the 34authors and should not be interpreted as representing official policies, either expressed 35or implied, of the copyright holder. 36""" 37 38import cPickle 39import os, sys, argparse 40import array 41from collections import deque, defaultdict 42from common import read_features 43 44class Scanner(object): 45 alphabet = map(chr, range(1<<8)) 46 """ 47 Implementation of Aho-Corasick string matching. 48 This class should be instantiated with a set of keywords, which 49 will then be the only tokens generated by the class's search method, 50 """ 51 @classmethod 52 def from_file(cls, path): 53 with open(path) as f: 54 tk_nextmove, tk_output, feats = cPickle.load(f) 55 if isinstance(feats, dict): 56 # The old scanner format had two identical dictionaries as the last 57 # two items in the tuple. This format can still be used by langid.py, 58 # but it does not carry the feature list, and so cannot be unpacked 59 # back into a Scanner object. 60 raise ValueError("old format scanner - please retrain. see code for details.") 61 # tk_output is a mapping from state to a list of feature indices. 62 # because of the way the scanner class is written, it needs a mapping 63 # from state to the feature itself. We rebuild this here. 64 tk_output_f = dict( (k,[feats[i] for i in v]) for k,v in tk_output.iteritems() ) 65 scanner = cls.__new__(cls) 66 scanner.__setstate__((tk_nextmove, tk_output_f)) 67 return scanner 68 69 def __init__(self, keywords): 70 self.build(keywords) 71 72 def __call__(self, value): 73 return self.search(value) 74 75 def build(self, keywords): 76 goto = dict() 77 fail = dict() 78 output = defaultdict(set) 79 80 # Algorithm 2 81 newstate = 0 82 for a in keywords: 83 state = 0 84 j = 0 85 while (j < len(a)) and (state, a[j]) in goto: 86 state = goto[(state, a[j])] 87 j += 1 88 for p in range(j, len(a)): 89 newstate += 1 90 goto[(state, a[p])] = newstate 91 #print "(%d, %s) -> %d" % (state, a[p], newstate) 92 state = newstate 93 output[state].add(a) 94 for a in self.alphabet: 95 if (0,a) not in goto: 96 goto[(0,a)] = 0 97 98 # Algorithm 3 99 queue = deque() 100 for a in self.alphabet: 101 if goto[(0,a)] != 0: 102 s = goto[(0,a)] 103 queue.append(s) 104 fail[s] = 0 105 while queue: 106 r = queue.popleft() 107 for a in self.alphabet: 108 if (r,a) in goto: 109 s = goto[(r,a)] 110 queue.append(s) 111 state = fail[r] 112 while (state,a) not in goto: 113 state = fail[state] 114 fail[s] = goto[(state,a)] 115 #print "f(%d) -> %d" % (s, goto[(state,a)]), output[fail[s]] 116 if output[fail[s]]: 117 output[s].update(output[fail[s]]) 118 119 # Algorithm 4 120 self.nextmove = {} 121 for a in self.alphabet: 122 self.nextmove[(0,a)] = goto[(0,a)] 123 if goto[(0,a)] != 0: 124 queue.append(goto[(0,a)]) 125 while queue: 126 r = queue.popleft() 127 for a in self.alphabet: 128 if (r,a) in goto: 129 s = goto[(r,a)] 130 queue.append(s) 131 self.nextmove[(r,a)] = s 132 else: 133 self.nextmove[(r,a)] = self.nextmove[(fail[r],a)] 134 135 # convert the output to tuples, as tuple iteration is faster 136 # than set iteration 137 self.output = dict((k, tuple(output[k])) for k in output) 138 139 # Next move encoded as a single array. The index of the next state 140 # is located at current state * alphabet size + ord(c). 141 # The choice of 'H' array typecode limits us to 64k states. 142 def generate_nm_arr(typecode): 143 def nextstate_iter(): 144 # State count starts at 0, so the number of states is the number of i 145 # the last state (newstate) + 1 146 for state in xrange(newstate+1): 147 for letter in self.alphabet: 148 yield self.nextmove[(state, letter)] 149 return array.array(typecode, nextstate_iter()) 150 try: 151 self.nm_arr = generate_nm_arr('H') 152 except OverflowError: 153 # Could not fit in an unsigned short array, let's try an unsigned long array. 154 self.nm_arr = generate_nm_arr('L') 155 156 def __getstate__(self): 157 """ 158 Compiled nextmove and output. 159 """ 160 return (self.nm_arr, self.output) 161 162 def __setstate__(self, value): 163 nm_array, output = value 164 self.nm_arr = nm_array 165 self.output = output 166 self.nextmove = {} 167 for i, next_state in enumerate(nm_array): 168 state = i / 256 169 letter = chr(i % 256) 170 self.nextmove[(state, letter)] = next_state 171 172 def search(self, string): 173 state = 0 174 for letter in map(ord,string): 175 state = self.nm_arr[(state << 8) + letter] 176 for key in self.output.get(state, []): 177 yield key 178 179def build_scanner(features): 180 """ 181 In difference to the Scanner class, this function unwraps a layer of indirection in 182 the detection of features. It translates the string output of the scanner's output 183 mapping into the index values (positions in the list) of the features in the supplied 184 feature set. This is very useful where we are only interested in the relative frequencies 185 of features. 186 187 @param features a list of features (byte sequences) 188 @returns a compiled scanner model 189 """ 190 feat_index = index(features) 191 192 # Build the actual scanner 193 print "building scanner" 194 scanner = Scanner(features) 195 tk_nextmove, raw_output = scanner.__getstate__() 196 197 # tk_output is the output function of the scanner. It should generate indices into 198 # the feature space directly, as this saves a lookup 199 tk_output = {} 200 for k,v in raw_output.items(): 201 tk_output[k] = tuple(feat_index[f] for f in v) 202 return tk_nextmove, tk_output 203 204 205def index(seq): 206 """ 207 Build an index for a sequence of items. Assumes 208 that the items in the sequence are unique. 209 @param seq the sequence to index 210 @returns a dictionary from item to position in the sequence 211 """ 212 return dict((k,v) for (v,k) in enumerate(seq)) 213 214if __name__ == "__main__": 215 parser = argparse.ArgumentParser() 216 parser.add_argument("input", metavar="INPUT", help="build a scanner for INPUT. If input is a directory, read INPUT/LDfeats") 217 parser.add_argument("-o","--output", help="output scanner to OUTFILE", metavar="OUTFILE") 218 args = parser.parse_args() 219 220 if os.path.isdir(args.input): 221 input_path = os.path.join(args.input, 'LDfeats') 222 else: 223 input_path = args.input 224 225 if args.output: 226 output_path = args.output 227 else: 228 output_path = input_path + '.scanner' 229 230 # display paths 231 print "input path:", input_path 232 print "output path:", output_path 233 234 nb_features = read_features(input_path) 235 tk_nextmove, tk_output = build_scanner(nb_features) 236 scanner = tk_nextmove, tk_output, nb_features 237 238 with open(output_path, 'w') as f: 239 cPickle.dump(scanner, f) 240 print "wrote scanner to {0}".format(output_path) 241