1#!/usr/bin/env python
2"""
3scanner.py -
4Assemble a "feature scanner" using Aho-Corasick string matching.
5This takes a list of features (byte sequences) and builds a DFA
6that when run on a byte stream can identify how often each of
7the features is present in a single pass over the stream.
8
9Marco Lui, January 2013
10
11Copyright 2013 Marco Lui <saffsd@gmail.com>. All rights reserved.
12
13Redistribution and use in source and binary forms, with or without modification, are
14permitted provided that the following conditions are met:
15
16   1. Redistributions of source code must retain the above copyright notice, this list of
17      conditions and the following disclaimer.
18
19   2. Redistributions in binary form must reproduce the above copyright notice, this list
20      of conditions and the following disclaimer in the documentation and/or other materials
21      provided with the distribution.
22
23THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED
24WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
25FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
26CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33The views and conclusions contained in the software and documentation are those of the
34authors and should not be interpreted as representing official policies, either expressed
35or implied, of the copyright holder.
36"""
37
38import cPickle
39import os, sys, argparse
40import array
41from collections import deque, defaultdict
42from common import read_features
43
44class Scanner(object):
45  alphabet = map(chr, range(1<<8))
46  """
47  Implementation of Aho-Corasick string matching.
48  This class should be instantiated with a set of keywords, which
49  will then be the only tokens generated by the class's search method,
50  """
51  @classmethod
52  def from_file(cls, path):
53    with open(path) as f:
54      tk_nextmove, tk_output, feats = cPickle.load(f)
55    if isinstance(feats, dict):
56      # The old scanner format had two identical dictionaries as the last
57      # two items in the tuple. This format can still be used by langid.py,
58      # but it does not carry the feature list, and so cannot be unpacked
59      # back into a Scanner object.
60      raise ValueError("old format scanner - please retrain. see code for details.")
61    # tk_output is a mapping from state to a list of feature indices.
62    # because of the way the scanner class is written, it needs a mapping
63    # from state to the feature itself. We rebuild this here.
64    tk_output_f = dict( (k,[feats[i] for i in v]) for k,v in tk_output.iteritems() )
65    scanner = cls.__new__(cls)
66    scanner.__setstate__((tk_nextmove, tk_output_f))
67    return scanner
68
69  def __init__(self, keywords):
70    self.build(keywords)
71
72  def __call__(self, value):
73    return self.search(value)
74
75  def build(self, keywords):
76    goto = dict()
77    fail = dict()
78    output = defaultdict(set)
79
80    # Algorithm 2
81    newstate = 0
82    for a in keywords:
83      state = 0
84      j = 0
85      while (j < len(a)) and (state, a[j]) in goto:
86        state = goto[(state, a[j])]
87        j += 1
88      for p in range(j, len(a)):
89        newstate += 1
90        goto[(state, a[p])] = newstate
91        #print "(%d, %s) -> %d" % (state, a[p], newstate)
92        state = newstate
93      output[state].add(a)
94    for a in self.alphabet:
95      if (0,a) not in goto:
96        goto[(0,a)] = 0
97
98    # Algorithm 3
99    queue = deque()
100    for a in self.alphabet:
101      if goto[(0,a)] != 0:
102        s = goto[(0,a)]
103        queue.append(s)
104        fail[s] = 0
105    while queue:
106      r = queue.popleft()
107      for a in self.alphabet:
108        if (r,a) in goto:
109          s = goto[(r,a)]
110          queue.append(s)
111          state = fail[r]
112          while (state,a) not in goto:
113            state = fail[state]
114          fail[s] = goto[(state,a)]
115          #print "f(%d) -> %d" % (s, goto[(state,a)]), output[fail[s]]
116          if output[fail[s]]:
117            output[s].update(output[fail[s]])
118
119    # Algorithm 4
120    self.nextmove = {}
121    for a in self.alphabet:
122      self.nextmove[(0,a)] = goto[(0,a)]
123      if goto[(0,a)] != 0:
124        queue.append(goto[(0,a)])
125    while queue:
126      r = queue.popleft()
127      for a in self.alphabet:
128        if (r,a) in goto:
129          s = goto[(r,a)]
130          queue.append(s)
131          self.nextmove[(r,a)] = s
132        else:
133          self.nextmove[(r,a)] = self.nextmove[(fail[r],a)]
134
135    # convert the output to tuples, as tuple iteration is faster
136    # than set iteration
137    self.output = dict((k, tuple(output[k])) for k in output)
138
139    # Next move encoded as a single array. The index of the next state
140    # is located at current state * alphabet size  + ord(c).
141    # The choice of 'H' array typecode limits us to 64k states.
142    def generate_nm_arr(typecode):
143      def nextstate_iter():
144        # State count starts at 0, so the number of states is the number of i
145        # the last state (newstate) + 1
146        for state in xrange(newstate+1):
147          for letter in self.alphabet:
148            yield self.nextmove[(state, letter)]
149      return array.array(typecode, nextstate_iter())
150    try:
151      self.nm_arr = generate_nm_arr('H')
152    except OverflowError:
153      # Could not fit in an unsigned short array, let's try an unsigned long array.
154      self.nm_arr = generate_nm_arr('L')
155
156  def __getstate__(self):
157    """
158    Compiled nextmove and output.
159    """
160    return (self.nm_arr, self.output)
161
162  def __setstate__(self, value):
163    nm_array, output = value
164    self.nm_arr = nm_array
165    self.output = output
166    self.nextmove = {}
167    for i, next_state in enumerate(nm_array):
168      state = i / 256
169      letter = chr(i % 256)
170      self.nextmove[(state, letter)] = next_state
171
172  def search(self, string):
173    state = 0
174    for letter in map(ord,string):
175      state = self.nm_arr[(state << 8) + letter]
176      for key in self.output.get(state, []):
177        yield key
178
179def build_scanner(features):
180  """
181  In difference to the Scanner class, this function unwraps a layer of indirection in
182  the detection of features. It translates the string output of the scanner's output
183  mapping into the index values (positions in the list) of the features in the supplied
184  feature set. This is very useful where we are only interested in the relative frequencies
185  of features.
186
187  @param features a list of features (byte sequences)
188  @returns a compiled scanner model
189  """
190  feat_index = index(features)
191
192  # Build the actual scanner
193  print "building scanner"
194  scanner = Scanner(features)
195  tk_nextmove, raw_output = scanner.__getstate__()
196
197  # tk_output is the output function of the scanner. It should generate indices into
198  # the feature space directly, as this saves a lookup
199  tk_output = {}
200  for k,v in raw_output.items():
201    tk_output[k] = tuple(feat_index[f] for f in v)
202  return tk_nextmove, tk_output
203
204
205def index(seq):
206  """
207  Build an index for a sequence of items. Assumes
208  that the items in the sequence are unique.
209  @param seq the sequence to index
210  @returns a dictionary from item to position in the sequence
211  """
212  return dict((k,v) for (v,k) in enumerate(seq))
213
214if __name__ == "__main__":
215  parser = argparse.ArgumentParser()
216  parser.add_argument("input", metavar="INPUT", help="build a scanner for INPUT. If input is a directory, read INPUT/LDfeats")
217  parser.add_argument("-o","--output", help="output scanner to OUTFILE", metavar="OUTFILE")
218  args = parser.parse_args()
219
220  if os.path.isdir(args.input):
221    input_path = os.path.join(args.input, 'LDfeats')
222  else:
223    input_path = args.input
224
225  if args.output:
226    output_path = args.output
227  else:
228    output_path = input_path + '.scanner'
229
230  # display paths
231  print "input path:", input_path
232  print "output path:", output_path
233
234  nb_features = read_features(input_path)
235  tk_nextmove, tk_output = build_scanner(nb_features)
236  scanner = tk_nextmove, tk_output, nb_features
237
238  with open(output_path, 'w') as f:
239    cPickle.dump(scanner, f)
240  print "wrote scanner to {0}".format(output_path)
241