1 /*
2  * ahocorasick.h: Includes all types of ahocorasick library
3  * This file is part of multifast.
4  *
5  Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
6 
7  multifast is free software: you can redistribute it and/or modify
8  it under the terms of the GNU Lesser General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  multifast is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  GNU Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public License
18  along with multifast.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef _AC_TYPES_H_
22 #define _AC_TYPES_H_
23 
24 #define AC_AHOCORASICK_NEW
25 #define AC_PATTRN_MAX_LENGTH 256
26 
27 /* reallocation step for AC_NODE_t.matched_patterns */
28 #define REALLOC_CHUNK_MATCHSTR 8
29 
30 /* reallocation step for AC_NODE_t.outgoing array */
31 /* Must be a multiple of __SIZEOF_LONG__ ! */
32 #define REALLOC_CHUNK_OUTGOING 8
33 
34 /* AC_ALPHABET_t:
35  * defines the alphabet type.
36  * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal
37  * with streams of other (bigger) types e.g. integers, specific enum, objects.
38  * Although they consists of string of bytes (chars), but using their specific
39  * types for AC_ALPHABET_t will lead to a better performance. so instead of
40  * dealing with strings of chars, we assume dealing with strings of
41  * AC_ALPHABET_t and leave it optional for other developers to define their
42  * own alphabets.
43  **/
44 typedef char AC_ALPHABET_t;
45 
46 /* AC_REP_t:
47  * Provides a more readable representative for a pattern.
48  * because patterns themselves are not always suitable for displaying
49  * (e.g. for hex patterns), we offer this type to improve intelligibility
50  * of output. furthermore, sometimes it is useful, for example while
51  * retrieving patterns from a database, to maintain their identifiers in the
52  * automata for further reference. we provisioned two possible types as a
53  * union for this purpose. you can add your desired type in it.
54  **/
55 typedef struct {
56   uint32_t number; /* Often used to store procotolId */
57   uint64_t number64;
58   uint16_t breed, category;
59   uint16_t        level,        /* Domain level for comparison */
60                   from_start:1, /* match from start of string */
61                   at_end:1,     /* match at end of string */
62                   dot:1;        /* is domain name */
63 } AC_REP_t;
64 
65 /* AC_PATTERN_t:
66  * This is the pattern type that must be fed into AC automata.
67  * the 'astring' field is not null-terminated, due to it can contain zero
68  * value bytes. the 'length' field determines the number of AC_ALPHABET_t it
69  * carries. the 'representative' field is described in AC_REP_t. despite
70  * 'astring', 'representative' can have duplicate values for different given
71  * AC_PATTERN_t. it is an optional field and you can just fill it with 0.
72  * CAUTION:
73  * Not always the 'astring' points to the correct position in memory.
74  * it is the responsibility of your program to maintain a permanent allocation
75  * for astring field of the added pattern to automata.
76  **/
77 
78 typedef struct
79 {
80   AC_ALPHABET_t * astring; /* String of alphabets */
81   uint16_t        length,  /* Length of pattern */
82 	          is_existing; /* not union_matchstr */
83   AC_REP_t rep;   /* Representative string (optional) */
84 } AC_PATTERN_t;
85 
86 typedef struct {
87   unsigned short num; /* Number of matched patterns at this node */
88   unsigned short max; /* Max capacity of allocated memory for matched_patterns */
89   AC_PATTERN_t	patterns[];
90 } AC_PATTERNS_t;
91 
92 
93 /* AC_MATCH_t:
94  * Provides the structure for reporting a match event.
95  * a match event occurs when the automata reaches a final node. any final
96  * node can match one or more pattern at a position in a text. the
97  * 'patterns' field holds these matched patterns. obviously these
98  * matched patterns have same end-position in the text. there is a relationship
99  * between matched patterns: the shorter one is a factor (tail) of the longer
100  * one. the 'position' maintains the end position of matched patterns. the
101  * start position of patterns could be found by knowing their 'length' in
102  * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at
103  * position 40 in the text, then the start position of them are 34 and 36
104  * respectively. finally the field 'match_num' maintains the number of
105  * matched patterns.
106  **/
107 
108 typedef struct
109 {
110   AC_PATTERN_t *matched[4],   /* for ac_automata_exact_match() */
111 	       *last;         /* for callback              */
112   AC_PATTERN_t *patterns;     /* Array of matched pattern  */
113   unsigned int  match_map;    /* Matched patterns (bitmap) */
114   unsigned int  position;     /* The end position of matching pattern(s) in the text */
115   unsigned short int match_num;     /* Number of matched patterns */
116   unsigned short int match_counter; /* Counter of found matches */
117 } AC_MATCH_t;
118 
119 /* AC_TEXT_t:
120  * The input text type that is fed to ac_automata_search() to be searched.
121  * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input
122  * text, but for the purpose of being more readable, we defined this new type.
123  **/
124 typedef struct
125 {
126   AC_MATCH_t match;
127   AC_ALPHABET_t * astring;    /* String of alphabets */
128   unsigned short int  length, /* Length of string */
129 	              option; /* AC_FEATURE_LC | AC_FEATURE_DEBUG */;
130 } AC_TEXT_t;
131 
132 
133 /* AC_ERROR_t:
134  * Error that may occur while adding a pattern to the automata.
135  * it is returned by ac_automata_add().
136  **/
137 typedef enum
138   {
139     ACERR_SUCCESS = 0, /* No error occurred */
140     ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */
141     ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */
142     ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */
143     ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling
144 			      ac_automata_finalize() you can not add new patterns to the automata. */
145     ACERR_ERROR, /* common error */
146   } AC_ERROR_t;
147 
148 /* MATCH_CALLBACK_t:
149  * This is the call-back function type that must be given to automata at
150  * initialization to report match occurrence to the caller.
151  * at a match event, the automata will reach you using this function and sends
152  * you a pointer to AC_MATCH_t. using that pointer you can handle
153  * matches. you can send parameters to the call-back function when you call
154  * ac_automata_search(). at call-back, the automata will sent you those
155  * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside
156  * the call-back function you can cast it to whatever you want.
157  * If you return 0 from MATCH_CALLBACK_t function to the automata, it will
158  * continue searching, otherwise it will return from ac_automata_search()
159  * to your calling function.
160  **/
161 typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *);
162 
163 /* AC_PATTRN_MAX_LENGTH:
164  * Maximum acceptable pattern length in AC_PATTERN_t.length
165  **/
166 
167 /* Forward Declaration */
168 struct edge;
169 
170 /*
171  * automata node
172  * 4 pointers + 8 bytes : 40/24 bytes for 64/32 bit
173  */
174 typedef struct ac_node
175 {
176   int id;                              /* Node ID : set after finalize(), only for ac_automata_dump */
177   AC_ALPHABET_t  one_alpha,
178 		 one:1,                /* one_char: yes/no */
179 		 range:1,              /* range symbols start from one_alpha */
180 		 root:1,               /* is root node */
181 	  	 final:1,	       /* 0: no ; 1: yes, it is a final node */
182 		 use:1,	               /* use: yes/no */
183 		 ff:1;		       /* finalized node */
184   unsigned short depth;                /* depth: distance between this node and the root */
185 
186   AC_PATTERNS_t  *matched_patterns;   /* Array of matched patterns */
187   struct edge    *outgoing;           /* Array of outgoing edges */
188 
189   struct ac_node *failure_node;       /* The failure node of this node */
190   AC_ALPHABET_t  *a_ptr;
191 } AC_NODE_t;
192 
193 struct edge {
194   unsigned short degree;      /* Number of outgoing edges */
195   unsigned short max;         /* Max capacity of allocated memory for outgoing */
196   uint32_t       cmap[8];      /* 256 bit */
197   AC_NODE_t	 *next[];
198  /*
199   * first N elements used for 'next' pointers +
200   * M elements used for symbols storage
201   * M = (max + sizeof(void*)-1) & ( ~(sizeof(void*)-1))
202   *
203   * if sizeof(void*)==8
204   * for max = 8  we must alloc next[9];
205   * for max = 16 we must alloc next[18];
206   *
207   */
208 };
209 
210 struct ac_path {
211   AC_NODE_t * n;
212   unsigned short int idx,l;
213 };
214 
215 typedef struct
216 {
217   /* The root of the Aho-Corasick trie */
218   AC_NODE_t * root;
219   MATCH_CALLBACK_f match_handler;
220   unsigned int all_nodes_num; /* Number of all nodes in the automata */
221 
222   /* this flag indicates that if automata is finalized by
223    * ac_automata_finalize() or not. 1 means finalized and 0
224    * means not finalized (is open). after finalizing automata you can not
225    * add pattern to automata anymore. */
226   unsigned short automata_open,
227 		 to_lc:1, no_root_range:1,debug:1; /* lowercase match */
228 
229   /* Statistic Variables */
230   unsigned long total_patterns; /* Total patterns in the automata */
231 
232   unsigned long max_str_len; /* largest pattern length. Update by ac_automata_finalize() */
233 
234   struct ac_path ac_path[AC_PATTRN_MAX_LENGTH+4]; /* for ac_automata_walk */
235   int id;	/* node id */
236   int add_to_range; /* for convert to range */
237   int n_oc,n_range,n_find; /* statistics */
238   char name[32]; /* if debug != 0 */
239 } AC_AUTOMATA_t;
240 
241 typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *);
242 
243 typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *);
244 
245 #define AC_FEATURE_DEBUG 1
246 #define AC_FEATURE_LC 2
247 #define AC_FEATURE_NO_ROOT_RANGE 4
248 
249 AC_AUTOMATA_t * ac_automata_init     (MATCH_CALLBACK_f mc);
250 AC_ERROR_t      ac_automata_feature  (AC_AUTOMATA_t * thiz, unsigned int feature);
251 AC_ERROR_t      ac_automata_name     (AC_AUTOMATA_t * thiz, char *name, int debug);
252 AC_ERROR_t      ac_automata_add      (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str);
253 AC_ERROR_t      ac_automata_finalize (AC_AUTOMATA_t * thiz);
254 AC_ERROR_t      ac_automata_walk     (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb,
255 						ALPHA_CALLBACK_f alpha_cb, void *);
256 
257 int             ac_automata_search   (AC_AUTOMATA_t * thiz,
258 						AC_TEXT_t * str,
259 						AC_REP_t * param);
260 int             ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *);
261 void            ac_automata_clean    (AC_AUTOMATA_t * thiz);
262 void            ac_automata_release  (AC_AUTOMATA_t * thiz, uint8_t free_pattern);
263 #ifndef __KERNEL__
264 /* Global debug control. */
265 void            ac_automata_enable_debug (int debug);
266 /* See man open_memstream() for get result as string */
267 void            ac_automata_dump     (AC_AUTOMATA_t * thiz, FILE *);
268 #endif
269 #endif
270