1 /* 2 * ahocorasick.h: Includes all types of ahocorasick library 3 * This file is part of multifast. 4 * 5 Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> 6 7 multifast is free software: you can redistribute it and/or modify 8 it under the terms of the GNU Lesser General Public License as published by 9 the Free Software Foundation, either version 3 of the License, or 10 (at your option) any later version. 11 12 multifast is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public License 18 along with multifast. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #ifndef _AC_TYPES_H_ 22 #define _AC_TYPES_H_ 23 24 #define AC_AHOCORASICK_NEW 25 #define AC_PATTRN_MAX_LENGTH 256 26 27 /* reallocation step for AC_NODE_t.matched_patterns */ 28 #define REALLOC_CHUNK_MATCHSTR 8 29 30 /* reallocation step for AC_NODE_t.outgoing array */ 31 /* Must be a multiple of __SIZEOF_LONG__ ! */ 32 #define REALLOC_CHUNK_OUTGOING 8 33 34 /* AC_ALPHABET_t: 35 * defines the alphabet type. 36 * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal 37 * with streams of other (bigger) types e.g. integers, specific enum, objects. 38 * Although they consists of string of bytes (chars), but using their specific 39 * types for AC_ALPHABET_t will lead to a better performance. so instead of 40 * dealing with strings of chars, we assume dealing with strings of 41 * AC_ALPHABET_t and leave it optional for other developers to define their 42 * own alphabets. 43 **/ 44 typedef char AC_ALPHABET_t; 45 46 /* AC_REP_t: 47 * Provides a more readable representative for a pattern. 48 * because patterns themselves are not always suitable for displaying 49 * (e.g. for hex patterns), we offer this type to improve intelligibility 50 * of output. furthermore, sometimes it is useful, for example while 51 * retrieving patterns from a database, to maintain their identifiers in the 52 * automata for further reference. we provisioned two possible types as a 53 * union for this purpose. you can add your desired type in it. 54 **/ 55 typedef struct { 56 uint32_t number; /* Often used to store procotolId */ 57 uint64_t number64; 58 uint16_t breed, category; 59 uint16_t level, /* Domain level for comparison */ 60 from_start:1, /* match from start of string */ 61 at_end:1, /* match at end of string */ 62 dot:1; /* is domain name */ 63 } AC_REP_t; 64 65 /* AC_PATTERN_t: 66 * This is the pattern type that must be fed into AC automata. 67 * the 'astring' field is not null-terminated, due to it can contain zero 68 * value bytes. the 'length' field determines the number of AC_ALPHABET_t it 69 * carries. the 'representative' field is described in AC_REP_t. despite 70 * 'astring', 'representative' can have duplicate values for different given 71 * AC_PATTERN_t. it is an optional field and you can just fill it with 0. 72 * CAUTION: 73 * Not always the 'astring' points to the correct position in memory. 74 * it is the responsibility of your program to maintain a permanent allocation 75 * for astring field of the added pattern to automata. 76 **/ 77 78 typedef struct 79 { 80 AC_ALPHABET_t * astring; /* String of alphabets */ 81 uint16_t length, /* Length of pattern */ 82 is_existing; /* not union_matchstr */ 83 AC_REP_t rep; /* Representative string (optional) */ 84 } AC_PATTERN_t; 85 86 typedef struct { 87 unsigned short num; /* Number of matched patterns at this node */ 88 unsigned short max; /* Max capacity of allocated memory for matched_patterns */ 89 AC_PATTERN_t patterns[]; 90 } AC_PATTERNS_t; 91 92 93 /* AC_MATCH_t: 94 * Provides the structure for reporting a match event. 95 * a match event occurs when the automata reaches a final node. any final 96 * node can match one or more pattern at a position in a text. the 97 * 'patterns' field holds these matched patterns. obviously these 98 * matched patterns have same end-position in the text. there is a relationship 99 * between matched patterns: the shorter one is a factor (tail) of the longer 100 * one. the 'position' maintains the end position of matched patterns. the 101 * start position of patterns could be found by knowing their 'length' in 102 * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at 103 * position 40 in the text, then the start position of them are 34 and 36 104 * respectively. finally the field 'match_num' maintains the number of 105 * matched patterns. 106 **/ 107 108 typedef struct 109 { 110 AC_PATTERN_t *matched[4], /* for ac_automata_exact_match() */ 111 *last; /* for callback */ 112 AC_PATTERN_t *patterns; /* Array of matched pattern */ 113 unsigned int match_map; /* Matched patterns (bitmap) */ 114 unsigned int position; /* The end position of matching pattern(s) in the text */ 115 unsigned short int match_num; /* Number of matched patterns */ 116 unsigned short int match_counter; /* Counter of found matches */ 117 } AC_MATCH_t; 118 119 /* AC_TEXT_t: 120 * The input text type that is fed to ac_automata_search() to be searched. 121 * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input 122 * text, but for the purpose of being more readable, we defined this new type. 123 **/ 124 typedef struct 125 { 126 AC_MATCH_t match; 127 AC_ALPHABET_t * astring; /* String of alphabets */ 128 unsigned short int length, /* Length of string */ 129 option; /* AC_FEATURE_LC | AC_FEATURE_DEBUG */; 130 } AC_TEXT_t; 131 132 133 /* AC_ERROR_t: 134 * Error that may occur while adding a pattern to the automata. 135 * it is returned by ac_automata_add(). 136 **/ 137 typedef enum 138 { 139 ACERR_SUCCESS = 0, /* No error occurred */ 140 ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */ 141 ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */ 142 ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */ 143 ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling 144 ac_automata_finalize() you can not add new patterns to the automata. */ 145 ACERR_ERROR, /* common error */ 146 } AC_ERROR_t; 147 148 /* MATCH_CALLBACK_t: 149 * This is the call-back function type that must be given to automata at 150 * initialization to report match occurrence to the caller. 151 * at a match event, the automata will reach you using this function and sends 152 * you a pointer to AC_MATCH_t. using that pointer you can handle 153 * matches. you can send parameters to the call-back function when you call 154 * ac_automata_search(). at call-back, the automata will sent you those 155 * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside 156 * the call-back function you can cast it to whatever you want. 157 * If you return 0 from MATCH_CALLBACK_t function to the automata, it will 158 * continue searching, otherwise it will return from ac_automata_search() 159 * to your calling function. 160 **/ 161 typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *); 162 163 /* AC_PATTRN_MAX_LENGTH: 164 * Maximum acceptable pattern length in AC_PATTERN_t.length 165 **/ 166 167 /* Forward Declaration */ 168 struct edge; 169 170 /* 171 * automata node 172 * 4 pointers + 8 bytes : 40/24 bytes for 64/32 bit 173 */ 174 typedef struct ac_node 175 { 176 int id; /* Node ID : set after finalize(), only for ac_automata_dump */ 177 AC_ALPHABET_t one_alpha, 178 one:1, /* one_char: yes/no */ 179 range:1, /* range symbols start from one_alpha */ 180 root:1, /* is root node */ 181 final:1, /* 0: no ; 1: yes, it is a final node */ 182 use:1, /* use: yes/no */ 183 ff:1; /* finalized node */ 184 unsigned short depth; /* depth: distance between this node and the root */ 185 186 AC_PATTERNS_t *matched_patterns; /* Array of matched patterns */ 187 struct edge *outgoing; /* Array of outgoing edges */ 188 189 struct ac_node *failure_node; /* The failure node of this node */ 190 AC_ALPHABET_t *a_ptr; 191 } AC_NODE_t; 192 193 struct edge { 194 unsigned short degree; /* Number of outgoing edges */ 195 unsigned short max; /* Max capacity of allocated memory for outgoing */ 196 uint32_t cmap[8]; /* 256 bit */ 197 AC_NODE_t *next[]; 198 /* 199 * first N elements used for 'next' pointers + 200 * M elements used for symbols storage 201 * M = (max + sizeof(void*)-1) & ( ~(sizeof(void*)-1)) 202 * 203 * if sizeof(void*)==8 204 * for max = 8 we must alloc next[9]; 205 * for max = 16 we must alloc next[18]; 206 * 207 */ 208 }; 209 210 struct ac_path { 211 AC_NODE_t * n; 212 unsigned short int idx,l; 213 }; 214 215 typedef struct 216 { 217 /* The root of the Aho-Corasick trie */ 218 AC_NODE_t * root; 219 MATCH_CALLBACK_f match_handler; 220 unsigned int all_nodes_num; /* Number of all nodes in the automata */ 221 222 /* this flag indicates that if automata is finalized by 223 * ac_automata_finalize() or not. 1 means finalized and 0 224 * means not finalized (is open). after finalizing automata you can not 225 * add pattern to automata anymore. */ 226 unsigned short automata_open, 227 to_lc:1, no_root_range:1,debug:1; /* lowercase match */ 228 229 /* Statistic Variables */ 230 unsigned long total_patterns; /* Total patterns in the automata */ 231 232 unsigned long max_str_len; /* largest pattern length. Update by ac_automata_finalize() */ 233 234 struct ac_path ac_path[AC_PATTRN_MAX_LENGTH+4]; /* for ac_automata_walk */ 235 int id; /* node id */ 236 int add_to_range; /* for convert to range */ 237 int n_oc,n_range,n_find; /* statistics */ 238 char name[32]; /* if debug != 0 */ 239 } AC_AUTOMATA_t; 240 241 typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *); 242 243 typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *); 244 245 #define AC_FEATURE_DEBUG 1 246 #define AC_FEATURE_LC 2 247 #define AC_FEATURE_NO_ROOT_RANGE 4 248 249 AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc); 250 AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature); 251 AC_ERROR_t ac_automata_name (AC_AUTOMATA_t * thiz, char *name, int debug); 252 AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str); 253 AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz); 254 AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb, 255 ALPHA_CALLBACK_f alpha_cb, void *); 256 257 int ac_automata_search (AC_AUTOMATA_t * thiz, 258 AC_TEXT_t * str, 259 AC_REP_t * param); 260 int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *); 261 void ac_automata_clean (AC_AUTOMATA_t * thiz); 262 void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern); 263 #ifndef __KERNEL__ 264 /* Global debug control. */ 265 void ac_automata_enable_debug (int debug); 266 /* See man open_memstream() for get result as string */ 267 void ac_automata_dump (AC_AUTOMATA_t * thiz, FILE *); 268 #endif 269 #endif 270