1 2 /* 3 * Copyright 2017 Laurent Farhi 4 * Contact: lfarhi@sfr.fr 5 * 6 * This file is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as published by 8 * the Free Software Foundation, version 3 of the License. 9 * 10 * This file is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with this file. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #ifndef __ACM_TEMPLATE__ 20 21 # define __ACM_TEMPLATE__ 22 23 /// User interface ************************************************************************ 24 25 /// Texts and keywprds are composed of symbols of type T. 26 /// T can be any standard type (int, char, wchar_t, ...) or any user defined type (such as a structure). 27 /// It can be declared and defined in global scope by: 28 /// #include "aho_corasick_template_impl.h" 29 /// ACM_DECLARE (T) 30 /// ACM_DEFINE (T) 31 /// 32 /// A destructor, and a copy constructor can be declared for type T if required. 33 /// Type for destructor is: void (*destructor) (const T) 34 # define DESTRUCTOR_TYPE(T) DESTROY_##T##_TYPE 35 36 /// Type for constructor is: T (*constructor) (const T) 37 # define COPY_CONSTRUCTOR_TYPE(T) COPY_##T##_TYPE 38 39 /// Type for equality operator is: int (*equal_operator) (const T, const T) 40 # define EQ_OPERATOR_TYPE(T) EQ_##T##_TYPE 41 42 /// SET_DESTRUCTOR optionally declares a destructor for type T. 43 /// Example: SET_DESTRUCTOR (mytype, mydestructor); 44 # define SET_DESTRUCTOR(T, destructor) do { DESTROY_##T = (destructor) ; } while (0) 45 46 /// SET_COPY_CONSTRUCTOR optionally declares a copy constructor for type T. 47 /// Example: SET_COPY_CONSTRUCTOR (mytype, myconstructor); 48 # define SET_COPY_CONSTRUCTOR(T, constructor) do { COPY_##T = (constructor) ; } while (0) 49 50 /// SET_EQ_OPERATOR optionally declares equality operator for type T. 51 /// A user defined equality operator can be declared for type T if needed. 52 /// A default equality operator (memcmp) is used otherwise. 53 /// Example: static int nocaseeq (wchar_t k, wchar_t t) { return k == towlower (t); } 54 /// SET_EQ_OPERATOR (wchar_t, nocaseeq); 55 # define SET_EQ_OPERATOR(T, equal_operator) do { EQ_##T = (equal_operator) ; } while (0) 56 57 /// ACState (T) is the type of a Aho-Corasick state machine for type T 58 # define ACState(T) ACState_##T 59 60 /// ACMachine (T) is the type of the Aho-Corasick finite state machine for type T 61 # define ACMachine(T) ACMachine_##T 62 63 /// ACMachine (T) *ACM_create (T, [equality_operator], [copy constructor], [destructor]) 64 /// Creates a Aho-Corasick finite state machine for type T. 65 /// @param [in] T type of symbols composing keywords and text to be parsed. 66 /// @param [in, optional] equality_operator Equality operator of type EQ_OPERATOR_TYPE(T). 67 /// @param [in, optional] copy constructor Copy constructor of type COPY_CONSTRUCTOR_TYPE(T). 68 /// @param [in, optional] destructor Destructor of type DESTRUCTOR_TYPE(T). 69 /// @returns A pointer to a Aho-Corasick machine for type T. 70 /// Example: ACMachine (char) * M = ACM_create (char); 71 /// Note: ACM_create accepts optional arguments thanks to the use of the VFUNC macro (see below). 72 # define ACM_create(...) VFUNC(ACM_create, __VA_ARGS__) 73 74 /// void ACM_release (const ACMachine (T) *machine) 75 /// Releases the ressources of a Aho-Corasick machine created with ACM_create. 76 /// @param [in] machine A pointer to a Aho-Corasick machine to be realeased. 77 /// Example: ACM_release (M); 78 # define ACM_release(machine) (machine)->vtable->release ((machine)) 79 80 /// Keyword (T) is the type of a keyword composed of symbols of type T. 81 /// Exemple: Keyword (char) kw; 82 # define Keyword(T) Keyword_##T 83 84 /// void ACM_KEYWORD_SET (Keyword(T) kw, T* array, size_t length) 85 /// Initializes a keyword from an array of symbols 86 /// @param [in] kw Keyword of symbols of type T. 87 /// @param [in] array Array of symbols 88 /// @param [in] length Length of the array 89 /// Note: The array is NOT duplicated by ACM_KEYWORD_SET and should be allocated by the calling user program. 90 /// Exemple: ACM_KEYWORD_SET (kw, "Duck", 4); 91 # define ACM_KEYWORD_SET(keyword,symbols,length) do { ACM_MATCH_SYMBOLS (keyword) = (symbols); ACM_MATCH_LENGTH (keyword) = (length); } while (0) 92 93 /// int ACM_register_keyword(ACMachine(T) *machine, Keyword(T) kw, [void * value_ptr], [void (*destructor) (void *)]) 94 /// Registers a keyword in the Aho-Corasick machine. 95 /// @param [in] machine A pointer to a Aho-Corasick machine. 96 /// @param [in] kw Keyword of symbols of type T to be registered. 97 /// @param [in, optional] value_ptr Pointer to a previously allocated value to associate with keyword kw. 98 /// @param [in, optional] destructor A destructor to be used to free the value pointed by value_ptr. 99 /// The default destructor is the standard library function `free. 100 /// Use `0` if the allocated value need not be managed by the finite state machine 101 /// (in case of automatic or static values). 102 /// @return 1 if the keyword was successfully registered, 0 otherwise (if the keyword is empty). 103 /// Note: When returning 0, the destructor, if any, is called on value, if any. 104 /// Note: If the keywpord is already registered in the machine, its associated value is forgotten and replaced by the new value. 105 /// Note: Keyword kw is duplicated and can be released after its registration. 106 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared. 107 /// Note: The keyword is registered together with its rank. 108 /// The rank of the registered keyword is the number of times ACM_register_keyword was previously called 109 /// since the machine was created. The rank is a 0-based sequence number. 110 /// This rank can later be retrieved by ACM_get_match. 111 /// Example: ACM_register_keyword (M, kw); 112 /// ACM_register_keyword (M, kw, calloc (1, sizeof (int)), free); 113 # define ACM_register_keyword(...) VFUNC(ACM_register_keyword, __VA_ARGS__) 114 115 /// int ACM_is_registered_keyword (const ACMachine(T) * machine, Keyword(T) kw, [void **value_ptr]) 116 /// Checks whether a keyword is already registered in the machine. 117 /// @param [in] machine A pointer to a Aho-Corasick machine. 118 /// @param [in] kw Keyword of symbols of type T to be checked. 119 /// @param [out, optional] value_ptr *value_ptr is set to the pointer of the value associated to the keyword after the call. 120 /// @return 1 if the keyword is registered in the machine, 0 otherwise. 121 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared. 122 # define ACM_is_registered_keyword(...) VFUNC(ACM_is_registered_keyword, __VA_ARGS__) 123 124 /// int ACM_unregister_keyword (ACMachine(T) *machine, Keyword(T) kw) 125 /// Unregisters a keyword from the Aho-Corasick machine. 126 /// @param [in] machine A pointer to a Aho-Corasick machine. 127 /// @param [in] kw Keyword of symbols of type T to be registered. 128 /// @return 1 if the keyword was successfully unregistered, 0 otherwise (the keywpord is not registered in the machine). 129 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared. 130 # define ACM_unregister_keyword(machine, keyword) (machine)->vtable->unregister_keyword ( (machine), (keyword)) 131 132 /// size_t ACM_nb_keywords (const ACMachine(T) *machine) 133 /// Returns the number of keywords registered in the machine. 134 /// @param [in] machine A pointer to a Aho-Corasick machine. 135 /// @return The number of keywords registered in the machine. 136 # define ACM_nb_keywords(machine) (machine)->vtable->nb_keywords ((machine)) 137 138 /// MatchHolder (T) is the type of a match composed of symbols of type T. 139 /// Exemple: MatchHolder (char) match; 140 # define MatchHolder(T) MatchHolder_##T 141 142 /// size_t ACM_MATCH_LENGTH (MatchHolder(T) match) 143 /// Returns the length of a matching keyword. 144 /// @param [in] match A matching keyword. 145 /// @return The length of the matching keyword. 146 /// Note: This function can also be applied to a keyword of type Keyword(T). 147 # define ACM_MATCH_LENGTH(match) ((match).length) 148 149 /// T* ACM_MATCH_SYMBOLS (MatchHolder(T) match) 150 /// Returns the array to the symbols of a matching keyword. 151 /// @param [in] match A matching keyword. 152 /// @return The array to the symbols of the matching keyword. 153 /// Note: This function can also be applied to a keyword of type Keyword(T). 154 # define ACM_MATCH_SYMBOLS(match) ((match).letter) 155 156 /// size_t ACM_MATCH_UID (MatchHolder(T) match) 157 /// Returns the unique id of a matching keyword. 158 /// @param [in] match A matching keyword returned by a previous call to `ACM_get_match`. 159 /// @return The unique id of the matching keyword. 160 # define ACM_MATCH_UID(match) ((match).rank) 161 162 /// void ACM_foreach_keyword (const ACMachine(T) * machine, void (*operator) (MatchHolder(T) kw, void *value)) 163 /// Applies an operator to each registered keyword (by `ACM_register_keyword`) in the machine. 164 /// @param [in] machine A pointer to a Aho-Corasick machine. 165 /// @param [in] operator Function of type void (*operator) (Keyword (T), void *) 166 /// Note: The operator is called for each registered keyword and pointer to associated value successively. 167 /// Note: The order the keywords are processed in unspecified. 168 /// Exemple: static void print_match (MatchHolder (wchar_t) match, void *value) { /* user code here */ } 169 /// ACM_foreach_keyword (M, print_match); 170 # define ACM_foreach_keyword(machine, operator) (machine)->vtable->foreach_keyword ((machine), (operator)) 171 172 /// const ACState (T) * ACM_reset (ACMachine(T) * machine) 173 /// Get a valid state, ignoring all the symbols previously matched by ACM_match. 174 /// @param [in] machine A pointer to a Aho-Corasick machine. 175 /// @param [in] state A pointer to a valid Aho-Corasick machine state. 176 /// Note: Several calls to ACM_reset on the same machine can be used to 177 /// parse several texts concurrently (e.g. by several threads). 178 # define ACM_reset(machine) (machine)->vtable->reset ((machine)) 179 180 # define ACM_print(machine, stream, printer) (machine)->vtable->print ((machine), (stream), (printer)) 181 182 /// size_t ACM_match (const ACState(T) *& state, T letter) 183 /// This is the main function used to parse a text, one symbol after the other, and search for pattern matching. 184 /// Get the next state matching a symbol injected in the finite state machine. 185 /// @param [in, out] state A pointer to a valid Aho-Corasick machine state. Argument passed by reference. 186 /// @param [in] letter A symbol. 187 /// @return The number of registered keywords that match a sequence of last letters sent to the last calls to `ACM_match`. 188 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared. 189 /// Note: The optional argument `nb_matches` avoids the call to ACM_nb_matches. 190 /// Note: `state` is passed by reference. It is modified by the function. 191 /// Usage: size_t nb = ACM_match(state, letter); 192 # define ACM_match(state, letter) (state)->vtable->match(&(state), (letter)) 193 194 /// void ACM_MATCH_INIT (MatchHolder(T) match) 195 /// Initializes a match before its first use by ACM_get_match. 196 /// @param [in] match A match 197 /// Exemple: ACM_MATCH_INIT (match); 198 /// Note: this function should only be applied to a matching keyword which reference is passed to ACM_get_match. 199 # define ACM_MATCH_INIT(match) ACM_KEYWORD_SET((match), 0, ((match).rank = 0)) 200 201 /// size_t ACM_get_match (const ACState(T) * state, size_t index, [MatchHolder(T) * match], [void **value_ptr]) 202 /// Gets the ith keyword matching with the last symbols. 203 /// @param [in] state A pointer to a valid Aho-Corasick machine state. 204 /// @param [in] index Index (ith) of the ith matching keyword. 205 /// @param [out, optional] match *match is set to the ith matching keyword. 206 /// @param [out, optional] value_ptr *value_ptr is set to the pointer of the value associated to the keyword after the call. 207 /// @return The rank (unique id) of the ith matching keyword. 208 /// Note: index must be lower than value returned by the last call to ACM_match. 209 /// ?ote: *match should have been initialized by ACM_MATCH_INIT before use. 210 /// Exemple: size_t rank = ACM_get_match (state, j, &match, 0); 211 # define ACM_get_match(...) VFUNC(ACM_get_match, __VA_ARGS__) 212 213 /// void ACM_MATCH_RELEASE (MatchHolder(T) match) 214 /// Releases a match after its last use by ACM_get_match. 215 /// @param [in] match A match 216 /// Exemple: ACM_MATCH_RELEASE (match); 217 /// Note: This function should only be applied to a matching keyword which reference is passed to `ACM_get_match`. 218 /// It should not ne applied to a keyword of type Keyword(T). 219 # define ACM_MATCH_RELEASE(match) do { free (ACM_MATCH_SYMBOLS (match)); ACM_MATCH_INIT (match); } while (0) 220 221 /// Internal declarations ******************************************************************** 222 223 // BEGIN VFUNC 224 // Credits: VFUNC is a macro for overloading on number (but not types) of arguments. 225 // See https://stackoverflow.com/questions/11761703/overloading-macro-on-number-of-arguments 226 # define __NARG__(...) __NARG_I_(__VA_ARGS__,__RSEQ_N()) 227 # define __NARG_I_(...) __ARG_N(__VA_ARGS__) 228 # define __ARG_N( \ 229 _1, _2, _3, _4, _5, _6, _7, _8, _9,_10, \ 230 _11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \ 231 _21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \ 232 _31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \ 233 _41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \ 234 _51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \ 235 _61,_62,_63,N,...) N 236 # define __RSEQ_N() \ 237 63,62,61,60, \ 238 59,58,57,56,55,54,53,52,51,50, \ 239 49,48,47,46,45,44,43,42,41,40, \ 240 39,38,37,36,35,34,33,32,31,30, \ 241 29,28,27,26,25,24,23,22,21,20, \ 242 19,18,17,16,15,14,13,12,11,10, \ 243 9,8,7,6,5,4,3,2,1,0 244 245 # define _VFUNC_(name, n) name##n 246 # define _VFUNC(name, n) _VFUNC_(name, n) 247 # define VFUNC(func, ...) _VFUNC(func, __NARG__(__VA_ARGS__)) (__VA_ARGS__) 248 // END VFUNC 249 250 // BEGIN DECLARE_ACM 251 # define ACM_DECLARE(T) \ 252 \ 253 typedef T (*COPY_##T##_TYPE) (const T); \ 254 typedef void (*DESTROY_##T##_TYPE) (const T); \ 255 typedef int (*EQ_##T##_TYPE) (const T, const T); \ 256 \ 257 typedef struct \ 258 { \ 259 T *letter; /* An array of symbols */ \ 260 size_t length; /* Length of the array */ \ 261 } Keyword_##T; \ 262 \ 263 typedef struct \ 264 { \ 265 T *letter; /* An array of symbols */ \ 266 size_t length; /* Length of the array */ \ 267 size_t rank; /* Rank of the regidtered keyword */\ 268 } MatchHolder_##T; \ 269 \ 270 struct _ac_state_##T; \ 271 typedef struct _ac_state_##T ACState_##T; \ 272 struct _ac_machine_##T; \ 273 typedef struct _ac_machine_##T ACMachine_##T; \ 274 typedef int (*PRINT_##T##_TYPE) (FILE *, T); \ 275 struct _acs_vtable_##T \ 276 { \ 277 size_t (*match) (const ACState_##T ** state, T letter); \ 278 size_t (*get_match) (const ACState_##T * state, size_t index, MatchHolder_##T * match, void **value); \ 279 }; \ 280 /* A state of the state machine. */ \ 281 struct _ac_state_##T /* [state s] */ \ 282 { \ 283 /* A link to the next states */ \ 284 struct _ac_next_##T \ 285 { \ 286 T letter; /* [a symbol] */ \ 287 struct _ac_state_##T *state; /* [g(s, letter)] */\ 288 } *goto_array; /* next states in the tree of the goto function */\ 289 size_t nb_goto; \ 290 /* A link to the previous states */ \ 291 struct \ 292 { \ 293 size_t i_letter; /* Index of the letter in the goto_array */ \ 294 /* letter = previous.state->goto_array[previous.i_letter].letter */ \ 295 struct _ac_state_##T *state; \ 296 } previous; /* Previous state */\ 297 const struct _ac_state_##T *fail_state; /* [f(s)] */\ 298 int is_matching; /* true if the state matches a keyword. */\ 299 size_t nb_sequence; /* Number of matching keywords (Aho-Corasick : size (output (s)) */\ 300 size_t rank; /* Rank (0-based) of insertion of a keyword in the machine. */\ 301 size_t id; /* state UID */ \ 302 void *value; /* An optional value associated to a state. */\ 303 void (*value_dtor) (void *); /* Destrcutor of the associated value, called a state machine release. */\ 304 ACMachine_##T * machine; \ 305 const struct _acs_vtable_##T *vtable; \ 306 }; \ 307 \ 308 struct _acm_vtable_##T \ 309 { \ 310 int (*register_keyword) (ACMachine_##T * machine, Keyword_##T keyword, void *value, void (*dtor) (void *)); \ 311 int (*is_registered_keyword) (const ACMachine_##T * machine, Keyword_##T keyword, void **value); \ 312 int (*unregister_keyword) (ACMachine_##T * machine, Keyword_##T keyword); \ 313 size_t (*nb_keywords) (const ACMachine_##T * machine); \ 314 void (*foreach_keyword) (const ACMachine_##T * machine, void (*operator) (MatchHolder_##T, void *)); \ 315 void (*release) (const ACMachine_##T * machine); \ 316 const ACState_##T * (*reset) (const ACMachine_##T * machine); \ 317 void (*print) (ACMachine_##T * machine, FILE * stream, PRINT_##T##_TYPE printer); \ 318 }; \ 319 \ 320 struct _ac_machine_##T \ 321 { \ 322 struct _ac_state_##T *state_0; /* state 0 */ \ 323 size_t rank; /* Number of keywords registered in the machine. */\ 324 size_t nb_sequence; /* Number of keywords in the machine. */\ 325 size_t state_counter; \ 326 int reconstruct; \ 327 size_t size; \ 328 pthread_mutex_t lock; \ 329 const struct _acm_vtable_##T *vtable; \ 330 T (*copy) (const T); \ 331 void (*destroy) (const T); \ 332 int (*eq) (const T, const T); \ 333 }; \ 334 \ 335 __attribute__ ((unused)) ACMachine_##T *ACM_create_##T (EQ_##T##_TYPE eq, \ 336 COPY_##T##_TYPE copier, \ 337 DESTROY_##T##_TYPE dtor); \ 338 struct __useless_struct_to_allow_trailing_semicolon__##T##__ 339 // END DECLARE_ACM 340 341 // BEGIN MACROS 342 # define ACM_create4(T, eq, copy, dtor) ACM_create_##T((eq), (copy), (dtor)) 343 # define ACM_create2(T, eq) ACM_create4(T, (eq), 0, 0) 344 # define ACM_create1(T) ACM_create4(T, 0, 0, 0) 345 346 # define ACM_register_keyword4(machine, keyword, value, dtor) (machine)->vtable->register_keyword ((machine), (keyword), (value), (dtor)) 347 # define ACM_register_keyword3(machine, keyword, value) ACM_register_keyword4((machine), (keyword), (value), free) 348 # define ACM_register_keyword2(machine, keyword) ACM_register_keyword4((machine), (keyword), 0, 0) 349 350 # define ACM_is_registered_keyword3(machine, keyword, value) (machine)->vtable->is_registered_keyword ((machine), (keyword), (value)) 351 # define ACM_is_registered_keyword2(machine, keyword) ACM_is_registered_keyword3((machine), (keyword), 0) 352 353 # define ACM_get_match4(state, index, matchholder, value) (state)->vtable->get_match ((state), (index), (matchholder), (value)) 354 # define ACM_get_match3(state, index, matchholder) ACM_get_match4((state), (index), (matchholder), 0) 355 # define ACM_get_match2(state, index) ACM_get_match4((state), (index), 0, 0) 356 357 #if defined(__GNUC__) || defined (__clang__) 358 #define ACM_DECL5(var, T, eq, copy, dtor) \ 359 __attribute__ ((cleanup (ACM_cleanup_##T))) ACMachine_##T var; machine_init_##T (&(var), state_create_##T (), (eq), (copy), (dtor)) 360 #define ACM_DECL3(var, T, eq) ACM_DECL5(var, T, (eq), 0, 0) 361 #define ACM_DECL2(var, T) ACM_DECL3(var, T, 0) 362 #define ACM_DECL(...) VFUNC(ACM_DECL, __VA_ARGS__) 363 #endif 364 // END MACROS 365 366 #endif 367