1 
2 /*
3 * Copyright 2017 Laurent Farhi
4 * Contact: lfarhi@sfr.fr
5 *
6 *  This file is free software: you can redistribute it and/or modify
7 *  it under the terms of the GNU Lesser General Public License as published by
8 *  the Free Software Foundation, version 3 of the License.
9 *
10 *  This file is distributed in the hope that it will be useful,
11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 *  GNU Lesser General Public License for more details.
14 *
15 *  You should have received a copy of the GNU Lesser General Public License
16 *  along with this file.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #ifndef __ACM_TEMPLATE__
20 
21 #  define __ACM_TEMPLATE__
22 
23 /// User interface ************************************************************************
24 
25 /// Texts and keywprds are composed of symbols of type T.
26 /// T can be any standard type (int, char, wchar_t, ...) or any user defined type (such as a structure).
27 /// It can be declared and defined in global scope by:
28 ///   #include "aho_corasick_template_impl.h"
29 ///   ACM_DECLARE (T)
30 ///   ACM_DEFINE (T)
31 ///
32 /// A destructor, and a copy constructor can be declared for type T if required.
33 /// Type for destructor is: void (*destructor) (const T)
34 #  define DESTRUCTOR_TYPE(T)                        DESTROY_##T##_TYPE
35 
36 /// Type for constructor is: T (*constructor) (const T)
37 #  define COPY_CONSTRUCTOR_TYPE(T)                  COPY_##T##_TYPE
38 
39 /// Type for equality operator is: int (*equal_operator) (const T, const T)
40 #  define EQ_OPERATOR_TYPE(T)                       EQ_##T##_TYPE
41 
42 /// SET_DESTRUCTOR optionally declares a destructor for type T.
43 /// Example: SET_DESTRUCTOR (mytype, mydestructor);
44 #  define SET_DESTRUCTOR(T, destructor)             do { DESTROY_##T = (destructor) ; } while (0)
45 
46 /// SET_COPY_CONSTRUCTOR optionally declares a copy constructor for type T.
47 /// Example: SET_COPY_CONSTRUCTOR (mytype, myconstructor);
48 #  define SET_COPY_CONSTRUCTOR(T, constructor)      do { COPY_##T = (constructor) ; } while (0)
49 
50 /// SET_EQ_OPERATOR optionally declares equality operator for type T.
51 /// A user defined equality operator can be declared for type T if needed.
52 /// A default equality operator (memcmp) is used otherwise.
53 /// Example: static int nocaseeq (wchar_t k, wchar_t t) { return k == towlower (t); }
54 ///          SET_EQ_OPERATOR (wchar_t, nocaseeq);
55 #  define SET_EQ_OPERATOR(T, equal_operator)        do { EQ_##T = (equal_operator) ; } while (0)
56 
57 /// ACState (T) is the type of a Aho-Corasick state machine for type T
58 #  define ACState(T)                                ACState_##T
59 
60 /// ACMachine (T) is the type of the Aho-Corasick finite state machine for type T
61 #  define ACMachine(T)                              ACMachine_##T
62 
63 /// ACMachine (T) *ACM_create (T, [equality_operator], [copy constructor], [destructor])
64 /// Creates a Aho-Corasick finite state machine for type T.
65 /// @param [in] T type of symbols composing keywords and text to be parsed.
66 /// @param [in, optional] equality_operator Equality operator of type EQ_OPERATOR_TYPE(T).
67 /// @param [in, optional] copy constructor Copy constructor of type COPY_CONSTRUCTOR_TYPE(T).
68 /// @param [in, optional] destructor Destructor of type DESTRUCTOR_TYPE(T).
69 /// @returns A pointer to a Aho-Corasick machine for type T.
70 /// Example: ACMachine (char) * M = ACM_create (char);
71 /// Note: ACM_create accepts optional arguments thanks to the use of the VFUNC macro (see below).
72 #  define ACM_create(...)                           VFUNC(ACM_create, __VA_ARGS__)
73 
74 /// void ACM_release (const ACMachine (T) *machine)
75 /// Releases the ressources of a Aho-Corasick machine created with ACM_create.
76 /// @param [in] machine A pointer to a Aho-Corasick machine to be realeased.
77 /// Example: ACM_release (M);
78 #  define ACM_release(machine)                      (machine)->vtable->release ((machine))
79 
80 /// Keyword (T) is the type of a keyword composed of symbols of type T.
81 /// Exemple: Keyword (char) kw;
82 #  define Keyword(T)                                Keyword_##T
83 
84 /// void ACM_KEYWORD_SET (Keyword(T) kw, T* array, size_t length)
85 /// Initializes a keyword from an array of symbols
86 /// @param [in] kw Keyword of symbols of type T.
87 /// @param [in] array Array of symbols
88 /// @param [in] length Length of the array
89 /// Note: The array is NOT duplicated by ACM_KEYWORD_SET and should be allocated by the calling user program.
90 /// Exemple: ACM_KEYWORD_SET (kw, "Duck", 4);
91 #  define ACM_KEYWORD_SET(keyword,symbols,length)   do { ACM_MATCH_SYMBOLS (keyword) = (symbols); ACM_MATCH_LENGTH (keyword) = (length); } while (0)
92 
93 /// int ACM_register_keyword(ACMachine(T) *machine, Keyword(T) kw, [void * value_ptr], [void (*destructor) (void *)])
94 /// Registers a keyword in the Aho-Corasick machine.
95 /// @param [in] machine A pointer to a Aho-Corasick machine.
96 /// @param [in] kw Keyword of symbols of type T to be registered.
97 /// @param [in, optional] value_ptr Pointer to a previously allocated value to associate with keyword kw.
98 /// @param [in, optional] destructor A destructor to be used to free the value pointed by value_ptr.
99 ///                                  The default destructor is the standard library function `free.
100 ///                                  Use `0` if the allocated value need not be managed by the finite state machine
101 ///                                  (in case of automatic or static values).
102 /// @return 1 if the keyword was successfully registered, 0 otherwise (if the keyword is empty).
103 /// Note: When returning 0, the destructor, if any, is called on value, if any.
104 /// Note: If the keywpord is already registered in the machine, its associated value is forgotten and replaced by the new value.
105 /// Note: Keyword kw is duplicated and can be released after its registration.
106 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
107 /// Note: The keyword is registered together with its rank.
108 ///       The rank of the registered keyword is the number of times ACM_register_keyword was previously called
109 ///       since the machine was created. The rank is a 0-based sequence number.
110 ///       This rank can later be retrieved by ACM_get_match.
111 /// Example: ACM_register_keyword (M, kw);
112 ///          ACM_register_keyword (M, kw, calloc (1, sizeof (int)), free);
113 #  define ACM_register_keyword(...)                 VFUNC(ACM_register_keyword, __VA_ARGS__)
114 
115 /// int ACM_is_registered_keyword (const ACMachine(T) * machine, Keyword(T) kw, [void **value_ptr])
116 /// Checks whether a keyword is already registered in the machine.
117 /// @param [in] machine A pointer to a Aho-Corasick machine.
118 /// @param [in] kw Keyword of symbols of type T to be checked.
119 /// @param [out, optional] value_ptr *value_ptr is set to the pointer of the value associated to the keyword after the call.
120 /// @return 1 if the keyword is registered in the machine, 0 otherwise.
121 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
122 #  define ACM_is_registered_keyword(...)            VFUNC(ACM_is_registered_keyword, __VA_ARGS__)
123 
124 /// int ACM_unregister_keyword (ACMachine(T) *machine, Keyword(T) kw)
125 /// Unregisters a keyword from the Aho-Corasick machine.
126 /// @param [in] machine A pointer to a Aho-Corasick machine.
127 /// @param [in] kw Keyword of symbols of type T to be registered.
128 /// @return 1 if the keyword was successfully unregistered, 0 otherwise (the keywpord is not registered in the machine).
129 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
130 #  define ACM_unregister_keyword(machine, keyword)  (machine)->vtable->unregister_keyword ( (machine), (keyword))
131 
132 /// size_t ACM_nb_keywords (const ACMachine(T) *machine)
133 /// Returns the number of keywords registered in the machine.
134 /// @param [in] machine A pointer to a Aho-Corasick machine.
135 /// @return The number of keywords registered in the machine.
136 #  define ACM_nb_keywords(machine)                  (machine)->vtable->nb_keywords ((machine))
137 
138 /// MatchHolder (T) is the type of a match composed of symbols of type T.
139 /// Exemple: MatchHolder (char) match;
140 #  define MatchHolder(T)                            MatchHolder_##T
141 
142 /// size_t ACM_MATCH_LENGTH (MatchHolder(T) match)
143 /// Returns the length of a matching keyword.
144 /// @param [in] match A matching keyword.
145 /// @return The length of the matching keyword.
146 /// Note: This function can also be applied to a keyword of type Keyword(T).
147 #  define ACM_MATCH_LENGTH(match)                   ((match).length)
148 
149 /// T* ACM_MATCH_SYMBOLS (MatchHolder(T) match)
150 /// Returns the array to the symbols of a matching keyword.
151 /// @param [in] match A matching keyword.
152 /// @return The array to the symbols of the matching keyword.
153 /// Note: This function can also be applied to a keyword of type Keyword(T).
154 #  define ACM_MATCH_SYMBOLS(match)                  ((match).letter)
155 
156 /// size_t ACM_MATCH_UID (MatchHolder(T) match)
157 /// Returns the unique id of a matching keyword.
158 /// @param [in] match A matching keyword returned by a previous call to `ACM_get_match`.
159 /// @return The unique id of the matching keyword.
160 #  define ACM_MATCH_UID(match)                      ((match).rank)
161 
162 /// void ACM_foreach_keyword (const ACMachine(T) * machine, void (*operator) (MatchHolder(T) kw, void *value))
163 /// Applies an operator to each registered keyword (by `ACM_register_keyword`) in the machine.
164 /// @param [in] machine A pointer to a Aho-Corasick machine.
165 /// @param [in] operator Function of type void (*operator) (Keyword (T), void *)
166 /// Note: The operator is called for each registered keyword and pointer to associated value successively.
167 /// Note: The order the keywords are processed in unspecified.
168 /// Exemple: static void print_match (MatchHolder (wchar_t) match, void *value) { /* user code here */ }
169 ///          ACM_foreach_keyword (M, print_match);
170 #  define ACM_foreach_keyword(machine, operator)    (machine)->vtable->foreach_keyword ((machine), (operator))
171 
172 /// const ACState (T) * ACM_reset (ACMachine(T) * machine)
173 /// Get a valid state, ignoring all the symbols previously matched by ACM_match.
174 /// @param [in] machine A pointer to a Aho-Corasick machine.
175 /// @param [in] state A pointer to a valid Aho-Corasick machine state.
176 /// Note: Several calls to ACM_reset on the same machine can be used to
177 ///       parse several texts concurrently (e.g. by several threads).
178 #  define ACM_reset(machine)                        (machine)->vtable->reset ((machine))
179 
180 #  define ACM_print(machine, stream, printer)       (machine)->vtable->print ((machine), (stream), (printer))
181 
182 /// size_t ACM_match (const ACState(T) *& state, T letter)
183 /// This is the main function used to parse a text, one symbol after the other, and search for pattern matching.
184 /// Get the next state matching a symbol injected in the finite state machine.
185 /// @param [in, out] state A pointer to a valid Aho-Corasick machine state. Argument passed by reference.
186 /// @param [in] letter A symbol.
187 /// @return The number of registered keywords that match a sequence of last letters sent to the last calls to `ACM_match`.
188 /// Note: The equality operator, either associated to the machine, or associated to the type T, is used if declared.
189 /// Note: The optional argument `nb_matches` avoids the call to ACM_nb_matches.
190 /// Note: `state` is passed by reference. It is modified by the function.
191 /// Usage: size_t nb = ACM_match(state, letter);
192 #  define ACM_match(state, letter)                  (state)->vtable->match(&(state), (letter))
193 
194 /// void ACM_MATCH_INIT (MatchHolder(T) match)
195 /// Initializes a match before its first use by ACM_get_match.
196 /// @param [in] match A match
197 /// Exemple: ACM_MATCH_INIT (match);
198 /// Note: this function should only be applied to a matching keyword which reference is passed to ACM_get_match.
199 #  define ACM_MATCH_INIT(match)                     ACM_KEYWORD_SET((match), 0, ((match).rank = 0))
200 
201 /// size_t ACM_get_match (const ACState(T) * state, size_t index, [MatchHolder(T) * match], [void **value_ptr])
202 /// Gets the ith keyword matching with the last symbols.
203 /// @param [in] state A pointer to a valid Aho-Corasick machine state.
204 /// @param [in] index Index (ith) of the ith matching keyword.
205 /// @param [out, optional] match *match is set to the ith matching keyword.
206 /// @param [out, optional] value_ptr *value_ptr is set to the pointer of the value associated to the keyword after the call.
207 /// @return The rank (unique id) of the ith matching keyword.
208 /// Note: index must be lower than value returned by the last call to ACM_match.
209 /// ?ote: *match should have been initialized by ACM_MATCH_INIT before use.
210 /// Exemple: size_t rank = ACM_get_match (state, j, &match, 0);
211 #  define ACM_get_match(...)                        VFUNC(ACM_get_match, __VA_ARGS__)
212 
213 /// void ACM_MATCH_RELEASE (MatchHolder(T) match)
214 /// Releases a match after its last use by ACM_get_match.
215 /// @param [in] match A match
216 /// Exemple: ACM_MATCH_RELEASE (match);
217 /// Note: This function should only be applied to a matching keyword which reference is passed to `ACM_get_match`.
218 ///       It should not ne applied to a keyword of type Keyword(T).
219 #  define ACM_MATCH_RELEASE(match)                  do { free (ACM_MATCH_SYMBOLS (match)); ACM_MATCH_INIT (match); } while (0)
220 
221 /// Internal declarations ********************************************************************
222 
223 // BEGIN VFUNC
224 // Credits: VFUNC is a macro for overloading on number (but not types) of arguments.
225 // See https://stackoverflow.com/questions/11761703/overloading-macro-on-number-of-arguments
226 #  define __NARG__(...)  __NARG_I_(__VA_ARGS__,__RSEQ_N())
227 #  define __NARG_I_(...) __ARG_N(__VA_ARGS__)
228 #  define __ARG_N( \
229       _1, _2, _3, _4, _5, _6, _7, _8, _9,_10, \
230      _11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \
231      _21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \
232      _31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \
233      _41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \
234      _51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \
235      _61,_62,_63,N,...) N
236 #  define __RSEQ_N() \
237      63,62,61,60,                   \
238      59,58,57,56,55,54,53,52,51,50, \
239      49,48,47,46,45,44,43,42,41,40, \
240      39,38,37,36,35,34,33,32,31,30, \
241      29,28,27,26,25,24,23,22,21,20, \
242      19,18,17,16,15,14,13,12,11,10, \
243      9,8,7,6,5,4,3,2,1,0
244 
245 #  define _VFUNC_(name, n) name##n
246 #  define _VFUNC(name, n) _VFUNC_(name, n)
247 #  define VFUNC(func, ...) _VFUNC(func, __NARG__(__VA_ARGS__)) (__VA_ARGS__)
248 // END VFUNC
249 
250 // BEGIN DECLARE_ACM
251 #  define ACM_DECLARE(T)                             \
252 \
253 typedef T (*COPY_##T##_TYPE) (const T);              \
254 typedef void (*DESTROY_##T##_TYPE) (const T);        \
255 typedef int (*EQ_##T##_TYPE) (const T, const T);     \
256 \
257 typedef struct                                       \
258 {                                                    \
259   T *letter;      /* An array of symbols */          \
260   size_t length;  /* Length of the array */          \
261 } Keyword_##T;                                       \
262 \
263 typedef struct                                       \
264 {                                                    \
265   T *letter;      /* An array of symbols */          \
266   size_t length;  /* Length of the array */          \
267   size_t rank;    /* Rank of the regidtered keyword */\
268 } MatchHolder_##T;                                   \
269 \
270 struct _ac_state_##T;                                \
271 typedef struct _ac_state_##T ACState_##T;            \
272 struct _ac_machine_##T;                              \
273 typedef struct _ac_machine_##T ACMachine_##T;        \
274 typedef int (*PRINT_##T##_TYPE) (FILE *, T);         \
275 struct _acs_vtable_##T                               \
276 {                                                    \
277   size_t (*match) (const ACState_##T ** state, T letter);                                                    \
278   size_t (*get_match) (const ACState_##T * state, size_t index, MatchHolder_##T * match, void **value);      \
279 };                                                   \
280 /* A state of the state machine. */                  \
281 struct _ac_state_##T             /* [state s] */     \
282 {                                                    \
283   /* A link to the next states */                    \
284   struct _ac_next_##T                                \
285   {                                                  \
286     T letter;                    /* [a symbol] */    \
287     struct _ac_state_##T *state; /* [g(s, letter)] */\
288   } *goto_array;                 /* next states in the tree of the goto function */\
289   size_t nb_goto;                                    \
290   /* A link to the previous states */                \
291   struct                                             \
292   {                                                  \
293     size_t i_letter; /* Index of the letter in the goto_array */ \
294     /* letter = previous.state->goto_array[previous.i_letter].letter */ \
295     struct _ac_state_##T *state;                     \
296   } previous;                    /* Previous state */\
297   const struct _ac_state_##T *fail_state; /* [f(s)] */\
298   int is_matching; /* true if the state matches a keyword. */\
299   size_t nb_sequence; /* Number of matching keywords (Aho-Corasick : size (output (s)) */\
300   size_t rank; /* Rank (0-based) of insertion of a keyword in the machine. */\
301   size_t id;   /* state UID */                       \
302   void *value; /* An optional value associated to a state. */\
303   void (*value_dtor) (void *); /* Destrcutor of the associated value, called a state machine release. */\
304   ACMachine_##T * machine;                           \
305   const struct _acs_vtable_##T *vtable;              \
306 };                                                   \
307 \
308 struct _acm_vtable_##T                               \
309 {                                                    \
310   int (*register_keyword) (ACMachine_##T * machine, Keyword_##T keyword, void *value, void (*dtor) (void *)); \
311   int (*is_registered_keyword) (const ACMachine_##T * machine, Keyword_##T keyword, void **value);            \
312   int (*unregister_keyword) (ACMachine_##T * machine, Keyword_##T keyword);                                   \
313   size_t (*nb_keywords) (const ACMachine_##T * machine);                                                      \
314   void (*foreach_keyword) (const ACMachine_##T * machine, void (*operator) (MatchHolder_##T, void *));        \
315   void (*release) (const ACMachine_##T * machine);                                                            \
316   const ACState_##T * (*reset) (const ACMachine_##T * machine);                                               \
317   void (*print) (ACMachine_##T * machine, FILE * stream, PRINT_##T##_TYPE printer);                           \
318 };                                                   \
319 \
320 struct _ac_machine_##T                               \
321 {                                                    \
322   struct _ac_state_##T *state_0; /* state 0 */       \
323   size_t rank; /* Number of keywords registered in the machine. */\
324   size_t nb_sequence; /* Number of keywords in the machine. */\
325   size_t state_counter;                              \
326   int reconstruct;                                   \
327   size_t size;                                       \
328   pthread_mutex_t lock;                              \
329   const struct _acm_vtable_##T *vtable;              \
330   T (*copy) (const T);                               \
331   void (*destroy) (const T);                         \
332   int (*eq) (const T, const T);                      \
333 };                                                   \
334 \
335 __attribute__ ((unused)) ACMachine_##T *ACM_create_##T (EQ_##T##_TYPE eq,        \
336                                       COPY_##T##_TYPE copier,  \
337                                       DESTROY_##T##_TYPE dtor);  \
338 struct __useless_struct_to_allow_trailing_semicolon__##T##__
339 // END DECLARE_ACM
340 
341 // BEGIN MACROS
342 #  define ACM_create4(T, eq, copy, dtor)       ACM_create_##T((eq), (copy), (dtor))
343 #  define ACM_create2(T, eq)                   ACM_create4(T, (eq), 0, 0)
344 #  define ACM_create1(T)                       ACM_create4(T, 0, 0, 0)
345 
346 #  define ACM_register_keyword4(machine, keyword, value, dtor)  (machine)->vtable->register_keyword ((machine), (keyword), (value), (dtor))
347 #  define ACM_register_keyword3(machine, keyword, value)        ACM_register_keyword4((machine), (keyword), (value), free)
348 #  define ACM_register_keyword2(machine, keyword)               ACM_register_keyword4((machine), (keyword), 0, 0)
349 
350 #  define ACM_is_registered_keyword3(machine, keyword, value)   (machine)->vtable->is_registered_keyword ((machine), (keyword), (value))
351 #  define ACM_is_registered_keyword2(machine, keyword)          ACM_is_registered_keyword3((machine), (keyword), 0)
352 
353 #  define ACM_get_match4(state, index, matchholder, value)      (state)->vtable->get_match ((state), (index), (matchholder), (value))
354 #  define ACM_get_match3(state, index, matchholder)             ACM_get_match4((state), (index), (matchholder), 0)
355 #  define ACM_get_match2(state, index)                          ACM_get_match4((state), (index), 0, 0)
356 
357 #if defined(__GNUC__) || defined (__clang__)
358 #define ACM_DECL5(var, T, eq, copy, dtor)  \
359 __attribute__ ((cleanup (ACM_cleanup_##T))) ACMachine_##T var; machine_init_##T (&(var), state_create_##T (), (eq), (copy), (dtor))
360 #define ACM_DECL3(var, T, eq) ACM_DECL5(var, T, (eq), 0, 0)
361 #define ACM_DECL2(var, T) ACM_DECL3(var, T, 0)
362 #define ACM_DECL(...) VFUNC(ACM_DECL, __VA_ARGS__)
363 #endif
364 // END MACROS
365 
366 #endif
367