1 /* 2 * fa.h: finite automata 3 * 4 * Copyright (C) 2007-2016 David Lutterkort 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * 20 * Author: David Lutterkort <dlutter@redhat.com> 21 */ 22 23 #ifndef FA_H_ 24 #define FA_H_ 25 26 #include <stdio.h> 27 #include <regex.h> 28 #include <stdbool.h> 29 #include <stdint.h> 30 31 /* The type for a finite automaton. */ 32 struct fa; 33 34 /* The type of a state of a finite automaton. The fa_state functions return 35 * pointers to this struct. Those pointers are only valid as long as the 36 * only fa_* functions that are called are fa_state_* functions. For 37 * example, the following code will almost certainly result in a crash (or 38 * worse): 39 * 40 * struct state *s = fa_state_initial(fa); 41 * fa_minimize(fa); 42 * // Crashes as S will likely have been freed 43 * s = fa_state_next(s) 44 */ 45 struct state; 46 47 /* Denote some basic automata, used by fa_is_basic and fa_make_basic */ 48 enum fa_basic { 49 FA_EMPTY, /* Accepts the empty language, i.e. no strings */ 50 FA_EPSILON, /* Accepts only the empty word */ 51 FA_TOTAL /* Accepts all words */ 52 }; 53 54 /* Choice of minimization algorithm to use; either Hopcroft's O(n log(n)) 55 * algorithm or Brzozowski's reverse-determinize-reverse-determinize 56 * algorithm. While the latter has exponential complexity in theory, it 57 * works quite well for some cases. 58 */ 59 enum fa_minimization_algorithms { 60 FA_MIN_HOPCROFT, 61 FA_MIN_BRZOZOWSKI 62 }; 63 64 /* Which minimization algorithm to use in FA_MINIMIZE. The library 65 * minimizes internally at certain points, too. 66 * 67 * Defaults to FA_MIN_HOPCROFT 68 */ 69 extern int fa_minimization_algorithm; 70 71 /* Unless otherwise mentioned, automata passed into routines are never 72 * modified. It is the responsibility of the caller to free automata 73 * returned by any of these routines when they are no longer needed. 74 */ 75 76 /* 77 * Compile the regular expression RE of length SIZE into an automaton. The 78 * return value is the same as the return value for the POSIX function 79 * regcomp. The syntax for regular expressions is extended POSIX syntax, 80 * with the difference that '.' does not match newlines. 81 * 82 * On success, FA points to the newly allocated automaton constructed for 83 * RE, and the function returns REG_NOERROR. Otherwise, FA is NULL, and the 84 * return value indicates the error. 85 * 86 * The FA is case sensitive. Call FA_NOCASE to switch it to 87 * case-insensitive. 88 */ 89 int fa_compile(const char *re, size_t size, struct fa **fa); 90 91 /* Make a new automaton that accepts one of the basic languages defined in 92 * the enum FA_BASIC. 93 */ 94 struct fa *fa_make_basic(unsigned int basic); 95 96 /* Return 1 if FA accepts the basic language BASIC, which must be one of 97 * the constants from enum FA_BASIC. 98 */ 99 int fa_is_basic(struct fa *fa, unsigned int basic); 100 101 /* Minimize FA using the currently-set fa_minimization_algorithm. 102 * As a side effect, the automaton will also be deterministic after being 103 * minimized. Modifies the automaton in place. 104 */ 105 int fa_minimize(struct fa *fa); 106 107 /* Return a finite automaton that accepts the concatenation of the 108 * languages for FA1 and FA2, i.e. L(FA1).L(FA2) 109 */ 110 struct fa *fa_concat(struct fa *fa1, struct fa *fa2); 111 112 /* Return a finite automaton that accepts the union of the languages that 113 * FA1 and FA2 accept (the '|' operator in regular expressions). 114 */ 115 struct fa *fa_union(struct fa *fa1, struct fa *fa2); 116 117 /* Return a finite automaton that accepts the intersection of the languages 118 * of FA1 and FA2. 119 */ 120 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2); 121 122 /* Return a finite automaton that accepts the complement of the language of 123 * FA, i.e. the set of all words not accepted by FA 124 */ 125 struct fa *fa_complement(struct fa *fa); 126 127 /* Return a finite automaton that accepts the set difference of the 128 * languages of FA1 and FA2, i.e. L(FA1)\L(FA2) 129 */ 130 struct fa *fa_minus(struct fa *fa1, struct fa *fa2); 131 132 /* Return a finite automaton that accepts a repetition of the language that 133 * FA accepts. If MAX == -1, the returned automaton accepts arbitrarily 134 * long repetitions. MIN must be 0 or bigger, and unless MAX == -1, MIN 135 * must be less or equal to MAX. If MIN is greater than 0, the returned 136 * automaton accepts only words that have at least MIN repetitions of words 137 * from L(FA). 138 * 139 * The following common regexp repetitios are achieved by the following 140 * calls (using a lose notation equating automata and their languages): 141 * 142 * - FA* = FA_ITER(FA, 0, -1) 143 * - FA+ = FA_ITER(FA, 1, -1) 144 * - FA? = FA_ITER(FA, 0, 1) 145 * - FA{n,m} = FA_ITER(FA, n, m) with 0 <= n and m = -1 or n <= m 146 */ 147 struct fa *fa_iter(struct fa *fa, int min, int max); 148 149 /* If successful, returns 1 if the language of FA1 is contained in the language 150 * of FA2, 0 otherwise. Returns a negative number if an error occurred. 151 */ 152 int fa_contains(struct fa *fa1, struct fa *fa2); 153 154 /* If successful, returns 1 if the language of FA1 equals the language of FA2, 155 * 0 otherwise. Returns a negative number if an error occurred. 156 */ 157 int fa_equals(struct fa *fa1, struct fa *fa2); 158 159 /* Free all memory used by FA */ 160 void fa_free(struct fa *fa); 161 162 /* Print FA to OUT as a graphviz dot file */ 163 void fa_dot(FILE *out, struct fa *fa); 164 165 /* Return a finite automaton that accepts the overlap of the languages of 166 * FA1 and FA2. The overlap of two languages is the set of strings that can 167 * be split in more than one way into a left part accepted by FA1 and a 168 * right part accepted by FA2. 169 */ 170 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2); 171 172 /* Produce an example for the language of FA. The example is not 173 * necessarily the shortest possible. The implementation works very hard to 174 * have printable characters (preferably alphanumeric) in the example, and 175 * to avoid just an empty word. 176 * 177 * *EXAMPLE will be the example, which may be NULL. If it is non-NULL, 178 * EXAMPLE_LEN will hold the length of the example. 179 * 180 * Return 0 on success, and a negative numer on error. On error, *EXAMPLE 181 * will be NULL. 182 * 183 * If *EXAMPLE is set, it is the caller's responsibility to free the string 184 * by calling free(). 185 */ 186 int fa_example(struct fa *fa, char **example, size_t *example_len); 187 188 /* Produce an example of an ambiguous word for the concatenation of the 189 * languages of FA1 and FA2. The return value is such a word (which must be 190 * freed by the caller) if it exists. If none exists, NULL is returned. 191 * 192 * The returned word is of the form UPV and PV and V are set to the first 193 * character of P and V in the returned word. The word UPV has the property 194 * that U and UP are accepted by FA1 and that PV and V are accepted by FA2. 195 * 196 * Neither the language of FA1 or of FA2 may contain words with the 197 * characters '\001' and '\002', as they are used during construction of 198 * the ambiguous word. 199 * 200 * UPV_LEN will be set to the length of the entire string UPV 201 * 202 * Returns 0 on success, and a negative number on failure. On failure, UPV, 203 * PV, and V will be NULL 204 */ 205 int fa_ambig_example(struct fa *fa1, struct fa *fa2, 206 char **upv, size_t *upv_len, 207 char **pv, char **v); 208 209 /* Convert the finite automaton FA into a regular expression and set REGEXP 210 * to point to that. When REGEXP is compiled into another automaton, it is 211 * guaranteed that that automaton and FA accept the same language. 212 * 213 * The code tries to be semi-clever about keeping the generated regular 214 * expression short; to guarantee reasonably short regexps, the automaton 215 * should be minimized before passing it to this routine. 216 * 217 * On success, REGEXP_LEN is set to the length of REGEXP 218 * 219 * Return 0 on success, and a negative number on failure. The only reason 220 * for FA_AS_REGEXP to fail is running out of memory. 221 */ 222 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len); 223 224 /* Given the regular expression REGEXP construct a new regular expression 225 * NEWREGEXP that does not match strings containing any of the characters 226 * in the range FROM to TO, with the endpoints included. 227 * 228 * The new regular expression is constructed by removing the range FROM to 229 * TO from all character sets in REGEXP; if any of the characters [FROM, 230 * TO] appear outside a character set in REGEXP, return -2. 231 * 232 * Return 0 if NEWREGEXP was constructed successfully, -1 if an internal 233 * error happened (e.g., an allocation failed) and -2 if NEWREGEXP can not 234 * be constructed because any character in the range [FROM, TO] appears 235 * outside of a character set. 236 * 237 * Return a positive value if REGEXP is not syntactically valid; the value 238 * returned is one of the REG_ERRCODE_T POSIX error codes. Return 0 on 239 * success and -1 if an allocation fails. 240 */ 241 int fa_restrict_alphabet(const char *regexp, size_t regexp_len, 242 char **newregexp, size_t *newregexp_len, 243 char from, char to); 244 245 /* Convert REGEXP into one that does not use ranges inside character 246 * classes. 247 * 248 * Return a positive value if REGEXP is not syntactically valid; the value 249 * returned is one of the REG_ERRCODE_T POSIX error codes. Return 0 on 250 * success and -1 if an allocation fails. 251 */ 252 int fa_expand_char_ranges(const char *regexp, size_t regexp_len, 253 char **newregexp, size_t *newregexp_len); 254 255 /* Modify FA so that it matches ignoring case. 256 * 257 * Returns 0 on success, and -1 if an allocation fails. On failure, the 258 * automaton is not guaranteed to represent anything sensible. 259 */ 260 int fa_nocase(struct fa *fa); 261 262 /* Return 1 if FA matches ignoring case, 0 if matches are case sensitive */ 263 int fa_is_nocase(struct fa *fa); 264 265 /* Assume REGEXP is a case-insensitive regular expression, and convert it 266 * to one that matches the same strings when used case sensitively. All 267 * occurrences of individual letters c in the regular expression will be 268 * replaced by character sets [cC], and lower/upper case characters are 269 * added to character sets as needed. 270 * 271 * Return a positive value if REGEXP is not syntactically valid; the value 272 * returned is one of the REG_ERRCODE_T POSIX error codes. Return 0 on 273 * success and -1 if an allocation fails. 274 */ 275 int fa_expand_nocase(const char *regexp, size_t regexp_len, 276 char **newregexp, size_t *newregexp_len); 277 278 /* Generate up to LIMIT words from the language of FA, which is assumed to 279 * be finite. The words are returned in WORDS, which is allocated by this 280 * function and must be freed by the caller. 281 * 282 * If FA accepts the empty word, the empty string will be included in 283 * WORDS. 284 * 285 * Return the number of generated words on success, -1 if we run out of 286 * memory, and -2 if FA has more than LIMIT words. 287 */ 288 int fa_enumerate(struct fa *fa, int limit, char ***words); 289 290 /* Print FA to OUT as a JSON file. State 0 is always the initial one. 291 * Returns 0 on success, and -1 on failure. 292 */ 293 int fa_json(FILE *out, struct fa *fa); 294 295 /* Returns true if the FA is deterministic and 0 otherwise */ 296 bool fa_is_deterministic(struct fa *fa); 297 298 /* Return the initial state */ 299 struct state *fa_state_initial(struct fa *fa); 300 301 /* Return true if this is an accepting state */ 302 bool fa_state_is_accepting(struct state *st); 303 304 /* Return the next state; return NULL if there are no more states */ 305 struct state* fa_state_next(struct state *st); 306 307 /* Return the number of transitions for a state */ 308 size_t fa_state_num_trans(struct state *st); 309 310 /* Produce details about the i-th transition. 311 * 312 * On success, *to points to the destination state of the transition and 313 * the interval [min-max] is the label of the transition. 314 * 315 * On failure, *to, min and max are not modified. 316 * 317 * Return 0 on success and -1 when I is larger than the number of 318 * transitions of ST. 319 */ 320 int fa_state_trans(struct state *st, size_t i, 321 struct state **to, unsigned char *min, unsigned char *max); 322 323 #endif 324 325 326 /* 327 * Local variables: 328 * indent-tabs-mode: nil 329 * c-indent-level: 4 330 * c-basic-offset: 4 331 * tab-width: 4 332 * End: 333 */ 334