1 /* 2 * Copyright (c) 2002 by The XFree86 Project, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 * 22 * Except as contained in this notice, the name of the XFree86 Project shall 23 * not be used in advertising or otherwise to promote the sale, use or other 24 * dealings in this Software without prior written authorization from the 25 * XFree86 Project. 26 * 27 * Author: Paulo César Pereira de Andrade 28 */ 29 30 /* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.2 2002/11/15 07:01:33 paulo Exp $ */ 31 32 #include "re.h" 33 34 #ifndef _rep_h 35 #define _rep_h 36 37 /* 38 * Local defines 39 */ 40 41 #ifdef MIN 42 #undef MIN 43 #endif 44 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 45 46 #ifdef MAX 47 #undef MAX 48 #endif 49 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 50 51 /* This value can not be larger than 255, a depth value is the nesting of 52 * repetition operations and alternatives. The number of nested parenthesis 53 * does not matter, but a repetition on the pattern inside the parenthesis 54 * does. Note also that you cannot have more than 9 parenthesis pairs in 55 * an expression. 56 * Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed 57 * 7 complex repetitions. A complex repetition is a dot followed by an 58 * repetition operator. It is called a complex repetition because dot 59 * matches anything but the empty string, so the engine needs to test 60 * all possible combinations until the end of the string is found. 61 * Repetitions like .* use one depth until the end of the string is found, 62 * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2. 63 */ 64 #define MAX_DEPTH 8 65 66 /* Minimum number of strings to generate a "large" string list, that is, 67 * sort the strings and allocate 512 extra bytes to map the first string 68 * with a given initial byte. */ 69 #define LARGE_STL_COUNT 16 70 71 /* 72 * Local types 73 */ 74 /* Intermediate compilation types declaration */ 75 /* (r)egular (e)xpression (c)ompile (c)a(se) */ 76 typedef struct _rec_cse rec_cse; 77 78 /* (r)egular (e)xpression (c)ompile (r)a(ng)e */ 79 typedef struct _rec_rng rec_rng; 80 81 /* (r)egular (e)xpression (c)ompile (pat)tern */ 82 typedef struct _rec_pat rec_pat; 83 84 /* (r)egular (e)xpression (c)ompile (rep)etition */ 85 typedef struct _rec_rep rec_rep; 86 87 /* (r)egular (e)xpression (c)ompile (gr)ou(p) */ 88 typedef struct _rec_grp rec_grp; 89 90 /* (r)egular (e)xpression (c)ompile (alt)ernatives */ 91 typedef struct _rec_alt rec_alt; 92 93 94 /* Optimization types */ 95 /* (r)egular (e)xpression (c)ompile (st)ring (l)ist */ 96 typedef struct _rec_stl rec_stl; 97 98 /* Final compilation and execution types */ 99 /* (re)gular expression (inf)ormation */ 100 typedef struct _re_inf re_inf; 101 102 /* (re)gular expression (eng)ine */ 103 typedef struct _re_eng re_eng; 104 105 106 /* Codes used by the engine */ 107 typedef enum { 108 /* Grouping */ 109 Re_Open, /* ( */ 110 Re_Close, /* ) */ 111 Re_Update, /* Like Re_Close, but is inside a loop */ 112 113 /* Alternatives */ 114 Re_Alt, /* Start alternative list, + next offset */ 115 Re_AltNext, /* Next alternative, + next offset */ 116 Re_AltDone, /* Finish alternative list */ 117 118 /* Repetition */ 119 Re_AnyTimes, /* * */ 120 Re_Maybe, /* ? */ 121 Re_AtLeast, /* +, at least one */ 122 123 /* Repetition like */ 124 Re_AnyAnyTimes, /* .*<re> */ 125 Re_AnyMaybe, /* .?<re> */ 126 Re_AnyAtLeast, /* .+<re> */ 127 128 Re_AnyEatAnyTimes, /* Expression ends with .* */ 129 Re_AnyEatMaybe, /* Expression ends with .? */ 130 Re_AnyEatAtLeast, /* Expression ends with .+ */ 131 132 /* Repetition with arguments */ 133 Re_Exact, /* {e} */ 134 Re_Min, /* {n,} */ 135 Re_Max, /* {,m} */ 136 Re_MinMax, /* {n,m} */ 137 138 /* Repetition helper instruction */ 139 Re_RepJump, /* Special code, go back to repetition */ 140 Re_RepLongJump, /* Jump needs two bytes */ 141 /* After the repetition data, all repetitions have an offset 142 * to the code after the repetition */ 143 144 /* Matching */ 145 Re_Any, /* . */ 146 Re_Odigit, /* \o */ 147 Re_OdigitNot, /* \O */ 148 Re_Digit, /* \d */ 149 Re_DigitNot, /* \D */ 150 Re_Xdigit, /* \x */ 151 Re_XdigitNot, /* \x */ 152 Re_Space, /* \s */ 153 Re_SpaceNot, /* \S */ 154 Re_Tab, /* \t */ 155 Re_Newline, /* \n */ 156 Re_Lower, /* \l */ 157 Re_Upper, /* \u */ 158 Re_Alnum, /* \w */ 159 Re_AlnumNot, /* \W */ 160 Re_Control, /* \c */ 161 Re_ControlNot, /* \C */ 162 Re_Bol, /* ^ */ 163 Re_Eol, /* $ */ 164 Re_Bow, /* \< */ 165 Re_Eow, /* \> */ 166 167 /* Range matching information */ 168 Re_Range, /* + 256 bytes */ 169 Re_RangeNot, /* + 256 bytes */ 170 171 /* Matching with arguments */ 172 Re_Literal, /* + character */ 173 Re_CaseLiteral, /* + lower + upper */ 174 Re_LiteralNot, /* + character */ 175 Re_CaseLiteralNot, /* + lower + upper */ 176 Re_String, /* + length + string */ 177 Re_CaseString, /* + length + string in format lower-upper */ 178 179 /* These are useful to start matching, or when RE_NOSPEC is used. */ 180 Re_SearchLiteral, 181 Re_SearchCaseLiteral, 182 Re_SearchString, 183 Re_SearchCaseString, 184 185 Re_StringList, /* + total-length + lengths + strings */ 186 Re_CaseStringList, /* + total-length + lengths + strings */ 187 188 Re_LargeStringList, /* + total-length + lengths + map + strings */ 189 Re_LargeCaseStringList, /* + total-length + lengths + map + strings */ 190 191 /* Backreference */ 192 Re_Backref, /* + reference number */ 193 194 /* The last codes */ 195 Re_DoneIf, /* Done if at end of input */ 196 Re_MaybeDone, /* Done */ 197 Re_Done /* If this code found, finished execution */ 198 } ReCode; 199 200 201 /* (r)egular (e)xpresssion (pat)rern (t)ype */ 202 typedef enum _rec_pat_t { 203 Rep_Literal = Re_Literal, 204 Rep_CaseLiteral = Re_CaseLiteral, 205 Rep_LiteralNot = Re_LiteralNot, 206 Rep_CaseLiteralNot = Re_CaseLiteralNot, 207 Rep_Range = Re_Range, 208 Rep_RangeNot = Re_RangeNot, 209 Rep_String = Re_String, 210 Rep_CaseString = Re_CaseString, 211 Rep_SearchLiteral = Re_SearchLiteral, 212 Rep_SearchCaseLiteral = Re_SearchCaseLiteral, 213 Rep_SearchString = Re_SearchString, 214 Rep_SearchCaseString = Re_SearchCaseString, 215 Rep_Any = Re_Any, 216 Rep_AnyAnyTimes = Re_AnyAnyTimes, 217 Rep_AnyEatAnyTimes = Re_AnyEatAnyTimes, 218 Rep_AnyMaybe = Re_AnyMaybe, 219 Rep_AnyEatMaybe = Re_AnyEatMaybe, 220 Rep_AnyAtLeast = Re_AnyAtLeast, 221 Rep_AnyEatAtLeast = Re_AnyEatAtLeast, 222 Rep_Odigit = Re_Odigit, 223 Rep_OdigitNot = Re_OdigitNot, 224 Rep_Digit = Re_Digit, 225 Rep_DigitNot = Re_DigitNot, 226 Rep_Xdigit = Re_Xdigit, 227 Rep_XdigitNot = Re_XdigitNot, 228 Rep_Space = Re_Space, 229 Rep_SpaceNot = Re_SpaceNot, 230 Rep_Tab = Re_Tab, 231 Rep_Newline = Re_Newline, 232 Rep_Lower = Re_Lower, 233 Rep_Upper = Re_Upper, 234 Rep_Alnum = Re_Alnum, 235 Rep_AlnumNot = Re_AlnumNot, 236 Rep_Control = Re_Control, 237 Rep_ControlNot = Re_ControlNot, 238 Rep_Bol = Re_Bol, 239 Rep_Eol = Re_Eol, 240 Rep_Bow = Re_Bow, 241 Rep_Eow = Re_Eow, 242 Rep_Backref = Re_Backref, 243 Rep_StringList = Re_StringList, 244 Rep_Group = Re_Open 245 } rec_pat_t; 246 247 248 /* (r)egular (e)xpression (rep)etition (t)ype */ 249 typedef enum _rec_rep_t { 250 Rer_AnyTimes = Re_AnyTimes, 251 Rer_AtLeast = Re_AtLeast, 252 Rer_Maybe = Re_Maybe, 253 Rer_Exact = Re_Exact, 254 Rer_Min = Re_Min, 255 Rer_Max = Re_Max, 256 Rer_MinMax = Re_MinMax 257 } rec_rep_t; 258 259 260 /* Decide at re compilation time what is lowercase and what is uppercase */ 261 struct _rec_cse { 262 unsigned char lower; 263 unsigned char upper; 264 }; 265 266 267 /* A rec_rng is used only during compilation, just a character map */ 268 struct _rec_rng { 269 unsigned char range[256]; 270 }; 271 272 273 /* A rec_pat is used only during compilation, and can be viewed as 274 * a regular expression element like a match to any character, a match 275 * to the beginning or end of the line, etc. 276 * It is implemented as a linked list, and does not have nesting. 277 * The data field can contain: 278 * chr: the value of a single character to match. 279 * cse: the upper and lower case value of a character to match. 280 * rng: a character map to match or not match. 281 * str: a simple string or a string where every two bytes 282 * represents the character to match, in lower/upper 283 * case sequence. 284 * The rep field is not used for strings, strings are broken in the 285 * last character in this case. That is, strings are just a concatenation 286 * of several character matches. 287 */ 288 struct _rec_pat { 289 rec_pat_t type; 290 rec_pat *next, *prev; /* Linked list information */ 291 union { 292 unsigned char chr; 293 rec_cse cse; 294 rec_rng *rng; 295 rec_grp *grp; 296 unsigned char *str; 297 rec_stl *stl; 298 } data; 299 rec_rep *rep; /* Pattern repetition information */ 300 }; 301 302 303 /* A rec_rep is used only during compilation, and can be viewed as: 304 * 305 * ? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>} 306 * 307 * where <e> is "exact", <m> is "minimum" and <M> is "maximum". 308 * In the compiled step it can also be just a NULL pointer, that 309 * is actually equivalent to {1}. 310 */ 311 struct _rec_rep { 312 rec_rep_t type; 313 short mine; /* minimum or exact number of matches */ 314 short maxc; /* maximum number of matches */ 315 }; 316 317 318 /* A rec_alt is used only during compilation, and can be viewed as: 319 * 320 * <re>|<re> 321 * 322 * where <re> is any regular expression. The expressions are nested 323 * using the grp field of the rec_pat structure. 324 */ 325 struct _rec_alt { 326 rec_alt *next, *prev; /* Linked list information */ 327 rec_pat *pat; 328 }; 329 330 331 /* A rec_grp is a place holder for expressions enclosed in parenthesis 332 * and is linked to the compilation data by an rec_pat structure. */ 333 struct _rec_grp { 334 rec_pat *parent; /* Reference to parent pattern */ 335 rec_alt *alt; /* The pattern information */ 336 rec_alt *palt; /* Parent alternative */ 337 rec_grp *pgrp; /* Nested groups */ 338 int comp; /* (comp)lex repetition pattern inside group */ 339 }; 340 341 342 /* Optimization compilation types definition */ 343 /* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */ 344 typedef enum { 345 Resl_StringList = Re_StringList, 346 Resl_CaseStringList = Re_CaseStringList 347 } rec_stl_t; 348 349 struct _rec_stl { 350 rec_stl_t type; 351 int nstrs; /* Number of strings in list */ 352 int tlen; /* Total length of all strings */ 353 unsigned char *lens; /* Vector of string lengths */ 354 unsigned char **strs; /* The strings */ 355 }; 356 357 358 /* 359 * Prototypes 360 */ 361 /* rep.c */ 362 rec_alt *irec_comp(const char*, const char*, int, int*); 363 void irec_free_alt(rec_alt*); 364 365 /* reo.c */ 366 int orec_comp(rec_alt*, int); 367 void orec_free_stl(rec_stl*); 368 369 #endif /* _rep_h */ 370