1 #ifndef REGPARSE_H 2 #define REGPARSE_H 3 /********************************************************************** 4 regparse.h - Oniguruma (regular expression library) 5 **********************************************************************/ 6 /*- 7 * Copyright (c) 2002-2021 K.Kosako 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include "regint.h" 33 34 #define NODE_STRING_MARGIN 16 35 #define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ 36 #define NODE_BACKREFS_SIZE 6 37 38 /* node type */ 39 typedef enum { 40 NODE_STRING = 0, 41 NODE_CCLASS = 1, 42 NODE_CTYPE = 2, 43 NODE_BACKREF = 3, 44 NODE_QUANT = 4, 45 NODE_BAG = 5, 46 NODE_ANCHOR = 6, 47 NODE_LIST = 7, 48 NODE_ALT = 8, 49 NODE_CALL = 9, 50 NODE_GIMMICK = 10 51 } NodeType; 52 53 enum BagType { 54 BAG_MEMORY = 0, 55 BAG_OPTION = 1, 56 BAG_STOP_BACKTRACK = 2, 57 BAG_IF_ELSE = 3, 58 }; 59 60 enum GimmickType { 61 GIMMICK_FAIL = 0, 62 GIMMICK_SAVE = 1, 63 GIMMICK_UPDATE_VAR = 2, 64 #ifdef USE_CALLOUT 65 GIMMICK_CALLOUT = 3, 66 #endif 67 }; 68 69 enum BodyEmptyType { 70 BODY_IS_NOT_EMPTY = 0, 71 BODY_MAY_BE_EMPTY = 1, 72 BODY_MAY_BE_EMPTY_MEM = 2, 73 BODY_MAY_BE_EMPTY_REC = 3 74 }; 75 76 /* bytes buffer */ 77 typedef struct _BBuf { 78 UChar* p; 79 unsigned int used; 80 unsigned int alloc; 81 } BBuf; 82 83 84 struct _Node; 85 86 typedef struct { 87 NodeType node_type; 88 int status; 89 struct _Node* parent; 90 91 UChar* s; 92 UChar* end; 93 unsigned int flag; 94 UChar buf[NODE_STRING_BUF_SIZE]; 95 int capacity; /* (allocated size - 1) or 0: use buf[] */ 96 } StrNode; 97 98 typedef struct { 99 NodeType node_type; 100 int status; 101 struct _Node* parent; 102 103 unsigned int flags; 104 BitSet bs; 105 BBuf* mbuf; /* multi-byte info or NULL */ 106 } CClassNode; 107 108 typedef struct { 109 NodeType node_type; 110 int status; 111 struct _Node* parent; 112 struct _Node* body; 113 114 int lower; 115 int upper; 116 int greedy; 117 enum BodyEmptyType emptiness; 118 struct _Node* head_exact; 119 struct _Node* next_head_exact; 120 int include_referred; /* include called node. don't eliminate even if {0} */ 121 MemStatusType empty_status_mem; 122 } QuantNode; 123 124 typedef struct { 125 NodeType node_type; 126 int status; 127 struct _Node* parent; 128 struct _Node* body; 129 130 enum BagType type; 131 union { 132 struct { 133 int regnum; 134 AbsAddrType called_addr; 135 int entry_count; 136 int called_state; 137 } m; 138 struct { 139 OnigOptionType options; 140 } o; 141 struct { 142 /* body is condition */ 143 struct _Node* Then; 144 struct _Node* Else; 145 } te; 146 }; 147 /* for multiple call reference */ 148 OnigLen min_len; /* min length (byte) */ 149 OnigLen max_len; /* max length (byte) */ 150 OnigLen min_char_len; 151 OnigLen max_char_len; 152 int opt_count; /* referenced count in optimize_nodes() */ 153 } BagNode; 154 155 #ifdef USE_CALL 156 157 typedef struct { 158 int offset; 159 struct _Node* target; 160 } UnsetAddr; 161 162 typedef struct { 163 int num; 164 int alloc; 165 UnsetAddr* us; 166 } UnsetAddrList; 167 168 typedef struct { 169 NodeType node_type; 170 int status; 171 struct _Node* parent; 172 struct _Node* body; /* to BagNode : BAG_MEMORY */ 173 174 int by_number; 175 int called_gnum; 176 UChar* name; 177 UChar* name_end; 178 int entry_count; 179 } CallNode; 180 181 #endif 182 183 typedef struct { 184 NodeType node_type; 185 int status; 186 struct _Node* parent; 187 188 int back_num; 189 int back_static[NODE_BACKREFS_SIZE]; 190 int* back_dynamic; 191 int nest_level; 192 } BackRefNode; 193 194 typedef struct { 195 NodeType node_type; 196 int status; 197 struct _Node* parent; 198 struct _Node* body; 199 200 int type; 201 OnigLen char_min_len; 202 OnigLen char_max_len; 203 int ascii_mode; 204 struct _Node* lead_node; 205 } AnchorNode; 206 207 typedef struct { 208 NodeType node_type; 209 int status; 210 struct _Node* parent; 211 212 struct _Node* car; 213 struct _Node* cdr; 214 } ConsAltNode; 215 216 typedef struct { 217 NodeType node_type; 218 int status; 219 struct _Node* parent; 220 221 int ctype; 222 int not; 223 int ascii_mode; 224 } CtypeNode; 225 226 typedef struct { 227 NodeType node_type; 228 int status; 229 struct _Node* parent; 230 231 enum GimmickType type; 232 int detail_type; 233 int num; 234 int id; 235 } GimmickNode; 236 237 typedef struct _Node { 238 union { 239 struct { 240 NodeType node_type; 241 int status; 242 struct _Node* parent; 243 struct _Node* body; 244 } base; 245 246 StrNode str; 247 CClassNode cclass; 248 QuantNode quant; 249 BagNode bag; 250 BackRefNode backref; 251 AnchorNode anchor; 252 ConsAltNode cons; 253 CtypeNode ctype; 254 #ifdef USE_CALL 255 CallNode call; 256 #endif 257 GimmickNode gimmick; 258 } u; 259 } Node; 260 261 typedef struct { 262 int new_val; 263 } GroupNumMap; 264 265 266 #define NULL_NODE ((Node* )0) 267 268 269 /* node type bit */ 270 #define NODE_TYPE2BIT(type) (1<<(type)) 271 272 #define NODE_BIT_STRING NODE_TYPE2BIT(NODE_STRING) 273 #define NODE_BIT_CCLASS NODE_TYPE2BIT(NODE_CCLASS) 274 #define NODE_BIT_CTYPE NODE_TYPE2BIT(NODE_CTYPE) 275 #define NODE_BIT_BACKREF NODE_TYPE2BIT(NODE_BACKREF) 276 #define NODE_BIT_QUANT NODE_TYPE2BIT(NODE_QUANT) 277 #define NODE_BIT_BAG NODE_TYPE2BIT(NODE_BAG) 278 #define NODE_BIT_ANCHOR NODE_TYPE2BIT(NODE_ANCHOR) 279 #define NODE_BIT_LIST NODE_TYPE2BIT(NODE_LIST) 280 #define NODE_BIT_ALT NODE_TYPE2BIT(NODE_ALT) 281 #define NODE_BIT_CALL NODE_TYPE2BIT(NODE_CALL) 282 #define NODE_BIT_GIMMICK NODE_TYPE2BIT(NODE_GIMMICK) 283 284 #define NODE_TYPE(node) ((node)->u.base.node_type) 285 #define NODE_SET_TYPE(node, ntype) (node)->u.base.node_type = (ntype) 286 287 #define STR_(node) (&((node)->u.str)) 288 #define CCLASS_(node) (&((node)->u.cclass)) 289 #define CTYPE_(node) (&((node)->u.ctype)) 290 #define BACKREF_(node) (&((node)->u.backref)) 291 #define QUANT_(node) (&((node)->u.quant)) 292 #define BAG_(node) (&((node)->u.bag)) 293 #define ANCHOR_(node) (&((node)->u.anchor)) 294 #define CONS_(node) (&((node)->u.cons)) 295 #define CALL_(node) (&((node)->u.call)) 296 #define GIMMICK_(node) (&((node)->u.gimmick)) 297 298 #define NODE_CAR(node) (CONS_(node)->car) 299 #define NODE_CDR(node) (CONS_(node)->cdr) 300 301 #define CTYPE_ANYCHAR -1 302 #define NODE_IS_ANYCHAR(node) \ 303 (NODE_TYPE(node) == NODE_CTYPE && CTYPE_(node)->ctype == CTYPE_ANYCHAR) 304 305 306 #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) 307 #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) 308 309 #define NODE_STRING_CRUDE (1<<0) 310 #define NODE_STRING_CASE_EXPANDED (1<<1) 311 312 #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) 313 #define NODE_STRING_SET_CRUDE(node) (node)->u.str.flag |= NODE_STRING_CRUDE 314 #define NODE_STRING_CLEAR_CRUDE(node) (node)->u.str.flag &= ~NODE_STRING_CRUDE 315 #define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED 316 #define NODE_STRING_IS_CRUDE(node) \ 317 (((node)->u.str.flag & NODE_STRING_CRUDE) != 0) 318 #define NODE_STRING_IS_CASE_EXPANDED(node) \ 319 (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0) 320 321 #define BACKREFS_P(br) \ 322 (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) 323 324 /* node status bits */ 325 #define NODE_ST_FIXED_MIN (1<<0) 326 #define NODE_ST_FIXED_MAX (1<<1) 327 #define NODE_ST_FIXED_CLEN (1<<2) 328 #define NODE_ST_MARK1 (1<<3) 329 #define NODE_ST_MARK2 (1<<4) 330 #define NODE_ST_STRICT_REAL_REPEAT (1<<5) 331 #define NODE_ST_RECURSION (1<<6) 332 #define NODE_ST_CALLED (1<<7) 333 #define NODE_ST_FIXED_ADDR (1<<8) 334 #define NODE_ST_NAMED_GROUP (1<<9) 335 #define NODE_ST_IN_REAL_REPEAT (1<<10) /* STK_REPEAT is nested in stack. */ 336 #define NODE_ST_IN_ZERO_REPEAT (1<<11) /* (....){0} */ 337 #define NODE_ST_IN_MULTI_ENTRY (1<<12) 338 #define NODE_ST_NEST_LEVEL (1<<13) 339 #define NODE_ST_BY_NUMBER (1<<14) /* {n,m} */ 340 #define NODE_ST_BY_NAME (1<<15) /* backref by name */ 341 #define NODE_ST_BACKREF (1<<16) 342 #define NODE_ST_CHECKER (1<<17) 343 #define NODE_ST_PROHIBIT_RECURSION (1<<18) 344 #define NODE_ST_SUPER (1<<19) 345 #define NODE_ST_EMPTY_STATUS_CHECK (1<<20) 346 #define NODE_ST_IGNORECASE (1<<21) 347 #define NODE_ST_MULTILINE (1<<22) 348 #define NODE_ST_TEXT_SEGMENT_WORD (1<<23) 349 #define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24) /* stopper or clear */ 350 #define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25) 351 #define NODE_ST_REFERENCED (1<<26) 352 #define NODE_ST_INPEEK (1<<27) 353 354 355 #define NODE_STATUS(node) (((Node* )node)->u.base.status) 356 #define NODE_STATUS_ADD(node,f) (NODE_STATUS(node) |= (NODE_ST_ ## f)) 357 #define NODE_STATUS_REMOVE(node,f) (NODE_STATUS(node) &= ~(NODE_ST_ ## f)) 358 359 #define NODE_IS_BY_NUMBER(node) ((NODE_STATUS(node) & NODE_ST_BY_NUMBER) != 0) 360 #define NODE_IS_IN_REAL_REPEAT(node) ((NODE_STATUS(node) & NODE_ST_IN_REAL_REPEAT) != 0) 361 #define NODE_IS_CALLED(node) ((NODE_STATUS(node) & NODE_ST_CALLED) != 0) 362 #define NODE_IS_IN_MULTI_ENTRY(node) ((NODE_STATUS(node) & NODE_ST_IN_MULTI_ENTRY) != 0) 363 #define NODE_IS_RECURSION(node) ((NODE_STATUS(node) & NODE_ST_RECURSION) != 0) 364 #define NODE_IS_IN_ZERO_REPEAT(node) ((NODE_STATUS(node) & NODE_ST_IN_ZERO_REPEAT) != 0) 365 #define NODE_IS_NAMED_GROUP(node) ((NODE_STATUS(node) & NODE_ST_NAMED_GROUP) != 0) 366 #define NODE_IS_FIXED_ADDR(node) ((NODE_STATUS(node) & NODE_ST_FIXED_ADDR) != 0) 367 #define NODE_IS_FIXED_CLEN(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN) != 0) 368 #define NODE_IS_FIXED_MIN(node) ((NODE_STATUS(node) & NODE_ST_FIXED_MIN) != 0) 369 #define NODE_IS_FIXED_MAX(node) ((NODE_STATUS(node) & NODE_ST_FIXED_MAX) != 0) 370 #define NODE_IS_MARK1(node) ((NODE_STATUS(node) & NODE_ST_MARK1) != 0) 371 #define NODE_IS_MARK2(node) ((NODE_STATUS(node) & NODE_ST_MARK2) != 0) 372 #define NODE_IS_NEST_LEVEL(node) ((NODE_STATUS(node) & NODE_ST_NEST_LEVEL) != 0) 373 #define NODE_IS_BY_NAME(node) ((NODE_STATUS(node) & NODE_ST_BY_NAME) != 0) 374 #define NODE_IS_BACKREF(node) ((NODE_STATUS(node) & NODE_ST_BACKREF) != 0) 375 #define NODE_IS_CHECKER(node) ((NODE_STATUS(node) & NODE_ST_CHECKER) != 0) 376 #define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0) 377 #define NODE_IS_PROHIBIT_RECURSION(node) \ 378 ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) 379 #define NODE_IS_STRICT_REAL_REPEAT(node) \ 380 ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) 381 #define NODE_IS_EMPTY_STATUS_CHECK(node) \ 382 ((NODE_STATUS(node) & NODE_ST_EMPTY_STATUS_CHECK) != 0) 383 #define NODE_IS_IGNORECASE(node) ((NODE_STATUS(node) & NODE_ST_IGNORECASE) != 0) 384 #define NODE_IS_MULTILINE(node) ((NODE_STATUS(node) & NODE_ST_MULTILINE) != 0) 385 #define NODE_IS_TEXT_SEGMENT_WORD(node) ((NODE_STATUS(node) & NODE_ST_TEXT_SEGMENT_WORD) != 0) 386 #define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0) 387 #define NODE_IS_FIXED_CLEN_MIN_SURE(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0) 388 #define NODE_IS_REFERENCED(node) ((NODE_STATUS(node) & NODE_ST_REFERENCED) != 0) 389 #define NODE_IS_INPEEK(node) ((NODE_STATUS(node) & NODE_ST_INPEEK) != 0) 390 391 #define NODE_PARENT(node) ((node)->u.base.parent) 392 #define NODE_BODY(node) ((node)->u.base.body) 393 #define NODE_QUANT_BODY(node) ((node)->body) 394 #define NODE_BAG_BODY(node) ((node)->body) 395 #define NODE_CALL_BODY(node) ((node)->body) 396 #define NODE_ANCHOR_BODY(node) ((node)->body) 397 398 #define PARSEENV_MEMENV_SIZE 8 399 #define PARSEENV_MEMENV(senv) \ 400 (IS_NOT_NULL((senv)->mem_env_dynamic) ? \ 401 (senv)->mem_env_dynamic : (senv)->mem_env_static) 402 403 #define IS_SYNTAX_OP(syn, opm) (((syn)->op & (opm)) != 0) 404 #define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0) 405 #define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0) 406 407 #define ID_ENTRY(env, id) do {\ 408 id = (env)->id_num++;\ 409 } while(0) 410 411 412 typedef struct { 413 Node* mem_node; 414 Node* empty_repeat_node; 415 } MemEnv; 416 417 typedef struct { 418 enum SaveType type; 419 } SaveItem; 420 421 typedef struct { 422 OnigOptionType options; 423 OnigCaseFoldType case_fold_flag; 424 OnigEncoding enc; 425 OnigSyntaxType* syntax; 426 MemStatusType cap_history; 427 MemStatusType backtrack_mem; /* backtrack/recursion */ 428 MemStatusType backrefed_mem; 429 UChar* pattern; 430 UChar* pattern_end; 431 UChar* error; 432 UChar* error_end; 433 regex_t* reg; /* for reg->names only */ 434 int num_call; 435 int num_mem; 436 int num_named; 437 int mem_alloc; 438 MemEnv mem_env_static[PARSEENV_MEMENV_SIZE]; 439 MemEnv* mem_env_dynamic; 440 int backref_num; 441 int keep_num; 442 int id_num; 443 int save_alloc_num; 444 SaveItem* saves; 445 #ifdef USE_CALL 446 UnsetAddrList* unset_addr_list; 447 int has_call_zero; 448 #endif 449 unsigned int parse_depth; 450 #ifdef ONIG_DEBUG_PARSE 451 unsigned int max_parse_depth; 452 #endif 453 } ParseEnv; 454 455 456 extern int onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map)); 457 458 extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); 459 extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); 460 extern void onig_scan_env_set_error_string P_((ParseEnv* env, int ecode, UChar* arg, UChar* arg_end)); 461 extern int onig_reduce_nested_quantifier P_((Node* pnode)); 462 extern int onig_node_copy(Node** rcopy, Node* from); 463 extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); 464 extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end, int need_free)); 465 extern void onig_node_str_clear P_((Node* node, int need_free)); 466 extern void onig_node_free P_((Node* node)); 467 extern int onig_node_reset_empty P_((Node* node)); 468 extern int onig_node_reset_fail P_((Node* node)); 469 extern Node* onig_node_new_bag P_((enum BagType type)); 470 extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); 471 extern Node* onig_node_new_list P_((Node* left, Node* right)); 472 extern Node* onig_node_new_alt P_((Node* left, Node* right)); 473 extern int onig_names_free P_((regex_t* reg)); 474 extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ParseEnv* env)); 475 extern int onig_free_shared_cclass_table P_((void)); 476 extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); 477 extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); 478 extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node); 479 480 #ifdef USE_CALLOUT 481 extern int onig_global_callout_names_free(void); 482 #endif 483 484 #ifdef ONIG_DEBUG 485 extern int onig_print_names(FILE*, regex_t*); 486 #endif 487 488 #endif /* REGPARSE_H */ 489