1 #ifndef REGPARSE_H 2 #define REGPARSE_H 3 /********************************************************************** 4 regparse.h - Oniguruma (regular expression library) 5 **********************************************************************/ 6 /*- 7 * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include "regint.h" 33 34 #define NODE_STRING_MARGIN 16 35 #define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ 36 #define NODE_BACKREFS_SIZE 6 37 38 /* node type */ 39 typedef enum { 40 NODE_STRING = 0, 41 NODE_CCLASS = 1, 42 NODE_CTYPE = 2, 43 NODE_BACKREF = 3, 44 NODE_QUANT = 4, 45 NODE_BAG = 5, 46 NODE_ANCHOR = 6, 47 NODE_LIST = 7, 48 NODE_ALT = 8, 49 NODE_CALL = 9, 50 NODE_GIMMICK = 10 51 } NodeType; 52 53 enum BagType { 54 BAG_MEMORY = 0, 55 BAG_OPTION = 1, 56 BAG_STOP_BACKTRACK = 2, 57 BAG_IF_ELSE = 3, 58 }; 59 60 enum GimmickType { 61 GIMMICK_FAIL = 0, 62 GIMMICK_SAVE = 1, 63 GIMMICK_UPDATE_VAR = 2, 64 #ifdef USE_CALLOUT 65 GIMMICK_CALLOUT = 3, 66 #endif 67 }; 68 69 enum BodyEmptyType { 70 BODY_IS_NOT_EMPTY = 0, 71 BODY_IS_EMPTY_POSSIBILITY = 1, 72 BODY_IS_EMPTY_POSSIBILITY_MEM = 2, 73 BODY_IS_EMPTY_POSSIBILITY_REC = 3 74 }; 75 76 typedef struct { 77 NodeType node_type; 78 int status; 79 80 UChar* s; 81 UChar* end; 82 unsigned int flag; 83 int capacity; /* (allocated size - 1) or 0: use buf[] */ 84 UChar buf[NODE_STRING_BUF_SIZE]; 85 } StrNode; 86 87 typedef struct { 88 NodeType node_type; 89 int status; 90 91 unsigned int flags; 92 BitSet bs; 93 BBuf* mbuf; /* multi-byte info or NULL */ 94 } CClassNode; 95 96 typedef struct { 97 NodeType node_type; 98 int status; 99 struct _Node* body; 100 101 int lower; 102 int upper; 103 int greedy; 104 enum BodyEmptyType emptiness; 105 struct _Node* head_exact; 106 struct _Node* next_head_exact; 107 int is_refered; /* include called node. don't eliminate even if {0} */ 108 } QuantNode; 109 110 typedef struct { 111 NodeType node_type; 112 int status; 113 struct _Node* body; 114 115 enum BagType type; 116 union { 117 struct { 118 int regnum; 119 AbsAddrType called_addr; 120 int entry_count; 121 int called_state; 122 } m; 123 struct { 124 OnigOptionType options; 125 } o; 126 struct { 127 /* body is condition */ 128 struct _Node* Then; 129 struct _Node* Else; 130 } te; 131 }; 132 /* for multiple call reference */ 133 OnigLen min_len; /* min length (byte) */ 134 OnigLen max_len; /* max length (byte) */ 135 int char_len; /* character length */ 136 int opt_count; /* referenced count in optimize_nodes() */ 137 } BagNode; 138 139 #ifdef USE_CALL 140 141 typedef struct { 142 int offset; 143 struct _Node* target; 144 } UnsetAddr; 145 146 typedef struct { 147 int num; 148 int alloc; 149 UnsetAddr* us; 150 } UnsetAddrList; 151 152 typedef struct { 153 NodeType node_type; 154 int status; 155 struct _Node* body; /* to BagNode : BAG_MEMORY */ 156 157 int by_number; 158 int group_num; 159 UChar* name; 160 UChar* name_end; 161 int entry_count; 162 } CallNode; 163 164 #endif 165 166 typedef struct { 167 NodeType node_type; 168 int status; 169 170 int back_num; 171 int back_static[NODE_BACKREFS_SIZE]; 172 int* back_dynamic; 173 int nest_level; 174 } BackRefNode; 175 176 typedef struct { 177 NodeType node_type; 178 int status; 179 struct _Node* body; 180 181 int type; 182 int char_len; 183 int ascii_mode; 184 } AnchorNode; 185 186 typedef struct { 187 NodeType node_type; 188 int status; 189 190 struct _Node* car; 191 struct _Node* cdr; 192 } ConsAltNode; 193 194 typedef struct { 195 NodeType node_type; 196 int status; 197 198 int ctype; 199 int not; 200 OnigOptionType options; 201 int ascii_mode; 202 } CtypeNode; 203 204 typedef struct { 205 NodeType node_type; 206 int status; 207 208 enum GimmickType type; 209 int detail_type; 210 int num; 211 int id; 212 } GimmickNode; 213 214 typedef struct _Node { 215 union { 216 struct { 217 NodeType node_type; 218 int status; 219 struct _Node* body; 220 } base; 221 222 StrNode str; 223 CClassNode cclass; 224 QuantNode quant; 225 BagNode bag; 226 BackRefNode backref; 227 AnchorNode anchor; 228 ConsAltNode cons; 229 CtypeNode ctype; 230 #ifdef USE_CALL 231 CallNode call; 232 #endif 233 GimmickNode gimmick; 234 } u; 235 } Node; 236 237 #define NULL_NODE ((Node* )0) 238 239 240 /* node type bit */ 241 #define NODE_TYPE2BIT(type) (1<<(type)) 242 243 #define NODE_BIT_STRING NODE_TYPE2BIT(NODE_STRING) 244 #define NODE_BIT_CCLASS NODE_TYPE2BIT(NODE_CCLASS) 245 #define NODE_BIT_CTYPE NODE_TYPE2BIT(NODE_CTYPE) 246 #define NODE_BIT_BACKREF NODE_TYPE2BIT(NODE_BACKREF) 247 #define NODE_BIT_QUANT NODE_TYPE2BIT(NODE_QUANT) 248 #define NODE_BIT_BAG NODE_TYPE2BIT(NODE_BAG) 249 #define NODE_BIT_ANCHOR NODE_TYPE2BIT(NODE_ANCHOR) 250 #define NODE_BIT_LIST NODE_TYPE2BIT(NODE_LIST) 251 #define NODE_BIT_ALT NODE_TYPE2BIT(NODE_ALT) 252 #define NODE_BIT_CALL NODE_TYPE2BIT(NODE_CALL) 253 #define NODE_BIT_GIMMICK NODE_TYPE2BIT(NODE_GIMMICK) 254 255 #define NODE_TYPE(node) ((node)->u.base.node_type) 256 #define NODE_SET_TYPE(node, ntype) (node)->u.base.node_type = (ntype) 257 258 #define STR_(node) (&((node)->u.str)) 259 #define CCLASS_(node) (&((node)->u.cclass)) 260 #define CTYPE_(node) (&((node)->u.ctype)) 261 #define BACKREF_(node) (&((node)->u.backref)) 262 #define QUANT_(node) (&((node)->u.quant)) 263 #define BAG_(node) (&((node)->u.bag)) 264 #define ANCHOR_(node) (&((node)->u.anchor)) 265 #define CONS_(node) (&((node)->u.cons)) 266 #define CALL_(node) (&((node)->u.call)) 267 #define GIMMICK_(node) (&((node)->u.gimmick)) 268 269 #define NODE_CAR(node) (CONS_(node)->car) 270 #define NODE_CDR(node) (CONS_(node)->cdr) 271 272 #define CTYPE_ANYCHAR -1 273 #define NODE_IS_ANYCHAR(node) \ 274 (NODE_TYPE(node) == NODE_CTYPE && CTYPE_(node)->ctype == CTYPE_ANYCHAR) 275 276 #define CTYPE_OPTION(node, reg) \ 277 (NODE_IS_FIXED_OPTION(node) ? CTYPE_(node)->options : reg->options) 278 279 280 #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) 281 #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) 282 283 #define NODE_STRING_RAW (1<<0) /* by backslashed number */ 284 #define NODE_STRING_AMBIG (1<<1) 285 #define NODE_STRING_GOOD_AMBIG (1<<2) 286 #define NODE_STRING_DONT_GET_OPT_INFO (1<<3) 287 288 #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) 289 #define NODE_STRING_SET_RAW(node) (node)->u.str.flag |= NODE_STRING_RAW 290 #define NODE_STRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NODE_STRING_RAW 291 #define NODE_STRING_SET_AMBIG(node) (node)->u.str.flag |= NODE_STRING_AMBIG 292 #define NODE_STRING_SET_GOOD_AMBIG(node) (node)->u.str.flag |= NODE_STRING_GOOD_AMBIG 293 #define NODE_STRING_SET_DONT_GET_OPT_INFO(node) \ 294 (node)->u.str.flag |= NODE_STRING_DONT_GET_OPT_INFO 295 #define NODE_STRING_IS_RAW(node) \ 296 (((node)->u.str.flag & NODE_STRING_RAW) != 0) 297 #define NODE_STRING_IS_AMBIG(node) \ 298 (((node)->u.str.flag & NODE_STRING_AMBIG) != 0) 299 #define NODE_STRING_IS_GOOD_AMBIG(node) \ 300 (((node)->u.str.flag & NODE_STRING_GOOD_AMBIG) != 0) 301 #define NODE_STRING_IS_DONT_GET_OPT_INFO(node) \ 302 (((node)->u.str.flag & NODE_STRING_DONT_GET_OPT_INFO) != 0) 303 304 #define BACKREFS_P(br) \ 305 (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) 306 307 /* node status bits */ 308 #define NODE_ST_MIN_FIXED (1<<0) 309 #define NODE_ST_MAX_FIXED (1<<1) 310 #define NODE_ST_CLEN_FIXED (1<<2) 311 #define NODE_ST_MARK1 (1<<3) 312 #define NODE_ST_MARK2 (1<<4) 313 #define NODE_ST_STRICT_REAL_REPEAT (1<<5) 314 #define NODE_ST_RECURSION (1<<6) 315 #define NODE_ST_CALLED (1<<7) 316 #define NODE_ST_ADDR_FIXED (1<<8) 317 #define NODE_ST_NAMED_GROUP (1<<9) 318 #define NODE_ST_IN_REAL_REPEAT (1<<10) /* STK_REPEAT is nested in stack. */ 319 #define NODE_ST_IN_ZERO_REPEAT (1<<11) /* (....){0} */ 320 #define NODE_ST_IN_MULTI_ENTRY (1<<12) 321 #define NODE_ST_NEST_LEVEL (1<<13) 322 #define NODE_ST_BY_NUMBER (1<<14) /* {n,m} */ 323 #define NODE_ST_BY_NAME (1<<15) /* backref by name */ 324 #define NODE_ST_BACKREF (1<<16) 325 #define NODE_ST_CHECKER (1<<17) 326 #define NODE_ST_FIXED_OPTION (1<<18) 327 #define NODE_ST_PROHIBIT_RECURSION (1<<19) 328 #define NODE_ST_SUPER (1<<20) 329 330 331 #define NODE_STATUS(node) (((Node* )node)->u.base.status) 332 #define NODE_STATUS_ADD(node,f) (NODE_STATUS(node) |= (NODE_ST_ ## f)) 333 #define NODE_STATUS_REMOVE(node,f) (NODE_STATUS(node) &= ~(NODE_ST_ ## f)) 334 335 #define NODE_IS_BY_NUMBER(node) ((NODE_STATUS(node) & NODE_ST_BY_NUMBER) != 0) 336 #define NODE_IS_IN_REAL_REPEAT(node) ((NODE_STATUS(node) & NODE_ST_IN_REAL_REPEAT) != 0) 337 #define NODE_IS_CALLED(node) ((NODE_STATUS(node) & NODE_ST_CALLED) != 0) 338 #define NODE_IS_IN_MULTI_ENTRY(node) ((NODE_STATUS(node) & NODE_ST_IN_MULTI_ENTRY) != 0) 339 #define NODE_IS_RECURSION(node) ((NODE_STATUS(node) & NODE_ST_RECURSION) != 0) 340 #define NODE_IS_IN_ZERO_REPEAT(node) ((NODE_STATUS(node) & NODE_ST_IN_ZERO_REPEAT) != 0) 341 #define NODE_IS_NAMED_GROUP(node) ((NODE_STATUS(node) & NODE_ST_NAMED_GROUP) != 0) 342 #define NODE_IS_ADDR_FIXED(node) ((NODE_STATUS(node) & NODE_ST_ADDR_FIXED) != 0) 343 #define NODE_IS_CLEN_FIXED(node) ((NODE_STATUS(node) & NODE_ST_CLEN_FIXED) != 0) 344 #define NODE_IS_MIN_FIXED(node) ((NODE_STATUS(node) & NODE_ST_MIN_FIXED) != 0) 345 #define NODE_IS_MAX_FIXED(node) ((NODE_STATUS(node) & NODE_ST_MAX_FIXED) != 0) 346 #define NODE_IS_MARK1(node) ((NODE_STATUS(node) & NODE_ST_MARK1) != 0) 347 #define NODE_IS_MARK2(node) ((NODE_STATUS(node) & NODE_ST_MARK2) != 0) 348 #define NODE_IS_NEST_LEVEL(node) ((NODE_STATUS(node) & NODE_ST_NEST_LEVEL) != 0) 349 #define NODE_IS_BY_NAME(node) ((NODE_STATUS(node) & NODE_ST_BY_NAME) != 0) 350 #define NODE_IS_BACKREF(node) ((NODE_STATUS(node) & NODE_ST_BACKREF) != 0) 351 #define NODE_IS_CHECKER(node) ((NODE_STATUS(node) & NODE_ST_CHECKER) != 0) 352 #define NODE_IS_FIXED_OPTION(node) ((NODE_STATUS(node) & NODE_ST_FIXED_OPTION) != 0) 353 #define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0) 354 #define NODE_IS_PROHIBIT_RECURSION(node) \ 355 ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) 356 #define NODE_IS_STRICT_REAL_REPEAT(node) \ 357 ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) 358 359 #define NODE_BODY(node) ((node)->u.base.body) 360 #define NODE_QUANT_BODY(node) ((node)->body) 361 #define NODE_BAG_BODY(node) ((node)->body) 362 #define NODE_CALL_BODY(node) ((node)->body) 363 #define NODE_ANCHOR_BODY(node) ((node)->body) 364 365 #define SCANENV_MEMENV_SIZE 8 366 #define SCANENV_MEMENV(senv) \ 367 (IS_NOT_NULL((senv)->mem_env_dynamic) ? \ 368 (senv)->mem_env_dynamic : (senv)->mem_env_static) 369 370 typedef struct { 371 Node* node; 372 #if 0 373 int in; 374 int recursion; 375 #endif 376 } MemEnv; 377 378 typedef struct { 379 enum SaveType type; 380 } SaveItem; 381 382 typedef struct { 383 OnigOptionType options; 384 OnigCaseFoldType case_fold_flag; 385 OnigEncoding enc; 386 OnigSyntaxType* syntax; 387 MemStatusType capture_history; 388 MemStatusType bt_mem_start; 389 MemStatusType bt_mem_end; 390 MemStatusType backrefed_mem; 391 UChar* pattern; 392 UChar* pattern_end; 393 UChar* error; 394 UChar* error_end; 395 regex_t* reg; /* for reg->names only */ 396 int num_call; 397 #ifdef USE_CALL 398 UnsetAddrList* unset_addr_list; 399 int has_call_zero; 400 #endif 401 int num_mem; 402 int num_named; 403 int mem_alloc; 404 MemEnv mem_env_static[SCANENV_MEMENV_SIZE]; 405 MemEnv* mem_env_dynamic; 406 unsigned int parse_depth; 407 408 int keep_num; 409 int save_num; 410 int save_alloc_num; 411 SaveItem* saves; 412 } ScanEnv; 413 414 415 #define IS_SYNTAX_OP(syn, opm) (((syn)->op & (opm)) != 0) 416 #define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0) 417 #define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0) 418 419 typedef struct { 420 int new_val; 421 } GroupNumRemap; 422 423 extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); 424 425 extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); 426 extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); 427 extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); 428 extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); 429 extern void onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode)); 430 extern void onig_node_conv_to_str_node P_((Node* node, int raw)); 431 extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); 432 extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); 433 extern void onig_node_free P_((Node* node)); 434 extern Node* onig_node_new_bag P_((enum BagType type)); 435 extern Node* onig_node_new_anchor P_((int type, int ascii_mode)); 436 extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); 437 extern Node* onig_node_new_list P_((Node* left, Node* right)); 438 extern Node* onig_node_list_add P_((Node* list, Node* x)); 439 extern Node* onig_node_new_alt P_((Node* left, Node* right)); 440 extern void onig_node_str_clear P_((Node* node)); 441 extern int onig_names_free P_((regex_t* reg)); 442 extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); 443 extern int onig_free_shared_cclass_table P_((void)); 444 extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); 445 extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node); 446 447 #ifdef USE_CALLOUT 448 extern int onig_global_callout_names_free(void); 449 #endif 450 451 #ifdef ONIG_DEBUG 452 extern int onig_print_names(FILE*, regex_t*); 453 #endif 454 455 #endif /* REGPARSE_H */ 456