1 #ifndef REGPARSE_H
2 #define REGPARSE_H
3 /**********************************************************************
4   regparse.h -  Oniguruma (regular expression library)
5 **********************************************************************/
6 /*-
7  * Copyright (c) 2002-2021  K.Kosako
8  * All rights reserved.
9  *
load_arguments(self, _)10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include "regint.h"
33 
34 #define NODE_STRING_MARGIN     16
35 #define NODE_STRING_BUF_SIZE   24  /* sizeof(CClassNode) - sizeof(int)*4 */
36 #define NODE_BACKREFS_SIZE      6
37 
38 /* node type */
39 typedef enum {
40   NODE_STRING  =  0,
41   NODE_CCLASS  =  1,
42   NODE_CTYPE   =  2,
43   NODE_BACKREF =  3,
44   NODE_QUANT   =  4,
45   NODE_BAG     =  5,
46   NODE_ANCHOR  =  6,
47   NODE_LIST    =  7,
48   NODE_ALT     =  8,
49   NODE_CALL    =  9,
50   NODE_GIMMICK = 10
51 } NodeType;
52 
53 enum BagType {
54   BAG_MEMORY         = 0,
55   BAG_OPTION         = 1,
56   BAG_STOP_BACKTRACK = 2,
57   BAG_IF_ELSE        = 3,
58 };
59 
60 enum GimmickType {
61   GIMMICK_FAIL       = 0,
62   GIMMICK_SAVE       = 1,
63   GIMMICK_UPDATE_VAR = 2,
64 #ifdef USE_CALLOUT
65   GIMMICK_CALLOUT    = 3,
66 #endif
67 };
68 
69 enum BodyEmptyType {
70   BODY_IS_NOT_EMPTY     = 0,
71   BODY_MAY_BE_EMPTY     = 1,
72   BODY_MAY_BE_EMPTY_MEM = 2,
73   BODY_MAY_BE_EMPTY_REC = 3
74 };
75 
76 /* bytes buffer */
77 typedef struct _BBuf {
78   UChar* p;
79   unsigned int used;
80   unsigned int alloc;
81 } BBuf;
82 
83 
84 struct _Node;
85 
86 typedef struct {
87   NodeType node_type;
88   int status;
89   struct _Node* parent;
90 
91   UChar* s;
92   UChar* end;
93   unsigned int flag;
94   UChar  buf[NODE_STRING_BUF_SIZE];
95   int    capacity;  /* (allocated size - 1) or 0: use buf[] */
96 } StrNode;
97 
98 typedef struct {
99   NodeType node_type;
100   int status;
101   struct _Node* parent;
102 
103   unsigned int flags;
104   BitSet bs;
105   BBuf*  mbuf;   /* multi-byte info or NULL */
106 } CClassNode;
107 
108 typedef struct {
109   NodeType node_type;
110   int status;
111   struct _Node* parent;
112   struct _Node* body;
113 
114   int lower;
115   int upper;
116   int greedy;
117   enum BodyEmptyType emptiness;
118   struct _Node* head_exact;
119   struct _Node* next_head_exact;
120   int include_referred;  /* include called node. don't eliminate even if {0} */
121   MemStatusType empty_status_mem;
122 } QuantNode;
123 
124 typedef struct {
125   NodeType node_type;
126   int status;
127   struct _Node* parent;
128   struct _Node* body;
129 
130   enum BagType type;
131   union {
132     struct {
133       int regnum;
134       AbsAddrType called_addr;
135       int entry_count;
136       int called_state;
137     } m;
138     struct {
139       OnigOptionType options;
140     } o;
141     struct {
142       /* body is condition */
143       struct _Node* Then;
144       struct _Node* Else;
145     } te;
146   };
147   /* for multiple call reference */
148   OnigLen min_len;   /* min length (byte) */
149   OnigLen max_len;   /* max length (byte) */
150   OnigLen min_char_len;
151   OnigLen max_char_len;
152   int opt_count;     /* referenced count in optimize_nodes() */
153 } BagNode;
154 
155 #ifdef USE_CALL
156 
157 typedef struct {
158   int           offset;
159   struct _Node* target;
160 } UnsetAddr;
161 
162 typedef struct {
163   int        num;
164   int        alloc;
165   UnsetAddr* us;
166 } UnsetAddrList;
167 
168 typedef struct {
169   NodeType node_type;
170   int status;
171   struct _Node* parent;
172   struct _Node* body; /* to BagNode : BAG_MEMORY */
173 
174   int     by_number;
175   int     called_gnum;
176   UChar*  name;
177   UChar*  name_end;
178   int     entry_count;
179 } CallNode;
180 
181 #endif
182 
183 typedef struct {
184   NodeType node_type;
185   int status;
186   struct _Node* parent;
187 
188   int  back_num;
189   int  back_static[NODE_BACKREFS_SIZE];
190   int* back_dynamic;
191   int  nest_level;
192 } BackRefNode;
193 
194 typedef struct {
195   NodeType node_type;
196   int status;
197   struct _Node* parent;
198   struct _Node* body;
199 
200   int type;
201   OnigLen char_min_len;
202   OnigLen char_max_len;
203   int ascii_mode;
204   struct _Node* lead_node;
205 } AnchorNode;
206 
207 typedef struct {
208   NodeType node_type;
209   int status;
210   struct _Node* parent;
211 
212   struct _Node* car;
213   struct _Node* cdr;
214 } ConsAltNode;
215 
216 typedef struct {
217   NodeType node_type;
218   int status;
219   struct _Node* parent;
220 
221   int ctype;
222   int not;
223   int ascii_mode;
224 } CtypeNode;
225 
226 typedef struct {
227   NodeType node_type;
228   int status;
229   struct _Node* parent;
230 
231   enum GimmickType type;
232   int  detail_type;
233   int  num;
234   int  id;
235 } GimmickNode;
236 
237 typedef struct _Node {
238   union {
239     struct {
240       NodeType node_type;
241       int status;
242       struct _Node* parent;
243       struct _Node* body;
244     } base;
245 
246     StrNode       str;
247     CClassNode    cclass;
248     QuantNode     quant;
249     BagNode       bag;
250     BackRefNode   backref;
251     AnchorNode    anchor;
252     ConsAltNode   cons;
253     CtypeNode     ctype;
254 #ifdef USE_CALL
255     CallNode      call;
256 #endif
257     GimmickNode   gimmick;
258   } u;
259 } Node;
260 
261 typedef struct {
262   int new_val;
263 } GroupNumMap;
264 
265 
266 #define NULL_NODE  ((Node* )0)
267 
268 
269 /* node type bit */
270 #define NODE_TYPE2BIT(type)      (1<<(type))
271 
272 #define NODE_BIT_STRING     NODE_TYPE2BIT(NODE_STRING)
273 #define NODE_BIT_CCLASS     NODE_TYPE2BIT(NODE_CCLASS)
274 #define NODE_BIT_CTYPE      NODE_TYPE2BIT(NODE_CTYPE)
275 #define NODE_BIT_BACKREF    NODE_TYPE2BIT(NODE_BACKREF)
276 #define NODE_BIT_QUANT      NODE_TYPE2BIT(NODE_QUANT)
277 #define NODE_BIT_BAG        NODE_TYPE2BIT(NODE_BAG)
278 #define NODE_BIT_ANCHOR     NODE_TYPE2BIT(NODE_ANCHOR)
279 #define NODE_BIT_LIST       NODE_TYPE2BIT(NODE_LIST)
280 #define NODE_BIT_ALT        NODE_TYPE2BIT(NODE_ALT)
281 #define NODE_BIT_CALL       NODE_TYPE2BIT(NODE_CALL)
282 #define NODE_BIT_GIMMICK    NODE_TYPE2BIT(NODE_GIMMICK)
283 
284 #define NODE_TYPE(node)             ((node)->u.base.node_type)
285 #define NODE_SET_TYPE(node, ntype)   (node)->u.base.node_type = (ntype)
286 
287 #define STR_(node)         (&((node)->u.str))
288 #define CCLASS_(node)      (&((node)->u.cclass))
289 #define CTYPE_(node)       (&((node)->u.ctype))
290 #define BACKREF_(node)     (&((node)->u.backref))
291 #define QUANT_(node)       (&((node)->u.quant))
292 #define BAG_(node)         (&((node)->u.bag))
293 #define ANCHOR_(node)      (&((node)->u.anchor))
294 #define CONS_(node)        (&((node)->u.cons))
295 #define CALL_(node)        (&((node)->u.call))
296 #define GIMMICK_(node)     (&((node)->u.gimmick))
297 
298 #define NODE_CAR(node)     (CONS_(node)->car)
299 #define NODE_CDR(node)     (CONS_(node)->cdr)
300 
301 #define CTYPE_ANYCHAR      -1
302 #define NODE_IS_ANYCHAR(node) \
303   (NODE_TYPE(node) == NODE_CTYPE && CTYPE_(node)->ctype == CTYPE_ANYCHAR)
304 
305 
306 #define ANCR_ANYCHAR_INF_MASK  (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML)
307 #define ANCR_END_BUF_MASK      (ANCR_END_BUF | ANCR_SEMI_END_BUF)
308 
309 #define NODE_STRING_CRUDE           (1<<0)
310 #define NODE_STRING_CASE_EXPANDED   (1<<1)
311 
312 #define NODE_STRING_LEN(node)            (int )((node)->u.str.end - (node)->u.str.s)
313 #define NODE_STRING_SET_CRUDE(node)         (node)->u.str.flag |= NODE_STRING_CRUDE
314 #define NODE_STRING_CLEAR_CRUDE(node)       (node)->u.str.flag &= ~NODE_STRING_CRUDE
315 #define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED
316 #define NODE_STRING_IS_CRUDE(node) \
317   (((node)->u.str.flag & NODE_STRING_CRUDE) != 0)
318 #define NODE_STRING_IS_CASE_EXPANDED(node) \
319   (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0)
320 
321 #define BACKREFS_P(br) \
322   (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static)
323 
324 /* node status bits */
325 #define NODE_ST_FIXED_MIN           (1<<0)
326 #define NODE_ST_FIXED_MAX           (1<<1)
327 #define NODE_ST_FIXED_CLEN          (1<<2)
328 #define NODE_ST_MARK1               (1<<3)
329 #define NODE_ST_MARK2               (1<<4)
330 #define NODE_ST_STRICT_REAL_REPEAT  (1<<5)
331 #define NODE_ST_RECURSION           (1<<6)
332 #define NODE_ST_CALLED              (1<<7)
333 #define NODE_ST_FIXED_ADDR          (1<<8)
334 #define NODE_ST_NAMED_GROUP         (1<<9)
335 #define NODE_ST_IN_REAL_REPEAT      (1<<10) /* STK_REPEAT is nested in stack. */
336 #define NODE_ST_IN_ZERO_REPEAT      (1<<11) /* (....){0} */
337 #define NODE_ST_IN_MULTI_ENTRY      (1<<12)
338 #define NODE_ST_NEST_LEVEL          (1<<13)
339 #define NODE_ST_BY_NUMBER           (1<<14) /* {n,m} */
340 #define NODE_ST_BY_NAME             (1<<15) /* backref by name */
341 #define NODE_ST_BACKREF             (1<<16)
342 #define NODE_ST_CHECKER             (1<<17)
343 #define NODE_ST_PROHIBIT_RECURSION  (1<<18)
344 #define NODE_ST_SUPER               (1<<19)
345 #define NODE_ST_EMPTY_STATUS_CHECK  (1<<20)
346 #define NODE_ST_IGNORECASE          (1<<21)
347 #define NODE_ST_MULTILINE           (1<<22)
348 #define NODE_ST_TEXT_SEGMENT_WORD   (1<<23)
349 #define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24)  /* stopper or clear */
350 #define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25)
351 #define NODE_ST_REFERENCED          (1<<26)
352 #define NODE_ST_INPEEK              (1<<27)
353 
354 
355 #define NODE_STATUS(node)           (((Node* )node)->u.base.status)
356 #define NODE_STATUS_ADD(node,f)     (NODE_STATUS(node) |= (NODE_ST_ ## f))
357 #define NODE_STATUS_REMOVE(node,f)  (NODE_STATUS(node) &= ~(NODE_ST_ ## f))
358 
359 #define NODE_IS_BY_NUMBER(node)       ((NODE_STATUS(node) & NODE_ST_BY_NUMBER)      != 0)
360 #define NODE_IS_IN_REAL_REPEAT(node)  ((NODE_STATUS(node) & NODE_ST_IN_REAL_REPEAT) != 0)
361 #define NODE_IS_CALLED(node)          ((NODE_STATUS(node) & NODE_ST_CALLED)         != 0)
362 #define NODE_IS_IN_MULTI_ENTRY(node)  ((NODE_STATUS(node) & NODE_ST_IN_MULTI_ENTRY) != 0)
363 #define NODE_IS_RECURSION(node)       ((NODE_STATUS(node) & NODE_ST_RECURSION)      != 0)
364 #define NODE_IS_IN_ZERO_REPEAT(node)  ((NODE_STATUS(node) & NODE_ST_IN_ZERO_REPEAT) != 0)
365 #define NODE_IS_NAMED_GROUP(node)     ((NODE_STATUS(node) & NODE_ST_NAMED_GROUP)  != 0)
366 #define NODE_IS_FIXED_ADDR(node)      ((NODE_STATUS(node) & NODE_ST_FIXED_ADDR)   != 0)
367 #define NODE_IS_FIXED_CLEN(node)      ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN)   != 0)
368 #define NODE_IS_FIXED_MIN(node)       ((NODE_STATUS(node) & NODE_ST_FIXED_MIN)    != 0)
369 #define NODE_IS_FIXED_MAX(node)       ((NODE_STATUS(node) & NODE_ST_FIXED_MAX)    != 0)
370 #define NODE_IS_MARK1(node)           ((NODE_STATUS(node) & NODE_ST_MARK1)        != 0)
371 #define NODE_IS_MARK2(node)           ((NODE_STATUS(node) & NODE_ST_MARK2)        != 0)
372 #define NODE_IS_NEST_LEVEL(node)      ((NODE_STATUS(node) & NODE_ST_NEST_LEVEL)   != 0)
373 #define NODE_IS_BY_NAME(node)         ((NODE_STATUS(node) & NODE_ST_BY_NAME)      != 0)
374 #define NODE_IS_BACKREF(node)         ((NODE_STATUS(node) & NODE_ST_BACKREF)      != 0)
375 #define NODE_IS_CHECKER(node)         ((NODE_STATUS(node) & NODE_ST_CHECKER)      != 0)
376 #define NODE_IS_SUPER(node)           ((NODE_STATUS(node) & NODE_ST_SUPER)        != 0)
377 #define NODE_IS_PROHIBIT_RECURSION(node) \
378     ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0)
379 #define NODE_IS_STRICT_REAL_REPEAT(node) \
380     ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0)
381 #define NODE_IS_EMPTY_STATUS_CHECK(node) \
382     ((NODE_STATUS(node) & NODE_ST_EMPTY_STATUS_CHECK) != 0)
383 #define NODE_IS_IGNORECASE(node)      ((NODE_STATUS(node) & NODE_ST_IGNORECASE) != 0)
384 #define NODE_IS_MULTILINE(node)       ((NODE_STATUS(node) & NODE_ST_MULTILINE) != 0)
385 #define NODE_IS_TEXT_SEGMENT_WORD(node)  ((NODE_STATUS(node) & NODE_ST_TEXT_SEGMENT_WORD) != 0)
386 #define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node)  ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0)
387 #define NODE_IS_FIXED_CLEN_MIN_SURE(node)  ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0)
388 #define NODE_IS_REFERENCED(node)      ((NODE_STATUS(node) & NODE_ST_REFERENCED) != 0)
389 #define NODE_IS_INPEEK(node)          ((NODE_STATUS(node) & NODE_ST_INPEEK) != 0)
390 
391 #define NODE_PARENT(node)         ((node)->u.base.parent)
392 #define NODE_BODY(node)           ((node)->u.base.body)
393 #define NODE_QUANT_BODY(node)     ((node)->body)
394 #define NODE_BAG_BODY(node)       ((node)->body)
395 #define NODE_CALL_BODY(node)      ((node)->body)
396 #define NODE_ANCHOR_BODY(node)    ((node)->body)
397 
398 #define PARSEENV_MEMENV_SIZE  8
399 #define PARSEENV_MEMENV(senv) \
400  (IS_NOT_NULL((senv)->mem_env_dynamic) ? \
401     (senv)->mem_env_dynamic : (senv)->mem_env_static)
402 
403 #define IS_SYNTAX_OP(syn, opm)    (((syn)->op  & (opm)) != 0)
404 #define IS_SYNTAX_OP2(syn, opm)   (((syn)->op2 & (opm)) != 0)
405 #define IS_SYNTAX_BV(syn, bvm)    (((syn)->behavior & (bvm)) != 0)
406 
407 #define ID_ENTRY(env, id) do {\
408   id = (env)->id_num++;\
409 } while(0)
410 
411 
412 typedef struct {
413   Node* mem_node;
414   Node* empty_repeat_node;
415 } MemEnv;
416 
417 typedef struct {
418   enum SaveType type;
419 } SaveItem;
420 
421 typedef struct {
422   OnigOptionType   options;
423   OnigCaseFoldType case_fold_flag;
424   OnigEncoding     enc;
425   OnigSyntaxType*  syntax;
426   MemStatusType    cap_history;
427   MemStatusType    backtrack_mem; /* backtrack/recursion */
428   MemStatusType    backrefed_mem;
429   UChar*           pattern;
430   UChar*           pattern_end;
431   UChar*           error;
432   UChar*           error_end;
433   regex_t*         reg;       /* for reg->names only */
434   int              num_call;
435   int              num_mem;
436   int              num_named;
437   int              mem_alloc;
438   MemEnv           mem_env_static[PARSEENV_MEMENV_SIZE];
439   MemEnv*          mem_env_dynamic;
440   int              backref_num;
441   int              keep_num;
442   int              id_num;
443   int              save_alloc_num;
444   SaveItem*        saves;
445 #ifdef USE_CALL
446   UnsetAddrList*   unset_addr_list;
447   int              has_call_zero;
448 #endif
449   unsigned int     parse_depth;
450 #ifdef ONIG_DEBUG_PARSE
451   unsigned int     max_parse_depth;
452 #endif
453 } ParseEnv;
454 
455 
456 extern int    onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map));
457 
458 extern int    onig_strncmp P_((const UChar* s1, const UChar* s2, int n));
459 extern void   onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end));
460 extern void   onig_scan_env_set_error_string P_((ParseEnv* env, int ecode, UChar* arg, UChar* arg_end));
461 extern int    onig_reduce_nested_quantifier P_((Node* pnode));
462 extern int    onig_node_copy(Node** rcopy, Node* from);
463 extern int    onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end));
464 extern int    onig_node_str_set P_((Node* node, const UChar* s, const UChar* end, int need_free));
465 extern void   onig_node_str_clear P_((Node* node, int need_free));
466 extern void   onig_node_free P_((Node* node));
467 extern int    onig_node_reset_empty P_((Node* node));
468 extern int    onig_node_reset_fail P_((Node* node));
469 extern Node*  onig_node_new_bag P_((enum BagType type));
470 extern Node*  onig_node_new_str P_((const UChar* s, const UChar* end));
471 extern Node*  onig_node_new_list P_((Node* left, Node* right));
472 extern Node*  onig_node_new_alt P_((Node* left, Node* right));
473 extern int    onig_names_free P_((regex_t* reg));
474 extern int    onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ParseEnv* env));
475 extern int    onig_free_shared_cclass_table P_((void));
476 extern int    onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc));
477 extern int    onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]);
478 extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node);
479 
480 #ifdef USE_CALLOUT
481 extern int onig_global_callout_names_free(void);
482 #endif
483 
484 #ifdef ONIG_DEBUG
485 extern int onig_print_names(FILE*, regex_t*);
486 #endif
487 
488 #endif /* REGPARSE_H */
489