1 #ifndef REGPARSE_H
2 #define REGPARSE_H
3 /**********************************************************************
4   regparse.h -  Oniguruma (regular expression library)
5 **********************************************************************/
6 /*-
7  * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include "regint.h"
33 
34 #define NODE_STRING_MARGIN         16
35 #define NODE_STRING_BUF_SIZE       24  /* sizeof(CClassNode) - sizeof(int)*4 */
36 #define NODE_BACKREFS_SIZE          6
37 
38 /* node type */
39 typedef enum {
40   NODE_STRING    =  0,
41   NODE_CCLASS    =  1,
42   NODE_CTYPE     =  2,
43   NODE_BACKREF   =  3,
44   NODE_QUANT     =  4,
45   NODE_BAG       =  5,
46   NODE_ANCHOR    =  6,
47   NODE_LIST      =  7,
48   NODE_ALT       =  8,
49   NODE_CALL      =  9,
50   NODE_GIMMICK   = 10
51 } NodeType;
52 
53 enum BagType {
54   BAG_MEMORY         = 0,
55   BAG_OPTION         = 1,
56   BAG_STOP_BACKTRACK = 2,
57   BAG_IF_ELSE        = 3,
58 };
59 
60 enum GimmickType {
61   GIMMICK_FAIL       = 0,
62   GIMMICK_SAVE       = 1,
63   GIMMICK_UPDATE_VAR = 2,
64 #ifdef USE_CALLOUT
65   GIMMICK_CALLOUT    = 3,
66 #endif
67 };
68 
69 enum BodyEmptyType {
70   BODY_IS_NOT_EMPTY             = 0,
71   BODY_IS_EMPTY_POSSIBILITY     = 1,
72   BODY_IS_EMPTY_POSSIBILITY_MEM = 2,
73   BODY_IS_EMPTY_POSSIBILITY_REC = 3
74 };
75 
76 typedef struct {
77   NodeType node_type;
78   int status;
79 
80   UChar* s;
81   UChar* end;
82   unsigned int flag;
83   int    capacity;    /* (allocated size - 1) or 0: use buf[] */
84   UChar  buf[NODE_STRING_BUF_SIZE];
85 } StrNode;
86 
87 typedef struct {
88   NodeType node_type;
89   int status;
90 
91   unsigned int flags;
92   BitSet bs;
93   BBuf*  mbuf;   /* multi-byte info or NULL */
94 } CClassNode;
95 
96 typedef struct {
97   NodeType node_type;
98   int status;
99   struct _Node* body;
100 
101   int lower;
102   int upper;
103   int greedy;
104   enum BodyEmptyType emptiness;
105   struct _Node* head_exact;
106   struct _Node* next_head_exact;
107   int is_refered;     /* include called node. don't eliminate even if {0} */
108 } QuantNode;
109 
110 typedef struct {
111   NodeType node_type;
112   int status;
113   struct _Node* body;
114 
115   enum BagType type;
116   union {
117     struct {
118       int regnum;
119       AbsAddrType called_addr;
120       int entry_count;
121       int called_state;
122     } m;
123     struct {
124       OnigOptionType options;
125     } o;
126     struct {
127       /* body is condition */
128       struct _Node* Then;
129       struct _Node* Else;
130     } te;
131   };
132   /* for multiple call reference */
133   OnigLen min_len;   /* min length (byte) */
134   OnigLen max_len;   /* max length (byte) */
135   int char_len;      /* character length  */
136   int opt_count;     /* referenced count in optimize_nodes() */
137 } BagNode;
138 
139 #ifdef USE_CALL
140 
141 typedef struct {
142   int           offset;
143   struct _Node* target;
144 } UnsetAddr;
145 
146 typedef struct {
147   int        num;
148   int        alloc;
149   UnsetAddr* us;
150 } UnsetAddrList;
151 
152 typedef struct {
153   NodeType node_type;
154   int status;
155   struct _Node* body; /* to BagNode : BAG_MEMORY */
156 
157   int     by_number;
158   int     group_num;
159   UChar*  name;
160   UChar*  name_end;
161   int     entry_count;
162 } CallNode;
163 
164 #endif
165 
166 typedef struct {
167   NodeType node_type;
168   int status;
169 
170   int  back_num;
171   int  back_static[NODE_BACKREFS_SIZE];
172   int* back_dynamic;
173   int  nest_level;
174 } BackRefNode;
175 
176 typedef struct {
177   NodeType node_type;
178   int status;
179   struct _Node* body;
180 
181   int type;
182   int char_len;
183   int ascii_mode;
184 } AnchorNode;
185 
186 typedef struct {
187   NodeType node_type;
188   int status;
189 
190   struct _Node* car;
191   struct _Node* cdr;
192 } ConsAltNode;
193 
194 typedef struct {
195   NodeType node_type;
196   int status;
197 
198   int ctype;
199   int not;
200   OnigOptionType options;
201   int ascii_mode;
202 } CtypeNode;
203 
204 typedef struct {
205   NodeType node_type;
206   int status;
207 
208   enum GimmickType type;
209   int  detail_type;
210   int  num;
211   int  id;
212 } GimmickNode;
213 
214 typedef struct _Node {
215   union {
216     struct {
217       NodeType node_type;
218       int status;
219       struct _Node* body;
220     } base;
221 
222     StrNode       str;
223     CClassNode    cclass;
224     QuantNode     quant;
225     BagNode       bag;
226     BackRefNode   backref;
227     AnchorNode    anchor;
228     ConsAltNode   cons;
229     CtypeNode     ctype;
230 #ifdef USE_CALL
231     CallNode      call;
232 #endif
233     GimmickNode   gimmick;
234   } u;
235 } Node;
236 
237 #define NULL_NODE  ((Node* )0)
238 
239 
240 /* node type bit */
241 #define NODE_TYPE2BIT(type)      (1<<(type))
242 
243 #define NODE_BIT_STRING     NODE_TYPE2BIT(NODE_STRING)
244 #define NODE_BIT_CCLASS     NODE_TYPE2BIT(NODE_CCLASS)
245 #define NODE_BIT_CTYPE      NODE_TYPE2BIT(NODE_CTYPE)
246 #define NODE_BIT_BACKREF    NODE_TYPE2BIT(NODE_BACKREF)
247 #define NODE_BIT_QUANT      NODE_TYPE2BIT(NODE_QUANT)
248 #define NODE_BIT_BAG        NODE_TYPE2BIT(NODE_BAG)
249 #define NODE_BIT_ANCHOR     NODE_TYPE2BIT(NODE_ANCHOR)
250 #define NODE_BIT_LIST       NODE_TYPE2BIT(NODE_LIST)
251 #define NODE_BIT_ALT        NODE_TYPE2BIT(NODE_ALT)
252 #define NODE_BIT_CALL       NODE_TYPE2BIT(NODE_CALL)
253 #define NODE_BIT_GIMMICK    NODE_TYPE2BIT(NODE_GIMMICK)
254 
255 #define NODE_TYPE(node)             ((node)->u.base.node_type)
256 #define NODE_SET_TYPE(node, ntype)   (node)->u.base.node_type = (ntype)
257 
258 #define STR_(node)         (&((node)->u.str))
259 #define CCLASS_(node)      (&((node)->u.cclass))
260 #define CTYPE_(node)       (&((node)->u.ctype))
261 #define BACKREF_(node)     (&((node)->u.backref))
262 #define QUANT_(node)       (&((node)->u.quant))
263 #define BAG_(node)         (&((node)->u.bag))
264 #define ANCHOR_(node)      (&((node)->u.anchor))
265 #define CONS_(node)        (&((node)->u.cons))
266 #define CALL_(node)        (&((node)->u.call))
267 #define GIMMICK_(node)     (&((node)->u.gimmick))
268 
269 #define NODE_CAR(node)         (CONS_(node)->car)
270 #define NODE_CDR(node)         (CONS_(node)->cdr)
271 
272 #define CTYPE_ANYCHAR      -1
273 #define NODE_IS_ANYCHAR(node) \
274   (NODE_TYPE(node) == NODE_CTYPE && CTYPE_(node)->ctype == CTYPE_ANYCHAR)
275 
276 #define CTYPE_OPTION(node, reg) \
277   (NODE_IS_FIXED_OPTION(node) ? CTYPE_(node)->options : reg->options)
278 
279 
280 #define ANCR_ANYCHAR_INF_MASK  (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML)
281 #define ANCR_END_BUF_MASK      (ANCR_END_BUF | ANCR_SEMI_END_BUF)
282 
283 #define NODE_STRING_RAW                (1<<0) /* by backslashed number */
284 #define NODE_STRING_AMBIG              (1<<1)
285 #define NODE_STRING_GOOD_AMBIG         (1<<2)
286 #define NODE_STRING_DONT_GET_OPT_INFO  (1<<3)
287 
288 #define NODE_STRING_LEN(node)            (int )((node)->u.str.end - (node)->u.str.s)
289 #define NODE_STRING_SET_RAW(node)        (node)->u.str.flag |= NODE_STRING_RAW
290 #define NODE_STRING_CLEAR_RAW(node)      (node)->u.str.flag &= ~NODE_STRING_RAW
291 #define NODE_STRING_SET_AMBIG(node)      (node)->u.str.flag |= NODE_STRING_AMBIG
292 #define NODE_STRING_SET_GOOD_AMBIG(node) (node)->u.str.flag |= NODE_STRING_GOOD_AMBIG
293 #define NODE_STRING_SET_DONT_GET_OPT_INFO(node) \
294   (node)->u.str.flag |= NODE_STRING_DONT_GET_OPT_INFO
295 #define NODE_STRING_IS_RAW(node) \
296   (((node)->u.str.flag & NODE_STRING_RAW) != 0)
297 #define NODE_STRING_IS_AMBIG(node) \
298   (((node)->u.str.flag & NODE_STRING_AMBIG) != 0)
299 #define NODE_STRING_IS_GOOD_AMBIG(node) \
300   (((node)->u.str.flag & NODE_STRING_GOOD_AMBIG) != 0)
301 #define NODE_STRING_IS_DONT_GET_OPT_INFO(node) \
302   (((node)->u.str.flag & NODE_STRING_DONT_GET_OPT_INFO) != 0)
303 
304 #define BACKREFS_P(br) \
305   (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static)
306 
307 /* node status bits */
308 #define NODE_ST_MIN_FIXED             (1<<0)
309 #define NODE_ST_MAX_FIXED             (1<<1)
310 #define NODE_ST_CLEN_FIXED            (1<<2)
311 #define NODE_ST_MARK1                 (1<<3)
312 #define NODE_ST_MARK2                 (1<<4)
313 #define NODE_ST_STRICT_REAL_REPEAT    (1<<5)
314 #define NODE_ST_RECURSION             (1<<6)
315 #define NODE_ST_CALLED                (1<<7)
316 #define NODE_ST_ADDR_FIXED            (1<<8)
317 #define NODE_ST_NAMED_GROUP           (1<<9)
318 #define NODE_ST_IN_REAL_REPEAT        (1<<10) /* STK_REPEAT is nested in stack. */
319 #define NODE_ST_IN_ZERO_REPEAT        (1<<11) /* (....){0} */
320 #define NODE_ST_IN_MULTI_ENTRY        (1<<12)
321 #define NODE_ST_NEST_LEVEL            (1<<13)
322 #define NODE_ST_BY_NUMBER             (1<<14) /* {n,m} */
323 #define NODE_ST_BY_NAME               (1<<15) /* backref by name */
324 #define NODE_ST_BACKREF               (1<<16)
325 #define NODE_ST_CHECKER               (1<<17)
326 #define NODE_ST_FIXED_OPTION          (1<<18)
327 #define NODE_ST_PROHIBIT_RECURSION    (1<<19)
328 #define NODE_ST_SUPER                 (1<<20)
329 
330 
331 #define NODE_STATUS(node)           (((Node* )node)->u.base.status)
332 #define NODE_STATUS_ADD(node,f)     (NODE_STATUS(node) |= (NODE_ST_ ## f))
333 #define NODE_STATUS_REMOVE(node,f)  (NODE_STATUS(node) &= ~(NODE_ST_ ## f))
334 
335 #define NODE_IS_BY_NUMBER(node)       ((NODE_STATUS(node) & NODE_ST_BY_NUMBER)      != 0)
336 #define NODE_IS_IN_REAL_REPEAT(node)  ((NODE_STATUS(node) & NODE_ST_IN_REAL_REPEAT) != 0)
337 #define NODE_IS_CALLED(node)          ((NODE_STATUS(node) & NODE_ST_CALLED)         != 0)
338 #define NODE_IS_IN_MULTI_ENTRY(node)  ((NODE_STATUS(node) & NODE_ST_IN_MULTI_ENTRY) != 0)
339 #define NODE_IS_RECURSION(node)       ((NODE_STATUS(node) & NODE_ST_RECURSION)      != 0)
340 #define NODE_IS_IN_ZERO_REPEAT(node)  ((NODE_STATUS(node) & NODE_ST_IN_ZERO_REPEAT) != 0)
341 #define NODE_IS_NAMED_GROUP(node)     ((NODE_STATUS(node) & NODE_ST_NAMED_GROUP)  != 0)
342 #define NODE_IS_ADDR_FIXED(node)      ((NODE_STATUS(node) & NODE_ST_ADDR_FIXED)   != 0)
343 #define NODE_IS_CLEN_FIXED(node)      ((NODE_STATUS(node) & NODE_ST_CLEN_FIXED)   != 0)
344 #define NODE_IS_MIN_FIXED(node)       ((NODE_STATUS(node) & NODE_ST_MIN_FIXED)    != 0)
345 #define NODE_IS_MAX_FIXED(node)       ((NODE_STATUS(node) & NODE_ST_MAX_FIXED)    != 0)
346 #define NODE_IS_MARK1(node)           ((NODE_STATUS(node) & NODE_ST_MARK1)        != 0)
347 #define NODE_IS_MARK2(node)           ((NODE_STATUS(node) & NODE_ST_MARK2)        != 0)
348 #define NODE_IS_NEST_LEVEL(node)      ((NODE_STATUS(node) & NODE_ST_NEST_LEVEL)   != 0)
349 #define NODE_IS_BY_NAME(node)         ((NODE_STATUS(node) & NODE_ST_BY_NAME)      != 0)
350 #define NODE_IS_BACKREF(node)         ((NODE_STATUS(node) & NODE_ST_BACKREF)      != 0)
351 #define NODE_IS_CHECKER(node)         ((NODE_STATUS(node) & NODE_ST_CHECKER)      != 0)
352 #define NODE_IS_FIXED_OPTION(node)    ((NODE_STATUS(node) & NODE_ST_FIXED_OPTION) != 0)
353 #define NODE_IS_SUPER(node)           ((NODE_STATUS(node) & NODE_ST_SUPER)        != 0)
354 #define NODE_IS_PROHIBIT_RECURSION(node) \
355     ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0)
356 #define NODE_IS_STRICT_REAL_REPEAT(node) \
357     ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0)
358 
359 #define NODE_BODY(node)           ((node)->u.base.body)
360 #define NODE_QUANT_BODY(node)     ((node)->body)
361 #define NODE_BAG_BODY(node)       ((node)->body)
362 #define NODE_CALL_BODY(node)      ((node)->body)
363 #define NODE_ANCHOR_BODY(node)    ((node)->body)
364 
365 #define SCANENV_MEMENV_SIZE               8
366 #define SCANENV_MEMENV(senv) \
367  (IS_NOT_NULL((senv)->mem_env_dynamic) ? \
368     (senv)->mem_env_dynamic : (senv)->mem_env_static)
369 
370 typedef struct {
371   Node* node;
372 #if 0
373   int in;
374   int recursion;
375 #endif
376 } MemEnv;
377 
378 typedef struct {
379   enum SaveType type;
380 } SaveItem;
381 
382 typedef struct {
383   OnigOptionType   options;
384   OnigCaseFoldType case_fold_flag;
385   OnigEncoding     enc;
386   OnigSyntaxType*  syntax;
387   MemStatusType    capture_history;
388   MemStatusType    bt_mem_start;
389   MemStatusType    bt_mem_end;
390   MemStatusType    backrefed_mem;
391   UChar*           pattern;
392   UChar*           pattern_end;
393   UChar*           error;
394   UChar*           error_end;
395   regex_t*         reg;       /* for reg->names only */
396   int              num_call;
397 #ifdef USE_CALL
398   UnsetAddrList*   unset_addr_list;
399   int              has_call_zero;
400 #endif
401   int              num_mem;
402   int              num_named;
403   int              mem_alloc;
404   MemEnv           mem_env_static[SCANENV_MEMENV_SIZE];
405   MemEnv*          mem_env_dynamic;
406   unsigned int     parse_depth;
407 
408   int keep_num;
409   int save_num;
410   int save_alloc_num;
411   SaveItem* saves;
412 } ScanEnv;
413 
414 
415 #define IS_SYNTAX_OP(syn, opm)    (((syn)->op  & (opm)) != 0)
416 #define IS_SYNTAX_OP2(syn, opm)   (((syn)->op2 & (opm)) != 0)
417 #define IS_SYNTAX_BV(syn, bvm)    (((syn)->behavior & (bvm)) != 0)
418 
419 typedef struct {
420   int new_val;
421 } GroupNumRemap;
422 
423 extern int    onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map));
424 
425 extern int    onig_strncmp P_((const UChar* s1, const UChar* s2, int n));
426 extern void   onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end));
427 extern void   onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end));
428 extern int    onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc));
429 extern void   onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode));
430 extern void   onig_node_conv_to_str_node P_((Node* node, int raw));
431 extern int    onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end));
432 extern int    onig_node_str_set P_((Node* node, const UChar* s, const UChar* end));
433 extern void   onig_node_free P_((Node* node));
434 extern Node*  onig_node_new_bag P_((enum BagType type));
435 extern Node*  onig_node_new_anchor P_((int type, int ascii_mode));
436 extern Node*  onig_node_new_str P_((const UChar* s, const UChar* end));
437 extern Node*  onig_node_new_list P_((Node* left, Node* right));
438 extern Node*  onig_node_list_add P_((Node* list, Node* x));
439 extern Node*  onig_node_new_alt P_((Node* left, Node* right));
440 extern void   onig_node_str_clear P_((Node* node));
441 extern int    onig_names_free P_((regex_t* reg));
442 extern int    onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env));
443 extern int    onig_free_shared_cclass_table P_((void));
444 extern int    onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc));
445 extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node);
446 
447 #ifdef USE_CALLOUT
448 extern int onig_global_callout_names_free(void);
449 #endif
450 
451 #ifdef ONIG_DEBUG
452 extern int onig_print_names(FILE*, regex_t*);
453 #endif
454 
455 #endif /* REGPARSE_H */
456