1 /*
2   tre-internal.h - TRE internal definitions
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #ifndef TRE_INTERNAL_H
10 #define TRE_INTERNAL_H 1
11 
12 #ifdef HAVE_WCHAR_H
13 #include <wchar.h>
14 #endif /* HAVE_WCHAR_H */
15 
16 #ifdef HAVE_WCTYPE_H
17 #include <wctype.h>
18 #endif /* !HAVE_WCTYPE_H */
19 
20 #include <ctype.h>
21 #include <stdint.h>
22 #include "tre.h"
23 
24 #ifdef TRE_DEBUG
25 #include <stdio.h>
26 #define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/(void)0,0)
27 #else /* !TRE_DEBUG */
28 #define DPRINT(msg) do { } while(/*CONSTCOND*/(void)0,0)
29 #endif /* !TRE_DEBUG */
30 
31 #define elementsof(x)	( sizeof(x) / sizeof(x[0]) )
32 
33 #ifdef HAVE_MBRTOWC
34 #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
35 #else /* !HAVE_MBRTOWC */
36 #ifdef HAVE_MBTOWC
37 #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n)))
38 #endif /* HAVE_MBTOWC */
39 #endif /* !HAVE_MBRTOWC */
40 
41 #ifdef TRE_MULTIBYTE
42 #ifdef HAVE_MBSTATE_T
43 #define TRE_MBSTATE
44 #endif /* TRE_MULTIBYTE */
45 #endif /* HAVE_MBSTATE_T */
46 
47 /* Define the character types and functions. */
48 #ifdef TRE_WCHAR
49 
50 /* Wide characters. */
51 typedef wint_t tre_cint_t;
52 /* Workaround problem seen on AIX, (2010 & 2015), e.g.,
53     https://stat.ethz.ch/pipermail/r-devel/2015-October/071902.html
54   WCHAR_MAX = UINT32_MAX on AIX and that is "not possible to work"
55   Solaris-sparcv9   WCHAR_MAX = INT32_MAX
56   Linux amd64       WCHAR_MAX = INT32_MAX
57 */
58 /*
59    [U]INT32_MAX need to be declared: this is a C99 header which we assume
60  */
61 #include <stdint.h>
62 #if WCHAR_MAX == UINT32_MAX
63 # define TRE_CHAR_MAX INT32_MAX
64 #else
65 # define TRE_CHAR_MAX WCHAR_MAX
66 #endif
67 
68 #ifdef TRE_MULTIBYTE
69 #define TRE_MB_CUR_MAX MB_CUR_MAX
70 #else /* !TRE_MULTIBYTE */
71 #define TRE_MB_CUR_MAX 1
72 #endif /* !TRE_MULTIBYTE */
73 
74 #include "rlocale.h"
75 
76 #define tre_isalnum iswalnum
77 #define tre_isalpha iswalpha
78 #define tre_isblank iswblank
79 #define tre_iscntrl iswcntrl
80 #define tre_isdigit iswdigit
81 #define tre_isgraph iswgraph
82 #define tre_islower iswlower
83 #define tre_isprint iswprint
84 #define tre_ispunct iswpunct
85 #define tre_isspace iswspace
86 #define tre_isupper iswupper
87 #define tre_isxdigit iswxdigit
88 
89 #define tre_tolower towlower
90 #define tre_toupper towupper
91 #define tre_strlen  wcslen
92 
93 #else /* !TRE_WCHAR */
94 
95 /* 8 bit characters. */
96 typedef short tre_cint_t;
97 #define TRE_CHAR_MAX 255
98 #define TRE_MB_CUR_MAX 1
99 
100 #define tre_isalnum isalnum
101 #define tre_isalpha isalpha
102 #ifdef HAVE_ISASCII
103 #define tre_isascii isascii
104 #endif /* HAVE_ISASCII */
105 #ifdef HAVE_ISBLANK
106 #define tre_isblank isblank
107 #endif /* HAVE_ISBLANK */
108 #define tre_iscntrl iscntrl
109 #define tre_isdigit isdigit
110 #define tre_isgraph isgraph
111 #define tre_islower islower
112 #define tre_isprint isprint
113 #define tre_ispunct ispunct
114 #define tre_isspace isspace
115 #define tre_isupper isupper
116 #define tre_isxdigit isxdigit
117 
118 #define tre_tolower(c) (tre_cint_t)(tolower(c))
119 #define tre_toupper(c) (tre_cint_t)(toupper(c))
120 #define tre_strlen(s)  (strlen((const char*)s))
121 
122 #endif /* !TRE_WCHAR */
123 
124 /* _WIN32 opt-out is R addition - iswctype was missing "blank"
125    R requires iswctype and wctype */
126 #if !defined(_WIN32) && defined(TRE_WCHAR) && defined(HAVE_ISWCTYPE) && defined(HAVE_WCTYPE)
127 #define TRE_USE_SYSTEM_WCTYPE 1
128 #endif
129 
130 #ifdef TRE_USE_SYSTEM_WCTYPE
131 /* Use system provided iswctype() and wctype(). */
132 typedef wctype_t tre_ctype_t;
133 #define tre_isctype(c, type) iswctype(c, type)
134 #define tre_ctype(s)   wctype(s)
135 #else /* !TRE_USE_SYSTEM_WCTYPE */
136 /* Define our own versions of iswctype() and wctype(). */
137 typedef int (*tre_ctype_t)(tre_cint_t);
138 #define tre_isctype(c, type) ( (type)(c) )
139 tre_ctype_t tre_ctype(const char *name);
140 #endif /* !TRE_USE_SYSTEM_WCTYPE */
141 
142 typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
143 
144 /* Returns number of bytes to add to (char *)ptr to make it
145    properly aligned for the type. */
146 /* R change:  was (long) but that is shorter than pointer on Win64 */
147 #define ALIGN(ptr, type) \
148   ((((size_t)ptr) % sizeof(type)) \
149    ? (sizeof(type) - (((size_t)ptr) % sizeof(type))) \
150    : 0)
151 
152 #undef MAX
153 #undef MIN
154 #define MAX(a, b) (((a) >= (b)) ? (a) : (b))
155 #define MIN(a, b) (((a) <= (b)) ? (a) : (b))
156 
157 /* Define STRF to the correct printf formatter for strings. */
158 #ifdef TRE_WCHAR
159 #define STRF "ls"
160 #else /* !TRE_WCHAR */
161 #define STRF "s"
162 #endif /* !TRE_WCHAR */
163 
164 /* TNFA transition type. A TNFA state is an array of transitions,
165    the terminator is a transition with NULL `state'. */
166 typedef struct tnfa_transition tre_tnfa_transition_t;
167 
168 struct tnfa_transition {
169   /* Range of accepted characters. */
170   tre_cint_t code_min;
171   tre_cint_t code_max;
172   /* Pointer to the destination state. */
173   tre_tnfa_transition_t *state;
174   /* ID number of the destination state. */
175   int state_id;
176   /* -1 terminated array of tags (or NULL). */
177   int *tags;
178   /* Matching parameters settings (or NULL). */
179   int *params;
180   /* Assertion bitmap. */
181   int assertions;
182   /* Assertion parameters. */
183   union {
184     /* Character class assertion. */
185     tre_ctype_t class;
186     /* Back reference assertion. */
187     int backref;
188   } u;
189   /* Negative character class assertions. */
190   tre_ctype_t *neg_classes;
191 };
192 
193 
194 /* Assertions. */
195 #define ASSERT_AT_BOL		  1   /* Beginning of line. */
196 #define ASSERT_AT_EOL		  2   /* End of line. */
197 #define ASSERT_CHAR_CLASS	  4   /* Character class in `class'. */
198 #define ASSERT_CHAR_CLASS_NEG	  8   /* Character classes in `neg_classes'. */
199 #define ASSERT_AT_BOW		 16   /* Beginning of word. */
200 #define ASSERT_AT_EOW		 32   /* End of word. */
201 #define ASSERT_AT_WB		 64   /* Word boundary. */
202 #define ASSERT_AT_WB_NEG	128   /* Not a word boundary. */
203 #define ASSERT_BACKREF		256   /* A back reference in `backref'. */
204 #define ASSERT_LAST		256
205 
206 /* define R_assert() which can replace assert() */
207 
208 /* fake definition (important: jsut const char* str is not enough!) */
209 extern void Rf_error(const char *str, ...);
210 
211 #ifdef NDEBUG
212 #define R_assert(e) ((void) 0)
213 #else
214 /* The line below requires an ANSI C preprocessor (stringify operator) */
215 #define R_assert(e) ((e) ? (void) 0 : Rf_error("assertion '%s' failed in executing regexp: file '%s', line %d\n", #e, __FILE__, __LINE__))
216 #endif /* NDEBUG */
217 
218 /* Tag directions. */
219 typedef enum {
220   TRE_TAG_MINIMIZE = 0,
221   TRE_TAG_MAXIMIZE = 1
222 } tre_tag_direction_t;
223 
224 /* Parameters that can be changed dynamically while matching. */
225 typedef enum {
226   TRE_PARAM_COST_INS	    = 0,
227   TRE_PARAM_COST_DEL	    = 1,
228   TRE_PARAM_COST_SUBST	    = 2,
229   TRE_PARAM_COST_MAX	    = 3,
230   TRE_PARAM_MAX_INS	    = 4,
231   TRE_PARAM_MAX_DEL	    = 5,
232   TRE_PARAM_MAX_SUBST	    = 6,
233   TRE_PARAM_MAX_ERR	    = 7,
234   TRE_PARAM_DEPTH	    = 8,
235   TRE_PARAM_LAST	    = 9
236 } tre_param_t;
237 
238 /* Unset matching parameter */
239 #define TRE_PARAM_UNSET -1
240 
241 /* Signifies the default matching parameter value. */
242 #define TRE_PARAM_DEFAULT -2
243 
244 /* Instructions to compute submatch register values from tag values
245    after a successful match.  */
246 struct tre_submatch_data {
247   /* Tag that gives the value for rm_so (submatch start offset). */
248   int so_tag;
249   /* Tag that gives the value for rm_eo (submatch end offset). */
250   int eo_tag;
251   /* List of submatches this submatch is contained in. */
252   int *parents;
253 };
254 
255 typedef struct tre_submatch_data tre_submatch_data_t;
256 
257 
258 /* TNFA definition. */
259 typedef struct tnfa tre_tnfa_t;
260 
261 struct tnfa {
262   tre_tnfa_transition_t *transitions;
263   unsigned int num_transitions;
264   tre_tnfa_transition_t *initial;
265   tre_tnfa_transition_t *final;
266   tre_submatch_data_t *submatch_data;
267   char *firstpos_chars;
268   int first_char;
269   unsigned int num_submatches;
270   tre_tag_direction_t *tag_directions;
271   int *minimal_tags;
272   int num_tags;
273   int num_minimals;
274   int end_tag;
275   int num_states;
276   int cflags;
277   int have_backrefs;
278   int have_approx;
279   int params_depth;
280 };
281 
282 int
283 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags);
284 
285 void
286 tre_free(regex_t *preg);
287 
288 void
289 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
290 		const tre_tnfa_t *tnfa, int *tags, int match_eo);
291 
292 reg_errcode_t
293 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
294 		      tre_str_type_t type, int *match_tags, int eflags,
295 		      int *match_end_ofs);
296 
297 reg_errcode_t
298 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
299 		      tre_str_type_t type, int *match_tags, int eflags,
300 		      int *match_end_ofs);
301 
302 reg_errcode_t
303 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
304 		       int len, tre_str_type_t type, int *match_tags,
305 		       int eflags, int *match_end_ofs);
306 
307 #ifdef TRE_APPROX
308 reg_errcode_t
309 tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
310 		    tre_str_type_t type, int *match_tags,
311 		    regamatch_t *match, regaparams_t params,
312 		    int eflags, int *match_end_ofs);
313 #endif /* TRE_APPROX */
314 
315 #endif /* TRE_INTERNAL_H */
316 
317 /* EOF */
318