1 /*
2   tre-internal.h - TRE internal definitions
3 
4   Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>.
5 
6   This library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10 
11   This library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15 
16   You should have received a copy of the GNU Lesser General Public
17   License along with this library; if not, write to the Free Software
18   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 
22 #ifndef TRE_INTERNAL_H
23 #define TRE_INTERNAL_H 1
24 
25 #ifdef HAVE_WCHAR_H
26 #include <wchar.h>
27 #endif /* HAVE_WCHAR_H */
28 
29 #ifdef HAVE_WCTYPE_H
30 #include <wctype.h>
31 #endif /* !HAVE_WCTYPE_H */
32 
33 #include <ctype.h>
34 #include "regex.h"
35 
36 #ifdef TRE_DEBUG
37 #include <stdio.h>
38 #define DPRINT(msg) do {printf msg; fflush(stdout);} while(0)
39 #else /* !TRE_DEBUG */
40 #define DPRINT(msg) do { } while(0)
41 #endif /* !TRE_DEBUG */
42 
43 #define elementsof(x)	( sizeof(x) / sizeof(x[0]) )
44 
45 #ifdef HAVE_MBRTOWC
46 #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
47 #else /* !HAVE_MBRTOWC */
48 #ifdef HAVE_MBTOWC
49 #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n)))
50 #endif /* HAVE_MBTOWC */
51 #endif /* !HAVE_MBRTOWC */
52 
53 #ifdef TRE_MULTIBYTE
54 #ifdef HAVE_MBSTATE_T
55 #define TRE_MBSTATE
56 #endif /* TRE_MULTIBYTE */
57 #endif /* HAVE_MBSTATE_T */
58 
59 /* Define the character types and functions. */
60 #ifdef TRE_WCHAR
61 
62 /* Wide characters. */
63 typedef wint_t tre_cint_t;
64 #define TRE_CHAR_MAX WCHAR_MAX
65 
66 #ifdef TRE_MULTIBYTE
67 #define TRE_MB_CUR_MAX MB_CUR_MAX
68 #else /* !TRE_MULTIBYTE */
69 #define TRE_MB_CUR_MAX 1
70 #endif /* !TRE_MULTIBYTE */
71 
72 #define tre_isalnum iswalnum
73 #define tre_isalpha iswalpha
74 #ifdef HAVE_ISWBLANK
75 #define tre_isblank iswblank
76 #endif /* HAVE_ISWBLANK */
77 #define tre_iscntrl iswcntrl
78 #define tre_isdigit iswdigit
79 #define tre_isgraph iswgraph
80 #define tre_islower iswlower
81 #define tre_isprint iswprint
82 #define tre_ispunct iswpunct
83 #define tre_isspace iswspace
84 #define tre_isupper iswupper
85 #define tre_isxdigit iswxdigit
86 
87 #define tre_tolower towlower
88 #define tre_toupper towupper
89 #define tre_strlen  wcslen
90 
91 #else /* !TRE_WCHAR */
92 
93 /* 8 bit characters. */
94 typedef short tre_cint_t;
95 #define TRE_CHAR_MAX 255
96 #define TRE_MB_CUR_MAX 1
97 
98 #define tre_isalnum isalnum
99 #define tre_isalpha isalpha
100 #ifdef HAVE_ISASCII
101 #define tre_isascii isascii
102 #endif /* HAVE_ISASCII */
103 #ifdef HAVE_ISBLANK
104 #define tre_isblank isblank
105 #endif /* HAVE_ISBLANK */
106 #define tre_iscntrl iscntrl
107 #define tre_isdigit isdigit
108 #define tre_isgraph isgraph
109 #define tre_islower islower
110 #define tre_isprint isprint
111 #define tre_ispunct ispunct
112 #define tre_isspace isspace
113 #define tre_isupper isupper
114 #define tre_isxdigit isxdigit
115 
116 #define tre_tolower(c) (tre_cint_t)(tolower(c))
117 #define tre_toupper(c) (tre_cint_t)(toupper(c))
118 #define tre_strlen(s)  (strlen((const char*)s))
119 
120 #endif /* !TRE_WCHAR */
121 
122 #if defined(TRE_WCHAR) && defined(HAVE_ISWCTYPE) && defined(HAVE_WCTYPE)
123 #define TRE_USE_SYSTEM_WCTYPE 1
124 #endif
125 
126 #ifdef TRE_USE_SYSTEM_WCTYPE
127 /* Use system provided iswctype() and wctype(). */
128 typedef wctype_t tre_ctype_t;
129 #define tre_isctype iswctype
130 #define tre_ctype   wctype
131 #else /* !TRE_USE_SYSTEM_WCTYPE */
132 /* Define our own versions of iswctype() and wctype(). */
133 typedef int (*tre_ctype_t)(tre_cint_t);
134 #define tre_isctype(c, type) ( (type)(c) )
135 tre_ctype_t tre_ctype(const char *name);
136 #endif /* !TRE_USE_SYSTEM_WCTYPE */
137 
138 typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
139 
140 /* Returns number of bytes to add to (char *)ptr to make it
141    properly aligned for the type. */
142 #define ALIGN(ptr, type) \
143   ((((long)ptr) % sizeof(type)) \
144    ? (sizeof(type) - (((long)ptr) % sizeof(type))) \
145    : 0)
146 
147 #undef MAX
148 #undef MIN
149 #define MAX(a, b) (((a) >= (b)) ? (a) : (b))
150 #define MIN(a, b) (((a) <= (b)) ? (a) : (b))
151 
152 /* Define STRF to the correct printf formatter for strings. */
153 #ifdef TRE_WCHAR
154 #define STRF "ls"
155 #else /* !TRE_WCHAR */
156 #define STRF "s"
157 #endif /* !TRE_WCHAR */
158 
159 /* TNFA transition type. A TNFA state is an array of transitions,
160    the terminator is a transition with NULL `state'. */
161 typedef struct tnfa_transition tre_tnfa_transition_t;
162 
163 struct tnfa_transition {
164   /* Range of accepted characters. */
165   tre_cint_t code_min;
166   tre_cint_t code_max;
167   /* Pointer to the destination state. */
168   tre_tnfa_transition_t *state;
169   /* ID number of the destination state. */
170   int state_id;
171   /* -1 terminated array of tags (or NULL). */
172   int *tags;
173   /* Matching parameters settings (or NULL). */
174   int *params;
175   /* Assertion bitmap. */
176   int assertions;
177   /* Assertion parameters. */
178   union {
179     /* Character class assertion. */
180     tre_ctype_t class;
181     /* Back reference assertion. */
182     int backref;
183   } u;
184   /* Negative character class assertions. */
185   tre_ctype_t *neg_classes;
186 };
187 
188 
189 /* Assertions. */
190 #define ASSERT_AT_BOL		  1   /* Beginning of line. */
191 #define ASSERT_AT_EOL		  2   /* End of line. */
192 #define ASSERT_CHAR_CLASS	  4   /* Character class in `class'. */
193 #define ASSERT_CHAR_CLASS_NEG	  8   /* Character classes in `neg_classes'. */
194 #define ASSERT_AT_BOW		 16   /* Beginning of word. */
195 #define ASSERT_AT_EOW		 32   /* End of word. */
196 #define ASSERT_AT_WB		 64   /* Word boundary. */
197 #define ASSERT_AT_WB_NEG	128   /* Not a word boundary. */
198 #define ASSERT_BACKREF		256   /* A back reference in `backref'. */
199 #define ASSERT_LAST		256
200 
201 /* Tag directions. */
202 typedef enum {
203   TRE_TAG_MINIMIZE = 0,
204   TRE_TAG_MAXIMIZE = 1
205 } tre_tag_direction_t;
206 
207 /* Parameters that can be changed dynamically while matching. */
208 typedef enum {
209   TRE_PARAM_COST_INS	    = 0,
210   TRE_PARAM_COST_DEL	    = 1,
211   TRE_PARAM_COST_SUBST	    = 2,
212   TRE_PARAM_COST_MAX	    = 3,
213   TRE_PARAM_MAX_INS	    = 4,
214   TRE_PARAM_MAX_DEL	    = 5,
215   TRE_PARAM_MAX_SUBST	    = 6,
216   TRE_PARAM_MAX_ERR	    = 7,
217   TRE_PARAM_DEPTH	    = 8,
218   TRE_PARAM_LAST	    = 9
219 } tre_param_t;
220 
221 /* Unset matching parameter */
222 #define TRE_PARAM_UNSET -1
223 
224 /* Signifies the default matching parameter value. */
225 #define TRE_PARAM_DEFAULT -2
226 
227 /* Instructions to compute submatch register values from tag values
228    after a successful match.  */
229 struct tre_submatch_data {
230   /* Tag that gives the value for rm_so (submatch start offset). */
231   int so_tag;
232   /* Tag that gives the value for rm_eo (submatch end offset). */
233   int eo_tag;
234   /* List of submatches this submatch is contained in. */
235   int *parents;
236 };
237 
238 typedef struct tre_submatch_data tre_submatch_data_t;
239 
240 
241 /* TNFA definition. */
242 typedef struct tnfa tre_tnfa_t;
243 
244 struct tnfa {
245   tre_tnfa_transition_t *transitions;
246   unsigned int num_transitions;
247   tre_tnfa_transition_t *initial;
248   tre_tnfa_transition_t *final;
249   tre_submatch_data_t *submatch_data;
250   char *firstpos_chars;
251   int first_char;
252   unsigned int num_submatches;
253   tre_tag_direction_t *tag_directions;
254   int *minimal_tags;
255   int num_tags;
256   int num_minimals;
257   int end_tag;
258   int num_states;
259   int cflags;
260   int have_backrefs;
261   int have_approx;
262   int params_depth;
263 };
264 
265 int
266 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags);
267 
268 void
269 tre_free(regex_t *preg);
270 
271 void
272 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
273 		const tre_tnfa_t *tnfa, int *tags, int match_eo);
274 
275 reg_errcode_t
276 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
277 		      tre_str_type_t type, int *match_tags, int eflags,
278 		      int *match_end_ofs);
279 
280 reg_errcode_t
281 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
282 		      tre_str_type_t type, int *match_tags, int eflags,
283 		      int *match_end_ofs);
284 
285 reg_errcode_t
286 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
287 		       int len, tre_str_type_t type, int *match_tags,
288 		       int eflags, int *match_end_ofs);
289 
290 #ifdef TRE_APPROX
291 reg_errcode_t
292 tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
293 		    tre_str_type_t type, int *match_tags,
294 		    regamatch_t *match, regaparams_t params,
295 		    int eflags, int *match_end_ofs);
296 #endif /* TRE_APPROX */
297 
298 #endif /* TRE_INTERNAL_H */
299 
300 /* EOF */
301