1 /**********************************************************************
2 regposix.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2021 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #define regex_t onig_regex_t
31 #include "regint.h"
32 #undef regex_t
33
34 #include "onigposix.h"
35
36 #undef regex_t
37 #undef regmatch_t
38 #undef regoff_t
39 #undef regcomp
40 #undef regexec
41 #undef regfree
42 #undef regerror
43 #undef reg_set_encoding
44 #undef reg_name_to_group_numbers
45 #undef reg_foreach_name
46 #undef reg_number_of_names
47
48 #define ONIG_C(reg) ((onig_regex_t* )((reg)->onig))
49 #define PONIG_C(reg) ((onig_regex_t** )(&(reg)->onig))
50
51 /* #define ENC_STRING_LEN(enc,s,len) len = strlen(s) */
52 #define ENC_STRING_LEN(enc,s,len) do { \
53 if (ONIGENC_MBC_MINLEN(enc) == 1) { \
54 UChar* tmps = (UChar* )(s); \
55 while (*tmps != 0) tmps++; \
56 len = (int )(tmps - (UChar* )(s));\
57 } \
58 else { \
59 len = onigenc_str_bytelen_null(enc, (UChar* )s); \
60 } \
61 } while(0)
62
63 typedef struct {
64 int onig_err;
65 int posix_err;
66 } O2PERR;
67
68 static int
onig2posix_error_code(int code)69 onig2posix_error_code(int code)
70 {
71 static const O2PERR o2p[] = {
72 { ONIG_MISMATCH, REG_NOMATCH },
73 { ONIG_NO_SUPPORT_CONFIG, REG_EONIG_INTERNAL },
74 { ONIG_ABORT, REG_EONIG_INTERNAL },
75 { ONIGERR_MEMORY, REG_ESPACE },
76 { ONIGERR_MATCH_STACK_LIMIT_OVER, REG_EONIG_INTERNAL },
77 { ONIGERR_RETRY_LIMIT_IN_MATCH_OVER, REG_EONIG_INTERNAL },
78 { ONIGERR_RETRY_LIMIT_IN_SEARCH_OVER, REG_EONIG_INTERNAL },
79 { ONIGERR_SUBEXP_CALL_LIMIT_IN_SEARCH_OVER, REG_EONIG_INTERNAL },
80 { ONIGERR_TYPE_BUG, REG_EONIG_INTERNAL },
81 { ONIGERR_PARSER_BUG, REG_EONIG_INTERNAL },
82 { ONIGERR_STACK_BUG, REG_EONIG_INTERNAL },
83 { ONIGERR_UNDEFINED_BYTECODE, REG_EONIG_INTERNAL },
84 { ONIGERR_UNEXPECTED_BYTECODE, REG_EONIG_INTERNAL },
85 { ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED, REG_EONIG_BADARG },
86 { ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR, REG_EONIG_BADARG },
87 { ONIGERR_FAIL_TO_INITIALIZE, REG_EONIG_INTERNAL },
88 { ONIGERR_INVALID_ARGUMENT, REG_EONIG_BADARG },
89 { ONIGERR_END_PATTERN_AT_LEFT_BRACE, REG_EBRACE },
90 { ONIGERR_END_PATTERN_AT_LEFT_BRACKET, REG_EBRACK },
91 { ONIGERR_EMPTY_CHAR_CLASS, REG_ECTYPE },
92 { ONIGERR_PREMATURE_END_OF_CHAR_CLASS, REG_ECTYPE },
93 { ONIGERR_END_PATTERN_AT_ESCAPE, REG_EESCAPE },
94 { ONIGERR_END_PATTERN_AT_META, REG_EESCAPE },
95 { ONIGERR_END_PATTERN_AT_CONTROL, REG_EESCAPE },
96 { ONIGERR_META_CODE_SYNTAX, REG_BADPAT },
97 { ONIGERR_CONTROL_CODE_SYNTAX, REG_BADPAT },
98 { ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE, REG_ECTYPE },
99 { ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE, REG_ECTYPE },
100 { ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS, REG_ECTYPE },
101 { ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED, REG_BADRPT },
102 { ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID, REG_BADRPT },
103 { ONIGERR_NESTED_REPEAT_OPERATOR, REG_BADRPT },
104 { ONIGERR_UNMATCHED_CLOSE_PARENTHESIS, REG_EPAREN },
105 { ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS, REG_EPAREN },
106 { ONIGERR_END_PATTERN_IN_GROUP, REG_BADPAT },
107 { ONIGERR_UNDEFINED_GROUP_OPTION, REG_BADPAT },
108 { ONIGERR_INVALID_POSIX_BRACKET_TYPE, REG_BADPAT },
109 { ONIGERR_INVALID_LOOK_BEHIND_PATTERN, REG_BADPAT },
110 { ONIGERR_INVALID_REPEAT_RANGE_PATTERN, REG_BADPAT },
111 { ONIGERR_TOO_BIG_NUMBER, REG_BADPAT },
112 { ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE, REG_BADBR },
113 { ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE, REG_BADBR },
114 { ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS, REG_ECTYPE },
115 { ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE, REG_ECTYPE },
116 { ONIGERR_TOO_MANY_MULTI_BYTE_RANGES, REG_ECTYPE },
117 { ONIGERR_TOO_SHORT_MULTI_BYTE_STRING, REG_BADPAT },
118 { ONIGERR_TOO_BIG_BACKREF_NUMBER, REG_ESUBREG },
119 { ONIGERR_INVALID_BACKREF, REG_ESUBREG },
120 { ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT },
121 { ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
122 { ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
123 { ONIGERR_UNDEFINED_OPERATOR, REG_BADPAT },
124 { ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC },
125 { ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT },
126 { ONIGERR_INVALID_GROUP_NAME, REG_BADPAT },
127 { ONIGERR_INVALID_CHAR_IN_GROUP_NAME, REG_BADPAT },
128 { ONIGERR_UNDEFINED_NAME_REFERENCE, REG_BADPAT },
129 { ONIGERR_UNDEFINED_GROUP_REFERENCE, REG_BADPAT },
130 { ONIGERR_MULTIPLEX_DEFINED_NAME, REG_BADPAT },
131 { ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, REG_BADPAT },
132 { ONIGERR_NEVER_ENDING_RECURSION, REG_BADPAT },
133 { ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY, REG_BADPAT },
134 { ONIGERR_INVALID_CHAR_PROPERTY_NAME, REG_BADPAT },
135 { ONIGERR_INVALID_IF_ELSE_SYNTAX, REG_BADPAT },
136 { ONIGERR_INVALID_ABSENT_GROUP_PATTERN, REG_BADPAT },
137 { ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN, REG_BADPAT },
138 { ONIGERR_INVALID_CALLOUT_PATTERN, REG_BADPAT },
139 { ONIGERR_INVALID_CALLOUT_NAME, REG_BADPAT },
140 { ONIGERR_UNDEFINED_CALLOUT_NAME, REG_BADPAT },
141 { ONIGERR_INVALID_CALLOUT_BODY, REG_BADPAT },
142 { ONIGERR_INVALID_CALLOUT_TAG_NAME, REG_BADPAT },
143 { ONIGERR_INVALID_CALLOUT_ARG, REG_BADPAT },
144 { ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION, REG_EONIG_BADARG },
145 { ONIGERR_VERY_INEFFICIENT_PATTERN, REG_BADPAT },
146 { ONIGERR_LIBRARY_IS_NOT_INITIALIZED, REG_EONIG_INTERNAL }
147 };
148
149 int i;
150
151 if (code >= 0) return 0;
152
153 for (i = 0; i < (int )(sizeof(o2p) / sizeof(o2p[0])); i++) {
154 if (code == o2p[i].onig_err)
155 return o2p[i].posix_err;
156 }
157
158 return REG_EONIG_INTERNAL; /* but, unknown error code */
159 }
160
161 extern int
onig_posix_regcomp(onig_posix_regex_t * reg,const char * pattern,int posix_options)162 onig_posix_regcomp(onig_posix_regex_t* reg, const char* pattern, int posix_options)
163 {
164 int r, len;
165 OnigSyntaxType* syntax = OnigDefaultSyntax;
166 OnigOptionType options;
167
168 reg->onig = (void* )0;
169
170 if ((posix_options & REG_EXTENDED) == 0)
171 syntax = ONIG_SYNTAX_POSIX_BASIC;
172
173 options = syntax->options;
174 if ((posix_options & REG_ICASE) != 0)
175 ONIG_OPTION_ON(options, ONIG_OPTION_IGNORECASE);
176 if ((posix_options & REG_NEWLINE) != 0) {
177 ONIG_OPTION_ON( options, ONIG_OPTION_NEGATE_SINGLELINE);
178 ONIG_OPTION_OFF(options, ONIG_OPTION_SINGLELINE);
179 }
180
181 reg->comp_options = posix_options;
182
183 ENC_STRING_LEN(OnigEncDefaultCharEncoding, pattern, len);
184 r = onig_new(PONIG_C(reg), (UChar* )pattern, (UChar* )(pattern + len),
185 options, OnigEncDefaultCharEncoding, syntax,
186 (OnigErrorInfo* )NULL);
187 if (r != ONIG_NORMAL) {
188 return onig2posix_error_code(r);
189 }
190
191 reg->re_nsub = ONIG_C(reg)->num_mem;
192 return 0;
193 }
194
195 extern int
onig_posix_regexec(onig_posix_regex_t * reg,const char * str,size_t nmatch,onig_posix_regmatch_t pmatch[],int posix_options)196 onig_posix_regexec(onig_posix_regex_t* reg, const char* str, size_t nmatch,
197 onig_posix_regmatch_t pmatch[], int posix_options)
198 {
199 int r, i, len;
200 UChar* end;
201 onig_posix_regmatch_t* pm;
202 OnigOptionType options;
203
204 options = ONIG_OPTION_POSIX_REGION;
205 if ((posix_options & REG_NOTBOL) != 0) options |= ONIG_OPTION_NOTBOL;
206 if ((posix_options & REG_NOTEOL) != 0) options |= ONIG_OPTION_NOTEOL;
207
208 if (nmatch == 0 || (reg->comp_options & REG_NOSUB) != 0) {
209 pm = (onig_posix_regmatch_t* )NULL;
210 nmatch = 0;
211 }
212 else if ((int )nmatch < ONIG_C(reg)->num_mem + 1) {
213 pm = (onig_posix_regmatch_t* )xmalloc(sizeof(onig_posix_regmatch_t)
214 * (ONIG_C(reg)->num_mem + 1));
215 if (pm == NULL)
216 return REG_ESPACE;
217 }
218 else {
219 pm = pmatch;
220 }
221
222 ENC_STRING_LEN(ONIG_C(reg)->enc, str, len);
223 end = (UChar* )(str + len);
224 r = onig_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end,
225 (OnigRegion* )pm, options);
226
227 if (r >= 0) {
228 r = 0; /* Match */
229 if (pm != pmatch && pm != NULL) {
230 xmemcpy(pmatch, pm, sizeof(onig_posix_regmatch_t) * nmatch);
231 }
232 }
233 else if (r == ONIG_MISMATCH) {
234 r = REG_NOMATCH;
235 for (i = 0; i < (int )nmatch; i++)
236 pmatch[i].rm_so = pmatch[i].rm_eo = ONIG_REGION_NOTPOS;
237 }
238 else {
239 r = onig2posix_error_code(r);
240 }
241
242 if (pm != pmatch && pm != NULL)
243 xfree(pm);
244
245 #if 0
246 if (reg->re_nsub > nmatch - 1)
247 reg->re_nsub = (nmatch <= 1 ? 0 : nmatch - 1);
248 #endif
249
250 return r;
251 }
252
253 extern void
onig_posix_regfree(onig_posix_regex_t * reg)254 onig_posix_regfree(onig_posix_regex_t* reg)
255 {
256 onig_free(ONIG_C(reg));
257 reg->onig = (void* )0;
258 }
259
260
261 extern void
onig_posix_reg_set_encoding(int mb_code)262 onig_posix_reg_set_encoding(int mb_code)
263 {
264 OnigEncoding enc;
265
266 switch (mb_code) {
267 case REG_POSIX_ENCODING_ASCII:
268 enc = ONIG_ENCODING_ASCII;
269 break;
270 case REG_POSIX_ENCODING_EUC_JP:
271 enc = ONIG_ENCODING_EUC_JP;
272 break;
273 case REG_POSIX_ENCODING_SJIS:
274 enc = ONIG_ENCODING_SJIS;
275 break;
276 case REG_POSIX_ENCODING_UTF8:
277 enc = ONIG_ENCODING_UTF8;
278 break;
279 case REG_POSIX_ENCODING_UTF16_BE:
280 enc = ONIG_ENCODING_UTF16_BE;
281 break;
282 case REG_POSIX_ENCODING_UTF16_LE:
283 enc = ONIG_ENCODING_UTF16_LE;
284 break;
285
286 default:
287 return ;
288 break;
289 }
290
291 onig_initialize(&enc, 1);
292
293 onigenc_set_default_encoding(enc);
294 }
295
296 extern int
onig_posix_reg_name_to_group_numbers(onig_posix_regex_t * reg,const unsigned char * name,const unsigned char * name_end,int ** nums)297 onig_posix_reg_name_to_group_numbers(onig_posix_regex_t* reg,
298 const unsigned char* name, const unsigned char* name_end, int** nums)
299 {
300 return onig_name_to_group_numbers(ONIG_C(reg), name, name_end, nums);
301 }
302
303 typedef struct {
304 int (*func)(const unsigned char*, const unsigned char*,int,int*,onig_posix_regex_t*,void*);
305 onig_posix_regex_t* reg;
306 void* arg;
307 } i_wrap;
308
309 static int
i_wrapper(const UChar * name,const UChar * name_end,int ng,int * gs,onig_regex_t * reg ARG_UNUSED,void * arg)310 i_wrapper(const UChar* name, const UChar* name_end, int ng, int* gs,
311 onig_regex_t* reg ARG_UNUSED, void* arg)
312 {
313 i_wrap* warg = (i_wrap* )arg;
314
315 return (*warg->func)(name, name_end, ng, gs, warg->reg, warg->arg);
316 }
317
318 extern int
onig_posix_reg_foreach_name(onig_posix_regex_t * reg,int (* func)(const unsigned char *,const unsigned char *,int,int *,onig_posix_regex_t *,void *),void * arg)319 onig_posix_reg_foreach_name(onig_posix_regex_t* reg,
320 int (*func)(const unsigned char*, const unsigned char*,int,int*,onig_posix_regex_t*,void*),
321 void* arg)
322 {
323 i_wrap warg;
324
325 warg.func = func;
326 warg.reg = reg;
327 warg.arg = arg;
328
329 return onig_foreach_name(ONIG_C(reg), i_wrapper, &warg);
330 }
331
332 extern int
onig_posix_reg_number_of_names(onig_posix_regex_t * reg)333 onig_posix_reg_number_of_names(onig_posix_regex_t* reg)
334 {
335 return onig_number_of_names(ONIG_C(reg));
336 }
337
338
339 #ifdef USE_BINARY_COMPATIBLE_POSIX_API
340
341 extern int
regcomp(onig_posix_regex_t * reg,const char * pattern,int posix_options)342 regcomp(onig_posix_regex_t* reg, const char* pattern, int posix_options)
343 {
344 return onig_posix_regcomp(reg, pattern, posix_options);
345 }
346
347 extern int
regexec(onig_posix_regex_t * reg,const char * str,size_t nmatch,onig_posix_regmatch_t pmatch[],int posix_options)348 regexec(onig_posix_regex_t* reg, const char* str, size_t nmatch,
349 onig_posix_regmatch_t pmatch[], int posix_options)
350 {
351 return onig_posix_regexec(reg, str, nmatch, pmatch, posix_options);
352 }
353
354 extern void
regfree(onig_posix_regex_t * reg)355 regfree(onig_posix_regex_t* reg)
356 {
357 onig_posix_regfree(reg);
358 }
359
360 extern void
reg_set_encoding(int mb_code)361 reg_set_encoding(int mb_code)
362 {
363 onig_posix_reg_set_encoding(mb_code);
364 }
365
366 extern int
reg_name_to_group_numbers(onig_posix_regex_t * reg,const unsigned char * name,const unsigned char * name_end,int ** nums)367 reg_name_to_group_numbers(onig_posix_regex_t* reg,
368 const unsigned char* name, const unsigned char* name_end, int** nums)
369 {
370 return onig_posix_reg_name_to_group_numbers(reg, name, name_end, nums);
371 }
372
373 extern int
reg_foreach_name(onig_posix_regex_t * reg,int (* func)(const unsigned char *,const unsigned char *,int,int *,onig_posix_regex_t *,void *),void * arg)374 reg_foreach_name(onig_posix_regex_t* reg,
375 int (*func)(const unsigned char*, const unsigned char*,int,int*,onig_posix_regex_t*,void*),
376 void* arg)
377 {
378 return onig_posix_reg_foreach_name(reg, func, arg);
379 }
380
381 extern int
reg_number_of_names(onig_posix_regex_t * reg)382 reg_number_of_names(onig_posix_regex_t* reg)
383 {
384 return onig_posix_reg_number_of_names(reg);
385 }
386
387 #endif /* USE_BINARY_COMPATIBLE_POSIX_API */
388