1 /**********************************************************************
2 regenc.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2020 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regint.h"
31
32 #define LARGE_S 0x53
33 #define SMALL_S 0x73
34
35 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
36
37 #define INITED_LIST_SIZE 20
38
39 static int InitedListNum;
40
41 static struct {
42 OnigEncoding enc;
43 int inited;
44 } InitedList[INITED_LIST_SIZE];
45
46 static int
enc_inited_entry(OnigEncoding enc)47 enc_inited_entry(OnigEncoding enc)
48 {
49 int i;
50
51 for (i = 0; i < InitedListNum; i++) {
52 if (InitedList[i].enc == enc) {
53 InitedList[i].inited = 1;
54 return i;
55 }
56 }
57
58 i = InitedListNum;
59 if (i < INITED_LIST_SIZE - 1) {
60 InitedList[i].enc = enc;
61 InitedList[i].inited = 1;
62 InitedListNum++;
63 return i;
64 }
65
66 return -1;
67 }
68
69 static int
enc_is_inited(OnigEncoding enc)70 enc_is_inited(OnigEncoding enc)
71 {
72 int i;
73
74 for (i = 0; i < InitedListNum; i++) {
75 if (InitedList[i].enc == enc) {
76 return InitedList[i].inited;
77 }
78 }
79
80 return 0;
81 }
82
83 static int OnigEncInited;
84
85 extern int
onigenc_init(void)86 onigenc_init(void)
87 {
88 if (OnigEncInited != 0) return 0;
89
90 OnigEncInited = 1;
91 return 0;
92 }
93
94 extern int
onigenc_end(void)95 onigenc_end(void)
96 {
97 int i;
98
99 for (i = 0; i < InitedListNum; i++) {
100 InitedList[i].enc = 0;
101 InitedList[i].inited = 0;
102 }
103 InitedListNum = 0;
104
105 OnigEncInited = 0;
106 return ONIG_NORMAL;
107 }
108
109 extern int
onig_initialize_encoding(OnigEncoding enc)110 onig_initialize_encoding(OnigEncoding enc)
111 {
112 int r;
113
114 if (enc != ONIG_ENCODING_ASCII &&
115 ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
116 OnigEncoding ascii = ONIG_ENCODING_ASCII;
117 if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
118 r = ascii->init();
119 if (r != ONIG_NORMAL) return r;
120 enc_inited_entry(ascii);
121 }
122 }
123
124 if (enc->init != 0 &&
125 enc_is_inited(enc) == 0) {
126 r = (enc->init)();
127 if (r == ONIG_NORMAL)
128 enc_inited_entry(enc);
129 return r;
130 }
131
132 return 0;
133 }
134
135 extern OnigEncoding
onigenc_get_default_encoding(void)136 onigenc_get_default_encoding(void)
137 {
138 return OnigEncDefaultCharEncoding;
139 }
140
141 extern int
onigenc_set_default_encoding(OnigEncoding enc)142 onigenc_set_default_encoding(OnigEncoding enc)
143 {
144 OnigEncDefaultCharEncoding = enc;
145 return 0;
146 }
147
148 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)149 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
150 {
151 int slen, term_len, i;
152 UChar *r;
153
154 slen = (int )(end - s);
155 term_len = ONIGENC_MBC_MINLEN(enc);
156
157 r = (UChar* )xmalloc(slen + term_len);
158 CHECK_NULL_RETURN(r);
159 xmemcpy(r, s, slen);
160
161 for (i = 0; i < term_len; i++)
162 r[slen + i] = (UChar )0;
163
164 return r;
165 }
166
167 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)168 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
169 {
170 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
171 if (p < s) {
172 p += enclen(enc, p);
173 }
174 return p;
175 }
176
177 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)178 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
179 const UChar* start, const UChar* s, const UChar** prev)
180 {
181 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
182
183 if (p < s) {
184 if (prev) *prev = (const UChar* )p;
185 p += enclen(enc, p);
186 }
187 else {
188 if (prev)
189 *prev = onigenc_get_prev_char_head(enc, start, p);
190 }
191 return p;
192 }
193
194 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)195 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
196 {
197 if (s <= start)
198 return (UChar* )NULL;
199
200 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
201 }
202
203 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)204 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
205 {
206 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
207 if (s <= start)
208 return (UChar* )NULL;
209
210 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
211 }
212 return (UChar* )s;
213 }
214
215 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)216 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
217 {
218 UChar* q = (UChar* )p;
219 while (n-- > 0) {
220 q += ONIGENC_MBC_ENC_LEN(enc, q);
221 }
222 return (q <= end ? q : NULL);
223 }
224
225 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)226 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
227 {
228 int n = 0;
229 UChar* q = (UChar* )p;
230
231 while (q < end) {
232 q += ONIGENC_MBC_ENC_LEN(enc, q);
233 n++;
234 }
235 return n;
236 }
237
238 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)239 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
240 {
241 int n = 0;
242 UChar* p = (UChar* )s;
243
244 while (1) {
245 if (*p == '\0') {
246 UChar* q;
247 int len = ONIGENC_MBC_MINLEN(enc);
248
249 if (len == 1) return n;
250 q = p + 1;
251 while (len > 1) {
252 if (*q != '\0') break;
253 q++;
254 len--;
255 }
256 if (len == 1) return n;
257 }
258 p += ONIGENC_MBC_ENC_LEN(enc, p);
259 n++;
260 }
261 }
262
263 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)264 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
265 {
266 const UChar* start = s;
267 const UChar* p = s;
268
269 while (1) {
270 if (*p == '\0') {
271 const UChar* q;
272 int len = ONIGENC_MBC_MINLEN(enc);
273
274 if (len == 1) return (int )(p - start);
275 q = p + 1;
276 while (len > 1) {
277 if (*q != '\0') break;
278 q++;
279 len--;
280 }
281 if (len == 1) return (int )(p - start);
282 }
283 p += ONIGENC_MBC_ENC_LEN(enc, p);
284 }
285 }
286
287 const UChar OnigEncAsciiToLowerCaseTable[] = {
288 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
289 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
290 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
291 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
292 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
293 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
294 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
295 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
296 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
297 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
298 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
299 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
300 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
301 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
302 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
303 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
304 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
305 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
306 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
307 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
308 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
309 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
310 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
311 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
312 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
313 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
314 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
315 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
316 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
317 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
318 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
319 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
320 };
321
322 #ifdef USE_UPPER_CASE_TABLE
323 const UChar OnigEncAsciiToUpperCaseTable[256] = {
324 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
325 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
326 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
327 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
328 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
329 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
330 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
331 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
332 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
333 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
334 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
335 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
336 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
337 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
338 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
339 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
340 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
341 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
342 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
343 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
344 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
345 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
346 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
347 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
348 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
349 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
350 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
351 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
352 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
353 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
354 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
355 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
356 };
357 #endif
358
359 const unsigned short OnigEncAsciiCtypeTable[256] = {
360 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
361 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
362 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
363 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
364 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
365 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
366 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
367 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
368 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
369 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
370 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
371 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
372 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
373 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
374 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
375 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
376 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
377 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
378 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
379 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
380 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
385 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
386 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
389 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
390 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
391 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
392 };
393
394 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
395 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
396 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
397 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
398 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
399 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
400 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
401 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
402 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
403 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
404 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
405 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
406 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
407 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
408 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
409 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
410 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
411 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
412 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
413 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
414 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
415 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
416 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
417 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
418 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
419 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
420 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
421 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
422 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
423 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
424 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
425 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
426 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
427 };
428
429 #ifdef USE_UPPER_CASE_TABLE
430 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
431 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
432 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
433 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
434 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
435 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
436 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
437 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
438 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
439 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
440 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
441 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
442 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
443 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
444 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
445 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
446 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
447 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
448 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
449 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
450 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
451 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
452 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
453 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
454 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
455 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
456 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
457 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
458 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
459 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
460 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
461 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
462 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
463 };
464 #endif
465
466 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)467 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
468 {
469 /* nothing */
470 /* obsoleted. */
471 }
472
473 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)474 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
475 {
476 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
477 }
478
479 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
480 { 0x41, 0x61 },
481 { 0x42, 0x62 },
482 { 0x43, 0x63 },
483 { 0x44, 0x64 },
484 { 0x45, 0x65 },
485 { 0x46, 0x66 },
486 { 0x47, 0x67 },
487 { 0x48, 0x68 },
488 { 0x49, 0x69 },
489 { 0x4a, 0x6a },
490 { 0x4b, 0x6b },
491 { 0x4c, 0x6c },
492 { 0x4d, 0x6d },
493 { 0x4e, 0x6e },
494 { 0x4f, 0x6f },
495 { 0x50, 0x70 },
496 { 0x51, 0x71 },
497 { 0x52, 0x72 },
498 { 0x53, 0x73 },
499 { 0x54, 0x74 },
500 { 0x55, 0x75 },
501 { 0x56, 0x76 },
502 { 0x57, 0x77 },
503 { 0x58, 0x78 },
504 { 0x59, 0x79 },
505 { 0x5a, 0x7a }
506 };
507
508 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)509 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
510 OnigApplyAllCaseFoldFunc f, void* arg)
511 {
512 OnigCodePoint code;
513 int i, r;
514
515 for (i = 0;
516 i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
517 i++) {
518 code = OnigAsciiLowerMap[i].to;
519 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
520 if (r != 0) return r;
521
522 code = OnigAsciiLowerMap[i].from;
523 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
524 if (r != 0) return r;
525 }
526
527 return 0;
528 }
529
530 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])531 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
532 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
533 OnigCaseFoldCodeItem items[])
534 {
535 if (0x41 <= *p && *p <= 0x5a) {
536 items[0].byte_len = 1;
537 items[0].code_len = 1;
538 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
539 return 1;
540 }
541 else if (0x61 <= *p && *p <= 0x7a) {
542 items[0].byte_len = 1;
543 items[0].code_len = 1;
544 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
545 return 1;
546 }
547 else
548 return 0;
549 }
550
551 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)552 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
553 OnigApplyAllCaseFoldFunc f, void* arg)
554 {
555 static OnigCodePoint ss[] = { SMALL_S, SMALL_S };
556
557 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
558 }
559
560 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)561 onigenc_apply_all_case_fold_with_map(int map_size,
562 const OnigPairCaseFoldCodes map[],
563 int ess_tsett_flag, OnigCaseFoldType flag,
564 OnigApplyAllCaseFoldFunc f, void* arg)
565 {
566 OnigCodePoint code;
567 int i, r;
568
569 r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
570 if (r != 0) return r;
571
572 if (CASE_FOLD_IS_ASCII_ONLY(flag))
573 return 0;
574
575 for (i = 0; i < map_size; i++) {
576 code = map[i].to;
577 r = (*f)(map[i].from, &code, 1, arg);
578 if (r != 0) return r;
579
580 code = map[i].from;
581 r = (*f)(map[i].to, &code, 1, arg);
582 if (r != 0) return r;
583 }
584
585 if (ess_tsett_flag != 0)
586 return ss_apply_all_case_fold(flag, f, arg);
587
588 return 0;
589 }
590
591 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])592 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
593 const OnigPairCaseFoldCodes map[],
594 int ess_tsett_flag, OnigCaseFoldType flag,
595 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
596 {
597 int i, j, n;
598 static OnigUChar sa[] = { LARGE_S, SMALL_S };
599
600 if (0x41 <= *p && *p <= 0x5a) { /* A - Z */
601 if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1
602 && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) /* SS */
603 && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
604 ss_combination:
605 items[0].byte_len = 2;
606 items[0].code_len = 1;
607 items[0].code[0] = (OnigCodePoint )0xdf;
608
609 n = 1;
610 for (i = 0; i < 2; i++) {
611 for (j = 0; j < 2; j++) {
612 if (sa[i] == *p && sa[j] == *(p+1))
613 continue;
614
615 items[n].byte_len = 2;
616 items[n].code_len = 2;
617 items[n].code[0] = (OnigCodePoint )sa[i];
618 items[n].code[1] = (OnigCodePoint )sa[j];
619 n++;
620 }
621 }
622 return 4;
623 }
624
625 items[0].byte_len = 1;
626 items[0].code_len = 1;
627 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
628 return 1;
629 }
630 else if (0x61 <= *p && *p <= 0x7a) { /* a - z */
631 if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1
632 && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)
633 && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
634 goto ss_combination;
635 }
636
637 items[0].byte_len = 1;
638 items[0].code_len = 1;
639 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
640 return 1;
641 }
642 else if (*p == 0xdf && ess_tsett_flag != 0
643 && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
644 items[0].byte_len = 1;
645 items[0].code_len = 2;
646 items[0].code[0] = (OnigCodePoint )'s';
647 items[0].code[1] = (OnigCodePoint )'s';
648
649 items[1].byte_len = 1;
650 items[1].code_len = 2;
651 items[1].code[0] = (OnigCodePoint )'S';
652 items[1].code[1] = (OnigCodePoint )'S';
653
654 items[2].byte_len = 1;
655 items[2].code_len = 2;
656 items[2].code[0] = (OnigCodePoint )'s';
657 items[2].code[1] = (OnigCodePoint )'S';
658
659 items[3].byte_len = 1;
660 items[3].code_len = 2;
661 items[3].code[0] = (OnigCodePoint )'S';
662 items[3].code[1] = (OnigCodePoint )'s';
663
664 return 4;
665 }
666 else {
667 int i;
668
669 if (CASE_FOLD_IS_ASCII_ONLY(flag))
670 return 0;
671
672 for (i = 0; i < map_size; i++) {
673 if (*p == map[i].from) {
674 items[0].byte_len = 1;
675 items[0].code_len = 1;
676 items[0].code[0] = map[i].to;
677 return 1;
678 }
679 else if (*p == map[i].to) {
680 items[0].byte_len = 1;
681 items[0].code_len = 1;
682 items[0].code[0] = map[i].from;
683 return 1;
684 }
685 }
686 }
687
688 return 0;
689 }
690
691
692 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)693 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
694 OnigCodePoint* sb_out ARG_UNUSED,
695 const OnigCodePoint* ranges[] ARG_UNUSED)
696 {
697 return ONIG_NO_SUPPORT_CONFIG;
698 }
699
700 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)701 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
702 {
703 if (p < end) {
704 if (*p == NEWLINE_CODE) return 1;
705 }
706 return 0;
707 }
708
709 /* for single byte encodings */
710 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)711 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
712 const UChar*end ARG_UNUSED, UChar* lower)
713 {
714 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
715
716 (*p)++;
717 return 1; /* return byte length of converted char to lower */
718 }
719
720 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)721 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
722 {
723 return 1;
724 }
725
726 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)727 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
728 {
729 return (OnigCodePoint )(*p);
730 }
731
732 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)733 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
734 {
735 return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
736 }
737
738 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)739 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
740 {
741 *buf = (UChar )(code & 0xff);
742 return 1;
743 }
744
745 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)746 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
747 const UChar* s)
748 {
749 return (UChar* )s;
750 }
751
752 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)753 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
754 const UChar* end ARG_UNUSED)
755 {
756 return TRUE;
757 }
758
759 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)760 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
761 const UChar* end ARG_UNUSED)
762 {
763 return FALSE;
764 }
765
766 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)767 onigenc_always_true_is_valid_mbc_string(const UChar* s ARG_UNUSED,
768 const UChar* end ARG_UNUSED)
769 {
770 return TRUE;
771 }
772
773 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)774 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
775 const UChar* p, const UChar* end)
776 {
777 while (p < end) {
778 p += enclen(enc, p);
779 }
780
781 if (p != end)
782 return FALSE;
783 else
784 return TRUE;
785 }
786
787 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)788 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
789 {
790 return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
791 }
792
793 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)794 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
795 {
796 int c, i, len;
797 OnigCodePoint n;
798
799 len = enclen(enc, p);
800 n = (OnigCodePoint )(*p++);
801 if (len == 1) return n;
802
803 for (i = 1; i < len; i++) {
804 if (p >= end) break;
805 c = *p++;
806 n <<= 8; n += c;
807 }
808 return n;
809 }
810
811 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)812 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
813 const UChar** pp, const UChar* end ARG_UNUSED,
814 UChar* lower)
815 {
816 int len;
817 const UChar *p = *pp;
818
819 if (ONIGENC_IS_MBC_ASCII(p)) {
820 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
821 (*pp)++;
822 return 1;
823 }
824 else {
825 int i;
826
827 len = enclen(enc, p);
828 for (i = 0; i < len; i++) {
829 *lower++ = *p++;
830 }
831 (*pp) += len;
832 return len; /* return byte length of converted to lower char */
833 }
834 }
835
836 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)837 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
838 {
839 UChar *p = buf;
840
841 if ((code & 0xff00) != 0) {
842 *p++ = (UChar )((code >> 8) & 0xff);
843 }
844 *p++ = (UChar )(code & 0xff);
845
846 #if 1
847 if (enclen(enc, buf) != (p - buf))
848 return ONIGERR_INVALID_CODE_POINT_VALUE;
849 #endif
850 return (int )(p - buf);
851 }
852
853 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)854 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
855 {
856 UChar *p = buf;
857
858 if ((code & 0xff000000) != 0) {
859 *p++ = (UChar )((code >> 24) & 0xff);
860 }
861 if ((code & 0xff0000) != 0 || p != buf) {
862 *p++ = (UChar )((code >> 16) & 0xff);
863 }
864 if ((code & 0xff00) != 0 || p != buf) {
865 *p++ = (UChar )((code >> 8) & 0xff);
866 }
867 *p++ = (UChar )(code & 0xff);
868
869 #if 1
870 if (enclen(enc, buf) != (p - buf))
871 return ONIGERR_INVALID_CODE_POINT_VALUE;
872 #endif
873 return (int )(p - buf);
874 }
875
876 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)877 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
878 {
879 static PosixBracketEntryType PBS[] = {
880 { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
881 { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
882 { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
883 { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
884 { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
885 { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
886 { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
887 { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
888 { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
889 { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
890 { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
891 { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
892 { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
893 { (UChar* )"Word", ONIGENC_CTYPE_WORD, 4 },
894 { (UChar* )NULL, -1, 0 }
895 };
896
897 PosixBracketEntryType *pb;
898 int len;
899
900 len = onigenc_strlen(enc, p, end);
901 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
902 if (len == pb->len &&
903 onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
904 return pb->ctype;
905 }
906
907 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
908 }
909
910 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)911 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
912 {
913 OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
914
915 if (code > ASCII_LIMIT) return 0;
916
917 return ONIGENC_IS_ASCII_CODE_WORD(code);
918 }
919
920 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)921 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
922 unsigned int ctype)
923 {
924 if (code < 128)
925 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
926 else {
927 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
928 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
929 }
930 }
931
932 return FALSE;
933 }
934
935 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)936 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
937 unsigned int ctype)
938 {
939 if (code < 128)
940 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
941 else {
942 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
943 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
944 }
945 }
946
947 return FALSE;
948 }
949
950 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)951 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
952 const UChar* sascii /* ascii */, int n)
953 {
954 int x, c;
955
956 while (n-- > 0) {
957 if (p >= end) return (int )(*sascii);
958
959 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
960 x = *sascii - c;
961 if (x) return x;
962
963 sascii++;
964 p += enclen(enc, p);
965 }
966 return 0;
967 }
968
969 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)970 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
971 {
972 int i;
973
974 for (i = 0; i < n; i++) {
975 if (a[i] != b[i])
976 return -1;
977 }
978
979 return 0;
980 }
981
982 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)983 onig_codes_byte_at(OnigCodePoint codes[], int at)
984 {
985 int index;
986 int b;
987 OnigCodePoint code;
988
989 index = at / 3;
990 b = at % 3;
991 code = codes[index];
992
993 return ((code >> ((2 - b) * 8)) & 0xff);
994 }
995