1 /**********************************************************************
2 regenc.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2019 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regint.h"
31
32 #define LARGE_S 0x53
33 #define SMALL_S 0x73
34
35 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
36
37 #define INITED_LIST_SIZE 20
38
39 static int InitedListNum;
40
41 static struct {
42 OnigEncoding enc;
43 int inited;
44 } InitedList[INITED_LIST_SIZE];
45
46 static int
enc_inited_entry(OnigEncoding enc)47 enc_inited_entry(OnigEncoding enc)
48 {
49 int i;
50
51 for (i = 0; i < InitedListNum; i++) {
52 if (InitedList[i].enc == enc) {
53 InitedList[i].inited = 1;
54 return i;
55 }
56 }
57
58 i = InitedListNum;
59 if (i < INITED_LIST_SIZE - 1) {
60 InitedList[i].enc = enc;
61 InitedList[i].inited = 1;
62 InitedListNum++;
63 return i;
64 }
65
66 return -1;
67 }
68
69 static int
enc_is_inited(OnigEncoding enc)70 enc_is_inited(OnigEncoding enc)
71 {
72 int i;
73
74 for (i = 0; i < InitedListNum; i++) {
75 if (InitedList[i].enc == enc) {
76 return InitedList[i].inited;
77 }
78 }
79
80 return 0;
81 }
82
83 static int OnigEncInited;
84
85 extern int
onigenc_init(void)86 onigenc_init(void)
87 {
88 if (OnigEncInited != 0) return 0;
89
90 OnigEncInited = 1;
91 return 0;
92 }
93
94 extern int
onigenc_end(void)95 onigenc_end(void)
96 {
97 int i;
98
99 for (i = 0; i < InitedListNum; i++) {
100 InitedList[i].enc = 0;
101 InitedList[i].inited = 0;
102 }
103 InitedListNum = 0;
104
105 OnigEncInited = 0;
106 return ONIG_NORMAL;
107 }
108
109 extern int
onig_initialize_encoding(OnigEncoding enc)110 onig_initialize_encoding(OnigEncoding enc)
111 {
112 int r;
113
114 if (enc != ONIG_ENCODING_ASCII &&
115 ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
116 OnigEncoding ascii = ONIG_ENCODING_ASCII;
117 if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
118 r = ascii->init();
119 if (r != ONIG_NORMAL) return r;
120 enc_inited_entry(ascii);
121 }
122 }
123
124 if (enc->init != 0 &&
125 enc_is_inited(enc) == 0) {
126 r = (enc->init)();
127 if (r == ONIG_NORMAL)
128 enc_inited_entry(enc);
129 return r;
130 }
131
132 return 0;
133 }
134
135 extern OnigEncoding
onigenc_get_default_encoding(void)136 onigenc_get_default_encoding(void)
137 {
138 return OnigEncDefaultCharEncoding;
139 }
140
141 extern int
onigenc_set_default_encoding(OnigEncoding enc)142 onigenc_set_default_encoding(OnigEncoding enc)
143 {
144 OnigEncDefaultCharEncoding = enc;
145 return 0;
146 }
147
148 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)149 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
150 {
151 int slen, term_len, i;
152 UChar *r;
153
154 slen = (int )(end - s);
155 term_len = ONIGENC_MBC_MINLEN(enc);
156
157 r = (UChar* )xmalloc(slen + term_len);
158 CHECK_NULL_RETURN(r);
159 xmemcpy(r, s, slen);
160
161 for (i = 0; i < term_len; i++)
162 r[slen + i] = (UChar )0;
163
164 return r;
165 }
166
167 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)168 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
169 {
170 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
171 if (p < s) {
172 p += enclen(enc, p);
173 }
174 return p;
175 }
176
177 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)178 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
179 const UChar* start, const UChar* s, const UChar** prev)
180 {
181 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
182
183 if (p < s) {
184 if (prev) *prev = (const UChar* )p;
185 p += enclen(enc, p);
186 }
187 else {
188 if (prev)
189 *prev = onigenc_get_prev_char_head(enc, start, p);
190 }
191 return p;
192 }
193
194 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)195 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
196 {
197 if (s <= start)
198 return (UChar* )NULL;
199
200 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
201 }
202
203 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)204 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
205 {
206 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
207 if (s <= start)
208 return (UChar* )NULL;
209
210 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
211 }
212 return (UChar* )s;
213 }
214
215 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)216 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
217 {
218 UChar* q = (UChar* )p;
219 while (n-- > 0) {
220 q += ONIGENC_MBC_ENC_LEN(enc, q);
221 }
222 return (q <= end ? q : NULL);
223 }
224
225 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)226 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
227 {
228 int n = 0;
229 UChar* q = (UChar* )p;
230
231 while (q < end) {
232 q += ONIGENC_MBC_ENC_LEN(enc, q);
233 n++;
234 }
235 return n;
236 }
237
238 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)239 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
240 {
241 int n = 0;
242 UChar* p = (UChar* )s;
243
244 while (1) {
245 if (*p == '\0') {
246 UChar* q;
247 int len = ONIGENC_MBC_MINLEN(enc);
248
249 if (len == 1) return n;
250 q = p + 1;
251 while (len > 1) {
252 if (*q != '\0') break;
253 q++;
254 len--;
255 }
256 if (len == 1) return n;
257 }
258 p += ONIGENC_MBC_ENC_LEN(enc, p);
259 n++;
260 }
261 }
262
263 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)264 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
265 {
266 UChar* start = (UChar* )s;
267 UChar* p = (UChar* )s;
268
269 while (1) {
270 if (*p == '\0') {
271 UChar* q;
272 int len = ONIGENC_MBC_MINLEN(enc);
273
274 if (len == 1) return (int )(p - start);
275 q = p + 1;
276 while (len > 1) {
277 if (*q != '\0') break;
278 q++;
279 len--;
280 }
281 if (len == 1) return (int )(p - start);
282 }
283 p += ONIGENC_MBC_ENC_LEN(enc, p);
284 }
285 }
286
287 const UChar OnigEncAsciiToLowerCaseTable[] = {
288 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
289 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
290 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
291 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
292 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
293 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
294 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
295 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
296 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
297 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
298 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
299 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
300 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
301 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
302 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
303 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
304 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
305 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
306 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
307 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
308 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
309 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
310 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
311 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
312 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
313 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
314 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
315 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
316 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
317 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
318 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
319 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
320 };
321
322 #ifdef USE_UPPER_CASE_TABLE
323 const UChar OnigEncAsciiToUpperCaseTable[256] = {
324 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
325 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
326 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
327 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
328 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
329 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
330 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
331 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
332 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
333 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
334 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
335 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
336 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
337 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
338 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
339 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
340 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
341 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
342 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
343 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
344 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
345 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
346 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
347 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
348 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
349 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
350 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
351 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
352 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
353 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
354 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
355 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
356 };
357 #endif
358
359 const unsigned short OnigEncAsciiCtypeTable[256] = {
360 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
361 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
362 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
363 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
364 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
365 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
366 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
367 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
368 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
369 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
370 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
371 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
372 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
373 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
374 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
375 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
376 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
377 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
378 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
379 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
380 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
385 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
386 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
389 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
390 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
391 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
392 };
393
394 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
395 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
396 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
397 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
398 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
399 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
400 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
401 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
402 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
403 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
404 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
405 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
406 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
407 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
408 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
409 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
410 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
411 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
412 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
413 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
414 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
415 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
416 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
417 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
418 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
419 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
420 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
421 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
422 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
423 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
424 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
425 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
426 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
427 };
428
429 #ifdef USE_UPPER_CASE_TABLE
430 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
431 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
432 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
433 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
434 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
435 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
436 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
437 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
438 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
439 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
440 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
441 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
442 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
443 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
444 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
445 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
446 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
447 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
448 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
449 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
450 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
451 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
452 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
453 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
454 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
455 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
456 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
457 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
458 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
459 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
460 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
461 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
462 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
463 };
464 #endif
465
466 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)467 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
468 {
469 /* nothing */
470 /* obsoleted. */
471 }
472
473 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)474 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
475 {
476 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
477 }
478
479 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
480 { 0x41, 0x61 },
481 { 0x42, 0x62 },
482 { 0x43, 0x63 },
483 { 0x44, 0x64 },
484 { 0x45, 0x65 },
485 { 0x46, 0x66 },
486 { 0x47, 0x67 },
487 { 0x48, 0x68 },
488 { 0x49, 0x69 },
489 { 0x4a, 0x6a },
490 { 0x4b, 0x6b },
491 { 0x4c, 0x6c },
492 { 0x4d, 0x6d },
493 { 0x4e, 0x6e },
494 { 0x4f, 0x6f },
495 { 0x50, 0x70 },
496 { 0x51, 0x71 },
497 { 0x52, 0x72 },
498 { 0x53, 0x73 },
499 { 0x54, 0x74 },
500 { 0x55, 0x75 },
501 { 0x56, 0x76 },
502 { 0x57, 0x77 },
503 { 0x58, 0x78 },
504 { 0x59, 0x79 },
505 { 0x5a, 0x7a }
506 };
507
508 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)509 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
510 OnigApplyAllCaseFoldFunc f, void* arg)
511 {
512 OnigCodePoint code;
513 int i, r;
514
515 for (i = 0;
516 i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
517 i++) {
518 code = OnigAsciiLowerMap[i].to;
519 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
520 if (r != 0) return r;
521
522 code = OnigAsciiLowerMap[i].from;
523 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
524 if (r != 0) return r;
525 }
526
527 return 0;
528 }
529
530 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])531 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
532 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
533 OnigCaseFoldCodeItem items[])
534 {
535 if (0x41 <= *p && *p <= 0x5a) {
536 items[0].byte_len = 1;
537 items[0].code_len = 1;
538 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
539 return 1;
540 }
541 else if (0x61 <= *p && *p <= 0x7a) {
542 items[0].byte_len = 1;
543 items[0].code_len = 1;
544 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
545 return 1;
546 }
547 else
548 return 0;
549 }
550
551 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)552 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
553 OnigApplyAllCaseFoldFunc f, void* arg)
554 {
555 static OnigCodePoint ss[] = { SMALL_S, SMALL_S };
556
557 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
558 }
559
560 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)561 onigenc_apply_all_case_fold_with_map(int map_size,
562 const OnigPairCaseFoldCodes map[],
563 int ess_tsett_flag, OnigCaseFoldType flag,
564 OnigApplyAllCaseFoldFunc f, void* arg)
565 {
566 OnigCodePoint code;
567 int i, r;
568
569 r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
570 if (r != 0) return r;
571
572 for (i = 0; i < map_size; i++) {
573 code = map[i].to;
574 r = (*f)(map[i].from, &code, 1, arg);
575 if (r != 0) return r;
576
577 code = map[i].from;
578 r = (*f)(map[i].to, &code, 1, arg);
579 if (r != 0) return r;
580 }
581
582 if (ess_tsett_flag != 0)
583 return ss_apply_all_case_fold(flag, f, arg);
584
585 return 0;
586 }
587
588 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])589 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
590 const OnigPairCaseFoldCodes map[],
591 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
592 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
593 {
594 int i, j, n;
595 static OnigUChar sa[] = { LARGE_S, SMALL_S };
596
597 if (0x41 <= *p && *p <= 0x5a) { /* A - Z */
598 if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1
599 && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */
600 ss_combination:
601 items[0].byte_len = 2;
602 items[0].code_len = 1;
603 items[0].code[0] = (OnigCodePoint )0xdf;
604
605 n = 1;
606 for (i = 0; i < 2; i++) {
607 for (j = 0; j < 2; j++) {
608 if (sa[i] == *p && sa[j] == *(p+1))
609 continue;
610
611 items[n].byte_len = 2;
612 items[n].code_len = 2;
613 items[n].code[0] = (OnigCodePoint )sa[i];
614 items[n].code[1] = (OnigCodePoint )sa[j];
615 n++;
616 }
617 }
618 return 4;
619 }
620
621 items[0].byte_len = 1;
622 items[0].code_len = 1;
623 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
624 return 1;
625 }
626 else if (0x61 <= *p && *p <= 0x7a) { /* a - z */
627 if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1
628 && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) {
629 goto ss_combination;
630 }
631
632 items[0].byte_len = 1;
633 items[0].code_len = 1;
634 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
635 return 1;
636 }
637 else if (*p == 0xdf && ess_tsett_flag != 0) {
638 items[0].byte_len = 1;
639 items[0].code_len = 2;
640 items[0].code[0] = (OnigCodePoint )'s';
641 items[0].code[1] = (OnigCodePoint )'s';
642
643 items[1].byte_len = 1;
644 items[1].code_len = 2;
645 items[1].code[0] = (OnigCodePoint )'S';
646 items[1].code[1] = (OnigCodePoint )'S';
647
648 items[2].byte_len = 1;
649 items[2].code_len = 2;
650 items[2].code[0] = (OnigCodePoint )'s';
651 items[2].code[1] = (OnigCodePoint )'S';
652
653 items[3].byte_len = 1;
654 items[3].code_len = 2;
655 items[3].code[0] = (OnigCodePoint )'S';
656 items[3].code[1] = (OnigCodePoint )'s';
657
658 return 4;
659 }
660 else {
661 int i;
662
663 for (i = 0; i < map_size; i++) {
664 if (*p == map[i].from) {
665 items[0].byte_len = 1;
666 items[0].code_len = 1;
667 items[0].code[0] = map[i].to;
668 return 1;
669 }
670 else if (*p == map[i].to) {
671 items[0].byte_len = 1;
672 items[0].code_len = 1;
673 items[0].code[0] = map[i].from;
674 return 1;
675 }
676 }
677 }
678
679 return 0;
680 }
681
682
683 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)684 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
685 OnigCodePoint* sb_out ARG_UNUSED,
686 const OnigCodePoint* ranges[] ARG_UNUSED)
687 {
688 return ONIG_NO_SUPPORT_CONFIG;
689 }
690
691 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)692 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
693 {
694 if (p < end) {
695 if (*p == NEWLINE_CODE) return 1;
696 }
697 return 0;
698 }
699
700 /* for single byte encodings */
701 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)702 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
703 const UChar*end ARG_UNUSED, UChar* lower)
704 {
705 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
706
707 (*p)++;
708 return 1; /* return byte length of converted char to lower */
709 }
710
711 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)712 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
713 {
714 return 1;
715 }
716
717 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)718 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
719 {
720 return (OnigCodePoint )(*p);
721 }
722
723 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)724 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
725 {
726 return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
727 }
728
729 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)730 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
731 {
732 *buf = (UChar )(code & 0xff);
733 return 1;
734 }
735
736 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)737 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
738 const UChar* s)
739 {
740 return (UChar* )s;
741 }
742
743 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)744 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
745 const UChar* end ARG_UNUSED)
746 {
747 return TRUE;
748 }
749
750 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)751 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
752 const UChar* end ARG_UNUSED)
753 {
754 return FALSE;
755 }
756
757 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)758 onigenc_always_true_is_valid_mbc_string(const UChar* s ARG_UNUSED,
759 const UChar* end ARG_UNUSED)
760 {
761 return TRUE;
762 }
763
764 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)765 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
766 const UChar* p, const UChar* end)
767 {
768 while (p < end) {
769 p += enclen(enc, p);
770 }
771
772 if (p != end)
773 return FALSE;
774 else
775 return TRUE;
776 }
777
778 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)779 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
780 {
781 return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
782 }
783
784 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)785 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
786 {
787 int c, i, len;
788 OnigCodePoint n;
789
790 len = enclen(enc, p);
791 n = (OnigCodePoint )(*p++);
792 if (len == 1) return n;
793
794 for (i = 1; i < len; i++) {
795 if (p >= end) break;
796 c = *p++;
797 n <<= 8; n += c;
798 }
799 return n;
800 }
801
802 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)803 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
804 const UChar** pp, const UChar* end ARG_UNUSED,
805 UChar* lower)
806 {
807 int len;
808 const UChar *p = *pp;
809
810 if (ONIGENC_IS_MBC_ASCII(p)) {
811 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
812 (*pp)++;
813 return 1;
814 }
815 else {
816 int i;
817
818 len = enclen(enc, p);
819 for (i = 0; i < len; i++) {
820 *lower++ = *p++;
821 }
822 (*pp) += len;
823 return len; /* return byte length of converted to lower char */
824 }
825 }
826
827 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)828 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
829 {
830 UChar *p = buf;
831
832 if ((code & 0xff00) != 0) {
833 *p++ = (UChar )((code >> 8) & 0xff);
834 }
835 *p++ = (UChar )(code & 0xff);
836
837 #if 1
838 if (enclen(enc, buf) != (p - buf))
839 return ONIGERR_INVALID_CODE_POINT_VALUE;
840 #endif
841 return (int )(p - buf);
842 }
843
844 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)845 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
846 {
847 UChar *p = buf;
848
849 if ((code & 0xff000000) != 0) {
850 *p++ = (UChar )((code >> 24) & 0xff);
851 }
852 if ((code & 0xff0000) != 0 || p != buf) {
853 *p++ = (UChar )((code >> 16) & 0xff);
854 }
855 if ((code & 0xff00) != 0 || p != buf) {
856 *p++ = (UChar )((code >> 8) & 0xff);
857 }
858 *p++ = (UChar )(code & 0xff);
859
860 #if 1
861 if (enclen(enc, buf) != (p - buf))
862 return ONIGERR_INVALID_CODE_POINT_VALUE;
863 #endif
864 return (int )(p - buf);
865 }
866
867 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)868 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
869 {
870 static PosixBracketEntryType PBS[] = {
871 { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
872 { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
873 { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
874 { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
875 { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
876 { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
877 { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
878 { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
879 { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
880 { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
881 { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
882 { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
883 { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
884 { (UChar* )"Word", ONIGENC_CTYPE_WORD, 4 },
885 { (UChar* )NULL, -1, 0 }
886 };
887
888 PosixBracketEntryType *pb;
889 int len;
890
891 len = onigenc_strlen(enc, p, end);
892 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
893 if (len == pb->len &&
894 onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
895 return pb->ctype;
896 }
897
898 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
899 }
900
901 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)902 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
903 {
904 OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
905
906 if (code > ASCII_LIMIT) return 0;
907
908 return ONIGENC_IS_ASCII_CODE_WORD(code);
909 }
910
911 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)912 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
913 unsigned int ctype)
914 {
915 if (code < 128)
916 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
917 else {
918 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
919 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
920 }
921 }
922
923 return FALSE;
924 }
925
926 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)927 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
928 unsigned int ctype)
929 {
930 if (code < 128)
931 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
932 else {
933 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
934 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
935 }
936 }
937
938 return FALSE;
939 }
940
941 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)942 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
943 const UChar* sascii /* ascii */, int n)
944 {
945 int x, c;
946
947 while (n-- > 0) {
948 if (p >= end) return (int )(*sascii);
949
950 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
951 x = *sascii - c;
952 if (x) return x;
953
954 sascii++;
955 p += enclen(enc, p);
956 }
957 return 0;
958 }
959
960 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)961 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
962 {
963 int i;
964
965 for (i = 0; i < n; i++) {
966 if (a[i] != b[i])
967 return -1;
968 }
969
970 return 0;
971 }
972
973 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)974 onig_codes_byte_at(OnigCodePoint codes[], int at)
975 {
976 int index;
977 int b;
978 OnigCodePoint code;
979
980 index = at / 3;
981 b = at % 3;
982 code = codes[index];
983
984 return ((code >> ((2 - b) * 8)) & 0xff);
985 }
986