1 /**********************************************************************
2   regenc.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
33 
34 #define INITED_LIST_SIZE  20
35 
36 static int InitedListNum;
37 
38 static struct {
39   OnigEncoding enc;
40   int          inited;
41 } InitedList[INITED_LIST_SIZE];
42 
43 static int
enc_inited_entry(OnigEncoding enc)44 enc_inited_entry(OnigEncoding enc)
45 {
46   int i;
47 
48   for (i = 0; i < InitedListNum; i++) {
49     if (InitedList[i].enc == enc) {
50       InitedList[i].inited = 1;
51       return i;
52     }
53   }
54 
55   i = InitedListNum;
56   if (i < INITED_LIST_SIZE - 1) {
57     InitedList[i].enc    = enc;
58     InitedList[i].inited = 1;
59     InitedListNum++;
60     return i;
61   }
62 
63   return -1;
64 }
65 
66 static int
enc_is_inited(OnigEncoding enc)67 enc_is_inited(OnigEncoding enc)
68 {
69   int i;
70 
71   for (i = 0; i < InitedListNum; i++) {
72     if (InitedList[i].enc == enc) {
73       return InitedList[i].inited;
74     }
75   }
76 
77   return 0;
78 }
79 
80 static int OnigEncInited;
81 
82 extern int
onigenc_init(void)83 onigenc_init(void)
84 {
85   if (OnigEncInited != 0) return 0;
86 
87   OnigEncInited = 1;
88   return 0;
89 }
90 
91 extern int
onigenc_end(void)92 onigenc_end(void)
93 {
94   int i;
95 
96   for (i = 0; i < InitedListNum; i++) {
97     InitedList[i].enc    = 0;
98     InitedList[i].inited = 0;
99   }
100   InitedListNum = 0;
101 
102   OnigEncInited = 0;
103   return ONIG_NORMAL;
104 }
105 
106 extern int
onig_initialize_encoding(OnigEncoding enc)107 onig_initialize_encoding(OnigEncoding enc)
108 {
109   int r;
110 
111   if (enc != ONIG_ENCODING_ASCII &&
112       ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
113     OnigEncoding ascii = ONIG_ENCODING_ASCII;
114     if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
115       r = ascii->init();
116       if (r != ONIG_NORMAL) return r;
117       enc_inited_entry(ascii);
118     }
119   }
120 
121   if (enc->init != 0 &&
122       enc_is_inited(enc) == 0) {
123     r = (enc->init)();
124     if (r == ONIG_NORMAL)
125       enc_inited_entry(enc);
126     return r;
127   }
128 
129   return 0;
130 }
131 
132 extern OnigEncoding
onigenc_get_default_encoding(void)133 onigenc_get_default_encoding(void)
134 {
135   return OnigEncDefaultCharEncoding;
136 }
137 
138 extern int
onigenc_set_default_encoding(OnigEncoding enc)139 onigenc_set_default_encoding(OnigEncoding enc)
140 {
141   OnigEncDefaultCharEncoding = enc;
142   return 0;
143 }
144 
145 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)146 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
147 {
148   int slen, term_len, i;
149   UChar *r;
150 
151   slen = (int )(end - s);
152   term_len = ONIGENC_MBC_MINLEN(enc);
153 
154   r = (UChar* )xmalloc(slen + term_len);
155   CHECK_NULL_RETURN(r);
156   xmemcpy(r, s, slen);
157 
158   for (i = 0; i < term_len; i++)
159     r[slen + i] = (UChar )0;
160 
161   return r;
162 }
163 
164 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)165 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
166 {
167   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
168   if (p < s) {
169     p += enclen(enc, p);
170   }
171   return p;
172 }
173 
174 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)175 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
176             const UChar* start, const UChar* s, const UChar** prev)
177 {
178   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
179 
180   if (p < s) {
181     if (prev) *prev = (const UChar* )p;
182     p += enclen(enc, p);
183   }
184   else {
185     if (prev)
186       *prev = onigenc_get_prev_char_head(enc, start, p);
187   }
188   return p;
189 }
190 
191 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)192 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
193 {
194   if (s <= start)
195     return (UChar* )NULL;
196 
197   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
198 }
199 
200 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)201 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
202 {
203   while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
204     if (s <= start)
205       return (UChar* )NULL;
206 
207     s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
208   }
209   return (UChar* )s;
210 }
211 
212 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)213 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
214 {
215   UChar* q = (UChar* )p;
216   while (n-- > 0) {
217     q += ONIGENC_MBC_ENC_LEN(enc, q);
218   }
219   return (q <= end ? q : NULL);
220 }
221 
222 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)223 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
224 {
225   int n = 0;
226   UChar* q = (UChar* )p;
227 
228   while (q < end) {
229     q += ONIGENC_MBC_ENC_LEN(enc, q);
230     n++;
231   }
232   return n;
233 }
234 
235 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)236 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
237 {
238   int n = 0;
239   UChar* p = (UChar* )s;
240 
241   while (1) {
242     if (*p == '\0') {
243       UChar* q;
244       int len = ONIGENC_MBC_MINLEN(enc);
245 
246       if (len == 1) return n;
247       q = p + 1;
248       while (len > 1) {
249         if (*q != '\0') break;
250         q++;
251         len--;
252       }
253       if (len == 1) return n;
254     }
255     p += ONIGENC_MBC_ENC_LEN(enc, p);
256     n++;
257   }
258 }
259 
260 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)261 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
262 {
263   UChar* start = (UChar* )s;
264   UChar* p = (UChar* )s;
265 
266   while (1) {
267     if (*p == '\0') {
268       UChar* q;
269       int len = ONIGENC_MBC_MINLEN(enc);
270 
271       if (len == 1) return (int )(p - start);
272       q = p + 1;
273       while (len > 1) {
274         if (*q != '\0') break;
275         q++;
276         len--;
277       }
278       if (len == 1) return (int )(p - start);
279     }
280     p += ONIGENC_MBC_ENC_LEN(enc, p);
281   }
282 }
283 
284 const UChar OnigEncAsciiToLowerCaseTable[] = {
285   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
286   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
287   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
288   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
289   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
290   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
291   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
292   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
293   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
294   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
295   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
296   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
297   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
298   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
299   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
300   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
301   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
302   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
303   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
304   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
305   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
306   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
307   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
308   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
309   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
310   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
311   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
312   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
313   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
314   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
315   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
316   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
317 };
318 
319 #ifdef USE_UPPER_CASE_TABLE
320 const UChar OnigEncAsciiToUpperCaseTable[256] = {
321   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
322   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
323   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
324   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
325   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
326   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
327   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
328   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
329   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
330   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
331   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
332   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
333   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
334   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
335   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
336   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
337   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
338   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
339   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
340   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
341   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
342   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
343   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
344   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
345   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
346   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
347   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
348   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
349   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
350   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
351   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
352   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
353 };
354 #endif
355 
356 const unsigned short OnigEncAsciiCtypeTable[256] = {
357   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
358   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
359   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
360   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
361   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
362   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
363   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
364   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
365   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
366   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
367   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
368   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
369   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
370   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
371   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
372   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
373   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
374   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
375   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
376   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
377   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
378   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
379   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
380   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
385   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
386   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
389 };
390 
391 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
392   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
393   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
394   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
395   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
396   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
397   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
398   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
399   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
400   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
401   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
402   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
403   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
404   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
405   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
406   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
407   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
408   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
409   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
410   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
411   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
412   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
413   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
414   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
415   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
416   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
417   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
418   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
419   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
420   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
421   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
422   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
423   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
424 };
425 
426 #ifdef USE_UPPER_CASE_TABLE
427 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
428   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
429   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
430   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
431   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
432   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
433   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
434   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
435   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
436   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
437   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
438   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
439   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
440   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
441   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
442   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
443   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
444   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
445   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
446   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
447   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
448   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
449   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
450   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
451   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
452   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
453   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
454   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
455   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
456   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
457   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
458   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
459   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
460 };
461 #endif
462 
463 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)464 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
465 {
466   /* nothing */
467   /* obsoleted. */
468 }
469 
470 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)471 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
472 {
473   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
474 }
475 
476 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
477   { 0x41, 0x61 },
478   { 0x42, 0x62 },
479   { 0x43, 0x63 },
480   { 0x44, 0x64 },
481   { 0x45, 0x65 },
482   { 0x46, 0x66 },
483   { 0x47, 0x67 },
484   { 0x48, 0x68 },
485   { 0x49, 0x69 },
486   { 0x4a, 0x6a },
487   { 0x4b, 0x6b },
488   { 0x4c, 0x6c },
489   { 0x4d, 0x6d },
490   { 0x4e, 0x6e },
491   { 0x4f, 0x6f },
492   { 0x50, 0x70 },
493   { 0x51, 0x71 },
494   { 0x52, 0x72 },
495   { 0x53, 0x73 },
496   { 0x54, 0x74 },
497   { 0x55, 0x75 },
498   { 0x56, 0x76 },
499   { 0x57, 0x77 },
500   { 0x58, 0x78 },
501   { 0x59, 0x79 },
502   { 0x5a, 0x7a }
503 };
504 
505 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)506 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
507                                   OnigApplyAllCaseFoldFunc f, void* arg)
508 {
509   OnigCodePoint code;
510   int i, r;
511 
512   for (i = 0;
513        i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
514        i++) {
515     code = OnigAsciiLowerMap[i].to;
516     r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
517     if (r != 0) return r;
518 
519     code = OnigAsciiLowerMap[i].from;
520     r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
521     if (r != 0) return r;
522   }
523 
524   return 0;
525 }
526 
527 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])528 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
529     const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
530     OnigCaseFoldCodeItem items[])
531 {
532   if (0x41 <= *p && *p <= 0x5a) {
533     items[0].byte_len = 1;
534     items[0].code_len = 1;
535     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
536     return 1;
537   }
538   else if (0x61 <= *p && *p <= 0x7a) {
539     items[0].byte_len = 1;
540     items[0].code_len = 1;
541     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
542     return 1;
543   }
544   else
545     return 0;
546 }
547 
548 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)549 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
550                        OnigApplyAllCaseFoldFunc f, void* arg)
551 {
552   static OnigCodePoint ss[] = { 0x73, 0x73 };
553 
554   return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
555 }
556 
557 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)558 onigenc_apply_all_case_fold_with_map(int map_size,
559     const OnigPairCaseFoldCodes map[],
560     int ess_tsett_flag, OnigCaseFoldType flag,
561     OnigApplyAllCaseFoldFunc f, void* arg)
562 {
563   OnigCodePoint code;
564   int i, r;
565 
566   r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
567   if (r != 0) return r;
568 
569   for (i = 0; i < map_size; i++) {
570     code = map[i].to;
571     r = (*f)(map[i].from, &code, 1, arg);
572     if (r != 0) return r;
573 
574     code = map[i].from;
575     r = (*f)(map[i].to, &code, 1, arg);
576     if (r != 0) return r;
577   }
578 
579   if (ess_tsett_flag != 0)
580     return ss_apply_all_case_fold(flag, f, arg);
581 
582   return 0;
583 }
584 
585 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])586 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
587     const OnigPairCaseFoldCodes map[],
588     int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
589     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
590 {
591   if (0x41 <= *p && *p <= 0x5a) {
592     items[0].byte_len = 1;
593     items[0].code_len = 1;
594     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
595     if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
596         && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
597       /* SS */
598       items[1].byte_len = 2;
599       items[1].code_len = 1;
600       items[1].code[0] = (OnigCodePoint )0xdf;
601       return 2;
602     }
603     else
604       return 1;
605   }
606   else if (0x61 <= *p && *p <= 0x7a) {
607     items[0].byte_len = 1;
608     items[0].code_len = 1;
609     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
610     if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
611         && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
612       /* ss */
613       items[1].byte_len = 2;
614       items[1].code_len = 1;
615       items[1].code[0] = (OnigCodePoint )0xdf;
616       return 2;
617     }
618     else
619       return 1;
620   }
621   else if (*p == 0xdf && ess_tsett_flag != 0) {
622     items[0].byte_len = 1;
623     items[0].code_len = 2;
624     items[0].code[0] = (OnigCodePoint )'s';
625     items[0].code[1] = (OnigCodePoint )'s';
626 
627     items[1].byte_len = 1;
628     items[1].code_len = 2;
629     items[1].code[0] = (OnigCodePoint )'S';
630     items[1].code[1] = (OnigCodePoint )'S';
631 
632     items[2].byte_len = 1;
633     items[2].code_len = 2;
634     items[2].code[0] = (OnigCodePoint )'s';
635     items[2].code[1] = (OnigCodePoint )'S';
636 
637     items[3].byte_len = 1;
638     items[3].code_len = 2;
639     items[3].code[0] = (OnigCodePoint )'S';
640     items[3].code[1] = (OnigCodePoint )'s';
641 
642     return 4;
643   }
644   else {
645     int i;
646 
647     for (i = 0; i < map_size; i++) {
648       if (*p == map[i].from) {
649         items[0].byte_len = 1;
650         items[0].code_len = 1;
651         items[0].code[0] = map[i].to;
652         return 1;
653       }
654       else if (*p == map[i].to) {
655         items[0].byte_len = 1;
656         items[0].code_len = 1;
657         items[0].code[0] = map[i].from;
658         return 1;
659       }
660     }
661   }
662 
663   return 0;
664 }
665 
666 
667 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)668 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
669                                          OnigCodePoint* sb_out ARG_UNUSED,
670                                          const OnigCodePoint* ranges[] ARG_UNUSED)
671 {
672   return ONIG_NO_SUPPORT_CONFIG;
673 }
674 
675 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)676 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
677 {
678   if (p < end) {
679     if (*p == 0x0a) return 1;
680   }
681   return 0;
682 }
683 
684 /* for single byte encodings */
685 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)686 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
687                             const UChar*end ARG_UNUSED, UChar* lower)
688 {
689   *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
690 
691   (*p)++;
692   return 1; /* return byte length of converted char to lower */
693 }
694 
695 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)696 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
697 {
698   return 1;
699 }
700 
701 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)702 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
703 {
704   return (OnigCodePoint )(*p);
705 }
706 
707 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)708 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
709 {
710   return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
711 }
712 
713 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)714 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
715 {
716   *buf = (UChar )(code & 0xff);
717   return 1;
718 }
719 
720 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)721 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
722                                           const UChar* s)
723 {
724   return (UChar* )s;
725 }
726 
727 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)728 onigenc_always_true_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
729                                              const UChar* end ARG_UNUSED)
730 {
731   return TRUE;
732 }
733 
734 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)735 onigenc_always_false_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
736                                               const UChar* end ARG_UNUSED)
737 {
738   return FALSE;
739 }
740 
741 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)742 onigenc_always_true_is_valid_mbc_string(const UChar* s   ARG_UNUSED,
743                                         const UChar* end ARG_UNUSED)
744 {
745   return TRUE;
746 }
747 
748 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)749 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
750                                          const UChar* p, const UChar* end)
751 {
752   while (p < end) {
753     p += enclen(enc, p);
754   }
755 
756   if (p != end)
757     return FALSE;
758   else
759     return TRUE;
760 }
761 
762 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)763 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
764 {
765   return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
766 }
767 
768 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)769 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
770 {
771   int c, i, len;
772   OnigCodePoint n;
773 
774   len = enclen(enc, p);
775   n = (OnigCodePoint )(*p++);
776   if (len == 1) return n;
777 
778   for (i = 1; i < len; i++) {
779     if (p >= end) break;
780     c = *p++;
781     n <<= 8;  n += c;
782   }
783   return n;
784 }
785 
786 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)787 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
788                           const UChar** pp, const UChar* end ARG_UNUSED,
789                           UChar* lower)
790 {
791   int len;
792   const UChar *p = *pp;
793 
794   if (ONIGENC_IS_MBC_ASCII(p)) {
795     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
796     (*pp)++;
797     return 1;
798   }
799   else {
800     int i;
801 
802     len = enclen(enc, p);
803     for (i = 0; i < len; i++) {
804       *lower++ = *p++;
805     }
806     (*pp) += len;
807     return len; /* return byte length of converted to lower char */
808   }
809 }
810 
811 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)812 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
813 {
814   UChar *p = buf;
815 
816   if ((code & 0xff00) != 0) {
817     *p++ = (UChar )((code >>  8) & 0xff);
818   }
819   *p++ = (UChar )(code & 0xff);
820 
821 #if 1
822   if (enclen(enc, buf) != (p - buf))
823     return ONIGERR_INVALID_CODE_POINT_VALUE;
824 #endif
825   return (int )(p - buf);
826 }
827 
828 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)829 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
830 {
831   UChar *p = buf;
832 
833   if ((code & 0xff000000) != 0) {
834     *p++ = (UChar )((code >> 24) & 0xff);
835   }
836   if ((code & 0xff0000) != 0 || p != buf) {
837     *p++ = (UChar )((code >> 16) & 0xff);
838   }
839   if ((code & 0xff00) != 0 || p != buf) {
840     *p++ = (UChar )((code >> 8) & 0xff);
841   }
842   *p++ = (UChar )(code & 0xff);
843 
844 #if 1
845   if (enclen(enc, buf) != (p - buf))
846     return ONIGERR_INVALID_CODE_POINT_VALUE;
847 #endif
848   return (int )(p - buf);
849 }
850 
851 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)852 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
853 {
854   static PosixBracketEntryType PBS[] = {
855     { (UChar* )"Alnum",  ONIGENC_CTYPE_ALNUM,  5 },
856     { (UChar* )"Alpha",  ONIGENC_CTYPE_ALPHA,  5 },
857     { (UChar* )"Blank",  ONIGENC_CTYPE_BLANK,  5 },
858     { (UChar* )"Cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
859     { (UChar* )"Digit",  ONIGENC_CTYPE_DIGIT,  5 },
860     { (UChar* )"Graph",  ONIGENC_CTYPE_GRAPH,  5 },
861     { (UChar* )"Lower",  ONIGENC_CTYPE_LOWER,  5 },
862     { (UChar* )"Print",  ONIGENC_CTYPE_PRINT,  5 },
863     { (UChar* )"Punct",  ONIGENC_CTYPE_PUNCT,  5 },
864     { (UChar* )"Space",  ONIGENC_CTYPE_SPACE,  5 },
865     { (UChar* )"Upper",  ONIGENC_CTYPE_UPPER,  5 },
866     { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
867     { (UChar* )"ASCII",  ONIGENC_CTYPE_ASCII,  5 },
868     { (UChar* )"Word",   ONIGENC_CTYPE_WORD,   4 },
869     { (UChar* )NULL, -1, 0 }
870   };
871 
872   PosixBracketEntryType *pb;
873   int len;
874 
875   len = onigenc_strlen(enc, p, end);
876   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
877     if (len == pb->len &&
878         onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
879       return pb->ctype;
880   }
881 
882   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
883 }
884 
885 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)886 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
887 {
888   OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
889 
890   if (code > 127) return 0;
891 
892   return ONIGENC_IS_ASCII_CODE_WORD(code);
893 }
894 
895 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)896 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
897                           unsigned int ctype)
898 {
899   if (code < 128)
900     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
901   else {
902     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
903       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
904     }
905   }
906 
907   return FALSE;
908 }
909 
910 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)911 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
912                           unsigned int ctype)
913 {
914   if (code < 128)
915     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
916   else {
917     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
918       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
919     }
920   }
921 
922   return FALSE;
923 }
924 
925 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)926 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
927                            const UChar* sascii /* ascii */, int n)
928 {
929   int x, c;
930 
931   while (n-- > 0) {
932     if (p >= end) return (int )(*sascii);
933 
934     c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
935     x = *sascii - c;
936     if (x) return x;
937 
938     sascii++;
939     p += enclen(enc, p);
940   }
941   return 0;
942 }
943 
944 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)945 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
946 {
947   int i;
948 
949   for (i = 0; i < n; i++) {
950     if (a[i] != b[i])
951       return -1;
952   }
953 
954   return 0;
955 }
956 
957 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)958 onig_codes_byte_at(OnigCodePoint codes[], int at)
959 {
960   int index;
961   int b;
962   OnigCodePoint code;
963 
964   index = at / 3;
965   b     = at % 3;
966   code = codes[index];
967 
968   return ((code >> ((2 - b) * 8)) & 0xff);
969 }
970