1 /**********************************************************************
2   regenc.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
33 
34 #define INITED_LIST_SIZE  20
35 
36 static int InitedListNum;
37 
38 static struct {
39   OnigEncoding enc;
40   int          inited;
41 } InitedList[INITED_LIST_SIZE];
42 
43 static int
enc_inited_entry(OnigEncoding enc)44 enc_inited_entry(OnigEncoding enc)
45 {
46   int i;
47 
48   for (i = 0; i < InitedListNum; i++) {
49     if (InitedList[i].enc == enc) {
50       InitedList[i].inited = 1;
51       return i;
52     }
53   }
54 
55   i = InitedListNum;
56   if (i < INITED_LIST_SIZE - 1) {
57     InitedList[i].enc    = enc;
58     InitedList[i].inited = 1;
59     InitedListNum++;
60     return i;
61   }
62 
63   return -1;
64 }
65 
66 static int
enc_is_inited(OnigEncoding enc)67 enc_is_inited(OnigEncoding enc)
68 {
69   int i;
70 
71   for (i = 0; i < InitedListNum; i++) {
72     if (InitedList[i].enc == enc) {
73       return InitedList[i].inited;
74     }
75   }
76 
77   return 0;
78 }
79 
80 extern int
onigenc_end(void)81 onigenc_end(void)
82 {
83   int i;
84 
85   for (i = 0; i < InitedListNum; i++) {
86     InitedList[i].enc    = 0;
87     InitedList[i].inited = 0;
88   }
89 
90   InitedListNum = 0;
91   return ONIG_NORMAL;
92 }
93 
94 extern int
onigenc_init(void)95 onigenc_init(void)
96 {
97   return 0;
98 }
99 
100 extern int
onig_initialize_encoding(OnigEncoding enc)101 onig_initialize_encoding(OnigEncoding enc)
102 {
103   int r;
104 
105   if (enc != ONIG_ENCODING_ASCII &&
106       ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
107     OnigEncoding ascii = ONIG_ENCODING_ASCII;
108     if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
109       r = ascii->init();
110       if (r != ONIG_NORMAL) return r;
111       enc_inited_entry(ascii);
112     }
113   }
114 
115   if (enc->init != 0 &&
116       enc_is_inited(enc) == 0) {
117     r = (enc->init)();
118     if (r == ONIG_NORMAL)
119       enc_inited_entry(enc);
120     return r;
121   }
122 
123   return 0;
124 }
125 
126 extern OnigEncoding
onigenc_get_default_encoding(void)127 onigenc_get_default_encoding(void)
128 {
129   return OnigEncDefaultCharEncoding;
130 }
131 
132 extern int
onigenc_set_default_encoding(OnigEncoding enc)133 onigenc_set_default_encoding(OnigEncoding enc)
134 {
135   OnigEncDefaultCharEncoding = enc;
136   return 0;
137 }
138 
139 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)140 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
141 {
142   int slen, term_len, i;
143   UChar *r;
144 
145   slen = (int )(end - s);
146   term_len = ONIGENC_MBC_MINLEN(enc);
147 
148   r = (UChar* )xmalloc(slen + term_len);
149   CHECK_NULL_RETURN(r);
150   xmemcpy(r, s, slen);
151 
152   for (i = 0; i < term_len; i++)
153     r[slen + i] = (UChar )0;
154 
155   return r;
156 }
157 
158 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)159 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
160 {
161   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
162   if (p < s) {
163     p += enclen(enc, p);
164   }
165   return p;
166 }
167 
168 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)169 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
170 				   const UChar* start, const UChar* s, const UChar** prev)
171 {
172   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
173 
174   if (p < s) {
175     if (prev) *prev = (const UChar* )p;
176     p += enclen(enc, p);
177   }
178   else {
179     if (prev) *prev = (const UChar* )NULL; /* Sorry */
180   }
181   return p;
182 }
183 
184 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)185 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
186 {
187   if (s <= start)
188     return (UChar* )NULL;
189 
190   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
191 }
192 
193 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)194 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
195 {
196   while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
197     if (s <= start)
198       return (UChar* )NULL;
199 
200     s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
201   }
202   return (UChar* )s;
203 }
204 
205 #if 0
206 extern int
207 onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end)
208 {
209   int len;
210   int n;
211 
212   len = ONIGENC_MBC_ENC_LEN(enc, p);
213   n = (int )(end - p);
214 
215   return (n < len ? n : len);
216 }
217 #endif
218 
219 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)220 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
221 {
222   UChar* q = (UChar* )p;
223   while (n-- > 0) {
224     q += ONIGENC_MBC_ENC_LEN(enc, q);
225   }
226   return (q <= end ? q : NULL);
227 }
228 
229 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)230 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
231 {
232   int n = 0;
233   UChar* q = (UChar* )p;
234 
235   while (q < end) {
236     q += ONIGENC_MBC_ENC_LEN(enc, q);
237     n++;
238   }
239   return n;
240 }
241 
242 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)243 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
244 {
245   int n = 0;
246   UChar* p = (UChar* )s;
247 
248   while (1) {
249     if (*p == '\0') {
250       UChar* q;
251       int len = ONIGENC_MBC_MINLEN(enc);
252 
253       if (len == 1) return n;
254       q = p + 1;
255       while (len > 1) {
256         if (*q != '\0') break;
257         q++;
258         len--;
259       }
260       if (len == 1) return n;
261     }
262     p += ONIGENC_MBC_ENC_LEN(enc, p);
263     n++;
264   }
265 }
266 
267 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)268 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
269 {
270   UChar* start = (UChar* )s;
271   UChar* p = (UChar* )s;
272 
273   while (1) {
274     if (*p == '\0') {
275       UChar* q;
276       int len = ONIGENC_MBC_MINLEN(enc);
277 
278       if (len == 1) return (int )(p - start);
279       q = p + 1;
280       while (len > 1) {
281         if (*q != '\0') break;
282         q++;
283         len--;
284       }
285       if (len == 1) return (int )(p - start);
286     }
287     p += ONIGENC_MBC_ENC_LEN(enc, p);
288   }
289 }
290 
291 const UChar OnigEncAsciiToLowerCaseTable[] = {
292   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
293   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
294   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
295   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
296   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
297   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
298   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
299   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
300   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
301   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
302   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
303   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
304   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
305   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
306   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
307   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
308   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
309   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
310   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
311   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
312   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
313   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
314   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
315   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
316   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
317   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
318   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
319   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
320   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
321   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
322   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
323   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
324 };
325 
326 #ifdef USE_UPPER_CASE_TABLE
327 const UChar OnigEncAsciiToUpperCaseTable[256] = {
328   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
329   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
330   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
331   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
332   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
333   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
334   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
335   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
336   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
337   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
338   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
339   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
340   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
341   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
342   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
343   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
344   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
345   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
346   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
347   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
348   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
349   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
350   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
351   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
352   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
353   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
354   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
355   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
356   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
357   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
358   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
359   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
360 };
361 #endif
362 
363 const unsigned short OnigEncAsciiCtypeTable[256] = {
364   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
365   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
366   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
367   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
368   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
369   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
370   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
371   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
372   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
373   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
374   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
375   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
376   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
377   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
378   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
379   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
380   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
385   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
386   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
389   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
390   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
391   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
392   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
393   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
394   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
395   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
396 };
397 
398 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
399   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
400   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
401   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
402   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
403   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
404   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
405   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
406   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
407   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
408   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
409   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
410   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
411   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
412   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
413   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
414   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
415   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
416   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
417   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
418   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
419   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
420   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
421   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
422   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
423   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
424   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
425   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
426   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
427   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
428   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
429   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
430   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
431 };
432 
433 #ifdef USE_UPPER_CASE_TABLE
434 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
435   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
436   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
437   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
438   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
439   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
440   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
441   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
442   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
443   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
444   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
445   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
446   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
447   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
448   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
449   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
450   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
451   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
452   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
453   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
454   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
455   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
456   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
457   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
458   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
459   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
460   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
461   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
462   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
463   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
464   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
465   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
466   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
467 };
468 #endif
469 
470 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)471 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
472 {
473   /* nothing */
474   /* obsoleted. */
475 }
476 
477 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)478 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
479 {
480   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
481 }
482 
483 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
484   { 0x41, 0x61 },
485   { 0x42, 0x62 },
486   { 0x43, 0x63 },
487   { 0x44, 0x64 },
488   { 0x45, 0x65 },
489   { 0x46, 0x66 },
490   { 0x47, 0x67 },
491   { 0x48, 0x68 },
492   { 0x49, 0x69 },
493   { 0x4a, 0x6a },
494   { 0x4b, 0x6b },
495   { 0x4c, 0x6c },
496   { 0x4d, 0x6d },
497   { 0x4e, 0x6e },
498   { 0x4f, 0x6f },
499   { 0x50, 0x70 },
500   { 0x51, 0x71 },
501   { 0x52, 0x72 },
502   { 0x53, 0x73 },
503   { 0x54, 0x74 },
504   { 0x55, 0x75 },
505   { 0x56, 0x76 },
506   { 0x57, 0x77 },
507   { 0x58, 0x78 },
508   { 0x59, 0x79 },
509   { 0x5a, 0x7a }
510 };
511 
512 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)513 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
514 				  OnigApplyAllCaseFoldFunc f, void* arg)
515 {
516   OnigCodePoint code;
517   int i, r;
518 
519   for (i = 0;
520        i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
521        i++) {
522     code = OnigAsciiLowerMap[i].to;
523     r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
524     if (r != 0) return r;
525 
526     code = OnigAsciiLowerMap[i].from;
527     r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
528     if (r != 0) return r;
529   }
530 
531   return 0;
532 }
533 
534 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])535 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
536 	 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
537 	 OnigCaseFoldCodeItem items[])
538 {
539   if (0x41 <= *p && *p <= 0x5a) {
540     items[0].byte_len = 1;
541     items[0].code_len = 1;
542     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
543     return 1;
544   }
545   else if (0x61 <= *p && *p <= 0x7a) {
546     items[0].byte_len = 1;
547     items[0].code_len = 1;
548     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
549     return 1;
550   }
551   else
552     return 0;
553 }
554 
555 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)556 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
557 		       OnigApplyAllCaseFoldFunc f, void* arg)
558 {
559   static OnigCodePoint ss[] = { 0x73, 0x73 };
560 
561   return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
562 }
563 
564 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)565 onigenc_apply_all_case_fold_with_map(int map_size,
566     const OnigPairCaseFoldCodes map[],
567     int ess_tsett_flag, OnigCaseFoldType flag,
568     OnigApplyAllCaseFoldFunc f, void* arg)
569 {
570   OnigCodePoint code;
571   int i, r;
572 
573   r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
574   if (r != 0) return r;
575 
576   for (i = 0; i < map_size; i++) {
577     code = map[i].to;
578     r = (*f)(map[i].from, &code, 1, arg);
579     if (r != 0) return r;
580 
581     code = map[i].from;
582     r = (*f)(map[i].to, &code, 1, arg);
583     if (r != 0) return r;
584   }
585 
586   if (ess_tsett_flag != 0)
587     return ss_apply_all_case_fold(flag, f, arg);
588 
589   return 0;
590 }
591 
592 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])593 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
594     const OnigPairCaseFoldCodes map[],
595     int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
596     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
597 {
598   if (0x41 <= *p && *p <= 0x5a) {
599     items[0].byte_len = 1;
600     items[0].code_len = 1;
601     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
602     if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
603 	&& (*(p+1) == 0x53 || *(p+1) == 0x73)) {
604       /* SS */
605       items[1].byte_len = 2;
606       items[1].code_len = 1;
607       items[1].code[0] = (OnigCodePoint )0xdf;
608       return 2;
609     }
610     else
611       return 1;
612   }
613   else if (0x61 <= *p && *p <= 0x7a) {
614     items[0].byte_len = 1;
615     items[0].code_len = 1;
616     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
617     if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
618 	&& (*(p+1) == 0x73 || *(p+1) == 0x53)) {
619       /* ss */
620       items[1].byte_len = 2;
621       items[1].code_len = 1;
622       items[1].code[0] = (OnigCodePoint )0xdf;
623       return 2;
624     }
625     else
626       return 1;
627   }
628   else if (*p == 0xdf && ess_tsett_flag != 0) {
629     items[0].byte_len = 1;
630     items[0].code_len = 2;
631     items[0].code[0] = (OnigCodePoint )'s';
632     items[0].code[1] = (OnigCodePoint )'s';
633 
634     items[1].byte_len = 1;
635     items[1].code_len = 2;
636     items[1].code[0] = (OnigCodePoint )'S';
637     items[1].code[1] = (OnigCodePoint )'S';
638 
639     items[2].byte_len = 1;
640     items[2].code_len = 2;
641     items[2].code[0] = (OnigCodePoint )'s';
642     items[2].code[1] = (OnigCodePoint )'S';
643 
644     items[3].byte_len = 1;
645     items[3].code_len = 2;
646     items[3].code[0] = (OnigCodePoint )'S';
647     items[3].code[1] = (OnigCodePoint )'s';
648 
649     return 4;
650   }
651   else {
652     int i;
653 
654     for (i = 0; i < map_size; i++) {
655       if (*p == map[i].from) {
656 	items[0].byte_len = 1;
657 	items[0].code_len = 1;
658 	items[0].code[0] = map[i].to;
659 	return 1;
660       }
661       else if (*p == map[i].to) {
662 	items[0].byte_len = 1;
663 	items[0].code_len = 1;
664 	items[0].code[0] = map[i].from;
665 	return 1;
666       }
667     }
668   }
669 
670   return 0;
671 }
672 
673 
674 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)675 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
676 	 OnigCodePoint* sb_out ARG_UNUSED,
677 	 const OnigCodePoint* ranges[] ARG_UNUSED)
678 {
679   return ONIG_NO_SUPPORT_CONFIG;
680 }
681 
682 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)683 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
684 {
685   if (p < end) {
686     if (*p == 0x0a) return 1;
687   }
688   return 0;
689 }
690 
691 /* for single byte encodings */
692 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)693 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
694 	    const UChar*end ARG_UNUSED, UChar* lower)
695 {
696   *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
697 
698   (*p)++;
699   return 1; /* return byte length of converted char to lower */
700 }
701 
702 #if 0
703 extern int
704 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag,
705 			       const UChar** pp, const UChar* end)
706 {
707   const UChar* p = *pp;
708 
709   (*pp)++;
710   return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
711 }
712 #endif
713 
714 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)715 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
716 {
717   return 1;
718 }
719 
720 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)721 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
722 {
723   return (OnigCodePoint )(*p);
724 }
725 
726 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)727 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
728 {
729   return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
730 }
731 
732 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)733 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
734 {
735   *buf = (UChar )(code & 0xff);
736   return 1;
737 }
738 
739 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)740 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
741 					  const UChar* s)
742 {
743   return (UChar* )s;
744 }
745 
746 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)747 onigenc_always_true_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
748 					     const UChar* end ARG_UNUSED)
749 {
750   return TRUE;
751 }
752 
753 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)754 onigenc_always_false_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
755 					      const UChar* end ARG_UNUSED)
756 {
757   return FALSE;
758 }
759 
760 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)761 onigenc_always_true_is_valid_mbc_string(const UChar* s   ARG_UNUSED,
762 					const UChar* end ARG_UNUSED)
763 {
764   return TRUE;
765 }
766 
767 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)768 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
769 					 const UChar* p, const UChar* end)
770 {
771   while (p < end) {
772     p += enclen(enc, p);
773   }
774 
775   if (p != end)
776     return FALSE;
777   else
778     return TRUE;
779 }
780 
781 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)782 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
783 {
784   return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
785 }
786 
787 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)788 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
789 {
790   int c, i, len;
791   OnigCodePoint n;
792 
793   len = enclen(enc, p);
794   n = (OnigCodePoint )(*p++);
795   if (len == 1) return n;
796 
797   for (i = 1; i < len; i++) {
798     if (p >= end) break;
799     c = *p++;
800     n <<= 8;  n += c;
801   }
802   return n;
803 }
804 
805 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)806 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
807                           const UChar** pp, const UChar* end ARG_UNUSED,
808 			  UChar* lower)
809 {
810   int len;
811   const UChar *p = *pp;
812 
813   if (ONIGENC_IS_MBC_ASCII(p)) {
814     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
815     (*pp)++;
816     return 1;
817   }
818   else {
819     int i;
820 
821     len = enclen(enc, p);
822     for (i = 0; i < len; i++) {
823       *lower++ = *p++;
824     }
825     (*pp) += len;
826     return len; /* return byte length of converted to lower char */
827   }
828 }
829 
830 #if 0
831 extern int
832 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
833                              const UChar** pp, const UChar* end)
834 {
835   const UChar* p = *pp;
836 
837   if (ONIGENC_IS_MBC_ASCII(p)) {
838     (*pp)++;
839     return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
840   }
841 
842   (*pp) += enclen(enc, p);
843   return FALSE;
844 }
845 #endif
846 
847 extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code)848 onigenc_mb2_code_to_mbclen(OnigCodePoint code)
849 {
850   if ((code & 0xff00) != 0) return 2;
851   else return 1;
852 }
853 
854 extern int
onigenc_mb4_code_to_mbclen(OnigCodePoint code)855 onigenc_mb4_code_to_mbclen(OnigCodePoint code)
856 {
857        if ((code & 0xff000000) != 0) return 4;
858   else if ((code & 0xff0000) != 0) return 3;
859   else if ((code & 0xff00) != 0) return 2;
860   else return 1;
861 }
862 
863 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)864 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
865 {
866   UChar *p = buf;
867 
868   if ((code & 0xff00) != 0) {
869     *p++ = (UChar )((code >>  8) & 0xff);
870   }
871   *p++ = (UChar )(code & 0xff);
872 
873 #if 1
874   if (enclen(enc, buf) != (p - buf))
875     return ONIGERR_INVALID_CODE_POINT_VALUE;
876 #endif
877   return (int )(p - buf);
878 }
879 
880 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)881 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
882 {
883   UChar *p = buf;
884 
885   if ((code & 0xff000000) != 0) {
886     *p++ = (UChar )((code >> 24) & 0xff);
887   }
888   if ((code & 0xff0000) != 0 || p != buf) {
889     *p++ = (UChar )((code >> 16) & 0xff);
890   }
891   if ((code & 0xff00) != 0 || p != buf) {
892     *p++ = (UChar )((code >> 8) & 0xff);
893   }
894   *p++ = (UChar )(code & 0xff);
895 
896 #if 1
897   if (enclen(enc, buf) != (p - buf))
898     return ONIGERR_INVALID_CODE_POINT_VALUE;
899 #endif
900   return (int )(p - buf);
901 }
902 
903 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)904 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
905 {
906   static PosixBracketEntryType PBS[] = {
907     { (UChar* )"Alnum",  ONIGENC_CTYPE_ALNUM,  5 },
908     { (UChar* )"Alpha",  ONIGENC_CTYPE_ALPHA,  5 },
909     { (UChar* )"Blank",  ONIGENC_CTYPE_BLANK,  5 },
910     { (UChar* )"Cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
911     { (UChar* )"Digit",  ONIGENC_CTYPE_DIGIT,  5 },
912     { (UChar* )"Graph",  ONIGENC_CTYPE_GRAPH,  5 },
913     { (UChar* )"Lower",  ONIGENC_CTYPE_LOWER,  5 },
914     { (UChar* )"Print",  ONIGENC_CTYPE_PRINT,  5 },
915     { (UChar* )"Punct",  ONIGENC_CTYPE_PUNCT,  5 },
916     { (UChar* )"Space",  ONIGENC_CTYPE_SPACE,  5 },
917     { (UChar* )"Upper",  ONIGENC_CTYPE_UPPER,  5 },
918     { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
919     { (UChar* )"ASCII",  ONIGENC_CTYPE_ASCII,  5 },
920     { (UChar* )"Word",   ONIGENC_CTYPE_WORD,   4 },
921     { (UChar* )NULL, -1, 0 }
922   };
923 
924   PosixBracketEntryType *pb;
925   int len;
926 
927   len = onigenc_strlen(enc, p, end);
928   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
929     if (len == pb->len &&
930         onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
931       return pb->ctype;
932   }
933 
934   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
935 }
936 
937 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)938 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
939 {
940   OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
941 
942   if (code > 127) return 0;
943 
944   return ONIGENC_IS_ASCII_CODE_WORD(code);
945 }
946 
947 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)948 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
949 			  unsigned int ctype)
950 {
951   if (code < 128)
952     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
953   else {
954     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
955       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
956     }
957   }
958 
959   return FALSE;
960 }
961 
962 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)963 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
964 			  unsigned int ctype)
965 {
966   if (code < 128)
967     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
968   else {
969     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
970       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
971     }
972   }
973 
974   return FALSE;
975 }
976 
977 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)978 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
979                            const UChar* sascii /* ascii */, int n)
980 {
981   int x, c;
982 
983   while (n-- > 0) {
984     if (p >= end) return (int )(*sascii);
985 
986     c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
987     x = *sascii - c;
988     if (x) return x;
989 
990     sascii++;
991     p += enclen(enc, p);
992   }
993   return 0;
994 }
995 
996 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)997 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
998 {
999   int i;
1000 
1001   for (i = 0; i < n; i++) {
1002     if (a[i] != b[i])
1003       return -1;
1004   }
1005 
1006   return 0;
1007 }
1008 
1009 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)1010 onig_codes_byte_at(OnigCodePoint codes[], int at)
1011 {
1012   int index;
1013   int b;
1014   OnigCodePoint code;
1015 
1016   index = at / 3;
1017   b     = at % 3;
1018   code = codes[index];
1019 
1020   return ((code >> ((2 - b) * 8)) & 0xff);
1021 }
1022