1 /**********************************************************************
2   regenc.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
33 
34 #define INITED_LIST_SIZE  20
35 
36 static int InitedListNum;
37 
38 static struct {
39   OnigEncoding enc;
40   int          inited;
41 } InitedList[INITED_LIST_SIZE];
42 
43 static int
enc_inited_entry(OnigEncoding enc)44 enc_inited_entry(OnigEncoding enc)
45 {
46   int i;
47 
48   for (i = 0; i < InitedListNum; i++) {
49     if (InitedList[i].enc == enc) {
50       InitedList[i].inited = 1;
51       return i;
52     }
53   }
54 
55   i = InitedListNum;
56   if (i < INITED_LIST_SIZE - 1) {
57     InitedList[i].enc    = enc;
58     InitedList[i].inited = 1;
59     InitedListNum++;
60     return i;
61   }
62 
63   return -1;
64 }
65 
66 static int
enc_is_inited(OnigEncoding enc)67 enc_is_inited(OnigEncoding enc)
68 {
69   int i;
70 
71   for (i = 0; i < InitedListNum; i++) {
72     if (InitedList[i].enc == enc) {
73       return InitedList[i].inited;
74     }
75   }
76 
77   return 0;
78 }
79 
80 static int OnigEncInited;
81 
82 extern int
onigenc_init(void)83 onigenc_init(void)
84 {
85   if (OnigEncInited != 0) return 0;
86 
87   OnigEncInited = 1;
88   return 0;
89 }
90 
91 extern int
onigenc_end(void)92 onigenc_end(void)
93 {
94   int i;
95 
96   for (i = 0; i < InitedListNum; i++) {
97     InitedList[i].enc    = 0;
98     InitedList[i].inited = 0;
99   }
100   InitedListNum = 0;
101 
102   OnigEncInited = 0;
103   return ONIG_NORMAL;
104 }
105 
106 extern int
onig_initialize_encoding(OnigEncoding enc)107 onig_initialize_encoding(OnigEncoding enc)
108 {
109   int r;
110 
111   if (enc != ONIG_ENCODING_ASCII &&
112       ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
113     OnigEncoding ascii = ONIG_ENCODING_ASCII;
114     if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
115       r = ascii->init();
116       if (r != ONIG_NORMAL) return r;
117       enc_inited_entry(ascii);
118     }
119   }
120 
121   if (enc->init != 0 &&
122       enc_is_inited(enc) == 0) {
123     r = (enc->init)();
124     if (r == ONIG_NORMAL)
125       enc_inited_entry(enc);
126     return r;
127   }
128 
129   return 0;
130 }
131 
132 extern OnigEncoding
onigenc_get_default_encoding(void)133 onigenc_get_default_encoding(void)
134 {
135   return OnigEncDefaultCharEncoding;
136 }
137 
138 extern int
onigenc_set_default_encoding(OnigEncoding enc)139 onigenc_set_default_encoding(OnigEncoding enc)
140 {
141   OnigEncDefaultCharEncoding = enc;
142   return 0;
143 }
144 
145 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)146 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
147 {
148   int slen, term_len, i;
149   UChar *r;
150 
151   slen = (int )(end - s);
152   term_len = ONIGENC_MBC_MINLEN(enc);
153 
154   r = (UChar* )xmalloc(slen + term_len);
155   CHECK_NULL_RETURN(r);
156   xmemcpy(r, s, slen);
157 
158   for (i = 0; i < term_len; i++)
159     r[slen + i] = (UChar )0;
160 
161   return r;
162 }
163 
164 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)165 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
166 {
167   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
168   if (p < s) {
169     p += enclen(enc, p);
170   }
171   return p;
172 }
173 
174 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)175 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
176             const UChar* start, const UChar* s, const UChar** prev)
177 {
178   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
179 
180   if (p < s) {
181     if (prev) *prev = (const UChar* )p;
182     p += enclen(enc, p);
183   }
184   else {
185     if (prev) *prev = (const UChar* )NULL; /* Sorry */
186   }
187   return p;
188 }
189 
190 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)191 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
192 {
193   if (s <= start)
194     return (UChar* )NULL;
195 
196   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
197 }
198 
199 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)200 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
201 {
202   while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
203     if (s <= start)
204       return (UChar* )NULL;
205 
206     s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
207   }
208   return (UChar* )s;
209 }
210 
211 #if 0
212 extern int
213 onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end)
214 {
215   int len;
216   int n;
217 
218   len = ONIGENC_MBC_ENC_LEN(enc, p);
219   n = (int )(end - p);
220 
221   return (n < len ? n : len);
222 }
223 #endif
224 
225 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)226 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
227 {
228   UChar* q = (UChar* )p;
229   while (n-- > 0) {
230     q += ONIGENC_MBC_ENC_LEN(enc, q);
231   }
232   return (q <= end ? q : NULL);
233 }
234 
235 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)236 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
237 {
238   int n = 0;
239   UChar* q = (UChar* )p;
240 
241   while (q < end) {
242     q += ONIGENC_MBC_ENC_LEN(enc, q);
243     n++;
244   }
245   return n;
246 }
247 
248 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)249 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
250 {
251   int n = 0;
252   UChar* p = (UChar* )s;
253 
254   while (1) {
255     if (*p == '\0') {
256       UChar* q;
257       int len = ONIGENC_MBC_MINLEN(enc);
258 
259       if (len == 1) return n;
260       q = p + 1;
261       while (len > 1) {
262         if (*q != '\0') break;
263         q++;
264         len--;
265       }
266       if (len == 1) return n;
267     }
268     p += ONIGENC_MBC_ENC_LEN(enc, p);
269     n++;
270   }
271 }
272 
273 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)274 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
275 {
276   UChar* start = (UChar* )s;
277   UChar* p = (UChar* )s;
278 
279   while (1) {
280     if (*p == '\0') {
281       UChar* q;
282       int len = ONIGENC_MBC_MINLEN(enc);
283 
284       if (len == 1) return (int )(p - start);
285       q = p + 1;
286       while (len > 1) {
287         if (*q != '\0') break;
288         q++;
289         len--;
290       }
291       if (len == 1) return (int )(p - start);
292     }
293     p += ONIGENC_MBC_ENC_LEN(enc, p);
294   }
295 }
296 
297 const UChar OnigEncAsciiToLowerCaseTable[] = {
298   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
299   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
300   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
301   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
302   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
303   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
304   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
305   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
306   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
307   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
308   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
309   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
310   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
311   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
312   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
313   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
314   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
315   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
316   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
317   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
318   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
319   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
320   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
321   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
322   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
323   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
324   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
325   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
326   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
327   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
328   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
329   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
330 };
331 
332 #ifdef USE_UPPER_CASE_TABLE
333 const UChar OnigEncAsciiToUpperCaseTable[256] = {
334   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
335   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
336   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
337   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
338   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
339   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
340   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
341   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
342   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
343   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
344   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
345   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
346   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
347   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
348   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
349   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
350   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
351   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
352   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
353   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
354   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
355   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
356   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
357   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
358   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
359   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
360   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
361   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
362   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
363   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
364   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
365   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
366 };
367 #endif
368 
369 const unsigned short OnigEncAsciiCtypeTable[256] = {
370   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
371   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
372   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
373   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
374   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
375   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
376   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
377   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
378   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
379   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
380   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
381   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
382   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
383   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
384   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
385   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
386   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
389   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
390   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
391   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
392   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
393   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
394   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
395   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
396   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
397   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
398   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
399   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
400   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
401   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
402 };
403 
404 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
405   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
406   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
407   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
408   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
409   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
410   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
411   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
412   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
413   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
414   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
415   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
416   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
417   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
418   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
419   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
420   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
421   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
422   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
423   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
424   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
425   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
426   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
427   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
428   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
429   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
430   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
431   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
432   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
433   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
434   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
435   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
436   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
437 };
438 
439 #ifdef USE_UPPER_CASE_TABLE
440 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
441   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
442   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
443   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
444   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
445   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
446   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
447   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
448   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
449   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
450   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
451   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
452   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
453   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
454   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
455   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
456   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
457   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
458   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
459   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
460   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
461   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
462   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
463   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
464   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
465   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
466   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
467   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
468   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
469   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
470   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
471   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
472   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
473 };
474 #endif
475 
476 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)477 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
478 {
479   /* nothing */
480   /* obsoleted. */
481 }
482 
483 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)484 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
485 {
486   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
487 }
488 
489 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
490   { 0x41, 0x61 },
491   { 0x42, 0x62 },
492   { 0x43, 0x63 },
493   { 0x44, 0x64 },
494   { 0x45, 0x65 },
495   { 0x46, 0x66 },
496   { 0x47, 0x67 },
497   { 0x48, 0x68 },
498   { 0x49, 0x69 },
499   { 0x4a, 0x6a },
500   { 0x4b, 0x6b },
501   { 0x4c, 0x6c },
502   { 0x4d, 0x6d },
503   { 0x4e, 0x6e },
504   { 0x4f, 0x6f },
505   { 0x50, 0x70 },
506   { 0x51, 0x71 },
507   { 0x52, 0x72 },
508   { 0x53, 0x73 },
509   { 0x54, 0x74 },
510   { 0x55, 0x75 },
511   { 0x56, 0x76 },
512   { 0x57, 0x77 },
513   { 0x58, 0x78 },
514   { 0x59, 0x79 },
515   { 0x5a, 0x7a }
516 };
517 
518 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)519 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
520                                   OnigApplyAllCaseFoldFunc f, void* arg)
521 {
522   OnigCodePoint code;
523   int i, r;
524 
525   for (i = 0;
526        i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
527        i++) {
528     code = OnigAsciiLowerMap[i].to;
529     r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
530     if (r != 0) return r;
531 
532     code = OnigAsciiLowerMap[i].from;
533     r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
534     if (r != 0) return r;
535   }
536 
537   return 0;
538 }
539 
540 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])541 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
542     const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
543     OnigCaseFoldCodeItem items[])
544 {
545   if (0x41 <= *p && *p <= 0x5a) {
546     items[0].byte_len = 1;
547     items[0].code_len = 1;
548     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
549     return 1;
550   }
551   else if (0x61 <= *p && *p <= 0x7a) {
552     items[0].byte_len = 1;
553     items[0].code_len = 1;
554     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
555     return 1;
556   }
557   else
558     return 0;
559 }
560 
561 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)562 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
563                        OnigApplyAllCaseFoldFunc f, void* arg)
564 {
565   static OnigCodePoint ss[] = { 0x73, 0x73 };
566 
567   return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
568 }
569 
570 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)571 onigenc_apply_all_case_fold_with_map(int map_size,
572     const OnigPairCaseFoldCodes map[],
573     int ess_tsett_flag, OnigCaseFoldType flag,
574     OnigApplyAllCaseFoldFunc f, void* arg)
575 {
576   OnigCodePoint code;
577   int i, r;
578 
579   r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
580   if (r != 0) return r;
581 
582   for (i = 0; i < map_size; i++) {
583     code = map[i].to;
584     r = (*f)(map[i].from, &code, 1, arg);
585     if (r != 0) return r;
586 
587     code = map[i].from;
588     r = (*f)(map[i].to, &code, 1, arg);
589     if (r != 0) return r;
590   }
591 
592   if (ess_tsett_flag != 0)
593     return ss_apply_all_case_fold(flag, f, arg);
594 
595   return 0;
596 }
597 
598 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])599 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
600     const OnigPairCaseFoldCodes map[],
601     int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
602     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
603 {
604   if (0x41 <= *p && *p <= 0x5a) {
605     items[0].byte_len = 1;
606     items[0].code_len = 1;
607     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
608     if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
609         && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
610       /* SS */
611       items[1].byte_len = 2;
612       items[1].code_len = 1;
613       items[1].code[0] = (OnigCodePoint )0xdf;
614       return 2;
615     }
616     else
617       return 1;
618   }
619   else if (0x61 <= *p && *p <= 0x7a) {
620     items[0].byte_len = 1;
621     items[0].code_len = 1;
622     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
623     if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
624         && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
625       /* ss */
626       items[1].byte_len = 2;
627       items[1].code_len = 1;
628       items[1].code[0] = (OnigCodePoint )0xdf;
629       return 2;
630     }
631     else
632       return 1;
633   }
634   else if (*p == 0xdf && ess_tsett_flag != 0) {
635     items[0].byte_len = 1;
636     items[0].code_len = 2;
637     items[0].code[0] = (OnigCodePoint )'s';
638     items[0].code[1] = (OnigCodePoint )'s';
639 
640     items[1].byte_len = 1;
641     items[1].code_len = 2;
642     items[1].code[0] = (OnigCodePoint )'S';
643     items[1].code[1] = (OnigCodePoint )'S';
644 
645     items[2].byte_len = 1;
646     items[2].code_len = 2;
647     items[2].code[0] = (OnigCodePoint )'s';
648     items[2].code[1] = (OnigCodePoint )'S';
649 
650     items[3].byte_len = 1;
651     items[3].code_len = 2;
652     items[3].code[0] = (OnigCodePoint )'S';
653     items[3].code[1] = (OnigCodePoint )'s';
654 
655     return 4;
656   }
657   else {
658     int i;
659 
660     for (i = 0; i < map_size; i++) {
661       if (*p == map[i].from) {
662         items[0].byte_len = 1;
663         items[0].code_len = 1;
664         items[0].code[0] = map[i].to;
665         return 1;
666       }
667       else if (*p == map[i].to) {
668         items[0].byte_len = 1;
669         items[0].code_len = 1;
670         items[0].code[0] = map[i].from;
671         return 1;
672       }
673     }
674   }
675 
676   return 0;
677 }
678 
679 
680 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)681 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
682                                          OnigCodePoint* sb_out ARG_UNUSED,
683                                          const OnigCodePoint* ranges[] ARG_UNUSED)
684 {
685   return ONIG_NO_SUPPORT_CONFIG;
686 }
687 
688 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)689 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
690 {
691   if (p < end) {
692     if (*p == 0x0a) return 1;
693   }
694   return 0;
695 }
696 
697 /* for single byte encodings */
698 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)699 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
700                             const UChar*end ARG_UNUSED, UChar* lower)
701 {
702   *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
703 
704   (*p)++;
705   return 1; /* return byte length of converted char to lower */
706 }
707 
708 #if 0
709 extern int
710 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag,
711                                const UChar** pp, const UChar* end)
712 {
713   const UChar* p = *pp;
714 
715   (*pp)++;
716   return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
717 }
718 #endif
719 
720 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)721 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
722 {
723   return 1;
724 }
725 
726 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)727 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
728 {
729   return (OnigCodePoint )(*p);
730 }
731 
732 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)733 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
734 {
735   return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
736 }
737 
738 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)739 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
740 {
741   *buf = (UChar )(code & 0xff);
742   return 1;
743 }
744 
745 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)746 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
747                                           const UChar* s)
748 {
749   return (UChar* )s;
750 }
751 
752 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)753 onigenc_always_true_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
754                                              const UChar* end ARG_UNUSED)
755 {
756   return TRUE;
757 }
758 
759 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)760 onigenc_always_false_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
761                                               const UChar* end ARG_UNUSED)
762 {
763   return FALSE;
764 }
765 
766 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)767 onigenc_always_true_is_valid_mbc_string(const UChar* s   ARG_UNUSED,
768                                         const UChar* end ARG_UNUSED)
769 {
770   return TRUE;
771 }
772 
773 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)774 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
775                                          const UChar* p, const UChar* end)
776 {
777   while (p < end) {
778     p += enclen(enc, p);
779   }
780 
781   if (p != end)
782     return FALSE;
783   else
784     return TRUE;
785 }
786 
787 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)788 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
789 {
790   return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
791 }
792 
793 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)794 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
795 {
796   int c, i, len;
797   OnigCodePoint n;
798 
799   len = enclen(enc, p);
800   n = (OnigCodePoint )(*p++);
801   if (len == 1) return n;
802 
803   for (i = 1; i < len; i++) {
804     if (p >= end) break;
805     c = *p++;
806     n <<= 8;  n += c;
807   }
808   return n;
809 }
810 
811 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)812 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
813                           const UChar** pp, const UChar* end ARG_UNUSED,
814                           UChar* lower)
815 {
816   int len;
817   const UChar *p = *pp;
818 
819   if (ONIGENC_IS_MBC_ASCII(p)) {
820     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
821     (*pp)++;
822     return 1;
823   }
824   else {
825     int i;
826 
827     len = enclen(enc, p);
828     for (i = 0; i < len; i++) {
829       *lower++ = *p++;
830     }
831     (*pp) += len;
832     return len; /* return byte length of converted to lower char */
833   }
834 }
835 
836 #if 0
837 extern int
838 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
839                              const UChar** pp, const UChar* end)
840 {
841   const UChar* p = *pp;
842 
843   if (ONIGENC_IS_MBC_ASCII(p)) {
844     (*pp)++;
845     return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
846   }
847 
848   (*pp) += enclen(enc, p);
849   return FALSE;
850 }
851 #endif
852 
853 extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code)854 onigenc_mb2_code_to_mbclen(OnigCodePoint code)
855 {
856   if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
857 
858   if ((code & 0xff00) != 0) return 2;
859   else return 1;
860 }
861 
862 extern int
onigenc_mb4_code_to_mbclen(OnigCodePoint code)863 onigenc_mb4_code_to_mbclen(OnigCodePoint code)
864 {
865        if ((code & 0xff000000) != 0) return 4;
866   else if ((code & 0xff0000) != 0) return 3;
867   else if ((code & 0xff00) != 0) return 2;
868   else return 1;
869 }
870 
871 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)872 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
873 {
874   UChar *p = buf;
875 
876   if ((code & 0xff00) != 0) {
877     *p++ = (UChar )((code >>  8) & 0xff);
878   }
879   *p++ = (UChar )(code & 0xff);
880 
881 #if 1
882   if (enclen(enc, buf) != (p - buf))
883     return ONIGERR_INVALID_CODE_POINT_VALUE;
884 #endif
885   return (int )(p - buf);
886 }
887 
888 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)889 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
890 {
891   UChar *p = buf;
892 
893   if ((code & 0xff000000) != 0) {
894     *p++ = (UChar )((code >> 24) & 0xff);
895   }
896   if ((code & 0xff0000) != 0 || p != buf) {
897     *p++ = (UChar )((code >> 16) & 0xff);
898   }
899   if ((code & 0xff00) != 0 || p != buf) {
900     *p++ = (UChar )((code >> 8) & 0xff);
901   }
902   *p++ = (UChar )(code & 0xff);
903 
904 #if 1
905   if (enclen(enc, buf) != (p - buf))
906     return ONIGERR_INVALID_CODE_POINT_VALUE;
907 #endif
908   return (int )(p - buf);
909 }
910 
911 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)912 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
913 {
914   static PosixBracketEntryType PBS[] = {
915     { (UChar* )"Alnum",  ONIGENC_CTYPE_ALNUM,  5 },
916     { (UChar* )"Alpha",  ONIGENC_CTYPE_ALPHA,  5 },
917     { (UChar* )"Blank",  ONIGENC_CTYPE_BLANK,  5 },
918     { (UChar* )"Cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
919     { (UChar* )"Digit",  ONIGENC_CTYPE_DIGIT,  5 },
920     { (UChar* )"Graph",  ONIGENC_CTYPE_GRAPH,  5 },
921     { (UChar* )"Lower",  ONIGENC_CTYPE_LOWER,  5 },
922     { (UChar* )"Print",  ONIGENC_CTYPE_PRINT,  5 },
923     { (UChar* )"Punct",  ONIGENC_CTYPE_PUNCT,  5 },
924     { (UChar* )"Space",  ONIGENC_CTYPE_SPACE,  5 },
925     { (UChar* )"Upper",  ONIGENC_CTYPE_UPPER,  5 },
926     { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
927     { (UChar* )"ASCII",  ONIGENC_CTYPE_ASCII,  5 },
928     { (UChar* )"Word",   ONIGENC_CTYPE_WORD,   4 },
929     { (UChar* )NULL, -1, 0 }
930   };
931 
932   PosixBracketEntryType *pb;
933   int len;
934 
935   len = onigenc_strlen(enc, p, end);
936   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
937     if (len == pb->len &&
938         onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
939       return pb->ctype;
940   }
941 
942   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
943 }
944 
945 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)946 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
947 {
948   OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
949 
950   if (code > 127) return 0;
951 
952   return ONIGENC_IS_ASCII_CODE_WORD(code);
953 }
954 
955 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)956 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
957                           unsigned int ctype)
958 {
959   if (code < 128)
960     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
961   else {
962     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
963       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
964     }
965   }
966 
967   return FALSE;
968 }
969 
970 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)971 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
972                           unsigned int ctype)
973 {
974   if (code < 128)
975     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
976   else {
977     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
978       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
979     }
980   }
981 
982   return FALSE;
983 }
984 
985 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)986 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
987                            const UChar* sascii /* ascii */, int n)
988 {
989   int x, c;
990 
991   while (n-- > 0) {
992     if (p >= end) return (int )(*sascii);
993 
994     c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
995     x = *sascii - c;
996     if (x) return x;
997 
998     sascii++;
999     p += enclen(enc, p);
1000   }
1001   return 0;
1002 }
1003 
1004 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)1005 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
1006 {
1007   int i;
1008 
1009   for (i = 0; i < n; i++) {
1010     if (a[i] != b[i])
1011       return -1;
1012   }
1013 
1014   return 0;
1015 }
1016 
1017 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)1018 onig_codes_byte_at(OnigCodePoint codes[], int at)
1019 {
1020   int index;
1021   int b;
1022   OnigCodePoint code;
1023 
1024   index = at / 3;
1025   b     = at % 3;
1026   code = codes[index];
1027 
1028   return ((code >> ((2 - b) * 8)) & 0xff);
1029 }
1030