1 /**********************************************************************
2   regenc.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2020  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 #define LARGE_S   0x53
33 #define SMALL_S   0x73
34 
35 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
36 
37 #define INITED_LIST_SIZE  20
38 
39 static int InitedListNum;
40 
41 static struct {
42   OnigEncoding enc;
43   int          inited;
44 } InitedList[INITED_LIST_SIZE];
45 
46 static int
enc_inited_entry(OnigEncoding enc)47 enc_inited_entry(OnigEncoding enc)
48 {
49   int i;
50 
51   for (i = 0; i < InitedListNum; i++) {
52     if (InitedList[i].enc == enc) {
53       InitedList[i].inited = 1;
54       return i;
55     }
56   }
57 
58   i = InitedListNum;
59   if (i < INITED_LIST_SIZE - 1) {
60     InitedList[i].enc    = enc;
61     InitedList[i].inited = 1;
62     InitedListNum++;
63     return i;
64   }
65 
66   return -1;
67 }
68 
69 static int
enc_is_inited(OnigEncoding enc)70 enc_is_inited(OnigEncoding enc)
71 {
72   int i;
73 
74   for (i = 0; i < InitedListNum; i++) {
75     if (InitedList[i].enc == enc) {
76       return InitedList[i].inited;
77     }
78   }
79 
80   return 0;
81 }
82 
83 static int OnigEncInited;
84 
85 extern int
onigenc_init(void)86 onigenc_init(void)
87 {
88   if (OnigEncInited != 0) return 0;
89 
90   OnigEncInited = 1;
91   return 0;
92 }
93 
94 extern int
onigenc_end(void)95 onigenc_end(void)
96 {
97   int i;
98 
99   for (i = 0; i < InitedListNum; i++) {
100     InitedList[i].enc    = 0;
101     InitedList[i].inited = 0;
102   }
103   InitedListNum = 0;
104 
105   OnigEncInited = 0;
106   return ONIG_NORMAL;
107 }
108 
109 extern int
onig_initialize_encoding(OnigEncoding enc)110 onig_initialize_encoding(OnigEncoding enc)
111 {
112   int r;
113 
114   if (enc != ONIG_ENCODING_ASCII &&
115       ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
116     OnigEncoding ascii = ONIG_ENCODING_ASCII;
117     if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
118       r = ascii->init();
119       if (r != ONIG_NORMAL) return r;
120       enc_inited_entry(ascii);
121     }
122   }
123 
124   if (enc->init != 0 &&
125       enc_is_inited(enc) == 0) {
126     r = (enc->init)();
127     if (r == ONIG_NORMAL)
128       enc_inited_entry(enc);
129     return r;
130   }
131 
132   return 0;
133 }
134 
135 extern OnigEncoding
onigenc_get_default_encoding(void)136 onigenc_get_default_encoding(void)
137 {
138   return OnigEncDefaultCharEncoding;
139 }
140 
141 extern int
onigenc_set_default_encoding(OnigEncoding enc)142 onigenc_set_default_encoding(OnigEncoding enc)
143 {
144   OnigEncDefaultCharEncoding = enc;
145   return 0;
146 }
147 
148 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)149 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
150 {
151   int slen, term_len, i;
152   UChar *r;
153 
154   slen = (int )(end - s);
155   term_len = ONIGENC_MBC_MINLEN(enc);
156 
157   r = (UChar* )xmalloc(slen + term_len);
158   CHECK_NULL_RETURN(r);
159   xmemcpy(r, s, slen);
160 
161   for (i = 0; i < term_len; i++)
162     r[slen + i] = (UChar )0;
163 
164   return r;
165 }
166 
167 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)168 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
169 {
170   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
171   if (p < s) {
172     p += enclen(enc, p);
173   }
174   return p;
175 }
176 
177 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)178 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
179             const UChar* start, const UChar* s, const UChar** prev)
180 {
181   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
182 
183   if (p < s) {
184     if (prev) *prev = (const UChar* )p;
185     p += enclen(enc, p);
186   }
187   else {
188     if (prev)
189       *prev = onigenc_get_prev_char_head(enc, start, p);
190   }
191   return p;
192 }
193 
194 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)195 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
196 {
197   if (s <= start)
198     return (UChar* )NULL;
199 
200   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
201 }
202 
203 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)204 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
205 {
206   while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
207     if (s <= start)
208       return (UChar* )NULL;
209 
210     s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
211   }
212   return (UChar* )s;
213 }
214 
215 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)216 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
217 {
218   UChar* q = (UChar* )p;
219   while (n-- > 0) {
220     q += ONIGENC_MBC_ENC_LEN(enc, q);
221   }
222   return (q <= end ? q : NULL);
223 }
224 
225 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)226 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
227 {
228   int n = 0;
229   UChar* q = (UChar* )p;
230 
231   while (q < end) {
232     q += ONIGENC_MBC_ENC_LEN(enc, q);
233     n++;
234   }
235   return n;
236 }
237 
238 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)239 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
240 {
241   int n = 0;
242   UChar* p = (UChar* )s;
243 
244   while (1) {
245     if (*p == '\0') {
246       UChar* q;
247       int len = ONIGENC_MBC_MINLEN(enc);
248 
249       if (len == 1) return n;
250       q = p + 1;
251       while (len > 1) {
252         if (*q != '\0') break;
253         q++;
254         len--;
255       }
256       if (len == 1) return n;
257     }
258     p += ONIGENC_MBC_ENC_LEN(enc, p);
259     n++;
260   }
261 }
262 
263 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)264 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
265 {
266   const UChar* start = s;
267   const UChar* p = s;
268 
269   while (1) {
270     if (*p == '\0') {
271       const UChar* q;
272       int len = ONIGENC_MBC_MINLEN(enc);
273 
274       if (len == 1) return (int )(p - start);
275       q = p + 1;
276       while (len > 1) {
277         if (*q != '\0') break;
278         q++;
279         len--;
280       }
281       if (len == 1) return (int )(p - start);
282     }
283     p += ONIGENC_MBC_ENC_LEN(enc, p);
284   }
285 }
286 
287 const UChar OnigEncAsciiToLowerCaseTable[] = {
288   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
289   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
290   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
291   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
292   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
293   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
294   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
295   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
296   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
297   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
298   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
299   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
300   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
301   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
302   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
303   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
304   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
305   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
306   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
307   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
308   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
309   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
310   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
311   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
312   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
313   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
314   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
315   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
316   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
317   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
318   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
319   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
320 };
321 
322 #ifdef USE_UPPER_CASE_TABLE
323 const UChar OnigEncAsciiToUpperCaseTable[256] = {
324   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
325   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
326   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
327   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
328   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
329   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
330   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
331   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
332   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
333   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
334   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
335   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
336   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
337   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
338   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
339   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
340   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
341   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
342   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
343   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
344   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
345   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
346   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
347   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
348   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
349   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
350   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
351   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
352   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
353   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
354   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
355   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
356 };
357 #endif
358 
359 const unsigned short OnigEncAsciiCtypeTable[256] = {
360   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
361   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
362   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
363   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
364   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
365   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
366   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
367   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
368   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
369   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
370   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
371   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
372   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
373   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
374   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
375   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
376   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
377   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
378   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
379   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
380   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
385   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
386   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
389   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
390   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
391   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
392 };
393 
394 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
395   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
396   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
397   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
398   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
399   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
400   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
401   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
402   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
403   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
404   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
405   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
406   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
407   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
408   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
409   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
410   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
411   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
412   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
413   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
414   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
415   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
416   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
417   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
418   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
419   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
420   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
421   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
422   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
423   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
424   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
425   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
426   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
427 };
428 
429 #ifdef USE_UPPER_CASE_TABLE
430 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
431   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
432   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
433   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
434   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
435   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
436   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
437   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
438   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
439   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
440   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
441   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
442   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
443   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
444   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
445   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
446   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
447   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
448   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
449   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
450   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
451   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
452   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
453   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
454   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
455   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
456   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
457   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
458   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
459   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
460   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
461   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
462   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
463 };
464 #endif
465 
466 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)467 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
468 {
469   /* nothing */
470   /* obsoleted. */
471 }
472 
473 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)474 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
475 {
476   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
477 }
478 
479 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
480   { 0x41, 0x61 },
481   { 0x42, 0x62 },
482   { 0x43, 0x63 },
483   { 0x44, 0x64 },
484   { 0x45, 0x65 },
485   { 0x46, 0x66 },
486   { 0x47, 0x67 },
487   { 0x48, 0x68 },
488   { 0x49, 0x69 },
489   { 0x4a, 0x6a },
490   { 0x4b, 0x6b },
491   { 0x4c, 0x6c },
492   { 0x4d, 0x6d },
493   { 0x4e, 0x6e },
494   { 0x4f, 0x6f },
495   { 0x50, 0x70 },
496   { 0x51, 0x71 },
497   { 0x52, 0x72 },
498   { 0x53, 0x73 },
499   { 0x54, 0x74 },
500   { 0x55, 0x75 },
501   { 0x56, 0x76 },
502   { 0x57, 0x77 },
503   { 0x58, 0x78 },
504   { 0x59, 0x79 },
505   { 0x5a, 0x7a }
506 };
507 
508 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)509 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
510                                   OnigApplyAllCaseFoldFunc f, void* arg)
511 {
512   OnigCodePoint code;
513   int i, r;
514 
515   for (i = 0;
516        i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
517        i++) {
518     code = OnigAsciiLowerMap[i].to;
519     r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
520     if (r != 0) return r;
521 
522     code = OnigAsciiLowerMap[i].from;
523     r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
524     if (r != 0) return r;
525   }
526 
527   return 0;
528 }
529 
530 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])531 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
532     const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
533     OnigCaseFoldCodeItem items[])
534 {
535   if (0x41 <= *p && *p <= 0x5a) {
536     items[0].byte_len = 1;
537     items[0].code_len = 1;
538     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
539     return 1;
540   }
541   else if (0x61 <= *p && *p <= 0x7a) {
542     items[0].byte_len = 1;
543     items[0].code_len = 1;
544     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
545     return 1;
546   }
547   else
548     return 0;
549 }
550 
551 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)552 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
553                        OnigApplyAllCaseFoldFunc f, void* arg)
554 {
555   static OnigCodePoint ss[] = { SMALL_S, SMALL_S };
556 
557   return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
558 }
559 
560 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)561 onigenc_apply_all_case_fold_with_map(int map_size,
562     const OnigPairCaseFoldCodes map[],
563     int ess_tsett_flag, OnigCaseFoldType flag,
564     OnigApplyAllCaseFoldFunc f, void* arg)
565 {
566   OnigCodePoint code;
567   int i, r;
568 
569   r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
570   if (r != 0) return r;
571 
572   if (CASE_FOLD_IS_ASCII_ONLY(flag))
573     return 0;
574 
575   for (i = 0; i < map_size; i++) {
576     code = map[i].to;
577     r = (*f)(map[i].from, &code, 1, arg);
578     if (r != 0) return r;
579 
580     code = map[i].from;
581     r = (*f)(map[i].to, &code, 1, arg);
582     if (r != 0) return r;
583   }
584 
585   if (ess_tsett_flag != 0)
586     return ss_apply_all_case_fold(flag, f, arg);
587 
588   return 0;
589 }
590 
591 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])592 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
593     const OnigPairCaseFoldCodes map[],
594     int ess_tsett_flag, OnigCaseFoldType flag,
595     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
596 {
597   int i, j, n;
598   static OnigUChar sa[] = { LARGE_S, SMALL_S };
599 
600   if (0x41 <= *p && *p <= 0x5a) { /* A - Z */
601     if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1
602         && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) /* SS */
603         && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
604     ss_combination:
605       items[0].byte_len = 2;
606       items[0].code_len = 1;
607       items[0].code[0] = (OnigCodePoint )0xdf;
608 
609       n = 1;
610       for (i = 0; i < 2; i++) {
611         for (j = 0; j < 2; j++) {
612           if (sa[i] == *p && sa[j] == *(p+1))
613             continue;
614 
615           items[n].byte_len = 2;
616           items[n].code_len = 2;
617           items[n].code[0] = (OnigCodePoint )sa[i];
618           items[n].code[1] = (OnigCodePoint )sa[j];
619           n++;
620         }
621       }
622       return 4;
623     }
624 
625     items[0].byte_len = 1;
626     items[0].code_len = 1;
627     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
628     return 1;
629   }
630   else if (0x61 <= *p && *p <= 0x7a) { /* a - z */
631     if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1
632         && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)
633         && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
634       goto ss_combination;
635     }
636 
637     items[0].byte_len = 1;
638     items[0].code_len = 1;
639     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
640     return 1;
641   }
642   else if (*p == 0xdf && ess_tsett_flag != 0
643            && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
644     items[0].byte_len = 1;
645     items[0].code_len = 2;
646     items[0].code[0] = (OnigCodePoint )'s';
647     items[0].code[1] = (OnigCodePoint )'s';
648 
649     items[1].byte_len = 1;
650     items[1].code_len = 2;
651     items[1].code[0] = (OnigCodePoint )'S';
652     items[1].code[1] = (OnigCodePoint )'S';
653 
654     items[2].byte_len = 1;
655     items[2].code_len = 2;
656     items[2].code[0] = (OnigCodePoint )'s';
657     items[2].code[1] = (OnigCodePoint )'S';
658 
659     items[3].byte_len = 1;
660     items[3].code_len = 2;
661     items[3].code[0] = (OnigCodePoint )'S';
662     items[3].code[1] = (OnigCodePoint )'s';
663 
664     return 4;
665   }
666   else {
667     int i;
668 
669     if (CASE_FOLD_IS_ASCII_ONLY(flag))
670       return 0;
671 
672     for (i = 0; i < map_size; i++) {
673       if (*p == map[i].from) {
674         items[0].byte_len = 1;
675         items[0].code_len = 1;
676         items[0].code[0] = map[i].to;
677         return 1;
678       }
679       else if (*p == map[i].to) {
680         items[0].byte_len = 1;
681         items[0].code_len = 1;
682         items[0].code[0] = map[i].from;
683         return 1;
684       }
685     }
686   }
687 
688   return 0;
689 }
690 
691 
692 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)693 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
694                                          OnigCodePoint* sb_out ARG_UNUSED,
695                                          const OnigCodePoint* ranges[] ARG_UNUSED)
696 {
697   return ONIG_NO_SUPPORT_CONFIG;
698 }
699 
700 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)701 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
702 {
703   if (p < end) {
704     if (*p == NEWLINE_CODE) return 1;
705   }
706   return 0;
707 }
708 
709 /* for single byte encodings */
710 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)711 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
712                             const UChar*end ARG_UNUSED, UChar* lower)
713 {
714   *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
715 
716   (*p)++;
717   return 1; /* return byte length of converted char to lower */
718 }
719 
720 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)721 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
722 {
723   return 1;
724 }
725 
726 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)727 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
728 {
729   return (OnigCodePoint )(*p);
730 }
731 
732 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)733 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
734 {
735   return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
736 }
737 
738 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)739 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
740 {
741   *buf = (UChar )(code & 0xff);
742   return 1;
743 }
744 
745 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)746 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
747                                           const UChar* s)
748 {
749   return (UChar* )s;
750 }
751 
752 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)753 onigenc_always_true_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
754                                              const UChar* end ARG_UNUSED)
755 {
756   return TRUE;
757 }
758 
759 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)760 onigenc_always_false_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
761                                               const UChar* end ARG_UNUSED)
762 {
763   return FALSE;
764 }
765 
766 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)767 onigenc_always_true_is_valid_mbc_string(const UChar* s   ARG_UNUSED,
768                                         const UChar* end ARG_UNUSED)
769 {
770   return TRUE;
771 }
772 
773 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)774 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
775                                          const UChar* p, const UChar* end)
776 {
777   while (p < end) {
778     p += enclen(enc, p);
779   }
780 
781   if (p != end)
782     return FALSE;
783   else
784     return TRUE;
785 }
786 
787 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)788 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
789 {
790   return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
791 }
792 
793 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)794 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
795 {
796   int c, i, len;
797   OnigCodePoint n;
798 
799   len = enclen(enc, p);
800   n = (OnigCodePoint )(*p++);
801   if (len == 1) return n;
802 
803   for (i = 1; i < len; i++) {
804     if (p >= end) break;
805     c = *p++;
806     n <<= 8;  n += c;
807   }
808   return n;
809 }
810 
811 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)812 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
813                           const UChar** pp, const UChar* end ARG_UNUSED,
814                           UChar* lower)
815 {
816   int len;
817   const UChar *p = *pp;
818 
819   if (ONIGENC_IS_MBC_ASCII(p)) {
820     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
821     (*pp)++;
822     return 1;
823   }
824   else {
825     int i;
826 
827     len = enclen(enc, p);
828     for (i = 0; i < len; i++) {
829       *lower++ = *p++;
830     }
831     (*pp) += len;
832     return len; /* return byte length of converted to lower char */
833   }
834 }
835 
836 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)837 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
838 {
839   UChar *p = buf;
840 
841   if ((code & 0xff00) != 0) {
842     *p++ = (UChar )((code >>  8) & 0xff);
843   }
844   *p++ = (UChar )(code & 0xff);
845 
846 #if 1
847   if (enclen(enc, buf) != (p - buf))
848     return ONIGERR_INVALID_CODE_POINT_VALUE;
849 #endif
850   return (int )(p - buf);
851 }
852 
853 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)854 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
855 {
856   UChar *p = buf;
857 
858   if ((code & 0xff000000) != 0) {
859     *p++ = (UChar )((code >> 24) & 0xff);
860   }
861   if ((code & 0xff0000) != 0 || p != buf) {
862     *p++ = (UChar )((code >> 16) & 0xff);
863   }
864   if ((code & 0xff00) != 0 || p != buf) {
865     *p++ = (UChar )((code >> 8) & 0xff);
866   }
867   *p++ = (UChar )(code & 0xff);
868 
869 #if 1
870   if (enclen(enc, buf) != (p - buf))
871     return ONIGERR_INVALID_CODE_POINT_VALUE;
872 #endif
873   return (int )(p - buf);
874 }
875 
876 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)877 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
878 {
879   static PosixBracketEntryType PBS[] = {
880     { (UChar* )"Alnum",  ONIGENC_CTYPE_ALNUM,  5 },
881     { (UChar* )"Alpha",  ONIGENC_CTYPE_ALPHA,  5 },
882     { (UChar* )"Blank",  ONIGENC_CTYPE_BLANK,  5 },
883     { (UChar* )"Cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
884     { (UChar* )"Digit",  ONIGENC_CTYPE_DIGIT,  5 },
885     { (UChar* )"Graph",  ONIGENC_CTYPE_GRAPH,  5 },
886     { (UChar* )"Lower",  ONIGENC_CTYPE_LOWER,  5 },
887     { (UChar* )"Print",  ONIGENC_CTYPE_PRINT,  5 },
888     { (UChar* )"Punct",  ONIGENC_CTYPE_PUNCT,  5 },
889     { (UChar* )"Space",  ONIGENC_CTYPE_SPACE,  5 },
890     { (UChar* )"Upper",  ONIGENC_CTYPE_UPPER,  5 },
891     { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
892     { (UChar* )"ASCII",  ONIGENC_CTYPE_ASCII,  5 },
893     { (UChar* )"Word",   ONIGENC_CTYPE_WORD,   4 },
894     { (UChar* )NULL, -1, 0 }
895   };
896 
897   PosixBracketEntryType *pb;
898   int len;
899 
900   len = onigenc_strlen(enc, p, end);
901   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
902     if (len == pb->len &&
903         onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
904       return pb->ctype;
905   }
906 
907   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
908 }
909 
910 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)911 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
912 {
913   OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
914 
915   if (code > ASCII_LIMIT) return 0;
916 
917   return ONIGENC_IS_ASCII_CODE_WORD(code);
918 }
919 
920 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)921 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
922                           unsigned int ctype)
923 {
924   if (code < 128)
925     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
926   else {
927     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
928       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
929     }
930   }
931 
932   return FALSE;
933 }
934 
935 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)936 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
937                           unsigned int ctype)
938 {
939   if (code < 128)
940     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
941   else {
942     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
943       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
944     }
945   }
946 
947   return FALSE;
948 }
949 
950 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)951 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
952                            const UChar* sascii /* ascii */, int n)
953 {
954   int x, c;
955 
956   while (n-- > 0) {
957     if (p >= end) return (int )(*sascii);
958 
959     c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
960     x = *sascii - c;
961     if (x) return x;
962 
963     sascii++;
964     p += enclen(enc, p);
965   }
966   return 0;
967 }
968 
969 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)970 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
971 {
972   int i;
973 
974   for (i = 0; i < n; i++) {
975     if (a[i] != b[i])
976       return -1;
977   }
978 
979   return 0;
980 }
981 
982 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)983 onig_codes_byte_at(OnigCodePoint codes[], int at)
984 {
985   int index;
986   int b;
987   OnigCodePoint code;
988 
989   index = at / 3;
990   b     = at % 3;
991   code = codes[index];
992 
993   return ((code >> ((2 - b) * 8)) & 0xff);
994 }
995