1 /**********************************************************************
2   regenc.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 #define LARGE_S   0x53
33 #define SMALL_S   0x73
34 
35 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
36 
37 #define INITED_LIST_SIZE  20
38 
39 static int InitedListNum;
40 
41 static struct {
42   OnigEncoding enc;
43   int          inited;
44 } InitedList[INITED_LIST_SIZE];
45 
46 static int
enc_inited_entry(OnigEncoding enc)47 enc_inited_entry(OnigEncoding enc)
48 {
49   int i;
50 
51   for (i = 0; i < InitedListNum; i++) {
52     if (InitedList[i].enc == enc) {
53       InitedList[i].inited = 1;
54       return i;
55     }
56   }
57 
58   i = InitedListNum;
59   if (i < INITED_LIST_SIZE - 1) {
60     InitedList[i].enc    = enc;
61     InitedList[i].inited = 1;
62     InitedListNum++;
63     return i;
64   }
65 
66   return -1;
67 }
68 
69 static int
enc_is_inited(OnigEncoding enc)70 enc_is_inited(OnigEncoding enc)
71 {
72   int i;
73 
74   for (i = 0; i < InitedListNum; i++) {
75     if (InitedList[i].enc == enc) {
76       return InitedList[i].inited;
77     }
78   }
79 
80   return 0;
81 }
82 
83 static int OnigEncInited;
84 
85 extern int
onigenc_init(void)86 onigenc_init(void)
87 {
88   if (OnigEncInited != 0) return 0;
89 
90   OnigEncInited = 1;
91   return 0;
92 }
93 
94 extern int
onigenc_end(void)95 onigenc_end(void)
96 {
97   int i;
98 
99   for (i = 0; i < InitedListNum; i++) {
100     InitedList[i].enc    = 0;
101     InitedList[i].inited = 0;
102   }
103   InitedListNum = 0;
104 
105   OnigEncInited = 0;
106   return ONIG_NORMAL;
107 }
108 
109 extern int
onig_initialize_encoding(OnigEncoding enc)110 onig_initialize_encoding(OnigEncoding enc)
111 {
112   int r;
113 
114   if (enc != ONIG_ENCODING_ASCII &&
115       ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
116     OnigEncoding ascii = ONIG_ENCODING_ASCII;
117     if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
118       r = ascii->init();
119       if (r != ONIG_NORMAL) return r;
120       enc_inited_entry(ascii);
121     }
122   }
123 
124   if (enc->init != 0 &&
125       enc_is_inited(enc) == 0) {
126     r = (enc->init)();
127     if (r == ONIG_NORMAL)
128       enc_inited_entry(enc);
129     return r;
130   }
131 
132   return 0;
133 }
134 
135 extern OnigEncoding
onigenc_get_default_encoding(void)136 onigenc_get_default_encoding(void)
137 {
138   return OnigEncDefaultCharEncoding;
139 }
140 
141 extern int
onigenc_set_default_encoding(OnigEncoding enc)142 onigenc_set_default_encoding(OnigEncoding enc)
143 {
144   OnigEncDefaultCharEncoding = enc;
145   return 0;
146 }
147 
148 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)149 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
150 {
151   int slen, term_len, i;
152   UChar *r;
153 
154   slen = (int )(end - s);
155   term_len = ONIGENC_MBC_MINLEN(enc);
156 
157   r = (UChar* )xmalloc(slen + term_len);
158   CHECK_NULL_RETURN(r);
159   xmemcpy(r, s, slen);
160 
161   for (i = 0; i < term_len; i++)
162     r[slen + i] = (UChar )0;
163 
164   return r;
165 }
166 
167 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)168 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
169 {
170   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
171   if (p < s) {
172     p += enclen(enc, p);
173   }
174   return p;
175 }
176 
177 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)178 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
179             const UChar* start, const UChar* s, const UChar** prev)
180 {
181   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
182 
183   if (p < s) {
184     if (prev) *prev = (const UChar* )p;
185     p += enclen(enc, p);
186   }
187   else {
188     if (prev)
189       *prev = onigenc_get_prev_char_head(enc, start, p);
190   }
191   return p;
192 }
193 
194 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)195 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
196 {
197   if (s <= start)
198     return (UChar* )NULL;
199 
200   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
201 }
202 
203 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)204 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
205 {
206   while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
207     if (s <= start)
208       return (UChar* )NULL;
209 
210     s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
211   }
212   return (UChar* )s;
213 }
214 
215 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)216 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
217 {
218   UChar* q = (UChar* )p;
219   while (n-- > 0) {
220     q += ONIGENC_MBC_ENC_LEN(enc, q);
221   }
222   return (q <= end ? q : NULL);
223 }
224 
225 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)226 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
227 {
228   int n = 0;
229   UChar* q = (UChar* )p;
230 
231   while (q < end) {
232     q += ONIGENC_MBC_ENC_LEN(enc, q);
233     n++;
234   }
235   return n;
236 }
237 
238 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)239 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
240 {
241   int n = 0;
242   UChar* p = (UChar* )s;
243 
244   while (1) {
245     if (*p == '\0') {
246       UChar* q;
247       int len = ONIGENC_MBC_MINLEN(enc);
248 
249       if (len == 1) return n;
250       q = p + 1;
251       while (len > 1) {
252         if (*q != '\0') break;
253         q++;
254         len--;
255       }
256       if (len == 1) return n;
257     }
258     p += ONIGENC_MBC_ENC_LEN(enc, p);
259     n++;
260   }
261 }
262 
263 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)264 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
265 {
266   UChar* start = (UChar* )s;
267   UChar* p = (UChar* )s;
268 
269   while (1) {
270     if (*p == '\0') {
271       UChar* q;
272       int len = ONIGENC_MBC_MINLEN(enc);
273 
274       if (len == 1) return (int )(p - start);
275       q = p + 1;
276       while (len > 1) {
277         if (*q != '\0') break;
278         q++;
279         len--;
280       }
281       if (len == 1) return (int )(p - start);
282     }
283     p += ONIGENC_MBC_ENC_LEN(enc, p);
284   }
285 }
286 
287 const UChar OnigEncAsciiToLowerCaseTable[] = {
288   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
289   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
290   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
291   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
292   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
293   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
294   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
295   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
296   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
297   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
298   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
299   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
300   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
301   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
302   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
303   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
304   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
305   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
306   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
307   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
308   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
309   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
310   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
311   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
312   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
313   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
314   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
315   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
316   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
317   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
318   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
319   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
320 };
321 
322 #ifdef USE_UPPER_CASE_TABLE
323 const UChar OnigEncAsciiToUpperCaseTable[256] = {
324   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
325   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
326   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
327   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
328   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
329   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
330   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
331   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
332   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
333   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
334   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
335   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
336   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
337   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
338   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
339   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
340   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
341   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
342   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
343   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
344   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
345   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
346   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
347   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
348   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
349   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
350   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
351   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
352   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
353   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
354   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
355   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
356 };
357 #endif
358 
359 const unsigned short OnigEncAsciiCtypeTable[256] = {
360   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
361   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
362   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
363   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
364   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
365   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
366   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
367   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
368   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
369   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
370   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
371   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
372   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
373   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
374   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
375   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
376   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
377   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
378   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
379   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
380   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
385   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
386   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
389   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
390   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
391   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
392 };
393 
394 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
395   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
396   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
397   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
398   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
399   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
400   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
401   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
402   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
403   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
404   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
405   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
406   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
407   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
408   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
409   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
410   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
411   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
412   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
413   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
414   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
415   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
416   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
417   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
418   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
419   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
420   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
421   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
422   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
423   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
424   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
425   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
426   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
427 };
428 
429 #ifdef USE_UPPER_CASE_TABLE
430 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
431   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
432   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
433   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
434   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
435   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
436   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
437   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
438   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
439   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
440   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
441   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
442   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
443   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
444   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
445   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
446   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
447   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
448   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
449   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
450   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
451   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
452   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
453   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
454   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
455   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
456   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
457   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
458   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
459   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
460   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
461   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
462   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
463 };
464 #endif
465 
466 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)467 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
468 {
469   /* nothing */
470   /* obsoleted. */
471 }
472 
473 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)474 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
475 {
476   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
477 }
478 
479 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
480   { 0x41, 0x61 },
481   { 0x42, 0x62 },
482   { 0x43, 0x63 },
483   { 0x44, 0x64 },
484   { 0x45, 0x65 },
485   { 0x46, 0x66 },
486   { 0x47, 0x67 },
487   { 0x48, 0x68 },
488   { 0x49, 0x69 },
489   { 0x4a, 0x6a },
490   { 0x4b, 0x6b },
491   { 0x4c, 0x6c },
492   { 0x4d, 0x6d },
493   { 0x4e, 0x6e },
494   { 0x4f, 0x6f },
495   { 0x50, 0x70 },
496   { 0x51, 0x71 },
497   { 0x52, 0x72 },
498   { 0x53, 0x73 },
499   { 0x54, 0x74 },
500   { 0x55, 0x75 },
501   { 0x56, 0x76 },
502   { 0x57, 0x77 },
503   { 0x58, 0x78 },
504   { 0x59, 0x79 },
505   { 0x5a, 0x7a }
506 };
507 
508 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)509 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
510                                   OnigApplyAllCaseFoldFunc f, void* arg)
511 {
512   OnigCodePoint code;
513   int i, r;
514 
515   for (i = 0;
516        i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
517        i++) {
518     code = OnigAsciiLowerMap[i].to;
519     r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
520     if (r != 0) return r;
521 
522     code = OnigAsciiLowerMap[i].from;
523     r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
524     if (r != 0) return r;
525   }
526 
527   return 0;
528 }
529 
530 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])531 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
532     const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
533     OnigCaseFoldCodeItem items[])
534 {
535   if (0x41 <= *p && *p <= 0x5a) {
536     items[0].byte_len = 1;
537     items[0].code_len = 1;
538     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
539     return 1;
540   }
541   else if (0x61 <= *p && *p <= 0x7a) {
542     items[0].byte_len = 1;
543     items[0].code_len = 1;
544     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
545     return 1;
546   }
547   else
548     return 0;
549 }
550 
551 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)552 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
553                        OnigApplyAllCaseFoldFunc f, void* arg)
554 {
555   static OnigCodePoint ss[] = { SMALL_S, SMALL_S };
556 
557   return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
558 }
559 
560 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)561 onigenc_apply_all_case_fold_with_map(int map_size,
562     const OnigPairCaseFoldCodes map[],
563     int ess_tsett_flag, OnigCaseFoldType flag,
564     OnigApplyAllCaseFoldFunc f, void* arg)
565 {
566   OnigCodePoint code;
567   int i, r;
568 
569   r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
570   if (r != 0) return r;
571 
572   for (i = 0; i < map_size; i++) {
573     code = map[i].to;
574     r = (*f)(map[i].from, &code, 1, arg);
575     if (r != 0) return r;
576 
577     code = map[i].from;
578     r = (*f)(map[i].to, &code, 1, arg);
579     if (r != 0) return r;
580   }
581 
582   if (ess_tsett_flag != 0)
583     return ss_apply_all_case_fold(flag, f, arg);
584 
585   return 0;
586 }
587 
588 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])589 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
590     const OnigPairCaseFoldCodes map[],
591     int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
592     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
593 {
594   int i, j, n;
595   static OnigUChar sa[] = { LARGE_S, SMALL_S };
596 
597   if (0x41 <= *p && *p <= 0x5a) { /* A - Z */
598     if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1
599         && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */
600     ss_combination:
601       items[0].byte_len = 2;
602       items[0].code_len = 1;
603       items[0].code[0] = (OnigCodePoint )0xdf;
604 
605       n = 1;
606       for (i = 0; i < 2; i++) {
607         for (j = 0; j < 2; j++) {
608           if (sa[i] == *p && sa[j] == *(p+1))
609             continue;
610 
611           items[n].byte_len = 2;
612           items[n].code_len = 2;
613           items[n].code[0] = (OnigCodePoint )sa[i];
614           items[n].code[1] = (OnigCodePoint )sa[j];
615           n++;
616         }
617       }
618       return 4;
619     }
620 
621     items[0].byte_len = 1;
622     items[0].code_len = 1;
623     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
624     return 1;
625   }
626   else if (0x61 <= *p && *p <= 0x7a) { /* a - z */
627     if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1
628         && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) {
629       goto ss_combination;
630     }
631 
632     items[0].byte_len = 1;
633     items[0].code_len = 1;
634     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
635     return 1;
636   }
637   else if (*p == 0xdf && ess_tsett_flag != 0) {
638     items[0].byte_len = 1;
639     items[0].code_len = 2;
640     items[0].code[0] = (OnigCodePoint )'s';
641     items[0].code[1] = (OnigCodePoint )'s';
642 
643     items[1].byte_len = 1;
644     items[1].code_len = 2;
645     items[1].code[0] = (OnigCodePoint )'S';
646     items[1].code[1] = (OnigCodePoint )'S';
647 
648     items[2].byte_len = 1;
649     items[2].code_len = 2;
650     items[2].code[0] = (OnigCodePoint )'s';
651     items[2].code[1] = (OnigCodePoint )'S';
652 
653     items[3].byte_len = 1;
654     items[3].code_len = 2;
655     items[3].code[0] = (OnigCodePoint )'S';
656     items[3].code[1] = (OnigCodePoint )'s';
657 
658     return 4;
659   }
660   else {
661     int i;
662 
663     for (i = 0; i < map_size; i++) {
664       if (*p == map[i].from) {
665         items[0].byte_len = 1;
666         items[0].code_len = 1;
667         items[0].code[0] = map[i].to;
668         return 1;
669       }
670       else if (*p == map[i].to) {
671         items[0].byte_len = 1;
672         items[0].code_len = 1;
673         items[0].code[0] = map[i].from;
674         return 1;
675       }
676     }
677   }
678 
679   return 0;
680 }
681 
682 
683 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)684 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
685                                          OnigCodePoint* sb_out ARG_UNUSED,
686                                          const OnigCodePoint* ranges[] ARG_UNUSED)
687 {
688   return ONIG_NO_SUPPORT_CONFIG;
689 }
690 
691 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)692 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
693 {
694   if (p < end) {
695     if (*p == NEWLINE_CODE) return 1;
696   }
697   return 0;
698 }
699 
700 /* for single byte encodings */
701 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)702 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
703                             const UChar*end ARG_UNUSED, UChar* lower)
704 {
705   *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
706 
707   (*p)++;
708   return 1; /* return byte length of converted char to lower */
709 }
710 
711 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)712 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
713 {
714   return 1;
715 }
716 
717 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)718 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
719 {
720   return (OnigCodePoint )(*p);
721 }
722 
723 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)724 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
725 {
726   return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
727 }
728 
729 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)730 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
731 {
732   *buf = (UChar )(code & 0xff);
733   return 1;
734 }
735 
736 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)737 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
738                                           const UChar* s)
739 {
740   return (UChar* )s;
741 }
742 
743 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)744 onigenc_always_true_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
745                                              const UChar* end ARG_UNUSED)
746 {
747   return TRUE;
748 }
749 
750 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)751 onigenc_always_false_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
752                                               const UChar* end ARG_UNUSED)
753 {
754   return FALSE;
755 }
756 
757 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)758 onigenc_always_true_is_valid_mbc_string(const UChar* s   ARG_UNUSED,
759                                         const UChar* end ARG_UNUSED)
760 {
761   return TRUE;
762 }
763 
764 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)765 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
766                                          const UChar* p, const UChar* end)
767 {
768   while (p < end) {
769     p += enclen(enc, p);
770   }
771 
772   if (p != end)
773     return FALSE;
774   else
775     return TRUE;
776 }
777 
778 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)779 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
780 {
781   return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
782 }
783 
784 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)785 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
786 {
787   int c, i, len;
788   OnigCodePoint n;
789 
790   len = enclen(enc, p);
791   n = (OnigCodePoint )(*p++);
792   if (len == 1) return n;
793 
794   for (i = 1; i < len; i++) {
795     if (p >= end) break;
796     c = *p++;
797     n <<= 8;  n += c;
798   }
799   return n;
800 }
801 
802 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)803 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
804                           const UChar** pp, const UChar* end ARG_UNUSED,
805                           UChar* lower)
806 {
807   int len;
808   const UChar *p = *pp;
809 
810   if (ONIGENC_IS_MBC_ASCII(p)) {
811     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
812     (*pp)++;
813     return 1;
814   }
815   else {
816     int i;
817 
818     len = enclen(enc, p);
819     for (i = 0; i < len; i++) {
820       *lower++ = *p++;
821     }
822     (*pp) += len;
823     return len; /* return byte length of converted to lower char */
824   }
825 }
826 
827 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)828 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
829 {
830   UChar *p = buf;
831 
832   if ((code & 0xff00) != 0) {
833     *p++ = (UChar )((code >>  8) & 0xff);
834   }
835   *p++ = (UChar )(code & 0xff);
836 
837 #if 1
838   if (enclen(enc, buf) != (p - buf))
839     return ONIGERR_INVALID_CODE_POINT_VALUE;
840 #endif
841   return (int )(p - buf);
842 }
843 
844 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)845 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
846 {
847   UChar *p = buf;
848 
849   if ((code & 0xff000000) != 0) {
850     *p++ = (UChar )((code >> 24) & 0xff);
851   }
852   if ((code & 0xff0000) != 0 || p != buf) {
853     *p++ = (UChar )((code >> 16) & 0xff);
854   }
855   if ((code & 0xff00) != 0 || p != buf) {
856     *p++ = (UChar )((code >> 8) & 0xff);
857   }
858   *p++ = (UChar )(code & 0xff);
859 
860 #if 1
861   if (enclen(enc, buf) != (p - buf))
862     return ONIGERR_INVALID_CODE_POINT_VALUE;
863 #endif
864   return (int )(p - buf);
865 }
866 
867 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)868 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
869 {
870   static PosixBracketEntryType PBS[] = {
871     { (UChar* )"Alnum",  ONIGENC_CTYPE_ALNUM,  5 },
872     { (UChar* )"Alpha",  ONIGENC_CTYPE_ALPHA,  5 },
873     { (UChar* )"Blank",  ONIGENC_CTYPE_BLANK,  5 },
874     { (UChar* )"Cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
875     { (UChar* )"Digit",  ONIGENC_CTYPE_DIGIT,  5 },
876     { (UChar* )"Graph",  ONIGENC_CTYPE_GRAPH,  5 },
877     { (UChar* )"Lower",  ONIGENC_CTYPE_LOWER,  5 },
878     { (UChar* )"Print",  ONIGENC_CTYPE_PRINT,  5 },
879     { (UChar* )"Punct",  ONIGENC_CTYPE_PUNCT,  5 },
880     { (UChar* )"Space",  ONIGENC_CTYPE_SPACE,  5 },
881     { (UChar* )"Upper",  ONIGENC_CTYPE_UPPER,  5 },
882     { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
883     { (UChar* )"ASCII",  ONIGENC_CTYPE_ASCII,  5 },
884     { (UChar* )"Word",   ONIGENC_CTYPE_WORD,   4 },
885     { (UChar* )NULL, -1, 0 }
886   };
887 
888   PosixBracketEntryType *pb;
889   int len;
890 
891   len = onigenc_strlen(enc, p, end);
892   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
893     if (len == pb->len &&
894         onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
895       return pb->ctype;
896   }
897 
898   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
899 }
900 
901 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)902 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
903 {
904   OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
905 
906   if (code > ASCII_LIMIT) return 0;
907 
908   return ONIGENC_IS_ASCII_CODE_WORD(code);
909 }
910 
911 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)912 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
913                           unsigned int ctype)
914 {
915   if (code < 128)
916     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
917   else {
918     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
919       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
920     }
921   }
922 
923   return FALSE;
924 }
925 
926 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)927 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
928                           unsigned int ctype)
929 {
930   if (code < 128)
931     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
932   else {
933     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
934       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
935     }
936   }
937 
938   return FALSE;
939 }
940 
941 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)942 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
943                            const UChar* sascii /* ascii */, int n)
944 {
945   int x, c;
946 
947   while (n-- > 0) {
948     if (p >= end) return (int )(*sascii);
949 
950     c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
951     x = *sascii - c;
952     if (x) return x;
953 
954     sascii++;
955     p += enclen(enc, p);
956   }
957   return 0;
958 }
959 
960 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)961 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
962 {
963   int i;
964 
965   for (i = 0; i < n; i++) {
966     if (a[i] != b[i])
967       return -1;
968   }
969 
970   return 0;
971 }
972 
973 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)974 onig_codes_byte_at(OnigCodePoint codes[], int at)
975 {
976   int index;
977   int b;
978   OnigCodePoint code;
979 
980   index = at / 3;
981   b     = at % 3;
982   code = codes[index];
983 
984   return ((code >> ((2 - b) * 8)) & 0xff);
985 }
986