1 /**********************************************************************
2   unicode.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2020  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 struct PoolPropertyNameCtype {
33   short int name;
34   short int ctype;
35 };
36 
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38   ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
39 
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
41   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42   0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62   0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64   0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
73 };
74 
75 #include "st.h"
76 
77 #include "unicode_fold_data.c"
78 
79 extern int
onigenc_unicode_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * fold)80 onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag,
81     const UChar** pp, const UChar* end, UChar* fold)
82 {
83   const struct ByUnfoldKey* buk;
84 
85   OnigCodePoint code;
86   int i, len, rlen;
87   const UChar *p = *pp;
88 
89   code = ONIGENC_MBC_TO_CODE(enc, p, end);
90   len = enclen(enc, p);
91   *pp += len;
92 
93 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
94   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
95     if (code == 0x0130) {
96       return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
97     }
98 #if 0
99     if (code == 0x0049) {
100       return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
101     }
102 #endif
103   }
104 #endif
105 
106   if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) {
107     buk = onigenc_unicode_unfold_key(code);
108     if (buk != 0) {
109       if (buk->fold_len == 1) {
110         if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
111             ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index)))
112           return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
113       }
114       else {
115         OnigCodePoint* addr;
116 
117         FOLDS_FOLD_ADDR_BUK(buk, addr);
118         rlen = 0;
119         for (i = 0; i < buk->fold_len; i++) {
120           OnigCodePoint c = addr[i];
121           len = ONIGENC_CODE_TO_MBC(enc, c, fold);
122           fold += len;
123           rlen += len;
124         }
125         return rlen;
126       }
127     }
128   }
129 
130   for (i = 0; i < len; i++) {
131     *fold++ = *p++;
132   }
133   return len;
134 }
135 
136 static int
apply_case_fold1(OnigCaseFoldType flag,int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)137 apply_case_fold1(OnigCaseFoldType flag, int from, int to,
138                  OnigApplyAllCaseFoldFunc f, void* arg)
139 {
140   int i, j, k, n, r;
141 
142   for (i = from; i < to; ) {
143     OnigCodePoint fold = *FOLDS1_FOLD(i);
144     if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break;
145 
146     n = FOLDS1_UNFOLDS_NUM(i);
147     for (j = 0; j < n; j++) {
148       OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
149 
150       if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold))
151         continue;
152 
153       r = (*f)(fold, &unfold, 1, arg);
154       if (r != 0) return r;
155       r = (*f)(unfold, &fold, 1, arg);
156       if (r != 0) return r;
157 
158       for (k = 0; k < j; k++) {
159         OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
160         if (CASE_FOLD_IS_ASCII_ONLY(flag) &&
161             ! ONIGENC_IS_ASCII_CODE(unfold2)) continue;
162 
163         r = (*f)(unfold, &unfold2, 1, arg);
164         if (r != 0) return r;
165         r = (*f)(unfold2, &unfold, 1, arg);
166         if (r != 0) return r;
167       }
168     }
169 
170     i = FOLDS1_NEXT_INDEX(i);
171   }
172 
173   return 0;
174 }
175 
176 static int
apply_case_fold2(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)177 apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
178 {
179   int i, j, k, n, r;
180 
181   for (i = from; i < to; ) {
182     OnigCodePoint* fold = FOLDS2_FOLD(i);
183     n = FOLDS2_UNFOLDS_NUM(i);
184     for (j = 0; j < n; j++) {
185       OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
186 
187       r = (*f)(unfold, fold, 2, arg);
188       if (r != 0) return r;
189 
190       for (k = 0; k < j; k++) {
191         OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
192         r = (*f)(unfold, &unfold2, 1, arg);
193         if (r != 0) return r;
194         r = (*f)(unfold2, &unfold, 1, arg);
195         if (r != 0) return r;
196       }
197     }
198 
199     i = FOLDS2_NEXT_INDEX(i);
200   }
201 
202   return 0;
203 }
204 
205 static int
apply_case_fold3(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)206 apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
207 {
208   int i, j, k, n, r;
209 
210   for (i = from; i < to; ) {
211     OnigCodePoint* fold = FOLDS3_FOLD(i);
212     n = FOLDS3_UNFOLDS_NUM(i);
213     for (j = 0; j < n; j++) {
214       OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
215 
216       r = (*f)(unfold, fold, 3, arg);
217       if (r != 0) return r;
218 
219       for (k = 0; k < j; k++) {
220         OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
221         r = (*f)(unfold, &unfold2, 1, arg);
222         if (r != 0) return r;
223         r = (*f)(unfold2, &unfold, 1, arg);
224         if (r != 0) return r;
225       }
226     }
227 
228     i = FOLDS3_NEXT_INDEX(i);
229   }
230 
231   return 0;
232 }
233 
234 extern int
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)235 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
236                                     OnigApplyAllCaseFoldFunc f, void* arg)
237 {
238   int r;
239 
240   r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg);
241   if (r != 0) return r;
242 
243 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
244   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
245     code = 0x0131;
246     r = (*f)(0x0049, &code, 1, arg);
247     if (r != 0) return r;
248     code = 0x0049;
249     r = (*f)(0x0131, &code, 1, arg);
250     if (r != 0) return r;
251 
252     code = 0x0130;
253     r = (*f)(0x0069, &code, 1, arg);
254     if (r != 0) return r;
255     code = 0x0069;
256     r = (*f)(0x0130, &code, 1, arg);
257     if (r != 0) return r;
258   }
259   else {
260 #endif
261     r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
262     if (r != 0) return r;
263 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
264   }
265 #endif
266 
267   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
268     return 0;
269 
270   r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
271   if (r != 0) return r;
272 
273 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
274   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
275 #endif
276     r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
277     if (r != 0) return r;
278 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
279   }
280 #endif
281 
282   r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
283   if (r != 0) return r;
284 
285   return 0;
286 }
287 
288 extern int
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])289 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
290     OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
291     OnigCaseFoldCodeItem items[])
292 {
293   int n, m, i, j, k, len, lens[3];
294   int index;
295   int fn, ncs[3];
296   OnigCodePoint cs[3][4];
297   OnigCodePoint code, codes[3], orig_codes[3];
298   const struct ByUnfoldKey* buk1;
299 
300   n = 0;
301 
302   code = ONIGENC_MBC_TO_CODE(enc, p, end);
303   if (CASE_FOLD_IS_ASCII_ONLY(flag)) {
304     if (! ONIGENC_IS_ASCII_CODE(code)) return n;
305   }
306   len = enclen(enc, p);
307 
308 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
309   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
310     if (code == 0x0049) {
311       items[0].byte_len = len;
312       items[0].code_len = 1;
313       items[0].code[0]  = 0x0131;
314       return 1;
315     }
316     else if (code == 0x0130) {
317       items[0].byte_len = len;
318       items[0].code_len = 1;
319       items[0].code[0]  = 0x0069;
320       return 1;
321     }
322     else if (code == 0x0131) {
323       items[0].byte_len = len;
324       items[0].code_len = 1;
325       items[0].code[0]  = 0x0049;
326       return 1;
327     }
328     else if (code == 0x0069) {
329       items[0].byte_len = len;
330       items[0].code_len = 1;
331       items[0].code[0]  = 0x0130;
332       return 1;
333     }
334   }
335 #endif
336 
337   orig_codes[0] = code;
338   lens[0] = len;
339   p += len;
340 
341   buk1 = onigenc_unicode_unfold_key(orig_codes[0]);
342   if (buk1 != 0 && buk1->fold_len == 1) {
343     codes[0] = *FOLDS1_FOLD(buk1->index);
344   }
345   else
346     codes[0] = orig_codes[0];
347 
348   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
349     goto fold1;
350 
351   if (p < end) {
352     const struct ByUnfoldKey* buk;
353 
354     code = ONIGENC_MBC_TO_CODE(enc, p, end);
355     orig_codes[1] = code;
356     len = enclen(enc, p);
357     lens[1] = lens[0] + len;
358     buk = onigenc_unicode_unfold_key(orig_codes[1]);
359     if (buk != 0 && buk->fold_len == 1) {
360       codes[1] = *FOLDS1_FOLD(buk->index);
361     }
362     else
363       codes[1] = orig_codes[1];
364 
365     p += len;
366     if (p < end) {
367       code = ONIGENC_MBC_TO_CODE(enc, p, end);
368       orig_codes[2] = code;
369       len = enclen(enc, p);
370       lens[2] = lens[1] + len;
371       buk = onigenc_unicode_unfold_key(orig_codes[2]);
372       if (buk != 0 && buk->fold_len == 1) {
373         codes[2] = *FOLDS1_FOLD(buk->index);
374       }
375       else
376         codes[2] = orig_codes[2];
377 
378       index = onigenc_unicode_fold3_key(codes);
379       if (index >= 0) {
380         m = FOLDS3_UNFOLDS_NUM(index);
381         for (i = 0; i < m; i++) {
382           items[n].byte_len = lens[2];
383           items[n].code_len = 1;
384           items[n].code[0]  = FOLDS3_UNFOLDS(index)[i];
385           n++;
386         }
387 
388         for (fn = 0; fn < 3; fn++) {
389           int sindex;
390           cs[fn][0] = FOLDS3_FOLD(index)[fn];
391           ncs[fn] = 1;
392           sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
393           if (sindex >= 0) {
394             int m = FOLDS1_UNFOLDS_NUM(sindex);
395             for (i = 0; i < m; i++) {
396               cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
397             }
398             ncs[fn] += m;
399           }
400         }
401 
402         for (i = 0; i < ncs[0]; i++) {
403           for (j = 0; j < ncs[1]; j++) {
404             for (k = 0; k < ncs[2]; k++) {
405               if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1] &&
406                   cs[2][k] == orig_codes[2])
407                 continue;
408 
409               items[n].byte_len = lens[2];
410               items[n].code_len = 3;
411               items[n].code[0]  = cs[0][i];
412               items[n].code[1]  = cs[1][j];
413               items[n].code[2]  = cs[2][k];
414               n++;
415             }
416           }
417         }
418 
419         return n;
420       }
421     }
422 
423     index = onigenc_unicode_fold2_key(codes);
424     if (index >= 0) {
425       m = FOLDS2_UNFOLDS_NUM(index);
426       for (i = 0; i < m; i++) {
427         items[n].byte_len = lens[1];
428         items[n].code_len = 1;
429         items[n].code[0]  = FOLDS2_UNFOLDS(index)[i];
430         n++;
431       }
432 
433       for (fn = 0; fn < 2; fn++) {
434         int sindex;
435         cs[fn][0] = FOLDS2_FOLD(index)[fn];
436         ncs[fn] = 1;
437         sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
438         if (sindex >= 0) {
439           int m = FOLDS1_UNFOLDS_NUM(sindex);
440           for (i = 0; i < m; i++) {
441             cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
442           }
443           ncs[fn] += m;
444         }
445       }
446 
447       for (i = 0; i < ncs[0]; i++) {
448         for (j = 0; j < ncs[1]; j++) {
449           if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1])
450             continue;
451           items[n].byte_len = lens[1];
452           items[n].code_len = 2;
453           items[n].code[0]  = cs[0][i];
454           items[n].code[1]  = cs[1][j];
455           n++;
456         }
457       }
458 
459       return n;
460     }
461   }
462 
463  fold1:
464   if (buk1 != 0) {
465     if (buk1->fold_len == 1) {
466       int un;
467 
468       if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
469           ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) {
470         items[0].byte_len = lens[0];
471         items[0].code_len = 1;
472         items[0].code[0]  = *FOLDS1_FOLD(buk1->index);
473         n++;
474       }
475 
476       un = FOLDS1_UNFOLDS_NUM(buk1->index);
477       for (i = 0; i < un; i++) {
478         OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i];
479         if (unfold != orig_codes[0]) {
480           if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
481               ONIGENC_IS_ASCII_CODE(unfold)) {
482             items[n].byte_len = lens[0];
483             items[n].code_len = 1;
484             items[n].code[0]  = unfold;
485             n++;
486           }
487         }
488       }
489     }
490     else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
491       if (buk1->fold_len == 2) {
492         m = FOLDS2_UNFOLDS_NUM(buk1->index);
493         for (i = 0; i < m; i++) {
494           OnigCodePoint unfold = FOLDS2_UNFOLDS(buk1->index)[i];
495           if (unfold == orig_codes[0]) continue;
496 
497           items[n].byte_len = lens[0];
498           items[n].code_len = 1;
499           items[n].code[0]  = unfold;
500           n++;
501         }
502 
503         for (fn = 0; fn < 2; fn++) {
504           int index;
505           cs[fn][0] = FOLDS2_FOLD(buk1->index)[fn];
506           ncs[fn] = 1;
507           index = onigenc_unicode_fold1_key(&cs[fn][0]);
508           if (index >= 0) {
509             int m = FOLDS1_UNFOLDS_NUM(index);
510             for (i = 0; i < m; i++) {
511               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
512             }
513             ncs[fn] += m;
514           }
515         }
516 
517         for (i = 0; i < ncs[0]; i++) {
518           for (j = 0; j < ncs[1]; j++) {
519             items[n].byte_len = lens[0];
520             items[n].code_len = 2;
521             items[n].code[0]  = cs[0][i];
522             items[n].code[1]  = cs[1][j];
523             n++;
524           }
525         }
526       }
527       else { /* fold_len == 3 */
528         m = FOLDS3_UNFOLDS_NUM(buk1->index);
529         for (i = 0; i < m; i++) {
530           OnigCodePoint unfold = FOLDS3_UNFOLDS(buk1->index)[i];
531           if (unfold == orig_codes[0]) continue;
532 
533           items[n].byte_len = lens[0];
534           items[n].code_len = 1;
535           items[n].code[0]  = unfold;
536           n++;
537         }
538 
539         for (fn = 0; fn < 3; fn++) {
540           int index;
541           cs[fn][0] = FOLDS3_FOLD(buk1->index)[fn];
542           ncs[fn] = 1;
543           index = onigenc_unicode_fold1_key(&cs[fn][0]);
544           if (index >= 0) {
545             int m = FOLDS1_UNFOLDS_NUM(index);
546             for (i = 0; i < m; i++) {
547               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
548             }
549             ncs[fn] += m;
550           }
551         }
552 
553         for (i = 0; i < ncs[0]; i++) {
554           for (j = 0; j < ncs[1]; j++) {
555             for (k = 0; k < ncs[2]; k++) {
556               items[n].byte_len = lens[0];
557               items[n].code_len = 3;
558               items[n].code[0]  = cs[0][i];
559               items[n].code[1]  = cs[1][j];
560               items[n].code[2]  = cs[2][k];
561               n++;
562             }
563           }
564         }
565       }
566     }
567   }
568   else {
569     int index = onigenc_unicode_fold1_key(orig_codes);
570     if (index >= 0) {
571       int m = FOLDS1_UNFOLDS_NUM(index);
572       for (i = 0; i < m; i++) {
573         code = FOLDS1_UNFOLDS(index)[i];
574         if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) {
575           items[n].byte_len = lens[0];
576           items[n].code_len = 1;
577           items[n].code[0]  = code;
578           n++;
579         }
580       }
581     }
582   }
583 
584   return n;
585 }
586 
587 #ifdef USE_UNICODE_PROPERTIES
588 #include "unicode_property_data.c"
589 #else
590 #include "unicode_property_data_posix.c"
591 #endif
592 
593 
594 #ifdef USE_UNICODE_WORD_BREAK
595 
596 enum WB_TYPE {
597   WB_Any = 0,
598   WB_ALetter,
599   WB_CR,
600   WB_Double_Quote,
601   WB_Extend,
602   WB_ExtendNumLet,
603   WB_Format,
604   WB_Hebrew_Letter,
605   WB_Katakana,
606   WB_LF,
607   WB_MidLetter,
608   WB_MidNum,
609   WB_MidNumLet,
610   WB_Newline,
611   WB_Numeric,
612   WB_Regional_Indicator,
613   WB_Single_Quote,
614   WB_WSegSpace,
615   WB_ZWJ,
616 };
617 
618 typedef struct {
619   OnigCodePoint start;
620   OnigCodePoint end;
621   enum WB_TYPE  type;
622 } WB_RANGE_TYPE;
623 
624 #include "unicode_wb_data.c"
625 
626 static enum WB_TYPE
wb_get_type(OnigCodePoint code)627 wb_get_type(OnigCodePoint code)
628 {
629   OnigCodePoint low, high, x;
630   enum WB_TYPE type;
631 
632   for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
633     x = (low + high) >> 1;
634     if (code > WB_RANGES[x].end)
635       low = x + 1;
636     else
637       high = x;
638   }
639 
640   type = (low < (OnigCodePoint )WB_RANGE_NUM &&
641           code >= WB_RANGES[low].start) ?
642     WB_RANGES[low].type : WB_Any;
643 
644   return type;
645 }
646 
647 #define IS_WB_IGNORE_TAIL(t)  ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
648 #define IS_WB_AHLetter(t)     ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
649 #define IS_WB_MidNumLetQ(t)   ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
650 
651 static int
wb_get_next_main_code(OnigEncoding enc,UChar * p,const UChar * end,OnigCodePoint * rcode,enum WB_TYPE * rtype)652 wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
653                       OnigCodePoint* rcode, enum WB_TYPE* rtype)
654 {
655   OnigCodePoint code;
656   enum WB_TYPE type;
657 
658   while (TRUE) {
659     p += enclen(enc, p);
660     if (p >= end) break;
661 
662     code = ONIGENC_MBC_TO_CODE(enc, p, end);
663     type = wb_get_type(code);
664     if (! IS_WB_IGNORE_TAIL(type)) {
665       *rcode = code;
666       *rtype = type;
667       return 1;
668     }
669   }
670 
671   return 0;
672 }
673 
674 extern int
onigenc_wb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)675 onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
676                              const UChar* start, const UChar* end)
677 {
678   int r;
679   UChar* pp;
680   OnigCodePoint cfrom;
681   OnigCodePoint cfrom2;
682   OnigCodePoint cto;
683   OnigCodePoint cto2;
684   enum WB_TYPE from;
685   enum WB_TYPE from2;
686   enum WB_TYPE to;
687   enum WB_TYPE to2;
688 
689   /* WB1: sot / Any */
690   if (p == start) return TRUE;
691   /* WB2: Any / eot */
692   if (p == end)   return TRUE;
693 
694   if (IS_NULL(prev)) {
695     prev = onigenc_get_prev_char_head(enc, start, p);
696     if (IS_NULL(prev)) return TRUE;
697   }
698 
699   cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
700   cto   = ONIGENC_MBC_TO_CODE(enc, p, end);
701 
702   from = wb_get_type(cfrom);
703   to   = wb_get_type(cto);
704 
705   /* short cut */
706   if (from == 0 && to == 0) goto WB999;
707 
708   /* WB3: CR + LF */
709   if (from == WB_CR && to == WB_LF) return FALSE;
710 
711   /* WB3a: (Newline|CR|LF) /  */
712   if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
713   /* WB3b: / (Newline|CR|LF) */
714   if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
715 
716   /* WB3c: ZWJ + {Extended_Pictographic} */
717   if (from == WB_ZWJ) {
718     if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
719       return FALSE;
720   }
721 
722   /* WB3d: WSegSpace + WSegSpace */
723   if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
724 
725   /* WB4:  X (Extend|Format|ZWJ)* -> X */
726   if (IS_WB_IGNORE_TAIL(to)) return FALSE;
727   if (IS_WB_IGNORE_TAIL(from)) {
728     while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
729       prev = pp;
730       cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
731       from = wb_get_type(cfrom);
732       if (! IS_WB_IGNORE_TAIL(from))
733         break;
734     }
735   }
736 
737   if (IS_WB_AHLetter(from)) {
738     /* WB5: AHLetter + AHLetter */
739     if (IS_WB_AHLetter(to)) return FALSE;
740 
741     /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
742     if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
743       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
744       if (r == 1) {
745         if (IS_WB_AHLetter(to2)) return FALSE;
746       }
747     }
748   }
749 
750   /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
751   if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
752     if (IS_WB_AHLetter(to)) {
753       from2 = WB_Any;
754       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
755         prev = pp;
756         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
757         from2 = wb_get_type(cfrom2);
758         if (! IS_WB_IGNORE_TAIL(from2))
759           break;
760       }
761 
762       if (IS_WB_AHLetter(from2)) return FALSE;
763     }
764   }
765 
766   if (from == WB_Hebrew_Letter) {
767     /* WB7a: Hebrew_Letter + Single_Quote */
768     if (to == WB_Single_Quote) return FALSE;
769 
770     /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
771     if (to == WB_Double_Quote) {
772       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
773       if (r == 1) {
774         if (to2 == WB_Hebrew_Letter) return FALSE;
775       }
776     }
777   }
778 
779   /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
780   if (from == WB_Double_Quote) {
781     if (to == WB_Hebrew_Letter) {
782       from2 = WB_Any;
783       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
784         prev = pp;
785         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
786         from2 = wb_get_type(cfrom2);
787         if (! IS_WB_IGNORE_TAIL(from2))
788           break;
789       }
790 
791       if (from2 == WB_Hebrew_Letter) return FALSE;
792     }
793   }
794 
795   if (to == WB_Numeric) {
796     /* WB8: Numeric + Numeric */
797     if (from == WB_Numeric) return FALSE;
798 
799     /* WB9: AHLetter + Numeric */
800     if (IS_WB_AHLetter(from)) return FALSE;
801 
802     /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
803     if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
804       from2 = WB_Any;
805       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
806         prev = pp;
807         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
808         from2 = wb_get_type(cfrom2);
809         if (! IS_WB_IGNORE_TAIL(from2))
810           break;
811       }
812 
813       if (from2 == WB_Numeric) return FALSE;
814     }
815   }
816 
817   if (from == WB_Numeric) {
818     /* WB10: Numeric + AHLetter */
819     if (IS_WB_AHLetter(to)) return FALSE;
820 
821     /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
822     if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
823       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
824       if (r == 1) {
825         if (to2 == WB_Numeric) return FALSE;
826       }
827     }
828   }
829 
830   /* WB13: Katakana + Katakana */
831   if (from == WB_Katakana && to == WB_Katakana) return FALSE;
832 
833   /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
834   if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
835       || from == WB_ExtendNumLet) {
836     if (to == WB_ExtendNumLet) return FALSE;
837   }
838 
839   /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
840   if (from == WB_ExtendNumLet) {
841     if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
842       return FALSE;
843   }
844 
845 
846   /* WB15:   sot (RI RI)* RI + RI */
847   /* WB16: [^RI] (RI RI)* RI + RI */
848   if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
849     int n = 0;
850     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
851       cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
852       from2  = wb_get_type(cfrom2);
853       if (from2 != WB_Regional_Indicator)
854         break;
855 
856       n++;
857     }
858     if ((n % 2) == 0) return FALSE;
859   }
860 
861  WB999:
862   /* WB999: Any / Any */
863   return TRUE;
864 }
865 
866 #endif /* USE_UNICODE_WORD_BREAK */
867 
868 
869 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
870 
871 enum EGCB_BREAK_TYPE {
872   EGCB_NOT_BREAK = 0,
873   EGCB_BREAK     = 1,
874   EGCB_BREAK_UNDEF_GB11  = 2,
875   EGCB_BREAK_UNDEF_RI_RI = 3
876 };
877 
878 enum EGCB_TYPE {
879   EGCB_Other   = 0,
880   EGCB_CR      = 1,
881   EGCB_LF      = 2,
882   EGCB_Control = 3,
883   EGCB_Extend  = 4,
884   EGCB_Prepend = 5,
885   EGCB_Regional_Indicator = 6,
886   EGCB_SpacingMark = 7,
887   EGCB_ZWJ         = 8,
888 #if 0
889   /* obsoleted */
890   EGCB_E_Base         = 9,
891   EGCB_E_Base_GAZ     = 10,
892   EGCB_E_Modifier     = 11,
893   EGCB_Glue_After_Zwj = 12,
894 #endif
895   EGCB_L   = 13,
896   EGCB_LV  = 14,
897   EGCB_LVT = 15,
898   EGCB_T   = 16,
899   EGCB_V   = 17
900 };
901 
902 typedef struct {
903   OnigCodePoint  start;
904   OnigCodePoint  end;
905   enum EGCB_TYPE type;
906 } EGCB_RANGE_TYPE;
907 
908 #include "unicode_egcb_data.c"
909 
910 static enum EGCB_TYPE
egcb_get_type(OnigCodePoint code)911 egcb_get_type(OnigCodePoint code)
912 {
913   OnigCodePoint low, high, x;
914   enum EGCB_TYPE type;
915 
916   for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
917     x = (low + high) >> 1;
918     if (code > EGCB_RANGES[x].end)
919       low = x + 1;
920     else
921       high = x;
922   }
923 
924   type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
925           code >= EGCB_RANGES[low].start) ?
926     EGCB_RANGES[low].type : EGCB_Other;
927 
928   return type;
929 }
930 
931 #define IS_CONTROL_CR_LF(code)   ((code) <= EGCB_Control && (code) >= EGCB_CR)
932 #define IS_HANGUL(code)          ((code) >= EGCB_L)
933 
934 /* GB1 and GB2 are outside of this function. */
935 static enum EGCB_BREAK_TYPE
unicode_egcb_is_break_2code(OnigCodePoint from_code,OnigCodePoint to_code)936 unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
937 {
938   enum EGCB_TYPE from;
939   enum EGCB_TYPE to;
940 
941   from = egcb_get_type(from_code);
942   to   = egcb_get_type(to_code);
943 
944   /* short cut */
945   if (from == 0 && to == 0) goto GB999;
946 
947   /* GB3 */
948   if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
949   /* GB4 */
950   if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
951   /* GB5 */
952   if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
953 
954   if (IS_HANGUL(from) && IS_HANGUL(to)) {
955     /* GB6 */
956     if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
957     /* GB7 */
958     if ((from == EGCB_LV || from == EGCB_V)
959         && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
960 
961     /* GB8 */
962     if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))
963       return EGCB_NOT_BREAK;
964 
965     goto GB999;
966   }
967 
968   /* GB9 */
969   if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
970 
971   /* GB9a */
972   if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
973   /* GB9b */
974   if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
975 
976   /* GB10 removed */
977 
978   /* GB11 */
979   if (from == EGCB_ZWJ) {
980     if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
981       return EGCB_BREAK_UNDEF_GB11;
982 
983     goto GB999;
984   }
985 
986   /* GB12, GB13 */
987   if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
988     return EGCB_BREAK_UNDEF_RI_RI;
989   }
990 
991  GB999:
992   return EGCB_BREAK;
993 }
994 
995 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
996 
997 extern int
onigenc_egcb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)998 onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
999                                const UChar* start, const UChar* end)
1000 {
1001   OnigCodePoint from;
1002   OnigCodePoint to;
1003 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
1004   enum EGCB_BREAK_TYPE btype;
1005   enum EGCB_TYPE type;
1006 #endif
1007 
1008   /* GB1 and GB2 */
1009   if (p == start) return 1;
1010   if (p == end)   return 1;
1011 
1012   if (IS_NULL(prev)) {
1013     prev = onigenc_get_prev_char_head(enc, start, p);
1014     if (IS_NULL(prev)) return 1;
1015   }
1016 
1017   from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1018   to   = ONIGENC_MBC_TO_CODE(enc, p, end);
1019 
1020 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
1021   if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
1022     return from != 0x000d || to != NEWLINE_CODE;
1023   }
1024 
1025   btype = unicode_egcb_is_break_2code(from, to);
1026   switch (btype) {
1027   case EGCB_NOT_BREAK:
1028     return 0;
1029     break;
1030   case EGCB_BREAK:
1031     return 1;
1032     break;
1033 
1034   case EGCB_BREAK_UNDEF_GB11:
1035     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
1036       from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1037       if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
1038         return 0;
1039 
1040       type = egcb_get_type(from);
1041       if (type != EGCB_Extend)
1042         break;
1043     }
1044     break;
1045 
1046   case EGCB_BREAK_UNDEF_RI_RI:
1047     {
1048       int n = 0;
1049       while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
1050         from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1051         type = egcb_get_type(from);
1052         if (type != EGCB_Regional_Indicator)
1053           break;
1054 
1055         n++;
1056       }
1057       if ((n % 2) == 0) return 0;
1058     }
1059     break;
1060   }
1061 
1062   return 1;
1063 
1064 #else
1065   return from != 0x000d || to != NEWLINE_CODE;
1066 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
1067 }
1068 
1069 
1070 #define USER_DEFINED_PROPERTY_MAX_NUM  20
1071 
1072 typedef struct {
1073   int ctype;
1074   OnigCodePoint* ranges;
1075 } UserDefinedPropertyValue;
1076 
1077 static int UserDefinedPropertyNum;
1078 static UserDefinedPropertyValue
1079 UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
1080 static st_table* UserDefinedPropertyTable;
1081 
1082 extern int
onig_unicode_define_user_property(const char * name,OnigCodePoint * ranges)1083 onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
1084 {
1085   UserDefinedPropertyValue* e;
1086   int r;
1087   int i;
1088   int n;
1089   int len;
1090   int c;
1091   char* s;
1092   UChar* uname;
1093 
1094   if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
1095     return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
1096 
1097   len = (int )strlen(name);
1098   if (len >= PROPERTY_NAME_MAX_SIZE)
1099     return ONIGERR_TOO_LONG_PROPERTY_NAME;
1100 
1101   s = (char* )xmalloc(len + 1);
1102   if (s == 0)
1103     return ONIGERR_MEMORY;
1104 
1105   uname = (UChar* )name;
1106   n = 0;
1107   for (i = 0; i < len; i++) {
1108     c = uname[i];
1109     if (c < 0x20 || c >= 0x80) {
1110       xfree(s);
1111       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1112     }
1113 
1114     if (c != ' ' && c != '-' && c != '_') {
1115       s[n] = c;
1116       n++;
1117     }
1118   }
1119   s[n] = '\0';
1120 
1121   if (UserDefinedPropertyTable == 0) {
1122     UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
1123     if (IS_NULL(UserDefinedPropertyTable)) {
1124       xfree(s);
1125       return ONIGERR_MEMORY;
1126     }
1127   }
1128 
1129   e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
1130   e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
1131   e->ranges = ranges;
1132   r = onig_st_insert_strend(UserDefinedPropertyTable,
1133                             (const UChar* )s, (const UChar* )s + n,
1134                             (hash_data_type )((void* )e));
1135   if (r < 0) return r;
1136 
1137   UserDefinedPropertyNum++;
1138   return 0;
1139 }
1140 
1141 extern int
onigenc_unicode_is_code_ctype(OnigCodePoint code,unsigned int ctype)1142 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
1143 {
1144   if (
1145 #ifdef USE_UNICODE_PROPERTIES
1146       ctype <= ONIGENC_MAX_STD_CTYPE &&
1147 #endif
1148       code < 256) {
1149     return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
1150   }
1151 
1152   if (ctype >= CODE_RANGES_NUM) {
1153     int index = ctype - CODE_RANGES_NUM;
1154     if (index < UserDefinedPropertyNum)
1155       return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
1156     else
1157       return ONIGERR_TYPE_BUG;
1158   }
1159 
1160   return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
1161 }
1162 
1163 
1164 extern int
onigenc_unicode_ctype_code_range(OnigCtype ctype,const OnigCodePoint * ranges[])1165 onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
1166 {
1167   if (ctype >= CODE_RANGES_NUM) {
1168     int index = ctype - CODE_RANGES_NUM;
1169     if (index < UserDefinedPropertyNum) {
1170       *ranges = UserDefinedPropertyRanges[index].ranges;
1171       return 0;
1172     }
1173     else
1174       return ONIGERR_TYPE_BUG;
1175   }
1176 
1177   *ranges = CodeRanges[ctype];
1178   return 0;
1179 }
1180 
1181 extern int
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype,OnigCodePoint * sb_out,const OnigCodePoint * ranges[])1182 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
1183                                       const OnigCodePoint* ranges[])
1184 {
1185   *sb_out = 0x00;
1186   return onigenc_unicode_ctype_code_range(ctype, ranges);
1187 }
1188 
1189 extern int
onigenc_unicode_property_name_to_ctype(OnigEncoding enc,UChar * name,UChar * end)1190 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
1191 {
1192   int len;
1193   UChar *p;
1194   OnigCodePoint code;
1195   const struct PoolPropertyNameCtype* pc;
1196   char buf[PROPERTY_NAME_MAX_SIZE];
1197 
1198   p = name;
1199   len = 0;
1200   while (p < end) {
1201     code = ONIGENC_MBC_TO_CODE(enc, p, end);
1202     if (code >= 0x80)
1203       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1204 
1205     if (code != ' ' && code != '-' && code != '_') {
1206       buf[len++] = (char )code;
1207       if (len >= PROPERTY_NAME_MAX_SIZE)
1208         return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1209     }
1210 
1211     p += enclen(enc, p);
1212   }
1213 
1214   buf[len] = 0;
1215 
1216   if (UserDefinedPropertyTable != 0) {
1217     UserDefinedPropertyValue* e;
1218     e = (UserDefinedPropertyValue* )NULL;
1219     onig_st_lookup_strend(UserDefinedPropertyTable,
1220                           (const UChar* )buf, (const UChar* )buf + len,
1221                           (hash_data_type* )((void* )(&e)));
1222     if (e != 0) {
1223       return e->ctype;
1224     }
1225   }
1226 
1227   pc = unicode_lookup_property_name(buf, len);
1228   if (pc != 0) {
1229     /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
1230 #ifndef USE_UNICODE_PROPERTIES
1231     if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
1232       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1233 #endif
1234 
1235     return (int )pc->ctype;
1236   }
1237 
1238   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1239 }
1240