1 /**********************************************************************
2   unicode.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 struct PoolPropertyNameCtype {
33   short int name;
34   short int ctype;
35 };
36 
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38   ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
39 
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
41   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42   0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62   0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64   0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
73 };
74 
75 #include "st.h"
76 
77 #include "unicode_fold_data.c"
78 
79 extern int
onigenc_unicode_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end,UChar * fold)80 onigenc_unicode_mbc_case_fold(OnigEncoding enc,
81     OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
82     UChar* fold)
83 {
84   const struct ByUnfoldKey* buk;
85 
86   OnigCodePoint code;
87   int i, len, rlen;
88   const UChar *p = *pp;
89 
90   code = ONIGENC_MBC_TO_CODE(enc, p, end);
91   len = enclen(enc, p);
92   *pp += len;
93 
94 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
95   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
96     if (code == 0x0130) {
97       return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
98     }
99 #if 0
100     if (code == 0x0049) {
101       return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
102     }
103 #endif
104   }
105 #endif
106 
107   buk = onigenc_unicode_unfold_key(code);
108   if (buk != 0) {
109     if (buk->fold_len == 1) {
110       return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
111     }
112     else {
113       OnigCodePoint* addr;
114 
115       FOLDS_FOLD_ADDR_BUK(buk, addr);
116       rlen = 0;
117       for (i = 0; i < buk->fold_len; i++) {
118         OnigCodePoint c = addr[i];
119         len = ONIGENC_CODE_TO_MBC(enc, c, fold);
120         fold += len;
121         rlen += len;
122       }
123       return rlen;
124     }
125   }
126 
127   for (i = 0; i < len; i++) {
128     *fold++ = *p++;
129   }
130   return len;
131 }
132 
133 static int
apply_case_fold1(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)134 apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
135 {
136   int i, j, k, n, r;
137 
138   for (i = from; i < to; ) {
139     OnigCodePoint fold = *FOLDS1_FOLD(i);
140     n = FOLDS1_UNFOLDS_NUM(i);
141     for (j = 0; j < n; j++) {
142       OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
143 
144       r = (*f)(fold, &unfold, 1, arg);
145       if (r != 0) return r;
146       r = (*f)(unfold, &fold, 1, arg);
147       if (r != 0) return r;
148 
149       for (k = 0; k < j; k++) {
150         OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
151         r = (*f)(unfold, &unfold2, 1, arg);
152         if (r != 0) return r;
153         r = (*f)(unfold2, &unfold, 1, arg);
154         if (r != 0) return r;
155       }
156     }
157 
158     i = FOLDS1_NEXT_INDEX(i);
159   }
160 
161   return 0;
162 }
163 
164 static int
apply_case_fold2(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)165 apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
166 {
167   int i, j, k, n, r;
168 
169   for (i = from; i < to; ) {
170     OnigCodePoint* fold = FOLDS2_FOLD(i);
171     n = FOLDS2_UNFOLDS_NUM(i);
172     for (j = 0; j < n; j++) {
173       OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
174 
175       r = (*f)(unfold, fold, 2, arg);
176       if (r != 0) return r;
177 
178       for (k = 0; k < j; k++) {
179         OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
180         r = (*f)(unfold, &unfold2, 1, arg);
181         if (r != 0) return r;
182         r = (*f)(unfold2, &unfold, 1, arg);
183         if (r != 0) return r;
184       }
185     }
186 
187     i = FOLDS2_NEXT_INDEX(i);
188   }
189 
190   return 0;
191 }
192 
193 static int
apply_case_fold3(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)194 apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
195 {
196   int i, j, k, n, r;
197 
198   for (i = from; i < to; ) {
199     OnigCodePoint* fold = FOLDS3_FOLD(i);
200     n = FOLDS3_UNFOLDS_NUM(i);
201     for (j = 0; j < n; j++) {
202       OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
203 
204       r = (*f)(unfold, fold, 3, arg);
205       if (r != 0) return r;
206 
207       for (k = 0; k < j; k++) {
208         OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
209         r = (*f)(unfold, &unfold2, 1, arg);
210         if (r != 0) return r;
211         r = (*f)(unfold2, &unfold, 1, arg);
212         if (r != 0) return r;
213       }
214     }
215 
216     i = FOLDS3_NEXT_INDEX(i);
217   }
218 
219   return 0;
220 }
221 
222 extern int
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)223 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
224                                     OnigApplyAllCaseFoldFunc f, void* arg)
225 {
226   int r;
227 
228   r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);
229   if (r != 0) return r;
230 
231 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
232   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
233     code = 0x0131;
234     r = (*f)(0x0049, &code, 1, arg);
235     if (r != 0) return r;
236     code = 0x0049;
237     r = (*f)(0x0131, &code, 1, arg);
238     if (r != 0) return r;
239 
240     code = 0x0130;
241     r = (*f)(0x0069, &code, 1, arg);
242     if (r != 0) return r;
243     code = 0x0069;
244     r = (*f)(0x0130, &code, 1, arg);
245     if (r != 0) return r;
246   }
247   else {
248 #endif
249     r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
250     if (r != 0) return r;
251 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
252   }
253 #endif
254 
255   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
256     return 0;
257 
258   r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
259   if (r != 0) return r;
260 
261 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
262   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
263 #endif
264     r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
265     if (r != 0) return r;
266 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
267   }
268 #endif
269 
270   r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
271   if (r != 0) return r;
272 
273   return 0;
274 }
275 
276 extern int
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])277 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
278     OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
279     OnigCaseFoldCodeItem items[])
280 {
281   int n, m, i, j, k, len, lens[3];
282   int index;
283   int fn, ncs[3];
284   OnigCodePoint cs[3][4];
285   OnigCodePoint code, codes[3], orig_codes[3];
286   const struct ByUnfoldKey* buk1;
287 
288   n = 0;
289 
290   code = ONIGENC_MBC_TO_CODE(enc, p, end);
291   len = enclen(enc, p);
292 
293 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
294   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
295     if (code == 0x0049) {
296       items[0].byte_len = len;
297       items[0].code_len = 1;
298       items[0].code[0]  = 0x0131;
299       return 1;
300     }
301     else if (code == 0x0130) {
302       items[0].byte_len = len;
303       items[0].code_len = 1;
304       items[0].code[0]  = 0x0069;
305       return 1;
306     }
307     else if (code == 0x0131) {
308       items[0].byte_len = len;
309       items[0].code_len = 1;
310       items[0].code[0]  = 0x0049;
311       return 1;
312     }
313     else if (code == 0x0069) {
314       items[0].byte_len = len;
315       items[0].code_len = 1;
316       items[0].code[0]  = 0x0130;
317       return 1;
318     }
319   }
320 #endif
321 
322   orig_codes[0] = code;
323   lens[0] = len;
324   p += len;
325 
326   buk1 = onigenc_unicode_unfold_key(orig_codes[0]);
327   if (buk1 != 0 && buk1->fold_len == 1) {
328     codes[0] = *FOLDS1_FOLD(buk1->index);
329   }
330   else
331     codes[0] = orig_codes[0];
332 
333   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
334     goto fold1;
335 
336   if (p < end) {
337     const struct ByUnfoldKey* buk;
338 
339     code = ONIGENC_MBC_TO_CODE(enc, p, end);
340     orig_codes[1] = code;
341     len = enclen(enc, p);
342     lens[1] = lens[0] + len;
343     buk = onigenc_unicode_unfold_key(orig_codes[1]);
344     if (buk != 0 && buk->fold_len == 1) {
345       codes[1] = *FOLDS1_FOLD(buk->index);
346     }
347     else
348       codes[1] = orig_codes[1];
349 
350     p += len;
351     if (p < end) {
352       code = ONIGENC_MBC_TO_CODE(enc, p, end);
353       orig_codes[2] = code;
354       len = enclen(enc, p);
355       lens[2] = lens[1] + len;
356       buk = onigenc_unicode_unfold_key(orig_codes[2]);
357       if (buk != 0 && buk->fold_len == 1) {
358         codes[2] = *FOLDS1_FOLD(buk->index);
359       }
360       else
361         codes[2] = orig_codes[2];
362 
363       index = onigenc_unicode_fold3_key(codes);
364       if (index >= 0) {
365         m = FOLDS3_UNFOLDS_NUM(index);
366         for (i = 0; i < m; i++) {
367           items[n].byte_len = lens[2];
368           items[n].code_len = 1;
369           items[n].code[0]  = FOLDS3_UNFOLDS(index)[i];
370           n++;
371         }
372 
373         for (fn = 0; fn < 3; fn++) {
374           int sindex;
375           cs[fn][0] = FOLDS3_FOLD(index)[fn];
376           ncs[fn] = 1;
377           sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
378           if (sindex >= 0) {
379             int m = FOLDS1_UNFOLDS_NUM(sindex);
380             for (i = 0; i < m; i++) {
381               cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
382             }
383             ncs[fn] += m;
384           }
385         }
386 
387         for (i = 0; i < ncs[0]; i++) {
388           for (j = 0; j < ncs[1]; j++) {
389             for (k = 0; k < ncs[2]; k++) {
390               items[n].byte_len = lens[2];
391               items[n].code_len = 3;
392               items[n].code[0]  = cs[0][i];
393               items[n].code[1]  = cs[1][j];
394               items[n].code[2]  = cs[2][k];
395               if (items[n].code[0] == orig_codes[0] &&
396                   items[n].code[1] == orig_codes[1] &&
397                   items[n].code[2] == orig_codes[2])
398                 continue;
399               n++;
400             }
401           }
402         }
403 
404         return n;
405       }
406     }
407 
408     index = onigenc_unicode_fold2_key(codes);
409     if (index >= 0) {
410       m = FOLDS2_UNFOLDS_NUM(index);
411       for (i = 0; i < m; i++) {
412         items[n].byte_len = lens[1];
413         items[n].code_len = 1;
414         items[n].code[0]  = FOLDS2_UNFOLDS(index)[i];
415         n++;
416       }
417 
418       for (fn = 0; fn < 2; fn++) {
419         int sindex;
420         cs[fn][0] = FOLDS2_FOLD(index)[fn];
421         ncs[fn] = 1;
422         sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
423         if (sindex >= 0) {
424           int m = FOLDS1_UNFOLDS_NUM(sindex);
425           for (i = 0; i < m; i++) {
426             cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
427           }
428           ncs[fn] += m;
429         }
430       }
431 
432       for (i = 0; i < ncs[0]; i++) {
433         for (j = 0; j < ncs[1]; j++) {
434           items[n].byte_len = lens[1];
435           items[n].code_len = 2;
436           items[n].code[0]  = cs[0][i];
437           items[n].code[1]  = cs[1][j];
438           if (items[n].code[0] == orig_codes[0] &&
439               items[n].code[1] == orig_codes[1])
440             continue;
441           n++;
442         }
443       }
444 
445       return n;
446     }
447   }
448 
449  fold1:
450   if (buk1 != 0) {
451     if (buk1->fold_len == 1) {
452       int un;
453       items[0].byte_len = lens[0];
454       items[0].code_len = 1;
455       items[0].code[0]  = *FOLDS1_FOLD(buk1->index);
456       n++;
457 
458       un = FOLDS1_UNFOLDS_NUM(buk1->index);
459       for (i = 0; i < un; i++) {
460         OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i];
461         if (unfold != orig_codes[0]) {
462           items[n].byte_len = lens[0];
463           items[n].code_len = 1;
464           items[n].code[0]  = unfold;
465           n++;
466         }
467       }
468     }
469     else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
470       if (buk1->fold_len == 2) {
471         m = FOLDS2_UNFOLDS_NUM(buk1->index);
472         for (i = 0; i < m; i++) {
473           OnigCodePoint unfold = FOLDS2_UNFOLDS(buk1->index)[i];
474           if (unfold == orig_codes[0]) continue;
475 
476           items[n].byte_len = lens[0];
477           items[n].code_len = 1;
478           items[n].code[0]  = unfold;
479           n++;
480         }
481 
482         for (fn = 0; fn < 2; fn++) {
483           int index;
484           cs[fn][0] = FOLDS2_FOLD(buk1->index)[fn];
485           ncs[fn] = 1;
486           index = onigenc_unicode_fold1_key(&cs[fn][0]);
487           if (index >= 0) {
488             int m = FOLDS1_UNFOLDS_NUM(index);
489             for (i = 0; i < m; i++) {
490               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
491             }
492             ncs[fn] += m;
493           }
494         }
495 
496         for (i = 0; i < ncs[0]; i++) {
497           for (j = 0; j < ncs[1]; j++) {
498             items[n].byte_len = lens[0];
499             items[n].code_len = 2;
500             items[n].code[0]  = cs[0][i];
501             items[n].code[1]  = cs[1][j];
502             n++;
503           }
504         }
505       }
506       else { /* fold_len == 3 */
507         m = FOLDS3_UNFOLDS_NUM(buk1->index);
508         for (i = 0; i < m; i++) {
509           OnigCodePoint unfold = FOLDS3_UNFOLDS(buk1->index)[i];
510           if (unfold == orig_codes[0]) continue;
511 
512           items[n].byte_len = lens[0];
513           items[n].code_len = 1;
514           items[n].code[0]  = unfold;
515           n++;
516         }
517 
518         for (fn = 0; fn < 3; fn++) {
519           int index;
520           cs[fn][0] = FOLDS3_FOLD(buk1->index)[fn];
521           ncs[fn] = 1;
522           index = onigenc_unicode_fold1_key(&cs[fn][0]);
523           if (index >= 0) {
524             int m = FOLDS1_UNFOLDS_NUM(index);
525             for (i = 0; i < m; i++) {
526               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
527             }
528             ncs[fn] += m;
529           }
530         }
531 
532         for (i = 0; i < ncs[0]; i++) {
533           for (j = 0; j < ncs[1]; j++) {
534             for (k = 0; k < ncs[2]; k++) {
535               items[n].byte_len = lens[0];
536               items[n].code_len = 3;
537               items[n].code[0]  = cs[0][i];
538               items[n].code[1]  = cs[1][j];
539               items[n].code[2]  = cs[2][k];
540               n++;
541             }
542           }
543         }
544       }
545     }
546   }
547   else {
548     int index = onigenc_unicode_fold1_key(orig_codes);
549     if (index >= 0) {
550       int m = FOLDS1_UNFOLDS_NUM(index);
551       for (i = 0; i < m; i++) {
552         items[n].byte_len = lens[0];
553         items[n].code_len = 1;
554         items[n].code[0]  = FOLDS1_UNFOLDS(index)[i];
555         n++;
556       }
557     }
558   }
559 
560   return n;
561 }
562 
563 #ifdef USE_UNICODE_PROPERTIES
564 #include "unicode_property_data.c"
565 #else
566 #include "unicode_property_data_posix.c"
567 #endif
568 
569 
570 #ifdef USE_UNICODE_WORD_BREAK
571 
572 enum WB_TYPE {
573   WB_Any = 0,
574   WB_ALetter,
575   WB_CR,
576   WB_Double_Quote,
577   WB_Extend,
578   WB_ExtendNumLet,
579   WB_Format,
580   WB_Hebrew_Letter,
581   WB_Katakana,
582   WB_LF,
583   WB_MidLetter,
584   WB_MidNum,
585   WB_MidNumLet,
586   WB_Newline,
587   WB_Numeric,
588   WB_Regional_Indicator,
589   WB_Single_Quote,
590   WB_WSegSpace,
591   WB_ZWJ,
592 };
593 
594 typedef struct {
595   OnigCodePoint start;
596   OnigCodePoint end;
597   enum WB_TYPE  type;
598 } WB_RANGE_TYPE;
599 
600 #include "unicode_wb_data.c"
601 
602 static enum WB_TYPE
wb_get_type(OnigCodePoint code)603 wb_get_type(OnigCodePoint code)
604 {
605   OnigCodePoint low, high, x;
606   enum WB_TYPE type;
607 
608   for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
609     x = (low + high) >> 1;
610     if (code > WB_RANGES[x].end)
611       low = x + 1;
612     else
613       high = x;
614   }
615 
616   type = (low < (OnigCodePoint )WB_RANGE_NUM &&
617           code >= WB_RANGES[low].start) ?
618     WB_RANGES[low].type : WB_Any;
619 
620   return type;
621 }
622 
623 #define IS_WB_IGNORE_TAIL(t)  ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
624 #define IS_WB_AHLetter(t)     ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
625 #define IS_WB_MidNumLetQ(t)   ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
626 
627 static int
wb_get_next_main_code(OnigEncoding enc,UChar * p,const UChar * end,OnigCodePoint * rcode,enum WB_TYPE * rtype)628 wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
629                       OnigCodePoint* rcode, enum WB_TYPE* rtype)
630 {
631   OnigCodePoint code;
632   enum WB_TYPE type;
633 
634   while (TRUE) {
635     p += enclen(enc, p);
636     if (p >= end) break;
637 
638     code = ONIGENC_MBC_TO_CODE(enc, p, end);
639     type = wb_get_type(code);
640     if (! IS_WB_IGNORE_TAIL(type)) {
641       *rcode = code;
642       *rtype = type;
643       return 1;
644     }
645   }
646 
647   return 0;
648 }
649 
650 extern int
onigenc_wb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)651 onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
652                              const UChar* start, const UChar* end)
653 {
654   int r;
655   UChar* pp;
656   OnigCodePoint cfrom;
657   OnigCodePoint cfrom2;
658   OnigCodePoint cto;
659   OnigCodePoint cto2;
660   enum WB_TYPE from;
661   enum WB_TYPE from2;
662   enum WB_TYPE to;
663   enum WB_TYPE to2;
664 
665   /* WB1: sot / Any */
666   if (p == start) return TRUE;
667   /* WB2: Any / eot */
668   if (p == end)   return TRUE;
669 
670   if (IS_NULL(prev)) {
671     prev = onigenc_get_prev_char_head(enc, start, p);
672     if (IS_NULL(prev)) return TRUE;
673   }
674 
675   cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
676   cto   = ONIGENC_MBC_TO_CODE(enc, p, end);
677 
678   from = wb_get_type(cfrom);
679   to   = wb_get_type(cto);
680 
681   /* short cut */
682   if (from == 0 && to == 0) goto WB999;
683 
684   /* WB3: CR + LF */
685   if (from == WB_CR && to == WB_LF) return FALSE;
686 
687   /* WB3a: (Newline|CR|LF) /  */
688   if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
689   /* WB3b: / (Newline|CR|LF) */
690   if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
691 
692   /* WB3c: ZWJ + {Extended_Pictographic} */
693   if (from == WB_ZWJ) {
694     if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
695       return FALSE;
696   }
697 
698   /* WB3d: WSegSpace + WSegSpace */
699   if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
700 
701   /* WB4:  X (Extend|Format|ZWJ)* -> X */
702   if (IS_WB_IGNORE_TAIL(to)) return FALSE;
703   if (IS_WB_IGNORE_TAIL(from)) {
704     while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
705       prev = pp;
706       cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
707       from = wb_get_type(cfrom);
708       if (! IS_WB_IGNORE_TAIL(from))
709         break;
710     }
711   }
712 
713   if (IS_WB_AHLetter(from)) {
714     /* WB5: AHLetter + AHLetter */
715     if (IS_WB_AHLetter(to)) return FALSE;
716 
717     /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
718     if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
719       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
720       if (r == 1) {
721         if (IS_WB_AHLetter(to2)) return FALSE;
722       }
723     }
724   }
725 
726   /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
727   if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
728     if (IS_WB_AHLetter(to)) {
729       from2 = WB_Any;
730       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
731         prev = pp;
732         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
733         from2 = wb_get_type(cfrom2);
734         if (! IS_WB_IGNORE_TAIL(from2))
735           break;
736       }
737 
738       if (IS_WB_AHLetter(from2)) return FALSE;
739     }
740   }
741 
742   if (from == WB_Hebrew_Letter) {
743     /* WB7a: Hebrew_Letter + Single_Quote */
744     if (to == WB_Single_Quote) return FALSE;
745 
746     /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
747     if (to == WB_Double_Quote) {
748       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
749       if (r == 1) {
750         if (to2 == WB_Hebrew_Letter) return FALSE;
751       }
752     }
753   }
754 
755   /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
756   if (from == WB_Double_Quote) {
757     if (to == WB_Hebrew_Letter) {
758       from2 = WB_Any;
759       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
760         prev = pp;
761         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
762         from2 = wb_get_type(cfrom2);
763         if (! IS_WB_IGNORE_TAIL(from2))
764           break;
765       }
766 
767       if (from2 == WB_Hebrew_Letter) return FALSE;
768     }
769   }
770 
771   if (to == WB_Numeric) {
772     /* WB8: Numeric + Numeric */
773     if (from == WB_Numeric) return FALSE;
774 
775     /* WB9: AHLetter + Numeric */
776     if (IS_WB_AHLetter(from)) return FALSE;
777 
778     /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
779     if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
780       from2 = WB_Any;
781       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
782         prev = pp;
783         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
784         from2 = wb_get_type(cfrom2);
785         if (! IS_WB_IGNORE_TAIL(from2))
786           break;
787       }
788 
789       if (from2 == WB_Numeric) return FALSE;
790     }
791   }
792 
793   if (from == WB_Numeric) {
794     /* WB10: Numeric + AHLetter */
795     if (IS_WB_AHLetter(to)) return FALSE;
796 
797     /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
798     if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
799       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
800       if (r == 1) {
801         if (to2 == WB_Numeric) return FALSE;
802       }
803     }
804   }
805 
806   /* WB13: Katakana + Katakana */
807   if (from == WB_Katakana && to == WB_Katakana) return FALSE;
808 
809   /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
810   if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
811       || from == WB_ExtendNumLet) {
812     if (to == WB_ExtendNumLet) return FALSE;
813   }
814 
815   /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
816   if (from == WB_ExtendNumLet) {
817     if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
818       return FALSE;
819   }
820 
821 
822   /* WB15:   sot (RI RI)* RI + RI */
823   /* WB16: [^RI] (RI RI)* RI + RI */
824   if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
825     int n = 0;
826     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
827       cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
828       from2  = wb_get_type(cfrom2);
829       if (from2 != WB_Regional_Indicator)
830         break;
831 
832       n++;
833     }
834     if ((n % 2) == 0) return FALSE;
835   }
836 
837  WB999:
838   /* WB999: Any / Any */
839   return TRUE;
840 }
841 
842 #endif /* USE_UNICODE_WORD_BREAK */
843 
844 
845 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
846 
847 enum EGCB_BREAK_TYPE {
848   EGCB_NOT_BREAK = 0,
849   EGCB_BREAK     = 1,
850   EGCB_BREAK_UNDEF_GB11  = 2,
851   EGCB_BREAK_UNDEF_RI_RI = 3
852 };
853 
854 enum EGCB_TYPE {
855   EGCB_Other   = 0,
856   EGCB_CR      = 1,
857   EGCB_LF      = 2,
858   EGCB_Control = 3,
859   EGCB_Extend  = 4,
860   EGCB_Prepend = 5,
861   EGCB_Regional_Indicator = 6,
862   EGCB_SpacingMark = 7,
863   EGCB_ZWJ         = 8,
864 #if 0
865   /* obsoleted */
866   EGCB_E_Base         = 9,
867   EGCB_E_Base_GAZ     = 10,
868   EGCB_E_Modifier     = 11,
869   EGCB_Glue_After_Zwj = 12,
870 #endif
871   EGCB_L   = 13,
872   EGCB_LV  = 14,
873   EGCB_LVT = 15,
874   EGCB_T   = 16,
875   EGCB_V   = 17
876 };
877 
878 typedef struct {
879   OnigCodePoint  start;
880   OnigCodePoint  end;
881   enum EGCB_TYPE type;
882 } EGCB_RANGE_TYPE;
883 
884 #include "unicode_egcb_data.c"
885 
886 static enum EGCB_TYPE
egcb_get_type(OnigCodePoint code)887 egcb_get_type(OnigCodePoint code)
888 {
889   OnigCodePoint low, high, x;
890   enum EGCB_TYPE type;
891 
892   for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
893     x = (low + high) >> 1;
894     if (code > EGCB_RANGES[x].end)
895       low = x + 1;
896     else
897       high = x;
898   }
899 
900   type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
901           code >= EGCB_RANGES[low].start) ?
902     EGCB_RANGES[low].type : EGCB_Other;
903 
904   return type;
905 }
906 
907 #define IS_CONTROL_CR_LF(code)   ((code) <= EGCB_Control && (code) >= EGCB_CR)
908 #define IS_HANGUL(code)          ((code) >= EGCB_L)
909 
910 /* GB1 and GB2 are outside of this function. */
911 static enum EGCB_BREAK_TYPE
unicode_egcb_is_break_2code(OnigCodePoint from_code,OnigCodePoint to_code)912 unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
913 {
914   enum EGCB_TYPE from;
915   enum EGCB_TYPE to;
916 
917   from = egcb_get_type(from_code);
918   to   = egcb_get_type(to_code);
919 
920   /* short cut */
921   if (from == 0 && to == 0) goto GB999;
922 
923   /* GB3 */
924   if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
925   /* GB4 */
926   if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
927   /* GB5 */
928   if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
929 
930   if (IS_HANGUL(from) && IS_HANGUL(to)) {
931     /* GB6 */
932     if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
933     /* GB7 */
934     if ((from == EGCB_LV || from == EGCB_V)
935         && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
936 
937     /* GB8 */
938     if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))
939       return EGCB_NOT_BREAK;
940 
941     goto GB999;
942   }
943 
944   /* GB9 */
945   if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
946 
947   /* GB9a */
948   if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
949   /* GB9b */
950   if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
951 
952   /* GB10 removed */
953 
954   /* GB11 */
955   if (from == EGCB_ZWJ) {
956     if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
957       return EGCB_BREAK_UNDEF_GB11;
958 
959     goto GB999;
960   }
961 
962   /* GB12, GB13 */
963   if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
964     return EGCB_BREAK_UNDEF_RI_RI;
965   }
966 
967  GB999:
968   return EGCB_BREAK;
969 }
970 
971 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
972 
973 extern int
onigenc_egcb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)974 onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
975                                const UChar* start, const UChar* end)
976 {
977   OnigCodePoint from;
978   OnigCodePoint to;
979 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
980   enum EGCB_BREAK_TYPE btype;
981   enum EGCB_TYPE type;
982 #endif
983 
984   /* GB1 and GB2 */
985   if (p == start) return 1;
986   if (p == end)   return 1;
987 
988   if (IS_NULL(prev)) {
989     prev = onigenc_get_prev_char_head(enc, start, p);
990     if (IS_NULL(prev)) return 1;
991   }
992 
993   from = ONIGENC_MBC_TO_CODE(enc, prev, end);
994   to   = ONIGENC_MBC_TO_CODE(enc, p, end);
995 
996 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
997   if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
998     return from != 0x000d || to != NEWLINE_CODE;
999   }
1000 
1001   btype = unicode_egcb_is_break_2code(from, to);
1002   switch (btype) {
1003   case EGCB_NOT_BREAK:
1004     return 0;
1005     break;
1006   case EGCB_BREAK:
1007     return 1;
1008     break;
1009 
1010   case EGCB_BREAK_UNDEF_GB11:
1011     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
1012       from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1013       if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
1014         return 0;
1015 
1016       type = egcb_get_type(from);
1017       if (type != EGCB_Extend)
1018         break;
1019     }
1020     break;
1021 
1022   case EGCB_BREAK_UNDEF_RI_RI:
1023     {
1024       int n = 0;
1025       while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
1026         from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1027         type = egcb_get_type(from);
1028         if (type != EGCB_Regional_Indicator)
1029           break;
1030 
1031         n++;
1032       }
1033       if ((n % 2) == 0) return 0;
1034     }
1035     break;
1036   }
1037 
1038   return 1;
1039 
1040 #else
1041   return from != 0x000d || to != NEWLINE_CODE;
1042 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
1043 }
1044 
1045 
1046 #define USER_DEFINED_PROPERTY_MAX_NUM  20
1047 
1048 typedef struct {
1049   int ctype;
1050   OnigCodePoint* ranges;
1051 } UserDefinedPropertyValue;
1052 
1053 static int UserDefinedPropertyNum;
1054 static UserDefinedPropertyValue
1055 UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
1056 static st_table* UserDefinedPropertyTable;
1057 
1058 extern int
onig_unicode_define_user_property(const char * name,OnigCodePoint * ranges)1059 onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
1060 {
1061   UserDefinedPropertyValue* e;
1062   int r;
1063   int i;
1064   int n;
1065   int len;
1066   int c;
1067   char* s;
1068   UChar* uname;
1069 
1070   if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
1071     return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
1072 
1073   len = (int )strlen(name);
1074   if (len >= PROPERTY_NAME_MAX_SIZE)
1075     return ONIGERR_TOO_LONG_PROPERTY_NAME;
1076 
1077   s = (char* )xmalloc(len + 1);
1078   if (s == 0)
1079     return ONIGERR_MEMORY;
1080 
1081   uname = (UChar* )name;
1082   n = 0;
1083   for (i = 0; i < len; i++) {
1084     c = uname[i];
1085     if (c < 0x20 || c >= 0x80) {
1086       xfree(s);
1087       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1088     }
1089 
1090     if (c != ' ' && c != '-' && c != '_') {
1091       s[n] = c;
1092       n++;
1093     }
1094   }
1095   s[n] = '\0';
1096 
1097   if (UserDefinedPropertyTable == 0) {
1098     UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
1099     if (IS_NULL(UserDefinedPropertyTable)) {
1100       xfree(s);
1101       return ONIGERR_MEMORY;
1102     }
1103   }
1104 
1105   e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
1106   e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
1107   e->ranges = ranges;
1108   r = onig_st_insert_strend(UserDefinedPropertyTable,
1109                             (const UChar* )s, (const UChar* )s + n,
1110                             (hash_data_type )((void* )e));
1111   if (r < 0) return r;
1112 
1113   UserDefinedPropertyNum++;
1114   return 0;
1115 }
1116 
1117 extern int
onigenc_unicode_is_code_ctype(OnigCodePoint code,unsigned int ctype)1118 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
1119 {
1120   if (
1121 #ifdef USE_UNICODE_PROPERTIES
1122       ctype <= ONIGENC_MAX_STD_CTYPE &&
1123 #endif
1124       code < 256) {
1125     return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
1126   }
1127 
1128   if (ctype >= CODE_RANGES_NUM) {
1129     int index = ctype - CODE_RANGES_NUM;
1130     if (index < UserDefinedPropertyNum)
1131       return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
1132     else
1133       return ONIGERR_TYPE_BUG;
1134   }
1135 
1136   return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
1137 }
1138 
1139 
1140 extern int
onigenc_unicode_ctype_code_range(OnigCtype ctype,const OnigCodePoint * ranges[])1141 onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
1142 {
1143   if (ctype >= CODE_RANGES_NUM) {
1144     int index = ctype - CODE_RANGES_NUM;
1145     if (index < UserDefinedPropertyNum) {
1146       *ranges = UserDefinedPropertyRanges[index].ranges;
1147       return 0;
1148     }
1149     else
1150       return ONIGERR_TYPE_BUG;
1151   }
1152 
1153   *ranges = CodeRanges[ctype];
1154   return 0;
1155 }
1156 
1157 extern int
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype,OnigCodePoint * sb_out,const OnigCodePoint * ranges[])1158 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
1159                                       const OnigCodePoint* ranges[])
1160 {
1161   *sb_out = 0x00;
1162   return onigenc_unicode_ctype_code_range(ctype, ranges);
1163 }
1164 
1165 extern int
onigenc_unicode_property_name_to_ctype(OnigEncoding enc,UChar * name,UChar * end)1166 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
1167 {
1168   int len;
1169   UChar *p;
1170   OnigCodePoint code;
1171   const struct PoolPropertyNameCtype* pc;
1172   char buf[PROPERTY_NAME_MAX_SIZE];
1173 
1174   p = name;
1175   len = 0;
1176   while (p < end) {
1177     code = ONIGENC_MBC_TO_CODE(enc, p, end);
1178     if (code >= 0x80)
1179       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1180 
1181     if (code != ' ' && code != '-' && code != '_') {
1182       buf[len++] = (char )code;
1183       if (len >= PROPERTY_NAME_MAX_SIZE)
1184         return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1185     }
1186 
1187     p += enclen(enc, p);
1188   }
1189 
1190   buf[len] = 0;
1191 
1192   if (UserDefinedPropertyTable != 0) {
1193     UserDefinedPropertyValue* e;
1194     e = (UserDefinedPropertyValue* )NULL;
1195     onig_st_lookup_strend(UserDefinedPropertyTable,
1196                           (const UChar* )buf, (const UChar* )buf + len,
1197                           (hash_data_type* )((void* )(&e)));
1198     if (e != 0) {
1199       return e->ctype;
1200     }
1201   }
1202 
1203   pc = unicode_lookup_property_name(buf, len);
1204   if (pc != 0) {
1205     /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
1206 #ifndef USE_UNICODE_PROPERTIES
1207     if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
1208       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1209 #endif
1210 
1211     return (int )pc->ctype;
1212   }
1213 
1214   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1215 }
1216