1 /**********************************************************************
2 unicode.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2020 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regint.h"
31
32 struct PoolPropertyNameCtype {
33 short int name;
34 short int ctype;
35 };
36
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38 ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
39
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
41 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
73 };
74
75 #include "st.h"
76
77 #include "unicode_fold_data.c"
78
79 extern int
onigenc_unicode_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * fold)80 onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag,
81 const UChar** pp, const UChar* end, UChar* fold)
82 {
83 const struct ByUnfoldKey* buk;
84
85 OnigCodePoint code;
86 int i, len, rlen;
87 const UChar *p = *pp;
88
89 code = ONIGENC_MBC_TO_CODE(enc, p, end);
90 len = enclen(enc, p);
91 *pp += len;
92
93 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
94 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
95 if (code == 0x0130) {
96 return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
97 }
98 #if 0
99 if (code == 0x0049) {
100 return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
101 }
102 #endif
103 }
104 #endif
105
106 if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) {
107 buk = onigenc_unicode_unfold_key(code);
108 if (buk != 0) {
109 if (buk->fold_len == 1) {
110 if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
111 ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index)))
112 return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
113 }
114 else {
115 OnigCodePoint* addr;
116
117 FOLDS_FOLD_ADDR_BUK(buk, addr);
118 rlen = 0;
119 for (i = 0; i < buk->fold_len; i++) {
120 OnigCodePoint c = addr[i];
121 len = ONIGENC_CODE_TO_MBC(enc, c, fold);
122 fold += len;
123 rlen += len;
124 }
125 return rlen;
126 }
127 }
128 }
129
130 for (i = 0; i < len; i++) {
131 *fold++ = *p++;
132 }
133 return len;
134 }
135
136 static int
apply_case_fold1(OnigCaseFoldType flag,int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)137 apply_case_fold1(OnigCaseFoldType flag, int from, int to,
138 OnigApplyAllCaseFoldFunc f, void* arg)
139 {
140 int i, j, k, n, r;
141
142 for (i = from; i < to; ) {
143 OnigCodePoint fold = *FOLDS1_FOLD(i);
144 if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break;
145
146 n = FOLDS1_UNFOLDS_NUM(i);
147 for (j = 0; j < n; j++) {
148 OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
149
150 if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold))
151 continue;
152
153 r = (*f)(fold, &unfold, 1, arg);
154 if (r != 0) return r;
155 r = (*f)(unfold, &fold, 1, arg);
156 if (r != 0) return r;
157
158 for (k = 0; k < j; k++) {
159 OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
160 if (CASE_FOLD_IS_ASCII_ONLY(flag) &&
161 ! ONIGENC_IS_ASCII_CODE(unfold2)) continue;
162
163 r = (*f)(unfold, &unfold2, 1, arg);
164 if (r != 0) return r;
165 r = (*f)(unfold2, &unfold, 1, arg);
166 if (r != 0) return r;
167 }
168 }
169
170 i = FOLDS1_NEXT_INDEX(i);
171 }
172
173 return 0;
174 }
175
176 static int
apply_case_fold2(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)177 apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
178 {
179 int i, j, k, n, r;
180
181 for (i = from; i < to; ) {
182 OnigCodePoint* fold = FOLDS2_FOLD(i);
183 n = FOLDS2_UNFOLDS_NUM(i);
184 for (j = 0; j < n; j++) {
185 OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
186
187 r = (*f)(unfold, fold, 2, arg);
188 if (r != 0) return r;
189
190 for (k = 0; k < j; k++) {
191 OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
192 r = (*f)(unfold, &unfold2, 1, arg);
193 if (r != 0) return r;
194 r = (*f)(unfold2, &unfold, 1, arg);
195 if (r != 0) return r;
196 }
197 }
198
199 i = FOLDS2_NEXT_INDEX(i);
200 }
201
202 return 0;
203 }
204
205 static int
apply_case_fold3(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)206 apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
207 {
208 int i, j, k, n, r;
209
210 for (i = from; i < to; ) {
211 OnigCodePoint* fold = FOLDS3_FOLD(i);
212 n = FOLDS3_UNFOLDS_NUM(i);
213 for (j = 0; j < n; j++) {
214 OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
215
216 r = (*f)(unfold, fold, 3, arg);
217 if (r != 0) return r;
218
219 for (k = 0; k < j; k++) {
220 OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
221 r = (*f)(unfold, &unfold2, 1, arg);
222 if (r != 0) return r;
223 r = (*f)(unfold2, &unfold, 1, arg);
224 if (r != 0) return r;
225 }
226 }
227
228 i = FOLDS3_NEXT_INDEX(i);
229 }
230
231 return 0;
232 }
233
234 extern int
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)235 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
236 OnigApplyAllCaseFoldFunc f, void* arg)
237 {
238 int r;
239
240 r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg);
241 if (r != 0) return r;
242
243 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
244 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
245 code = 0x0131;
246 r = (*f)(0x0049, &code, 1, arg);
247 if (r != 0) return r;
248 code = 0x0049;
249 r = (*f)(0x0131, &code, 1, arg);
250 if (r != 0) return r;
251
252 code = 0x0130;
253 r = (*f)(0x0069, &code, 1, arg);
254 if (r != 0) return r;
255 code = 0x0069;
256 r = (*f)(0x0130, &code, 1, arg);
257 if (r != 0) return r;
258 }
259 else {
260 #endif
261 r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
262 if (r != 0) return r;
263 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
264 }
265 #endif
266
267 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
268 return 0;
269
270 r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
271 if (r != 0) return r;
272
273 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
274 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
275 #endif
276 r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
277 if (r != 0) return r;
278 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
279 }
280 #endif
281
282 r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
283 if (r != 0) return r;
284
285 return 0;
286 }
287
288 extern int
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])289 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
290 OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
291 OnigCaseFoldCodeItem items[])
292 {
293 int n, m, i, j, k, len, lens[3];
294 int index;
295 int fn, ncs[3];
296 OnigCodePoint cs[3][4];
297 OnigCodePoint code, codes[3], orig_codes[3];
298 const struct ByUnfoldKey* buk1;
299
300 n = 0;
301
302 code = ONIGENC_MBC_TO_CODE(enc, p, end);
303 if (CASE_FOLD_IS_ASCII_ONLY(flag)) {
304 if (! ONIGENC_IS_ASCII_CODE(code)) return n;
305 }
306 len = enclen(enc, p);
307
308 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
309 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
310 if (code == 0x0049) {
311 items[0].byte_len = len;
312 items[0].code_len = 1;
313 items[0].code[0] = 0x0131;
314 return 1;
315 }
316 else if (code == 0x0130) {
317 items[0].byte_len = len;
318 items[0].code_len = 1;
319 items[0].code[0] = 0x0069;
320 return 1;
321 }
322 else if (code == 0x0131) {
323 items[0].byte_len = len;
324 items[0].code_len = 1;
325 items[0].code[0] = 0x0049;
326 return 1;
327 }
328 else if (code == 0x0069) {
329 items[0].byte_len = len;
330 items[0].code_len = 1;
331 items[0].code[0] = 0x0130;
332 return 1;
333 }
334 }
335 #endif
336
337 orig_codes[0] = code;
338 lens[0] = len;
339 p += len;
340
341 buk1 = onigenc_unicode_unfold_key(orig_codes[0]);
342 if (buk1 != 0 && buk1->fold_len == 1) {
343 codes[0] = *FOLDS1_FOLD(buk1->index);
344 }
345 else
346 codes[0] = orig_codes[0];
347
348 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
349 goto fold1;
350
351 if (p < end) {
352 const struct ByUnfoldKey* buk;
353
354 code = ONIGENC_MBC_TO_CODE(enc, p, end);
355 orig_codes[1] = code;
356 len = enclen(enc, p);
357 lens[1] = lens[0] + len;
358 buk = onigenc_unicode_unfold_key(orig_codes[1]);
359 if (buk != 0 && buk->fold_len == 1) {
360 codes[1] = *FOLDS1_FOLD(buk->index);
361 }
362 else
363 codes[1] = orig_codes[1];
364
365 p += len;
366 if (p < end) {
367 code = ONIGENC_MBC_TO_CODE(enc, p, end);
368 orig_codes[2] = code;
369 len = enclen(enc, p);
370 lens[2] = lens[1] + len;
371 buk = onigenc_unicode_unfold_key(orig_codes[2]);
372 if (buk != 0 && buk->fold_len == 1) {
373 codes[2] = *FOLDS1_FOLD(buk->index);
374 }
375 else
376 codes[2] = orig_codes[2];
377
378 index = onigenc_unicode_fold3_key(codes);
379 if (index >= 0) {
380 m = FOLDS3_UNFOLDS_NUM(index);
381 for (i = 0; i < m; i++) {
382 items[n].byte_len = lens[2];
383 items[n].code_len = 1;
384 items[n].code[0] = FOLDS3_UNFOLDS(index)[i];
385 n++;
386 }
387
388 for (fn = 0; fn < 3; fn++) {
389 int sindex;
390 cs[fn][0] = FOLDS3_FOLD(index)[fn];
391 ncs[fn] = 1;
392 sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
393 if (sindex >= 0) {
394 int m = FOLDS1_UNFOLDS_NUM(sindex);
395 for (i = 0; i < m; i++) {
396 cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
397 }
398 ncs[fn] += m;
399 }
400 }
401
402 for (i = 0; i < ncs[0]; i++) {
403 for (j = 0; j < ncs[1]; j++) {
404 for (k = 0; k < ncs[2]; k++) {
405 if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1] &&
406 cs[2][k] == orig_codes[2])
407 continue;
408
409 items[n].byte_len = lens[2];
410 items[n].code_len = 3;
411 items[n].code[0] = cs[0][i];
412 items[n].code[1] = cs[1][j];
413 items[n].code[2] = cs[2][k];
414 n++;
415 }
416 }
417 }
418
419 return n;
420 }
421 }
422
423 index = onigenc_unicode_fold2_key(codes);
424 if (index >= 0) {
425 m = FOLDS2_UNFOLDS_NUM(index);
426 for (i = 0; i < m; i++) {
427 items[n].byte_len = lens[1];
428 items[n].code_len = 1;
429 items[n].code[0] = FOLDS2_UNFOLDS(index)[i];
430 n++;
431 }
432
433 for (fn = 0; fn < 2; fn++) {
434 int sindex;
435 cs[fn][0] = FOLDS2_FOLD(index)[fn];
436 ncs[fn] = 1;
437 sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
438 if (sindex >= 0) {
439 int m = FOLDS1_UNFOLDS_NUM(sindex);
440 for (i = 0; i < m; i++) {
441 cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
442 }
443 ncs[fn] += m;
444 }
445 }
446
447 for (i = 0; i < ncs[0]; i++) {
448 for (j = 0; j < ncs[1]; j++) {
449 if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1])
450 continue;
451 items[n].byte_len = lens[1];
452 items[n].code_len = 2;
453 items[n].code[0] = cs[0][i];
454 items[n].code[1] = cs[1][j];
455 n++;
456 }
457 }
458
459 return n;
460 }
461 }
462
463 fold1:
464 if (buk1 != 0) {
465 if (buk1->fold_len == 1) {
466 int un;
467
468 if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
469 ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) {
470 items[0].byte_len = lens[0];
471 items[0].code_len = 1;
472 items[0].code[0] = *FOLDS1_FOLD(buk1->index);
473 n++;
474 }
475
476 un = FOLDS1_UNFOLDS_NUM(buk1->index);
477 for (i = 0; i < un; i++) {
478 OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i];
479 if (unfold != orig_codes[0]) {
480 if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
481 ONIGENC_IS_ASCII_CODE(unfold)) {
482 items[n].byte_len = lens[0];
483 items[n].code_len = 1;
484 items[n].code[0] = unfold;
485 n++;
486 }
487 }
488 }
489 }
490 else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
491 if (buk1->fold_len == 2) {
492 m = FOLDS2_UNFOLDS_NUM(buk1->index);
493 for (i = 0; i < m; i++) {
494 OnigCodePoint unfold = FOLDS2_UNFOLDS(buk1->index)[i];
495 if (unfold == orig_codes[0]) continue;
496
497 items[n].byte_len = lens[0];
498 items[n].code_len = 1;
499 items[n].code[0] = unfold;
500 n++;
501 }
502
503 for (fn = 0; fn < 2; fn++) {
504 int index;
505 cs[fn][0] = FOLDS2_FOLD(buk1->index)[fn];
506 ncs[fn] = 1;
507 index = onigenc_unicode_fold1_key(&cs[fn][0]);
508 if (index >= 0) {
509 int m = FOLDS1_UNFOLDS_NUM(index);
510 for (i = 0; i < m; i++) {
511 cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
512 }
513 ncs[fn] += m;
514 }
515 }
516
517 for (i = 0; i < ncs[0]; i++) {
518 for (j = 0; j < ncs[1]; j++) {
519 items[n].byte_len = lens[0];
520 items[n].code_len = 2;
521 items[n].code[0] = cs[0][i];
522 items[n].code[1] = cs[1][j];
523 n++;
524 }
525 }
526 }
527 else { /* fold_len == 3 */
528 m = FOLDS3_UNFOLDS_NUM(buk1->index);
529 for (i = 0; i < m; i++) {
530 OnigCodePoint unfold = FOLDS3_UNFOLDS(buk1->index)[i];
531 if (unfold == orig_codes[0]) continue;
532
533 items[n].byte_len = lens[0];
534 items[n].code_len = 1;
535 items[n].code[0] = unfold;
536 n++;
537 }
538
539 for (fn = 0; fn < 3; fn++) {
540 int index;
541 cs[fn][0] = FOLDS3_FOLD(buk1->index)[fn];
542 ncs[fn] = 1;
543 index = onigenc_unicode_fold1_key(&cs[fn][0]);
544 if (index >= 0) {
545 int m = FOLDS1_UNFOLDS_NUM(index);
546 for (i = 0; i < m; i++) {
547 cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
548 }
549 ncs[fn] += m;
550 }
551 }
552
553 for (i = 0; i < ncs[0]; i++) {
554 for (j = 0; j < ncs[1]; j++) {
555 for (k = 0; k < ncs[2]; k++) {
556 items[n].byte_len = lens[0];
557 items[n].code_len = 3;
558 items[n].code[0] = cs[0][i];
559 items[n].code[1] = cs[1][j];
560 items[n].code[2] = cs[2][k];
561 n++;
562 }
563 }
564 }
565 }
566 }
567 }
568 else {
569 int index = onigenc_unicode_fold1_key(orig_codes);
570 if (index >= 0) {
571 int m = FOLDS1_UNFOLDS_NUM(index);
572 for (i = 0; i < m; i++) {
573 code = FOLDS1_UNFOLDS(index)[i];
574 if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) {
575 items[n].byte_len = lens[0];
576 items[n].code_len = 1;
577 items[n].code[0] = code;
578 n++;
579 }
580 }
581 }
582 }
583
584 return n;
585 }
586
587 #ifdef USE_UNICODE_PROPERTIES
588 #include "unicode_property_data.c"
589 #else
590 #include "unicode_property_data_posix.c"
591 #endif
592
593
594 #ifdef USE_UNICODE_WORD_BREAK
595
596 enum WB_TYPE {
597 WB_Any = 0,
598 WB_ALetter,
599 WB_CR,
600 WB_Double_Quote,
601 WB_Extend,
602 WB_ExtendNumLet,
603 WB_Format,
604 WB_Hebrew_Letter,
605 WB_Katakana,
606 WB_LF,
607 WB_MidLetter,
608 WB_MidNum,
609 WB_MidNumLet,
610 WB_Newline,
611 WB_Numeric,
612 WB_Regional_Indicator,
613 WB_Single_Quote,
614 WB_WSegSpace,
615 WB_ZWJ,
616 };
617
618 typedef struct {
619 OnigCodePoint start;
620 OnigCodePoint end;
621 enum WB_TYPE type;
622 } WB_RANGE_TYPE;
623
624 #include "unicode_wb_data.c"
625
626 static enum WB_TYPE
wb_get_type(OnigCodePoint code)627 wb_get_type(OnigCodePoint code)
628 {
629 OnigCodePoint low, high, x;
630 enum WB_TYPE type;
631
632 for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
633 x = (low + high) >> 1;
634 if (code > WB_RANGES[x].end)
635 low = x + 1;
636 else
637 high = x;
638 }
639
640 type = (low < (OnigCodePoint )WB_RANGE_NUM &&
641 code >= WB_RANGES[low].start) ?
642 WB_RANGES[low].type : WB_Any;
643
644 return type;
645 }
646
647 #define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
648 #define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
649 #define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
650
651 static int
wb_get_next_main_code(OnigEncoding enc,UChar * p,const UChar * end,OnigCodePoint * rcode,enum WB_TYPE * rtype)652 wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
653 OnigCodePoint* rcode, enum WB_TYPE* rtype)
654 {
655 OnigCodePoint code;
656 enum WB_TYPE type;
657
658 while (TRUE) {
659 p += enclen(enc, p);
660 if (p >= end) break;
661
662 code = ONIGENC_MBC_TO_CODE(enc, p, end);
663 type = wb_get_type(code);
664 if (! IS_WB_IGNORE_TAIL(type)) {
665 *rcode = code;
666 *rtype = type;
667 return 1;
668 }
669 }
670
671 return 0;
672 }
673
674 extern int
onigenc_wb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)675 onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
676 const UChar* start, const UChar* end)
677 {
678 int r;
679 UChar* pp;
680 OnigCodePoint cfrom;
681 OnigCodePoint cfrom2;
682 OnigCodePoint cto;
683 OnigCodePoint cto2;
684 enum WB_TYPE from;
685 enum WB_TYPE from2;
686 enum WB_TYPE to;
687 enum WB_TYPE to2;
688
689 /* WB1: sot / Any */
690 if (p == start) return TRUE;
691 /* WB2: Any / eot */
692 if (p == end) return TRUE;
693
694 if (IS_NULL(prev)) {
695 prev = onigenc_get_prev_char_head(enc, start, p);
696 if (IS_NULL(prev)) return TRUE;
697 }
698
699 cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
700 cto = ONIGENC_MBC_TO_CODE(enc, p, end);
701
702 from = wb_get_type(cfrom);
703 to = wb_get_type(cto);
704
705 /* short cut */
706 if (from == 0 && to == 0) goto WB999;
707
708 /* WB3: CR + LF */
709 if (from == WB_CR && to == WB_LF) return FALSE;
710
711 /* WB3a: (Newline|CR|LF) / */
712 if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
713 /* WB3b: / (Newline|CR|LF) */
714 if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
715
716 /* WB3c: ZWJ + {Extended_Pictographic} */
717 if (from == WB_ZWJ) {
718 if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
719 return FALSE;
720 }
721
722 /* WB3d: WSegSpace + WSegSpace */
723 if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
724
725 /* WB4: X (Extend|Format|ZWJ)* -> X */
726 if (IS_WB_IGNORE_TAIL(to)) return FALSE;
727 if (IS_WB_IGNORE_TAIL(from)) {
728 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
729 prev = pp;
730 cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
731 from = wb_get_type(cfrom);
732 if (! IS_WB_IGNORE_TAIL(from))
733 break;
734 }
735 }
736
737 if (IS_WB_AHLetter(from)) {
738 /* WB5: AHLetter + AHLetter */
739 if (IS_WB_AHLetter(to)) return FALSE;
740
741 /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
742 if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
743 r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
744 if (r == 1) {
745 if (IS_WB_AHLetter(to2)) return FALSE;
746 }
747 }
748 }
749
750 /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
751 if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
752 if (IS_WB_AHLetter(to)) {
753 from2 = WB_Any;
754 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
755 prev = pp;
756 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
757 from2 = wb_get_type(cfrom2);
758 if (! IS_WB_IGNORE_TAIL(from2))
759 break;
760 }
761
762 if (IS_WB_AHLetter(from2)) return FALSE;
763 }
764 }
765
766 if (from == WB_Hebrew_Letter) {
767 /* WB7a: Hebrew_Letter + Single_Quote */
768 if (to == WB_Single_Quote) return FALSE;
769
770 /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
771 if (to == WB_Double_Quote) {
772 r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
773 if (r == 1) {
774 if (to2 == WB_Hebrew_Letter) return FALSE;
775 }
776 }
777 }
778
779 /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
780 if (from == WB_Double_Quote) {
781 if (to == WB_Hebrew_Letter) {
782 from2 = WB_Any;
783 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
784 prev = pp;
785 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
786 from2 = wb_get_type(cfrom2);
787 if (! IS_WB_IGNORE_TAIL(from2))
788 break;
789 }
790
791 if (from2 == WB_Hebrew_Letter) return FALSE;
792 }
793 }
794
795 if (to == WB_Numeric) {
796 /* WB8: Numeric + Numeric */
797 if (from == WB_Numeric) return FALSE;
798
799 /* WB9: AHLetter + Numeric */
800 if (IS_WB_AHLetter(from)) return FALSE;
801
802 /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
803 if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
804 from2 = WB_Any;
805 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
806 prev = pp;
807 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
808 from2 = wb_get_type(cfrom2);
809 if (! IS_WB_IGNORE_TAIL(from2))
810 break;
811 }
812
813 if (from2 == WB_Numeric) return FALSE;
814 }
815 }
816
817 if (from == WB_Numeric) {
818 /* WB10: Numeric + AHLetter */
819 if (IS_WB_AHLetter(to)) return FALSE;
820
821 /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
822 if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
823 r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
824 if (r == 1) {
825 if (to2 == WB_Numeric) return FALSE;
826 }
827 }
828 }
829
830 /* WB13: Katakana + Katakana */
831 if (from == WB_Katakana && to == WB_Katakana) return FALSE;
832
833 /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
834 if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
835 || from == WB_ExtendNumLet) {
836 if (to == WB_ExtendNumLet) return FALSE;
837 }
838
839 /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
840 if (from == WB_ExtendNumLet) {
841 if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
842 return FALSE;
843 }
844
845
846 /* WB15: sot (RI RI)* RI + RI */
847 /* WB16: [^RI] (RI RI)* RI + RI */
848 if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
849 int n = 0;
850 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
851 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
852 from2 = wb_get_type(cfrom2);
853 if (from2 != WB_Regional_Indicator)
854 break;
855
856 n++;
857 }
858 if ((n % 2) == 0) return FALSE;
859 }
860
861 WB999:
862 /* WB999: Any / Any */
863 return TRUE;
864 }
865
866 #endif /* USE_UNICODE_WORD_BREAK */
867
868
869 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
870
871 enum EGCB_BREAK_TYPE {
872 EGCB_NOT_BREAK = 0,
873 EGCB_BREAK = 1,
874 EGCB_BREAK_UNDEF_GB11 = 2,
875 EGCB_BREAK_UNDEF_RI_RI = 3
876 };
877
878 enum EGCB_TYPE {
879 EGCB_Other = 0,
880 EGCB_CR = 1,
881 EGCB_LF = 2,
882 EGCB_Control = 3,
883 EGCB_Extend = 4,
884 EGCB_Prepend = 5,
885 EGCB_Regional_Indicator = 6,
886 EGCB_SpacingMark = 7,
887 EGCB_ZWJ = 8,
888 #if 0
889 /* obsoleted */
890 EGCB_E_Base = 9,
891 EGCB_E_Base_GAZ = 10,
892 EGCB_E_Modifier = 11,
893 EGCB_Glue_After_Zwj = 12,
894 #endif
895 EGCB_L = 13,
896 EGCB_LV = 14,
897 EGCB_LVT = 15,
898 EGCB_T = 16,
899 EGCB_V = 17
900 };
901
902 typedef struct {
903 OnigCodePoint start;
904 OnigCodePoint end;
905 enum EGCB_TYPE type;
906 } EGCB_RANGE_TYPE;
907
908 #include "unicode_egcb_data.c"
909
910 static enum EGCB_TYPE
egcb_get_type(OnigCodePoint code)911 egcb_get_type(OnigCodePoint code)
912 {
913 OnigCodePoint low, high, x;
914 enum EGCB_TYPE type;
915
916 for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
917 x = (low + high) >> 1;
918 if (code > EGCB_RANGES[x].end)
919 low = x + 1;
920 else
921 high = x;
922 }
923
924 type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
925 code >= EGCB_RANGES[low].start) ?
926 EGCB_RANGES[low].type : EGCB_Other;
927
928 return type;
929 }
930
931 #define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)
932 #define IS_HANGUL(code) ((code) >= EGCB_L)
933
934 /* GB1 and GB2 are outside of this function. */
935 static enum EGCB_BREAK_TYPE
unicode_egcb_is_break_2code(OnigCodePoint from_code,OnigCodePoint to_code)936 unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
937 {
938 enum EGCB_TYPE from;
939 enum EGCB_TYPE to;
940
941 from = egcb_get_type(from_code);
942 to = egcb_get_type(to_code);
943
944 /* short cut */
945 if (from == 0 && to == 0) goto GB999;
946
947 /* GB3 */
948 if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
949 /* GB4 */
950 if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
951 /* GB5 */
952 if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
953
954 if (IS_HANGUL(from) && IS_HANGUL(to)) {
955 /* GB6 */
956 if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
957 /* GB7 */
958 if ((from == EGCB_LV || from == EGCB_V)
959 && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
960
961 /* GB8 */
962 if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))
963 return EGCB_NOT_BREAK;
964
965 goto GB999;
966 }
967
968 /* GB9 */
969 if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
970
971 /* GB9a */
972 if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
973 /* GB9b */
974 if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
975
976 /* GB10 removed */
977
978 /* GB11 */
979 if (from == EGCB_ZWJ) {
980 if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
981 return EGCB_BREAK_UNDEF_GB11;
982
983 goto GB999;
984 }
985
986 /* GB12, GB13 */
987 if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
988 return EGCB_BREAK_UNDEF_RI_RI;
989 }
990
991 GB999:
992 return EGCB_BREAK;
993 }
994
995 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
996
997 extern int
onigenc_egcb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)998 onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
999 const UChar* start, const UChar* end)
1000 {
1001 OnigCodePoint from;
1002 OnigCodePoint to;
1003 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
1004 enum EGCB_BREAK_TYPE btype;
1005 enum EGCB_TYPE type;
1006 #endif
1007
1008 /* GB1 and GB2 */
1009 if (p == start) return 1;
1010 if (p == end) return 1;
1011
1012 if (IS_NULL(prev)) {
1013 prev = onigenc_get_prev_char_head(enc, start, p);
1014 if (IS_NULL(prev)) return 1;
1015 }
1016
1017 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1018 to = ONIGENC_MBC_TO_CODE(enc, p, end);
1019
1020 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
1021 if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
1022 return from != 0x000d || to != NEWLINE_CODE;
1023 }
1024
1025 btype = unicode_egcb_is_break_2code(from, to);
1026 switch (btype) {
1027 case EGCB_NOT_BREAK:
1028 return 0;
1029 break;
1030 case EGCB_BREAK:
1031 return 1;
1032 break;
1033
1034 case EGCB_BREAK_UNDEF_GB11:
1035 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
1036 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1037 if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
1038 return 0;
1039
1040 type = egcb_get_type(from);
1041 if (type != EGCB_Extend)
1042 break;
1043 }
1044 break;
1045
1046 case EGCB_BREAK_UNDEF_RI_RI:
1047 {
1048 int n = 0;
1049 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
1050 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
1051 type = egcb_get_type(from);
1052 if (type != EGCB_Regional_Indicator)
1053 break;
1054
1055 n++;
1056 }
1057 if ((n % 2) == 0) return 0;
1058 }
1059 break;
1060 }
1061
1062 return 1;
1063
1064 #else
1065 return from != 0x000d || to != NEWLINE_CODE;
1066 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
1067 }
1068
1069
1070 #define USER_DEFINED_PROPERTY_MAX_NUM 20
1071
1072 typedef struct {
1073 int ctype;
1074 OnigCodePoint* ranges;
1075 } UserDefinedPropertyValue;
1076
1077 static int UserDefinedPropertyNum;
1078 static UserDefinedPropertyValue
1079 UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
1080 static st_table* UserDefinedPropertyTable;
1081
1082 extern int
onig_unicode_define_user_property(const char * name,OnigCodePoint * ranges)1083 onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
1084 {
1085 UserDefinedPropertyValue* e;
1086 int r;
1087 int i;
1088 int n;
1089 int len;
1090 int c;
1091 char* s;
1092 UChar* uname;
1093
1094 if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
1095 return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
1096
1097 len = (int )strlen(name);
1098 if (len >= PROPERTY_NAME_MAX_SIZE)
1099 return ONIGERR_TOO_LONG_PROPERTY_NAME;
1100
1101 s = (char* )xmalloc(len + 1);
1102 if (s == 0)
1103 return ONIGERR_MEMORY;
1104
1105 uname = (UChar* )name;
1106 n = 0;
1107 for (i = 0; i < len; i++) {
1108 c = uname[i];
1109 if (c < 0x20 || c >= 0x80) {
1110 xfree(s);
1111 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1112 }
1113
1114 if (c != ' ' && c != '-' && c != '_') {
1115 s[n] = c;
1116 n++;
1117 }
1118 }
1119 s[n] = '\0';
1120
1121 if (UserDefinedPropertyTable == 0) {
1122 UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
1123 if (IS_NULL(UserDefinedPropertyTable)) {
1124 xfree(s);
1125 return ONIGERR_MEMORY;
1126 }
1127 }
1128
1129 e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
1130 e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
1131 e->ranges = ranges;
1132 r = onig_st_insert_strend(UserDefinedPropertyTable,
1133 (const UChar* )s, (const UChar* )s + n,
1134 (hash_data_type )((void* )e));
1135 if (r < 0) return r;
1136
1137 UserDefinedPropertyNum++;
1138 return 0;
1139 }
1140
1141 extern int
onigenc_unicode_is_code_ctype(OnigCodePoint code,unsigned int ctype)1142 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
1143 {
1144 if (
1145 #ifdef USE_UNICODE_PROPERTIES
1146 ctype <= ONIGENC_MAX_STD_CTYPE &&
1147 #endif
1148 code < 256) {
1149 return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
1150 }
1151
1152 if (ctype >= CODE_RANGES_NUM) {
1153 int index = ctype - CODE_RANGES_NUM;
1154 if (index < UserDefinedPropertyNum)
1155 return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
1156 else
1157 return ONIGERR_TYPE_BUG;
1158 }
1159
1160 return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
1161 }
1162
1163
1164 extern int
onigenc_unicode_ctype_code_range(OnigCtype ctype,const OnigCodePoint * ranges[])1165 onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
1166 {
1167 if (ctype >= CODE_RANGES_NUM) {
1168 int index = ctype - CODE_RANGES_NUM;
1169 if (index < UserDefinedPropertyNum) {
1170 *ranges = UserDefinedPropertyRanges[index].ranges;
1171 return 0;
1172 }
1173 else
1174 return ONIGERR_TYPE_BUG;
1175 }
1176
1177 *ranges = CodeRanges[ctype];
1178 return 0;
1179 }
1180
1181 extern int
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype,OnigCodePoint * sb_out,const OnigCodePoint * ranges[])1182 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
1183 const OnigCodePoint* ranges[])
1184 {
1185 *sb_out = 0x00;
1186 return onigenc_unicode_ctype_code_range(ctype, ranges);
1187 }
1188
1189 extern int
onigenc_unicode_property_name_to_ctype(OnigEncoding enc,UChar * name,UChar * end)1190 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
1191 {
1192 int len;
1193 UChar *p;
1194 OnigCodePoint code;
1195 const struct PoolPropertyNameCtype* pc;
1196 char buf[PROPERTY_NAME_MAX_SIZE];
1197
1198 p = name;
1199 len = 0;
1200 while (p < end) {
1201 code = ONIGENC_MBC_TO_CODE(enc, p, end);
1202 if (code >= 0x80)
1203 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1204
1205 if (code != ' ' && code != '-' && code != '_') {
1206 buf[len++] = (char )code;
1207 if (len >= PROPERTY_NAME_MAX_SIZE)
1208 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1209 }
1210
1211 p += enclen(enc, p);
1212 }
1213
1214 buf[len] = 0;
1215
1216 if (UserDefinedPropertyTable != 0) {
1217 UserDefinedPropertyValue* e;
1218 e = (UserDefinedPropertyValue* )NULL;
1219 onig_st_lookup_strend(UserDefinedPropertyTable,
1220 (const UChar* )buf, (const UChar* )buf + len,
1221 (hash_data_type* )((void* )(&e)));
1222 if (e != 0) {
1223 return e->ctype;
1224 }
1225 }
1226
1227 pc = unicode_lookup_property_name(buf, len);
1228 if (pc != 0) {
1229 /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
1230 #ifndef USE_UNICODE_PROPERTIES
1231 if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
1232 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1233 #endif
1234
1235 return (int )pc->ctype;
1236 }
1237
1238 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1239 }
1240