1 /**********************************************************************
2 gb18030.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2005-2020 KUBO Takehiro <kubo AT jiubao DOT org>
6 * K.Kosako
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include "regenc.h"
32
33 /* #define DEBUG_GB18030 */
34
35 #ifndef DEBUG_GB18030
36
37 #define DEBUG_OUT(arg)
38
39 #else
40
41 #ifndef NEED_TO_INCLUDE_STDIO
42 #define NEED_TO_INCLUDE_STDIO
43 #endif
44
45 /* for printf() */
46 #include "regint.h"
47
48 #define DEBUG_OUT(arg) printf arg
49
50 #endif
51
52 enum {
53 C1, /* one-byte char */
54 C2, /* one-byte or second of two-byte char */
55 C4, /* one-byte or second or fourth of four-byte char */
56 CM /* first of two- or four-byte char or second of two-byte char */
57 };
58
59 static const char GB18030_MAP[] = {
60 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
61 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
62 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
63 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
64 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
65 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
66 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
67 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
68 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
69 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
70 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
71 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
72 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
73 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
74 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
75 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
76 };
77
78 static int
gb18030_mbc_enc_len(const UChar * p)79 gb18030_mbc_enc_len(const UChar* p)
80 {
81 if (GB18030_MAP[*p] != CM)
82 return 1;
83
84 p++;
85 if (GB18030_MAP[*p] == C4)
86 return 4;
87
88 return 2;
89 }
90
91 static int
gb18030_code_to_mbclen(OnigCodePoint code)92 gb18030_code_to_mbclen(OnigCodePoint code)
93 {
94 if ((code & 0xff000000) != 0) {
95 if (GB18030_MAP[(int )(code >> 24) & 0xff] == CM)
96 if (GB18030_MAP[(int )(code >> 16) & 0xff] == C4)
97 return 4;
98 }
99 else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
100 else if ((code & 0xff00) != 0) {
101 if (GB18030_MAP[(int )(code >> 8) & 0xff] == CM) {
102 char c = GB18030_MAP[(int )code & 0xff];
103 if (c == CM || c == C2)
104 return 2;
105 }
106 }
107 else {
108 if (GB18030_MAP[(int )(code & 0xff)] != CM)
109 return 1;
110 }
111
112 return ONIGERR_INVALID_CODE_POINT_VALUE;
113 }
114
115 static int
is_valid_mbc_string(const UChar * p,const UChar * end)116 is_valid_mbc_string(const UChar* p, const UChar* end)
117 {
118 while (p < end) {
119 if (*p < 0x80) {
120 p++;
121 }
122 else if (*p == 0x80 || *p == 0xff) {
123 return FALSE;
124 }
125 else {
126 p++;
127 if (p >= end) return FALSE;
128 if (*p < 0x40) {
129 if (*p < 0x30 || *p > 0x39)
130 return FALSE;
131
132 p++;
133 if (p >= end) return FALSE;
134 if (*p < 0x81 || *p == 0xff) return FALSE;
135
136 p++;
137 if (p >= end) return FALSE;
138 if (*p < 0x30 || *p > 0x39)
139 return FALSE;
140
141 p++;
142 }
143 else if (*p == 0x7f || *p == 0xff) {
144 return FALSE;
145 }
146 else {
147 p++;
148 }
149 }
150 }
151
152 return TRUE;
153 }
154
155 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)156 gb18030_mbc_to_code(const UChar* p, const UChar* end)
157 {
158 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
159 }
160
161 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)162 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
163 {
164 return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
165 }
166
167 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)168 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
169 UChar* lower)
170 {
171 return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
172 pp, end, lower);
173 }
174
175 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)176 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
177 {
178 return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
179 }
180
181 enum state {
182 S_START = 0,
183 S_one_C2 = 1,
184 S_one_C4,
185 S_one_CM,
186
187 S_odd_CM_one_CX,
188 S_even_CM_one_CX,
189
190 /* CMC4 : pair of "CM C4" */
191 S_one_CMC4,
192 S_odd_CMC4,
193 S_one_C4_odd_CMC4,
194 S_even_CMC4,
195 S_one_C4_even_CMC4,
196
197 S_odd_CM_odd_CMC4,
198 S_even_CM_odd_CMC4,
199
200 S_odd_CM_even_CMC4,
201 S_even_CM_even_CMC4,
202
203 /* C4CM : pair of "C4 CM" */
204 S_odd_C4CM,
205 S_one_CM_odd_C4CM,
206 S_even_C4CM,
207 S_one_CM_even_C4CM,
208
209 S_even_CM_odd_C4CM,
210 S_odd_CM_odd_C4CM,
211 S_even_CM_even_C4CM,
212 S_odd_CM_even_C4CM,
213 };
214
215 #ifdef DEBUG_GB18030
216 static char* StateNames[] = {
217 "S_START",
218 "S_one_C2",
219 "S_one_C4",
220 "S_one_CM",
221 "S_odd_CM_one_CX",
222 "S_even_CM_one_CX",
223 "S_one_CMC4",
224 "S_odd_CMC4",
225 "S_one_C4_odd_CMC4",
226 "S_even_CMC4",
227 "S_one_C4_even_CMC4",
228 "S_odd_CM_odd_CMC4",
229 "S_even_CM_odd_CMC4",
230 "S_odd_CM_even_CMC4",
231 "S_even_CM_even_CMC4",
232 "S_odd_C4CM",
233 "S_one_CM_odd_C4CM",
234 "S_even_C4CM",
235 "S_one_CM_even_C4CM",
236 "S_even_CM_odd_C4CM",
237 "S_odd_CM_odd_C4CM",
238 "S_even_CM_even_C4CM",
239 "S_odd_CM_even_C4CM"
240 };
241 #endif
242
243 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)244 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
245 {
246 const UChar *p;
247 enum state state = S_START;
248
249 DEBUG_OUT(("----------------\n"));
250 for (p = s; p >= start; p--) {
251 DEBUG_OUT(("%5d: state %-19s (0x%02x)->\n", (int )(p - start), StateNames[state], *p));
252 switch (state) {
253 case S_START:
254 switch (GB18030_MAP[*p]) {
255 case C1:
256 return (UChar *)s;
257 case C2:
258 state = S_one_C2; /* C2 */
259 break;
260 case C4:
261 state = S_one_C4; /* C4 */
262 break;
263 case CM:
264 state = S_one_CM; /* CM */
265 break;
266 }
267 break;
268 case S_one_C2: /* C2 */
269 switch (GB18030_MAP[*p]) {
270 case C1:
271 case C2:
272 case C4:
273 return (UChar *)s;
274 case CM:
275 state = S_odd_CM_one_CX; /* CM C2 */
276 break;
277 }
278 break;
279 case S_one_C4: /* C4 */
280 switch (GB18030_MAP[*p]) {
281 case C1:
282 case C2:
283 case C4:
284 return (UChar *)s;
285 case CM:
286 state = S_one_CMC4;
287 break;
288 }
289 break;
290 case S_one_CM: /* CM */
291 switch (GB18030_MAP[*p]) {
292 case C1:
293 case C2:
294 return (UChar *)s;
295 case C4:
296 state = S_odd_C4CM;
297 break;
298 case CM:
299 state = S_odd_CM_one_CX; /* CM CM */
300 break;
301 }
302 break;
303
304 case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
305 switch (GB18030_MAP[*p]) {
306 case C1:
307 case C2:
308 case C4:
309 return (UChar *)(s - 1);
310 case CM:
311 state = S_even_CM_one_CX;
312 break;
313 }
314 break;
315 case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
316 switch (GB18030_MAP[*p]) {
317 case C1:
318 case C2:
319 case C4:
320 return (UChar *)s;
321 case CM:
322 state = S_odd_CM_one_CX;
323 break;
324 }
325 break;
326
327 case S_one_CMC4: /* CM C4 */
328 switch (GB18030_MAP[*p]) {
329 case C1:
330 case C2:
331 return (UChar *)(s - 1);
332 case C4:
333 state = S_one_C4_odd_CMC4; /* C4 CM C4 */
334 break;
335 case CM:
336 state = S_even_CM_one_CX; /* CM CM C4 */
337 break;
338 }
339 break;
340 case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
341 switch (GB18030_MAP[*p]) {
342 case C1:
343 case C2:
344 return (UChar *)(s - 1);
345 case C4:
346 state = S_one_C4_odd_CMC4;
347 break;
348 case CM:
349 state = S_odd_CM_odd_CMC4;
350 break;
351 }
352 break;
353 case S_one_C4_odd_CMC4: /* C4 CM C4 */
354 switch (GB18030_MAP[*p]) {
355 case C1:
356 case C2:
357 case C4:
358 return (UChar *)(s - 1);
359 case CM:
360 state = S_even_CMC4; /* CM C4 CM C4 */
361 break;
362 }
363 break;
364 case S_even_CMC4: /* CM C4 CM C4 */
365 switch (GB18030_MAP[*p]) {
366 case C1:
367 case C2:
368 return (UChar *)(s - 3);
369 case C4:
370 state = S_one_C4_even_CMC4;
371 break;
372 case CM:
373 state = S_odd_CM_even_CMC4;
374 break;
375 }
376 break;
377 case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
378 switch (GB18030_MAP[*p]) {
379 case C1:
380 case C2:
381 case C4:
382 return (UChar *)(s - 3);
383 case CM:
384 state = S_odd_CMC4;
385 break;
386 }
387 break;
388
389 case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
390 switch (GB18030_MAP[*p]) {
391 case C1:
392 case C2:
393 case C4:
394 return (UChar *)(s - 3);
395 case CM:
396 state = S_even_CM_odd_CMC4;
397 break;
398 }
399 break;
400 case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
401 switch (GB18030_MAP[*p]) {
402 case C1:
403 case C2:
404 case C4:
405 return (UChar *)(s - 1);
406 case CM:
407 state = S_odd_CM_odd_CMC4;
408 break;
409 }
410 break;
411
412 case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
413 switch (GB18030_MAP[*p]) {
414 case C1:
415 case C2:
416 case C4:
417 return (UChar *)(s - 1);
418 case CM:
419 state = S_even_CM_even_CMC4;
420 break;
421 }
422 break;
423 case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
424 switch (GB18030_MAP[*p]) {
425 case C1:
426 case C2:
427 case C4:
428 return (UChar *)(s - 3);
429 case CM:
430 state = S_odd_CM_even_CMC4;
431 break;
432 }
433 break;
434
435 case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
436 switch (GB18030_MAP[*p]) {
437 case C1:
438 case C2:
439 case C4:
440 return (UChar *)s;
441 case CM:
442 state = S_one_CM_odd_C4CM; /* CM C4 CM */
443 break;
444 }
445 break;
446 case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
447 switch (GB18030_MAP[*p]) {
448 case C1:
449 case C2:
450 return (UChar *)(s - 2); /* |CM C4 CM */
451 case C4:
452 state = S_even_C4CM;
453 break;
454 case CM:
455 state = S_even_CM_odd_C4CM;
456 break;
457 }
458 break;
459 case S_even_C4CM: /* C4 CM C4 CM */
460 switch (GB18030_MAP[*p]) {
461 case C1:
462 case C2:
463 case C4:
464 return (UChar *)(s - 2); /* C4|CM C4 CM */
465 case CM:
466 state = S_one_CM_even_C4CM;
467 break;
468 }
469 break;
470 case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
471 switch (GB18030_MAP[*p]) {
472 case C1:
473 case C2:
474 return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
475 case C4:
476 state = S_odd_C4CM;
477 break;
478 case CM:
479 state = S_even_CM_even_C4CM;
480 break;
481 }
482 break;
483
484 case S_even_CM_odd_C4CM: /* CM CM C4 CM */
485 switch (GB18030_MAP[*p]) {
486 case C1:
487 case C2:
488 case C4:
489 return (UChar *)(s - 0); /* |CM CM|C4|CM */
490 case CM:
491 state = S_odd_CM_odd_C4CM;
492 break;
493 }
494 break;
495 case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
496 switch (GB18030_MAP[*p]) {
497 case C1:
498 case C2:
499 case C4:
500 return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
501 case CM:
502 state = S_even_CM_odd_C4CM;
503 break;
504 }
505 break;
506
507 case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
508 switch (GB18030_MAP[*p]) {
509 case C1:
510 case C2:
511 case C4:
512 return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
513 case CM:
514 state = S_odd_CM_even_C4CM;
515 break;
516 }
517 break;
518 case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
519 switch (GB18030_MAP[*p]) {
520 case C1:
521 case C2:
522 case C4:
523 return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
524 case CM:
525 state = S_even_CM_even_C4CM;
526 break;
527 }
528 break;
529 }
530 }
531
532 DEBUG_OUT(("state %-19s\n", StateNames[state]));
533 switch (state) {
534 case S_START: return (UChar *)(s - 0);
535 case S_one_C2: return (UChar *)(s - 0);
536 case S_one_C4: return (UChar *)(s - 0);
537 case S_one_CM: return (UChar *)(s - 0);
538
539 case S_odd_CM_one_CX: return (UChar *)(s - 1);
540 case S_even_CM_one_CX: return (UChar *)(s - 0);
541
542 case S_one_CMC4: return (UChar *)(s - 1);
543 case S_odd_CMC4: return (UChar *)(s - 1);
544 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
545 case S_even_CMC4: return (UChar *)(s - 3);
546 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
547
548 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
549 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
550
551 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
552 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
553
554 case S_odd_C4CM: return (UChar *)(s - 0);
555 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
556 case S_even_C4CM: return (UChar *)(s - 2);
557 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
558
559 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
560 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
561 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
562 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
563 }
564
565 return (UChar* )s; /* never come here. (escape warning) */
566 }
567
568 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)569 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
570 {
571 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
572 }
573
574 OnigEncodingType OnigEncodingGB18030 = {
575 gb18030_mbc_enc_len,
576 "GB18030", /* name */
577 4, /* max enc length */
578 1, /* min enc length */
579 onigenc_is_mbc_newline_0x0a,
580 gb18030_mbc_to_code,
581 gb18030_code_to_mbclen,
582 gb18030_code_to_mbc,
583 gb18030_mbc_case_fold,
584 onigenc_ascii_apply_all_case_fold,
585 onigenc_ascii_get_case_fold_codes_by_str,
586 onigenc_minimum_property_name_to_ctype,
587 gb18030_is_code_ctype,
588 onigenc_not_support_get_ctype_code_range,
589 gb18030_left_adjust_char_head,
590 gb18030_is_allowed_reverse_match,
591 NULL, /* init */
592 NULL, /* is_initialized */
593 is_valid_mbc_string,
594 ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1,
595 0, 0
596 };
597