1 /**********************************************************************
2 gb18030.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2005-2020 KUBO Takehiro <kubo AT jiubao DOT org>
6 * K.Kosako
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include "regenc.h"
32
33 #if 1
34
35 #define DEBUG_GB18030(arg)
36
37 #else
38
39 #ifndef NEED_TO_INCLUDE_STDIO
40 #define NEED_TO_INCLUDE_STDIO
41 #endif
42
43 /* for printf() */
44 #include "regint.h"
45
46 #define DEBUG_GB18030(arg) printf arg
47
48 #endif
49
50 enum {
51 C1, /* one-byte char */
52 C2, /* one-byte or second of two-byte char */
53 C4, /* one-byte or second or fourth of four-byte char */
54 CM /* first of two- or four-byte char or second of two-byte char */
55 };
56
57 static const char GB18030_MAP[] = {
58 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
59 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
60 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
61 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
62 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
63 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
64 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
65 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
66 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
67 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
68 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
69 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
70 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
71 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
72 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
73 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
74 };
75
76 static int
gb18030_mbc_enc_len(const UChar * p)77 gb18030_mbc_enc_len(const UChar* p)
78 {
79 if (GB18030_MAP[*p] != CM)
80 return 1;
81
82 p++;
83 if (GB18030_MAP[*p] == C4)
84 return 4;
85
86 return 2;
87 }
88
89 static int
gb18030_code_to_mbclen(OnigCodePoint code)90 gb18030_code_to_mbclen(OnigCodePoint code)
91 {
92 if ((code & 0xff000000) != 0) return 4;
93 else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
94 else if ((code & 0xff00) != 0) return 2;
95 else {
96 if (GB18030_MAP[(int )(code & 0xff)] == CM)
97 return ONIGERR_INVALID_CODE_POINT_VALUE;
98
99 return 1;
100 }
101 }
102
103 static int
is_valid_mbc_string(const UChar * p,const UChar * end)104 is_valid_mbc_string(const UChar* p, const UChar* end)
105 {
106 while (p < end) {
107 if (*p < 0x80) {
108 p++;
109 }
110 else if (*p == 0x80 || *p == 0xff) {
111 return FALSE;
112 }
113 else {
114 p++;
115 if (p >= end) return FALSE;
116 if (*p < 0x40) {
117 if (*p < 0x30 || *p > 0x39)
118 return FALSE;
119
120 p++;
121 if (p >= end) return FALSE;
122 if (*p < 0x81 || *p == 0xff) return FALSE;
123
124 p++;
125 if (p >= end) return FALSE;
126 if (*p < 0x30 || *p > 0x39)
127 return FALSE;
128
129 p++;
130 }
131 else if (*p == 0x7f || *p == 0xff) {
132 return FALSE;
133 }
134 else {
135 p++;
136 }
137 }
138 }
139
140 return TRUE;
141 }
142
143 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)144 gb18030_mbc_to_code(const UChar* p, const UChar* end)
145 {
146 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
147 }
148
149 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)150 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
151 {
152 return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
153 }
154
155 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)156 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
157 UChar* lower)
158 {
159 return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
160 pp, end, lower);
161 }
162
163 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)164 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
165 {
166 return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
167 }
168
169 enum state {
170 S_START,
171 S_one_C2,
172 S_one_C4,
173 S_one_CM,
174
175 S_odd_CM_one_CX,
176 S_even_CM_one_CX,
177
178 /* CMC4 : pair of "CM C4" */
179 S_one_CMC4,
180 S_odd_CMC4,
181 S_one_C4_odd_CMC4,
182 S_even_CMC4,
183 S_one_C4_even_CMC4,
184
185 S_odd_CM_odd_CMC4,
186 S_even_CM_odd_CMC4,
187
188 S_odd_CM_even_CMC4,
189 S_even_CM_even_CMC4,
190
191 /* C4CM : pair of "C4 CM" */
192 S_odd_C4CM,
193 S_one_CM_odd_C4CM,
194 S_even_C4CM,
195 S_one_CM_even_C4CM,
196
197 S_even_CM_odd_C4CM,
198 S_odd_CM_odd_C4CM,
199 S_even_CM_even_C4CM,
200 S_odd_CM_even_C4CM,
201 };
202
203 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)204 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
205 {
206 const UChar *p;
207 enum state state = S_START;
208
209 DEBUG_GB18030(("----------------\n"));
210 for (p = s; p >= start; p--) {
211 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
212 switch (state) {
213 case S_START:
214 switch (GB18030_MAP[*p]) {
215 case C1:
216 return (UChar *)s;
217 case C2:
218 state = S_one_C2; /* C2 */
219 break;
220 case C4:
221 state = S_one_C4; /* C4 */
222 break;
223 case CM:
224 state = S_one_CM; /* CM */
225 break;
226 }
227 break;
228 case S_one_C2: /* C2 */
229 switch (GB18030_MAP[*p]) {
230 case C1:
231 case C2:
232 case C4:
233 return (UChar *)s;
234 case CM:
235 state = S_odd_CM_one_CX; /* CM C2 */
236 break;
237 }
238 break;
239 case S_one_C4: /* C4 */
240 switch (GB18030_MAP[*p]) {
241 case C1:
242 case C2:
243 case C4:
244 return (UChar *)s;
245 case CM:
246 state = S_one_CMC4;
247 break;
248 }
249 break;
250 case S_one_CM: /* CM */
251 switch (GB18030_MAP[*p]) {
252 case C1:
253 case C2:
254 return (UChar *)s;
255 case C4:
256 state = S_odd_C4CM;
257 break;
258 case CM:
259 state = S_odd_CM_one_CX; /* CM CM */
260 break;
261 }
262 break;
263
264 case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
265 switch (GB18030_MAP[*p]) {
266 case C1:
267 case C2:
268 case C4:
269 return (UChar *)(s - 1);
270 case CM:
271 state = S_even_CM_one_CX;
272 break;
273 }
274 break;
275 case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
276 switch (GB18030_MAP[*p]) {
277 case C1:
278 case C2:
279 case C4:
280 return (UChar *)s;
281 case CM:
282 state = S_odd_CM_one_CX;
283 break;
284 }
285 break;
286
287 case S_one_CMC4: /* CM C4 */
288 switch (GB18030_MAP[*p]) {
289 case C1:
290 case C2:
291 return (UChar *)(s - 1);
292 case C4:
293 state = S_one_C4_odd_CMC4; /* C4 CM C4 */
294 break;
295 case CM:
296 state = S_even_CM_one_CX; /* CM CM C4 */
297 break;
298 }
299 break;
300 case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
301 switch (GB18030_MAP[*p]) {
302 case C1:
303 case C2:
304 return (UChar *)(s - 1);
305 case C4:
306 state = S_one_C4_odd_CMC4;
307 break;
308 case CM:
309 state = S_odd_CM_odd_CMC4;
310 break;
311 }
312 break;
313 case S_one_C4_odd_CMC4: /* C4 CM C4 */
314 switch (GB18030_MAP[*p]) {
315 case C1:
316 case C2:
317 case C4:
318 return (UChar *)(s - 1);
319 case CM:
320 state = S_even_CMC4; /* CM C4 CM C4 */
321 break;
322 }
323 break;
324 case S_even_CMC4: /* CM C4 CM C4 */
325 switch (GB18030_MAP[*p]) {
326 case C1:
327 case C2:
328 return (UChar *)(s - 3);
329 case C4:
330 state = S_one_C4_even_CMC4;
331 break;
332 case CM:
333 state = S_odd_CM_even_CMC4;
334 break;
335 }
336 break;
337 case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
338 switch (GB18030_MAP[*p]) {
339 case C1:
340 case C2:
341 case C4:
342 return (UChar *)(s - 3);
343 case CM:
344 state = S_odd_CMC4;
345 break;
346 }
347 break;
348
349 case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
350 switch (GB18030_MAP[*p]) {
351 case C1:
352 case C2:
353 case C4:
354 return (UChar *)(s - 3);
355 case CM:
356 state = S_even_CM_odd_CMC4;
357 break;
358 }
359 break;
360 case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
361 switch (GB18030_MAP[*p]) {
362 case C1:
363 case C2:
364 case C4:
365 return (UChar *)(s - 1);
366 case CM:
367 state = S_odd_CM_odd_CMC4;
368 break;
369 }
370 break;
371
372 case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
373 switch (GB18030_MAP[*p]) {
374 case C1:
375 case C2:
376 case C4:
377 return (UChar *)(s - 1);
378 case CM:
379 state = S_even_CM_even_CMC4;
380 break;
381 }
382 break;
383 case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
384 switch (GB18030_MAP[*p]) {
385 case C1:
386 case C2:
387 case C4:
388 return (UChar *)(s - 3);
389 case CM:
390 state = S_odd_CM_even_CMC4;
391 break;
392 }
393 break;
394
395 case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
396 switch (GB18030_MAP[*p]) {
397 case C1:
398 case C2:
399 case C4:
400 return (UChar *)s;
401 case CM:
402 state = S_one_CM_odd_C4CM; /* CM C4 CM */
403 break;
404 }
405 break;
406 case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
407 switch (GB18030_MAP[*p]) {
408 case C1:
409 case C2:
410 return (UChar *)(s - 2); /* |CM C4 CM */
411 case C4:
412 state = S_even_C4CM;
413 break;
414 case CM:
415 state = S_even_CM_odd_C4CM;
416 break;
417 }
418 break;
419 case S_even_C4CM: /* C4 CM C4 CM */
420 switch (GB18030_MAP[*p]) {
421 case C1:
422 case C2:
423 case C4:
424 return (UChar *)(s - 2); /* C4|CM C4 CM */
425 case CM:
426 state = S_one_CM_even_C4CM;
427 break;
428 }
429 break;
430 case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
431 switch (GB18030_MAP[*p]) {
432 case C1:
433 case C2:
434 return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
435 case C4:
436 state = S_odd_C4CM;
437 break;
438 case CM:
439 state = S_even_CM_even_C4CM;
440 break;
441 }
442 break;
443
444 case S_even_CM_odd_C4CM: /* CM CM C4 CM */
445 switch (GB18030_MAP[*p]) {
446 case C1:
447 case C2:
448 case C4:
449 return (UChar *)(s - 0); /* |CM CM|C4|CM */
450 case CM:
451 state = S_odd_CM_odd_C4CM;
452 break;
453 }
454 break;
455 case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
456 switch (GB18030_MAP[*p]) {
457 case C1:
458 case C2:
459 case C4:
460 return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
461 case CM:
462 state = S_even_CM_odd_C4CM;
463 break;
464 }
465 break;
466
467 case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
468 switch (GB18030_MAP[*p]) {
469 case C1:
470 case C2:
471 case C4:
472 return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
473 case CM:
474 state = S_odd_CM_even_C4CM;
475 break;
476 }
477 break;
478 case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
479 switch (GB18030_MAP[*p]) {
480 case C1:
481 case C2:
482 case C4:
483 return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
484 case CM:
485 state = S_even_CM_even_C4CM;
486 break;
487 }
488 break;
489 }
490 }
491
492 DEBUG_GB18030(("state %d\n", state));
493 switch (state) {
494 case S_START: return (UChar *)(s - 0);
495 case S_one_C2: return (UChar *)(s - 0);
496 case S_one_C4: return (UChar *)(s - 0);
497 case S_one_CM: return (UChar *)(s - 0);
498
499 case S_odd_CM_one_CX: return (UChar *)(s - 1);
500 case S_even_CM_one_CX: return (UChar *)(s - 0);
501
502 case S_one_CMC4: return (UChar *)(s - 1);
503 case S_odd_CMC4: return (UChar *)(s - 1);
504 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
505 case S_even_CMC4: return (UChar *)(s - 3);
506 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
507
508 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
509 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
510
511 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
512 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
513
514 case S_odd_C4CM: return (UChar *)(s - 0);
515 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
516 case S_even_C4CM: return (UChar *)(s - 2);
517 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
518
519 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
520 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
521 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
522 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
523 }
524
525 return (UChar* )s; /* never come here. (escape warning) */
526 }
527
528 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)529 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
530 {
531 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
532 }
533
534 OnigEncodingType OnigEncodingGB18030 = {
535 gb18030_mbc_enc_len,
536 "GB18030", /* name */
537 4, /* max enc length */
538 1, /* min enc length */
539 onigenc_is_mbc_newline_0x0a,
540 gb18030_mbc_to_code,
541 gb18030_code_to_mbclen,
542 gb18030_code_to_mbc,
543 gb18030_mbc_case_fold,
544 onigenc_ascii_apply_all_case_fold,
545 onigenc_ascii_get_case_fold_codes_by_str,
546 onigenc_minimum_property_name_to_ctype,
547 gb18030_is_code_ctype,
548 onigenc_not_support_get_ctype_code_range,
549 gb18030_left_adjust_char_head,
550 gb18030_is_allowed_reverse_match,
551 NULL, /* init */
552 NULL, /* is_initialized */
553 is_valid_mbc_string,
554 ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1,
555 0, 0
556 };
557