1 /**********************************************************************
2   gb18030.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2005-2020  KUBO Takehiro <kubo AT jiubao DOT org>
6  *                          K.Kosako
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include "regenc.h"
32 
33 /* #define DEBUG_GB18030 */
34 
35 #ifndef DEBUG_GB18030
36 
37 #define DEBUG_OUT(arg)
38 
39 #else
40 
41 #ifndef NEED_TO_INCLUDE_STDIO
42 #define NEED_TO_INCLUDE_STDIO
43 #endif
44 
45 /* for printf() */
46 #include "regint.h"
47 
48 #define DEBUG_OUT(arg) printf arg
49 
50 #endif
51 
52 enum {
53   C1, /* one-byte char */
54   C2, /* one-byte or second of two-byte char */
55   C4, /* one-byte or second or fourth of four-byte char */
56   CM  /* first of two- or four-byte char or second of two-byte char */
57 };
58 
59 static const char GB18030_MAP[] = {
60   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
61   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
62   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
63   C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
64   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
65   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
66   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
67   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
68   C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
69   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
70   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
71   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
72   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
73   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
74   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
75   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
76 };
77 
78 static int
gb18030_mbc_enc_len(const UChar * p)79 gb18030_mbc_enc_len(const UChar* p)
80 {
81   if (GB18030_MAP[*p] != CM)
82     return 1;
83 
84   p++;
85   if (GB18030_MAP[*p] == C4)
86     return 4;
87 
88   return 2;
89 }
90 
91 static int
gb18030_code_to_mbclen(OnigCodePoint code)92 gb18030_code_to_mbclen(OnigCodePoint code)
93 {
94   if ((code & 0xff000000) != 0) {
95     if (GB18030_MAP[(int )(code >> 24) & 0xff] == CM)
96       if (GB18030_MAP[(int )(code >> 16) & 0xff] == C4)
97         return 4;
98   }
99   else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
100   else if ((code & 0xff00) != 0) {
101     if (GB18030_MAP[(int )(code >> 8) & 0xff] == CM) {
102       char c = GB18030_MAP[(int )code & 0xff];
103       if (c == CM || c == C2)
104         return 2;
105     }
106   }
107   else {
108     if (GB18030_MAP[(int )(code & 0xff)] != CM)
109       return 1;
110   }
111 
112   return ONIGERR_INVALID_CODE_POINT_VALUE;
113 }
114 
115 static int
is_valid_mbc_string(const UChar * p,const UChar * end)116 is_valid_mbc_string(const UChar* p, const UChar* end)
117 {
118   while (p < end) {
119     if (*p < 0x80) {
120       p++;
121     }
122     else if (*p == 0x80 || *p == 0xff) {
123       return FALSE;
124     }
125     else {
126       p++;
127       if (p >= end) return FALSE;
128       if (*p < 0x40) {
129         if (*p < 0x30 || *p > 0x39)
130           return FALSE;
131 
132         p++;
133         if (p >= end) return FALSE;
134         if (*p < 0x81 || *p == 0xff) return FALSE;
135 
136         p++;
137         if (p >= end) return FALSE;
138         if (*p < 0x30 || *p > 0x39)
139           return FALSE;
140 
141         p++;
142       }
143       else if (*p == 0x7f || *p == 0xff) {
144         return FALSE;
145       }
146       else {
147         p++;
148       }
149     }
150   }
151 
152   return TRUE;
153 }
154 
155 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)156 gb18030_mbc_to_code(const UChar* p, const UChar* end)
157 {
158   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
159 }
160 
161 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)162 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
163 {
164   return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
165 }
166 
167 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)168 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
169                       UChar* lower)
170 {
171   return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
172                                    pp, end, lower);
173 }
174 
175 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)176 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
177 {
178   return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
179 }
180 
181 enum state {
182   S_START = 0,
183   S_one_C2 = 1,
184   S_one_C4,
185   S_one_CM,
186 
187   S_odd_CM_one_CX,
188   S_even_CM_one_CX,
189 
190   /* CMC4 : pair of "CM C4" */
191   S_one_CMC4,
192   S_odd_CMC4,
193   S_one_C4_odd_CMC4,
194   S_even_CMC4,
195   S_one_C4_even_CMC4,
196 
197   S_odd_CM_odd_CMC4,
198   S_even_CM_odd_CMC4,
199 
200   S_odd_CM_even_CMC4,
201   S_even_CM_even_CMC4,
202 
203   /* C4CM : pair of "C4 CM" */
204   S_odd_C4CM,
205   S_one_CM_odd_C4CM,
206   S_even_C4CM,
207   S_one_CM_even_C4CM,
208 
209   S_even_CM_odd_C4CM,
210   S_odd_CM_odd_C4CM,
211   S_even_CM_even_C4CM,
212   S_odd_CM_even_C4CM,
213 };
214 
215 #ifdef DEBUG_GB18030
216 static char* StateNames[] = {
217   "S_START",
218   "S_one_C2",
219   "S_one_C4",
220   "S_one_CM",
221   "S_odd_CM_one_CX",
222   "S_even_CM_one_CX",
223   "S_one_CMC4",
224   "S_odd_CMC4",
225   "S_one_C4_odd_CMC4",
226   "S_even_CMC4",
227   "S_one_C4_even_CMC4",
228   "S_odd_CM_odd_CMC4",
229   "S_even_CM_odd_CMC4",
230   "S_odd_CM_even_CMC4",
231   "S_even_CM_even_CMC4",
232   "S_odd_C4CM",
233   "S_one_CM_odd_C4CM",
234   "S_even_C4CM",
235   "S_one_CM_even_C4CM",
236   "S_even_CM_odd_C4CM",
237   "S_odd_CM_odd_C4CM",
238   "S_even_CM_even_C4CM",
239   "S_odd_CM_even_C4CM"
240 };
241 #endif
242 
243 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)244 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
245 {
246   const UChar *p;
247   enum state state = S_START;
248 
249   DEBUG_OUT(("----------------\n"));
250   for (p = s; p >= start; p--) {
251     DEBUG_OUT(("%5d: state %-19s (0x%02x)->\n", (int )(p - start), StateNames[state], *p));
252     switch (state) {
253     case S_START:
254       switch (GB18030_MAP[*p]) {
255       case C1:
256         return (UChar *)s;
257       case C2:
258         state = S_one_C2; /* C2 */
259         break;
260       case C4:
261         state = S_one_C4; /* C4 */
262         break;
263       case CM:
264         state = S_one_CM; /* CM */
265         break;
266       }
267       break;
268     case S_one_C2: /* C2 */
269       switch (GB18030_MAP[*p]) {
270       case C1:
271       case C2:
272       case C4:
273         return (UChar *)s;
274       case CM:
275         state = S_odd_CM_one_CX; /* CM C2 */
276         break;
277       }
278       break;
279     case S_one_C4: /* C4 */
280       switch (GB18030_MAP[*p]) {
281       case C1:
282       case C2:
283       case C4:
284         return (UChar *)s;
285       case CM:
286         state = S_one_CMC4;
287         break;
288       }
289       break;
290     case S_one_CM: /* CM */
291       switch (GB18030_MAP[*p]) {
292       case C1:
293       case C2:
294         return (UChar *)s;
295       case C4:
296         state = S_odd_C4CM;
297         break;
298       case CM:
299         state = S_odd_CM_one_CX; /* CM CM */
300         break;
301       }
302       break;
303 
304     case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
305       switch (GB18030_MAP[*p]) {
306       case C1:
307       case C2:
308       case C4:
309         return (UChar *)(s - 1);
310       case CM:
311         state = S_even_CM_one_CX;
312         break;
313       }
314       break;
315     case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
316       switch (GB18030_MAP[*p]) {
317       case C1:
318       case C2:
319       case C4:
320         return (UChar *)s;
321       case CM:
322         state = S_odd_CM_one_CX;
323         break;
324       }
325       break;
326 
327     case S_one_CMC4: /* CM C4 */
328       switch (GB18030_MAP[*p]) {
329       case C1:
330       case C2:
331         return (UChar *)(s - 1);
332       case C4:
333         state = S_one_C4_odd_CMC4; /* C4 CM C4 */
334         break;
335       case CM:
336         state = S_even_CM_one_CX; /* CM CM C4 */
337         break;
338       }
339       break;
340     case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
341       switch (GB18030_MAP[*p]) {
342       case C1:
343       case C2:
344         return (UChar *)(s - 1);
345       case C4:
346         state = S_one_C4_odd_CMC4;
347         break;
348       case CM:
349         state = S_odd_CM_odd_CMC4;
350         break;
351       }
352       break;
353     case S_one_C4_odd_CMC4: /* C4 CM C4 */
354       switch (GB18030_MAP[*p]) {
355       case C1:
356       case C2:
357       case C4:
358         return (UChar *)(s - 1);
359       case CM:
360         state = S_even_CMC4; /* CM C4 CM C4 */
361         break;
362       }
363       break;
364     case S_even_CMC4: /* CM C4 CM C4 */
365       switch (GB18030_MAP[*p]) {
366       case C1:
367       case C2:
368         return (UChar *)(s - 3);
369       case C4:
370         state = S_one_C4_even_CMC4;
371         break;
372       case CM:
373         state = S_odd_CM_even_CMC4;
374         break;
375       }
376       break;
377     case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
378       switch (GB18030_MAP[*p]) {
379       case C1:
380       case C2:
381       case C4:
382         return (UChar *)(s - 3);
383       case CM:
384         state = S_odd_CMC4;
385         break;
386       }
387       break;
388 
389     case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
390       switch (GB18030_MAP[*p]) {
391       case C1:
392       case C2:
393       case C4:
394         return (UChar *)(s - 3);
395       case CM:
396         state = S_even_CM_odd_CMC4;
397         break;
398       }
399       break;
400     case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
401       switch (GB18030_MAP[*p]) {
402       case C1:
403       case C2:
404       case C4:
405         return (UChar *)(s - 1);
406       case CM:
407         state = S_odd_CM_odd_CMC4;
408         break;
409       }
410       break;
411 
412     case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
413       switch (GB18030_MAP[*p]) {
414       case C1:
415       case C2:
416       case C4:
417         return (UChar *)(s - 1);
418       case CM:
419         state = S_even_CM_even_CMC4;
420         break;
421       }
422       break;
423     case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
424       switch (GB18030_MAP[*p]) {
425       case C1:
426       case C2:
427       case C4:
428         return (UChar *)(s - 3);
429       case CM:
430         state = S_odd_CM_even_CMC4;
431         break;
432       }
433       break;
434 
435     case S_odd_C4CM: /* C4 CM */  /* C4 CM C4 CM C4 CM*/
436       switch (GB18030_MAP[*p]) {
437       case C1:
438       case C2:
439       case C4:
440         return (UChar *)s;
441       case CM:
442         state = S_one_CM_odd_C4CM; /* CM C4 CM */
443         break;
444       }
445       break;
446     case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
447       switch (GB18030_MAP[*p]) {
448       case C1:
449       case C2:
450         return (UChar *)(s - 2); /* |CM C4 CM */
451       case C4:
452         state = S_even_C4CM;
453         break;
454       case CM:
455         state = S_even_CM_odd_C4CM;
456         break;
457       }
458       break;
459     case S_even_C4CM: /* C4 CM C4 CM */
460       switch (GB18030_MAP[*p]) {
461       case C1:
462       case C2:
463       case C4:
464         return (UChar *)(s - 2);  /* C4|CM C4 CM */
465       case CM:
466         state = S_one_CM_even_C4CM;
467         break;
468       }
469       break;
470     case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
471       switch (GB18030_MAP[*p]) {
472       case C1:
473       case C2:
474         return (UChar *)(s - 0);  /*|CM C4 CM C4|CM */
475       case C4:
476         state = S_odd_C4CM;
477         break;
478       case CM:
479         state = S_even_CM_even_C4CM;
480         break;
481       }
482       break;
483 
484     case S_even_CM_odd_C4CM: /* CM CM C4 CM */
485       switch (GB18030_MAP[*p]) {
486       case C1:
487       case C2:
488       case C4:
489         return (UChar *)(s - 0); /* |CM CM|C4|CM */
490       case CM:
491         state = S_odd_CM_odd_C4CM;
492         break;
493       }
494       break;
495     case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
496       switch (GB18030_MAP[*p]) {
497       case C1:
498       case C2:
499       case C4:
500         return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
501       case CM:
502         state = S_even_CM_odd_C4CM;
503         break;
504       }
505       break;
506 
507     case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
508       switch (GB18030_MAP[*p]) {
509       case C1:
510       case C2:
511       case C4:
512         return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
513       case CM:
514         state = S_odd_CM_even_C4CM;
515         break;
516       }
517       break;
518     case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
519       switch (GB18030_MAP[*p]) {
520       case C1:
521       case C2:
522       case C4:
523         return (UChar *)(s - 0);  /* |CM CM|CM C4 CM C4|CM */
524       case CM:
525         state = S_even_CM_even_C4CM;
526         break;
527       }
528       break;
529     }
530   }
531 
532   DEBUG_OUT(("state %-19s\n", StateNames[state]));
533   switch (state) {
534   case S_START:             return (UChar *)(s - 0);
535   case S_one_C2:            return (UChar *)(s - 0);
536   case S_one_C4:            return (UChar *)(s - 0);
537   case S_one_CM:            return (UChar *)(s - 0);
538 
539   case S_odd_CM_one_CX:     return (UChar *)(s - 1);
540   case S_even_CM_one_CX:    return (UChar *)(s - 0);
541 
542   case S_one_CMC4:          return (UChar *)(s - 1);
543   case S_odd_CMC4:          return (UChar *)(s - 1);
544   case S_one_C4_odd_CMC4:   return (UChar *)(s - 1);
545   case S_even_CMC4:         return (UChar *)(s - 3);
546   case S_one_C4_even_CMC4:  return (UChar *)(s - 3);
547 
548   case S_odd_CM_odd_CMC4:   return (UChar *)(s - 3);
549   case S_even_CM_odd_CMC4:  return (UChar *)(s - 1);
550 
551   case S_odd_CM_even_CMC4:  return (UChar *)(s - 1);
552   case S_even_CM_even_CMC4: return (UChar *)(s - 3);
553 
554   case S_odd_C4CM:          return (UChar *)(s - 0);
555   case S_one_CM_odd_C4CM:   return (UChar *)(s - 2);
556   case S_even_C4CM:         return (UChar *)(s - 2);
557   case S_one_CM_even_C4CM:  return (UChar *)(s - 0);
558 
559   case S_even_CM_odd_C4CM:  return (UChar *)(s - 0);
560   case S_odd_CM_odd_C4CM:   return (UChar *)(s - 2);
561   case S_even_CM_even_C4CM: return (UChar *)(s - 2);
562   case S_odd_CM_even_C4CM:  return (UChar *)(s - 0);
563   }
564 
565   return (UChar* )s;  /* never come here. (escape warning) */
566 }
567 
568 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)569 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
570 {
571   return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
572 }
573 
574 OnigEncodingType OnigEncodingGB18030 = {
575   gb18030_mbc_enc_len,
576   "GB18030",   /* name */
577   4,          /* max enc length */
578   1,          /* min enc length */
579   onigenc_is_mbc_newline_0x0a,
580   gb18030_mbc_to_code,
581   gb18030_code_to_mbclen,
582   gb18030_code_to_mbc,
583   gb18030_mbc_case_fold,
584   onigenc_ascii_apply_all_case_fold,
585   onigenc_ascii_get_case_fold_codes_by_str,
586   onigenc_minimum_property_name_to_ctype,
587   gb18030_is_code_ctype,
588   onigenc_not_support_get_ctype_code_range,
589   gb18030_left_adjust_char_head,
590   gb18030_is_allowed_reverse_match,
591   NULL, /* init */
592   NULL, /* is_initialized */
593   is_valid_mbc_string,
594   ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1,
595   0, 0
596 };
597