1 /**********************************************************************
2   utf8.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 /* U+0000 - U+10FFFF */
33 #define USE_RFC3629_RANGE
34 
35 /* #define USE_INVALID_CODE_SCHEME */
36 
37 #ifdef USE_INVALID_CODE_SCHEME
38 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
39 #define INVALID_CODE_FE   0xfffffffe
40 #define INVALID_CODE_FF   0xffffffff
41 #define VALID_CODE_LIMIT  0x7fffffff
42 #endif
43 
44 #define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80)
45 #define utf8_istail(c)     ((UChar )((c) & 0xc0) == 0x80)
46 
47 static const int EncLen_UTF8[] = {
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
62   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
63 #ifdef USE_RFC3629_RANGE
64   4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
65 #else
66   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
67 #endif
68 };
69 
70 static int
mbc_enc_len(const UChar * p)71 mbc_enc_len(const UChar* p)
72 {
73   return EncLen_UTF8[*p];
74 }
75 
76 static int
is_valid_mbc_string(const UChar * p,const UChar * end)77 is_valid_mbc_string(const UChar* p, const UChar* end)
78 {
79   int i, len;
80 
81   while (p < end) {
82     if (! utf8_islead(*p))
83       return FALSE;
84 
85     len = mbc_enc_len(p++);
86     if (len > 1) {
87       for (i = 1; i < len; i++) {
88         if (p == end)
89           return FALSE;
90 
91         if (! utf8_istail(*p++))
92           return FALSE;
93       }
94     }
95   }
96 
97   return TRUE;
98 }
99 
100 static OnigCodePoint
mbc_to_code(const UChar * p,const UChar * end)101 mbc_to_code(const UChar* p, const UChar* end)
102 {
103   int c, len;
104   OnigCodePoint n;
105 
106   len = mbc_enc_len(p);
107   if (len > (int )(end - p)) len = (int )(end - p);
108 
109   c = *p++;
110   if (len > 1) {
111     len--;
112     n = c & ((1 << (6 - len)) - 1);
113     while (len--) {
114       c = *p++;
115       n = (n << 6) | (c & ((1 << 6) - 1));
116     }
117     return n;
118   }
119   else {
120 #ifdef USE_INVALID_CODE_SCHEME
121     if (c > 0xfd) {
122       return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
123     }
124 #endif
125     return (OnigCodePoint )c;
126   }
127 }
128 
129 static int
code_to_mbclen(OnigCodePoint code)130 code_to_mbclen(OnigCodePoint code)
131 {
132   if      ((code & 0xffffff80) == 0) return 1;
133   else if ((code & 0xfffff800) == 0) return 2;
134   else if ((code & 0xffff0000) == 0) return 3;
135   else if ((code & 0xffe00000) == 0) return 4;
136 #ifndef USE_RFC3629_RANGE
137   else if ((code & 0xfc000000) == 0) return 5;
138   else if ((code & 0x80000000) == 0) return 6;
139 #endif
140 #ifdef USE_INVALID_CODE_SCHEME
141   else if (code == INVALID_CODE_FE) return 1;
142   else if (code == INVALID_CODE_FF) return 1;
143 #endif
144   else
145     return ONIGERR_INVALID_CODE_POINT_VALUE;
146 }
147 
148 static int
code_to_mbc(OnigCodePoint code,UChar * buf)149 code_to_mbc(OnigCodePoint code, UChar *buf)
150 {
151 #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
152 #define UTF8_TRAIL0(code)        (UChar )(((code) & 0x3f) | 0x80)
153 
154   if ((code & 0xffffff80) == 0) {
155     *buf = (UChar )code;
156     return 1;
157   }
158   else {
159     UChar *p = buf;
160 
161     if ((code & 0xfffff800) == 0) {
162       *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
163     }
164     else if ((code & 0xffff0000) == 0) {
165       *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
166       *p++ = UTF8_TRAILS(code, 6);
167     }
168     else if ((code & 0xffe00000) == 0) {
169       *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
170       *p++ = UTF8_TRAILS(code, 12);
171       *p++ = UTF8_TRAILS(code,  6);
172     }
173 #ifndef USE_RFC3629_RANGE
174     else if ((code & 0xfc000000) == 0) {
175       *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
176       *p++ = UTF8_TRAILS(code, 18);
177       *p++ = UTF8_TRAILS(code, 12);
178       *p++ = UTF8_TRAILS(code,  6);
179     }
180     else if ((code & 0x80000000) == 0) {
181       *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
182       *p++ = UTF8_TRAILS(code, 24);
183       *p++ = UTF8_TRAILS(code, 18);
184       *p++ = UTF8_TRAILS(code, 12);
185       *p++ = UTF8_TRAILS(code,  6);
186     }
187 #endif
188 #ifdef USE_INVALID_CODE_SCHEME
189     else if (code == INVALID_CODE_FE) {
190       *p = 0xfe;
191       return 1;
192     }
193     else if (code == INVALID_CODE_FF) {
194       *p = 0xff;
195       return 1;
196     }
197 #endif
198     else {
199       return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
200     }
201 
202     *p++ = UTF8_TRAIL0(code);
203     return (int )(p - buf);
204   }
205 }
206 
207 static int
mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * fold)208 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
209               const UChar* end, UChar* fold)
210 {
211   const UChar* p = *pp;
212 
213   if (ONIGENC_IS_MBC_ASCII(p)) {
214 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
215     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
216       if (*p == 0x49) {
217         *fold++ = 0xc4;
218         *fold   = 0xb1;
219         (*pp)++;
220         return 2;
221       }
222     }
223 #endif
224 
225     *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
226     (*pp)++;
227     return 1; /* return byte length of converted char to lower */
228   }
229   else {
230     return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF8, flag,
231                                          pp, end, fold);
232   }
233 }
234 
235 static int
get_ctype_code_range(OnigCtype ctype,OnigCodePoint * sb_out,const OnigCodePoint * ranges[])236 get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
237                      const OnigCodePoint* ranges[])
238 {
239   *sb_out = 0x80;
240   return onigenc_unicode_ctype_code_range(ctype, ranges);
241 }
242 
243 
244 static UChar*
left_adjust_char_head(const UChar * start,const UChar * s)245 left_adjust_char_head(const UChar* start, const UChar* s)
246 {
247   const UChar *p;
248 
249   if (s <= start) return (UChar* )s;
250   p = s;
251 
252   while (!utf8_islead(*p) && p > start) p--;
253   return (UChar* )p;
254 }
255 
256 static int
get_case_fold_codes_by_str(OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])257 get_case_fold_codes_by_str(OnigCaseFoldType flag,
258     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
259 {
260   return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF8,
261                                                     flag, p, end, items);
262 }
263 
264 OnigEncodingType OnigEncodingUTF8 = {
265   mbc_enc_len,
266   "UTF-8",     /* name */
267 #ifdef USE_RFC3629_RANGE
268   4,           /* max enc length */
269 #else
270   6,
271 #endif
272   1,           /* min enc length */
273   onigenc_is_mbc_newline_0x0a,
274   mbc_to_code,
275   code_to_mbclen,
276   code_to_mbc,
277   mbc_case_fold,
278   onigenc_unicode_apply_all_case_fold,
279   get_case_fold_codes_by_str,
280   onigenc_unicode_property_name_to_ctype,
281   onigenc_unicode_is_code_ctype,
282   get_ctype_code_range,
283   left_adjust_char_head,
284   onigenc_always_true_is_allowed_reverse_match,
285   NULL, /* init */
286   NULL, /* is_initialized */
287   is_valid_mbc_string,
288   ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_UNICODE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
289   0, 0
290 };
291