1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2018 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Wez Furlong (wez@thebrainroom.com)                           |
16    +----------------------------------------------------------------------+
17 
18 	Based on code from ucdata-2.5, which has the following Copyright:
19 
20 	Copyright 2001 Computing Research Labs, New Mexico State University
21 
22 	Permission is hereby granted, free of charge, to any person obtaining a
23 	copy of this software and associated documentation files (the "Software"),
24 	to deal in the Software without restriction, including without limitation
25 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 	and/or sell copies of the Software, and to permit persons to whom the
27 	Software is furnished to do so, subject to the following conditions:
28 
29 	The above copyright notice and this permission notice shall be included in
30 	all copies or substantial portions of the Software.
31 */
32 
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36 
37 #include "php.h"
38 #include "php_ini.h"
39 
40 #if HAVE_MBSTRING
41 
42 /* include case folding data generated from the official UnicodeData.txt file */
43 #include "mbstring.h"
44 #include "php_unicode.h"
45 #include "unicode_data.h"
46 #include "libmbfl/mbfl/mbfilter_wchar.h"
47 
ZEND_EXTERN_MODULE_GLOBALS(mbstring)48 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
49 
50 static int prop_lookup(unsigned long code, unsigned long n)
51 {
52 	long l, r, m;
53 
54 	/*
55 	 * There is an extra node on the end of the offsets to allow this routine
56 	 * to work right.  If the index is 0xffff, then there are no nodes for the
57 	 * property.
58 	 */
59 	if ((l = _ucprop_offsets[n]) == 0xffff)
60 		return 0;
61 
62 	/*
63 	 * Locate the next offset that is not 0xffff.  The sentinel at the end of
64 	 * the array is the max index value.
65 	 */
66 	for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
67 		;
68 
69 	r = _ucprop_offsets[n + m] - 1;
70 
71 	while (l <= r) {
72 		/*
73 		 * Determine a "mid" point and adjust to make sure the mid point is at
74 		 * the beginning of a range pair.
75 		 */
76 		m = (l + r) >> 1;
77 		m -= (m & 1);
78 		if (code > _ucprop_ranges[m + 1])
79 			l = m + 2;
80 		else if (code < _ucprop_ranges[m])
81 			r = m - 2;
82 		else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
83 			return 1;
84 	}
85 	return 0;
86 
87 }
88 
php_unicode_is_prop1(unsigned long code,int prop)89 MBSTRING_API int php_unicode_is_prop1(unsigned long code, int prop)
90 {
91 	return prop_lookup(code, prop);
92 }
93 
php_unicode_is_prop(unsigned long code,...)94 MBSTRING_API int php_unicode_is_prop(unsigned long code, ...)
95 {
96 	int result = 0;
97 	va_list va;
98 	va_start(va, code);
99 
100 	while (1) {
101 		int prop = va_arg(va, int);
102 		if (prop < 0) {
103 			break;
104 		}
105 
106 		if (prop_lookup(code, prop)) {
107 			result = 1;
108 			break;
109 		}
110 	}
111 
112 	va_end(va);
113 	return result;
114 }
115 
mph_hash(unsigned d,unsigned x)116 static inline unsigned mph_hash(unsigned d, unsigned x) {
117     x ^= d;
118     x = ((x >> 16) ^ x) * 0x45d9f3b;
119     return x;
120 }
121 
122 #define CODE_NOT_FOUND ((unsigned) -1)
123 
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)124 static inline unsigned mph_lookup(
125 		unsigned code,
126 		const short *g_table, unsigned g_table_size,
127 		const unsigned *table, unsigned table_size)
128 {
129 	short g = g_table[mph_hash(0, code) % g_table_size];
130 
131 	unsigned idx;
132 	if (g <= 0) {
133 		idx = -g;
134 	} else {
135 		idx = mph_hash(g, code) % table_size;
136 	}
137 
138 	if (table[2*idx] == code) {
139 		return table[2*idx + 1];
140 	}
141 	return CODE_NOT_FOUND;
142 }
143 
144 #define CASE_LOOKUP(code, type) \
145 	mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
146 			_uccase_##type##_table, _uccase_##type##_table_size)
147 
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)148 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
149 {
150 	if (code < 0x80) {
151 		/* Fast path for ASCII */
152 		if (code >= 0x61 && code <= 0x7A) {
153 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
154 				return 0x130;
155 			}
156 			return code - 0x20;
157 		}
158 		return code;
159 	} else {
160 		unsigned new_code = CASE_LOOKUP(code, upper);
161 		if (new_code != CODE_NOT_FOUND) {
162 			return new_code;
163 		}
164 		return code;
165 	}
166 }
167 
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)168 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
169 {
170 	if (code < 0x80) {
171 		/* Fast path for ASCII */
172 		if (code >= 0x41 && code <= 0x5A) {
173 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
174 				return 0x0131L;
175 			}
176 			return code + 0x20;
177 		}
178 		return code;
179 	} else {
180 		unsigned new_code = CASE_LOOKUP(code, lower);
181 		if (new_code != CODE_NOT_FOUND) {
182 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
183 				return 0x69;
184 			}
185 			return new_code;
186 		}
187 		return code;
188 	}
189 }
190 
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)191 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
192 {
193 	unsigned new_code = CASE_LOOKUP(code, title);
194 	if (new_code != CODE_NOT_FOUND) {
195 		return new_code;
196 	}
197 
198 	/* No dedicated title-case variant, use to-upper instead */
199 	return php_unicode_toupper_raw(code, enc);
200 }
201 
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)202 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
203 {
204 	if (code < 0x80) {
205 		/* Fast path for ASCII */
206 		if (code >= 0x41 && code <= 0x5A) {
207 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
208 				return 0x131;
209 			}
210 			return code + 0x20;
211 		}
212 		return code;
213 	} else {
214 		unsigned new_code = CASE_LOOKUP(code, fold);
215 		if (new_code != CODE_NOT_FOUND) {
216 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
217 				return 0x69;
218 			}
219 			return new_code;
220 		}
221 		return code;
222 	}
223 }
224 
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)225 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
226 	code = php_unicode_tolower_raw(code, enc);
227 	if (UNEXPECTED(code > 0xffffff)) {
228 		return _uccase_extra_table[code & 0xffffff];
229 	}
230 	return code;
231 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)232 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
233 	code = php_unicode_toupper_raw(code, enc);
234 	if (UNEXPECTED(code > 0xffffff)) {
235 		return _uccase_extra_table[code & 0xffffff];
236 	}
237 	return code;
238 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)239 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
240 	code = php_unicode_totitle_raw(code, enc);
241 	if (UNEXPECTED(code > 0xffffff)) {
242 		return _uccase_extra_table[code & 0xffffff];
243 	}
244 	return code;
245 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)246 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
247 	code = php_unicode_tofold_raw(code, enc);
248 	if (UNEXPECTED(code > 0xffffff)) {
249 		return _uccase_extra_table[code & 0xffffff];
250 	}
251 	return code;
252 }
253 
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)254 static inline unsigned php_unicode_tolower_full(
255 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
256 	code = php_unicode_tolower_raw(code, enc);
257 	if (UNEXPECTED(code > 0xffffff)) {
258 		unsigned len = code >> 24;
259 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
260 		memcpy(out, p + 1, len * sizeof(unsigned));
261 		return len;
262 	}
263 	*out = code;
264 	return 1;
265 }
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)266 static inline unsigned php_unicode_toupper_full(
267 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
268 	code = php_unicode_toupper_raw(code, enc);
269 	if (UNEXPECTED(code > 0xffffff)) {
270 		unsigned len = code >> 24;
271 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
272 		memcpy(out, p + 1, len * sizeof(unsigned));
273 		return len;
274 	}
275 	*out = code;
276 	return 1;
277 }
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)278 static inline unsigned php_unicode_totitle_full(
279 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
280 	code = php_unicode_totitle_raw(code, enc);
281 	if (UNEXPECTED(code > 0xffffff)) {
282 		unsigned len = code >> 24;
283 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
284 		memcpy(out, p + 1, len * sizeof(unsigned));
285 		return len;
286 	}
287 	*out = code;
288 	return 1;
289 }
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)290 static inline unsigned php_unicode_tofold_full(
291 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
292 	code = php_unicode_tofold_raw(code, enc);
293 	if (UNEXPECTED(code > 0xffffff)) {
294 		unsigned len = code >> 24;
295 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
296 		memcpy(out, p + 1, len * sizeof(unsigned));
297 		return len;
298 	}
299 	*out = code;
300 	return 1;
301 }
302 
303 struct convert_case_data {
304 	mbfl_convert_filter *next_filter;
305 	enum mbfl_no_encoding no_encoding;
306 	int case_mode;
307 	int title_mode;
308 };
309 
convert_case_filter(int c,void * void_data)310 static int convert_case_filter(int c, void *void_data)
311 {
312 	struct convert_case_data *data = (struct convert_case_data *) void_data;
313 	unsigned out[3];
314 	unsigned len, i;
315 
316 	/* Handle invalid characters early, as we assign special meaning to
317 	 * codepoints above 0xffffff. */
318 	if (UNEXPECTED((unsigned) c > 0xffffff)) {
319 		(*data->next_filter->filter_function)(c, data->next_filter);
320 		return 0;
321 	}
322 
323 	switch (data->case_mode) {
324 		case PHP_UNICODE_CASE_UPPER_SIMPLE:
325 			out[0] = php_unicode_toupper_simple(c, data->no_encoding);
326 			len = 1;
327 			break;
328 
329 		case PHP_UNICODE_CASE_UPPER:
330 			len = php_unicode_toupper_full(c, data->no_encoding, out);
331 			break;
332 
333 		case PHP_UNICODE_CASE_LOWER_SIMPLE:
334 			out[0] = php_unicode_tolower_simple(c, data->no_encoding);
335 			len = 1;
336 			break;
337 
338 		case PHP_UNICODE_CASE_LOWER:
339 			len = php_unicode_tolower_full(c, data->no_encoding, out);
340 			break;
341 
342 		case PHP_UNICODE_CASE_FOLD:
343 			len = php_unicode_tofold_full(c, data->no_encoding, out);
344 			break;
345 
346 		case PHP_UNICODE_CASE_FOLD_SIMPLE:
347 			out[0] = php_unicode_tofold_simple(c, data->no_encoding);
348 			len = 1;
349 			break;
350 
351 		case PHP_UNICODE_CASE_TITLE_SIMPLE:
352 		case PHP_UNICODE_CASE_TITLE:
353 		{
354 			if (data->title_mode) {
355 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
356 					out[0] = php_unicode_tolower_simple(c, data->no_encoding);
357 					len = 1;
358 				} else {
359 					len = php_unicode_tolower_full(c, data->no_encoding, out);
360 				}
361 			} else {
362 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
363 					out[0] = php_unicode_totitle_simple(c, data->no_encoding);
364 					len = 1;
365 				} else {
366 					len = php_unicode_totitle_full(c, data->no_encoding, out);
367 				}
368 			}
369 			if (!php_unicode_is_case_ignorable(c)) {
370 				data->title_mode = php_unicode_is_cased(c);
371 			}
372 			break;
373 		}
374 		default:
375 			assert(0);
376 			break;
377 	}
378 
379 	for (i = 0; i < len; i++) {
380 		(*data->next_filter->filter_function)(out[i], data->next_filter);
381 	}
382 	return 0;
383 }
384 
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)385 MBSTRING_API char *php_unicode_convert_case(
386 		int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
387 		const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
388 {
389 	struct convert_case_data data;
390 	mbfl_convert_filter *from_wchar, *to_wchar;
391 	mbfl_string result, *result_ptr;
392 
393 	mbfl_memory_device device;
394 	mbfl_memory_device_init(&device, srclen + 1, 0);
395 
396 	/* encoding -> wchar filter */
397 	to_wchar = mbfl_convert_filter_new(src_encoding,
398 			&mbfl_encoding_wchar, convert_case_filter, NULL, &data);
399 	if (to_wchar == NULL) {
400 		mbfl_memory_device_clear(&device);
401 		return NULL;
402 	}
403 
404 	/* wchar -> encoding filter */
405 	from_wchar = mbfl_convert_filter_new(
406 			&mbfl_encoding_wchar, src_encoding,
407 			mbfl_memory_device_output, NULL, &device);
408 	if (from_wchar == NULL) {
409 		mbfl_convert_filter_delete(to_wchar);
410 		mbfl_memory_device_clear(&device);
411 		return NULL;
412 	}
413 
414 	to_wchar->illegal_mode = illegal_mode;
415 	to_wchar->illegal_substchar = illegal_substchar;
416 	from_wchar->illegal_mode = illegal_mode;
417 	from_wchar->illegal_substchar = illegal_substchar;
418 
419 	data.next_filter = from_wchar;
420 	data.no_encoding = src_encoding->no_encoding;
421 	data.case_mode = case_mode;
422 	data.title_mode = 0;
423 
424 	{
425 		/* feed data */
426 		const unsigned char *p = (const unsigned char *) srcstr;
427 		size_t n = srclen;
428 		while (n > 0) {
429 			if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
430 				break;
431 			}
432 			n--;
433 		}
434 	}
435 
436 	mbfl_convert_filter_flush(to_wchar);
437 	mbfl_convert_filter_flush(from_wchar);
438 	result_ptr = mbfl_memory_device_result(&device, &result);
439 	mbfl_convert_filter_delete(to_wchar);
440 	mbfl_convert_filter_delete(from_wchar);
441 
442 	if (!result_ptr) {
443 		return NULL;
444 	}
445 
446 	*ret_len = result.len;
447 	return (char *) result.val;
448 }
449 
450 
451 #endif /* HAVE_MBSTRING */
452 
453 /*
454  * Local variables:
455  * tab-width: 4
456  * c-basic-offset: 4
457  * End:
458  * vim600: sw=4 ts=4 fdm=marker
459  * vim<600: sw=4 ts=4
460  */
461