1 /*
2  *  This program is free software; you can redistribute it and/or modify
3  *  it under the terms of the GNU General Public License as published by
4  *  the Free Software Foundation; either version 2 of the License, or
5  *  (at your option) any later version.
6  *
7  *  This program is distributed in the hope that it will be useful,
8  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *  GNU General Public License for more details.
11  *
12  *  You should have received a copy of the GNU General Public License
13  *  along with this program; if not, write to the Free Software
14  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
15  */
16 
17 #include "port.h"
18 
19 #define UNICODE_INTERNAL
20 #include "libs/unicode.h"
21 
22 #include <ctype.h>
23 #include <stddef.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include "libs/log.h"
27 #include "libs/misc.h"
28 
29 
30 // Resynchronise (skip everything starting with 0x10xxxxxx):
31 static inline void
resyncUTF8(const unsigned char ** ptr)32 resyncUTF8(const unsigned char **ptr) {
33 	while ((**ptr & 0xc0) == 0x80)
34 		(*ptr)++;
35 }
36 
37 // Get one character from a UTF-8 encoded string.
38 // *ptr will point to the start of the next character.
39 // Returns 0 if the encoding is bad. This can be distinguished from the
40 // '\0' character by checking whether **ptr == '\0' before calling this
41 // function.
42 UniChar
getCharFromString(const unsigned char ** ptr)43 getCharFromString(const unsigned char **ptr) {
44 	UniChar result;
45 
46 	if (**ptr < 0x80) {
47 		// 0xxxxxxx, regular ASCII
48 		result = **ptr;
49 		(*ptr)++;
50 
51 		return result;
52 	}
53 
54 	if ((**ptr & 0xe0) == 0xc0) {
55 		// 110xxxxx; 10xxxxxx must follow
56 		// Value between 0x00000080 and 0x000007ff (inclusive)
57 		result = **ptr & 0x1f;
58 		(*ptr)++;
59 
60 		if ((**ptr & 0xc0) != 0x80)
61 			goto err;
62 		result = (result << 6) | ((**ptr) & 0x3f);
63 		(*ptr)++;
64 
65 		if (result < 0x00000080) {
66 			// invalid encoding - must reject
67 			goto err;
68 		}
69 		return result;
70 	}
71 
72 	if ((**ptr & 0xf0) == 0xe0) {
73 		// 1110xxxx; 10xxxxxx 10xxxxxx must follow
74 		// Value between 0x00000800 and 0x0000ffff (inclusive)
75 		result = **ptr & 0x0f;
76 		(*ptr)++;
77 
78 		if ((**ptr & 0xc0) != 0x80)
79 			goto err;
80 		result = (result << 6) | ((**ptr) & 0x3f);
81 		(*ptr)++;
82 
83 		if ((**ptr & 0xc0) != 0x80)
84 			goto err;
85 		result = (result << 6) | ((**ptr) & 0x3f);
86 		(*ptr)++;
87 
88 		if (result < 0x00000800) {
89 			// invalid encoding - must reject
90 			goto err;
91 		}
92 		return result;
93 	}
94 
95 	if ((**ptr & 0xf8) == 0xf0) {
96 		// 11110xxx; 10xxxxxx 10xxxxxx 10xxxxxx must follow
97 		// Value between 0x00010000 and 0x0010ffff (inclusive)
98 		result = **ptr & 0x07;
99 		(*ptr)++;
100 
101 		if ((**ptr & 0xc0) != 0x80)
102 			goto err;
103 		result = (result << 6) | ((**ptr) & 0x3f);
104 		(*ptr)++;
105 
106 		if ((**ptr & 0xc0) != 0x80)
107 			goto err;
108 		result = (result << 6) | ((**ptr) & 0x3f);
109 		(*ptr)++;
110 
111 		if ((**ptr & 0xc0) != 0x80)
112 			goto err;
113 		result = (result << 6) | ((**ptr) & 0x3f);
114 		(*ptr)++;
115 
116 		if (result < 0x00010000) {
117 			// invalid encoding - must reject
118 			goto err;
119 		}
120 		return result;
121 	}
122 
123 err:
124 	log_add(log_Warning, "Warning: Invalid UTF8 sequence.");
125 
126 	// Resynchronise (skip everything starting with 0x10xxxxxx):
127 	resyncUTF8(ptr);
128 
129 	return 0;
130 }
131 
132 UniChar
getCharFromStringN(const unsigned char ** ptr,const unsigned char * end)133 getCharFromStringN(const unsigned char **ptr, const unsigned char *end) {
134 	size_t numBytes;
135 
136 	if (*ptr == end)
137 		goto err;
138 
139 	if (**ptr < 0x80) {
140 		numBytes = 1;
141 	} else if ((**ptr & 0xe0) == 0xc0) {
142 		numBytes = 2;
143 	} else if ((**ptr & 0xf0) == 0xe0) {
144 		numBytes = 3;
145 	} else if ((**ptr & 0xf8) == 0xf0) {
146 		numBytes = 4;
147 	} else
148 		goto err;
149 
150 	if (*ptr + numBytes > end)
151 		goto err;
152 
153 	return getCharFromString(ptr);
154 
155 err:
156 	*ptr = end;
157 	return 0;
158 }
159 
160 // Get one line from a string.
161 // A line is terminated with either CRLF (DOS/Windows),
162 // LF (Unix, MacOS X), or CR (old MacOS).
163 // The end of the string is reached when **startNext == '\0'.
164 // NULL is returned if the string is not valid UTF8. In this case
165 // *end points to the first invalid character (or the character before if
166 // it was a LF), and *startNext to the start of the next (possibly invalid
167 // too) character.
168 unsigned char *
getLineFromString(const unsigned char * start,const unsigned char ** end,const unsigned char ** startNext)169 getLineFromString(const unsigned char *start, const unsigned char **end,
170 		const unsigned char **startNext) {
171 	const unsigned char *ptr = start;
172 	const unsigned char *lastPtr;
173 	UniChar ch;
174 
175 	// Search for the first newline.
176 	for (;;) {
177 		if (*ptr == '\0') {
178 			*end = ptr;
179 			*startNext = ptr;
180 			return (unsigned char *) unconst(start);
181 		}
182 		lastPtr = ptr;
183 		ch = getCharFromString(&ptr);
184 		if (ch == '\0') {
185 			// Bad string
186 			*end = lastPtr;
187 			*startNext = ptr;
188 			return NULL;
189 		}
190 		if (ch == '\n') {
191 			*end = lastPtr;
192 			if (*ptr == '\0'){
193 				// LF at the end of the string.
194 				*startNext = ptr;
195 				return (unsigned char *) unconst(start);
196 			}
197 			ch = getCharFromString(&ptr);
198 			if (ch == '\0') {
199 				// Bad string
200 				return NULL;
201 			}
202 			if (ch == '\r') {
203 				// LFCR
204 				*startNext = ptr;
205 			} else {
206 				// LF
207 				*startNext = *end;
208 			}
209 			return (unsigned char *) unconst(start);
210 		} else if (ch == '\r') {
211 			*end = lastPtr;
212 			*startNext = ptr;
213 			return (unsigned char *) unconst(start);
214 		} // else: a normal character
215 	}
216 }
217 
218 size_t
utf8StringCount(const unsigned char * start)219 utf8StringCount(const unsigned char *start) {
220 	size_t count = 0;
221 	UniChar ch;
222 
223 	for (;;) {
224 		ch = getCharFromString(&start);
225 		if (ch == '\0')
226 			return count;
227 		count++;
228 	}
229 }
230 
231 size_t
utf8StringCountN(const unsigned char * start,const unsigned char * end)232 utf8StringCountN(const unsigned char *start, const unsigned char *end) {
233 	size_t count = 0;
234 	UniChar ch;
235 
236 	for (;;) {
237 		ch = getCharFromStringN(&start, end);
238 		if (ch == '\0')
239 			return count;
240 		count++;
241 	}
242 }
243 
244 // Locates a unicode character (ch) in a UTF-8 string (pStr)
245 // returns the char positions when found
246 //  -1 when not found
247 int
utf8StringPos(const unsigned char * pStr,UniChar ch)248 utf8StringPos (const unsigned char *pStr, UniChar ch)
249 {
250 	int pos;
251 
252 	for (pos = 0; *pStr != '\0'; ++pos)
253 	{
254 		if (getCharFromString (&pStr) == ch)
255 			return pos;
256 	}
257 
258 	if (ch == '\0' && *pStr == '\0')
259 		return pos;
260 
261 	return -1;
262 }
263 
264 // Safe version of strcpy(), somewhat analogous to strncpy()
265 // except it guarantees a 0-term when size > 0
266 // when size == 0, returns NULL
267 // BUG: this may result in the last character being only partially in the
268 // buffer
269 unsigned char *
utf8StringCopy(unsigned char * dst,size_t size,const unsigned char * src)270 utf8StringCopy (unsigned char *dst, size_t size, const unsigned char *src)
271 {
272 	if (size == 0)
273 		return 0;
274 
275 	strncpy ((char *) dst, (const char *) src, size);
276 	dst[size - 1] = '\0';
277 
278 	return dst;
279 }
280 
281 // TODO: this is not implemented with respect to collating order
282 int
utf8StringCompare(const unsigned char * str1,const unsigned char * str2)283 utf8StringCompare (const unsigned char *str1, const unsigned char *str2)
284 {
285 #if 0
286 	// UniChar comparing version
287 	UniChar ch1;
288 	UniChar ch2;
289 
290 	for (;;)
291 	{
292 		int cmp;
293 
294 		ch1 = getCharFromString(&str1);
295 		ch2 = getCharFromString(&str2);
296 		if (ch1 == '\0' || ch2 == '\0')
297 			break;
298 
299 		cmp = utf8CompareChar (ch1, ch2);
300 		if (cmp != 0)
301 			return cmp;
302 	}
303 
304 	if (ch1 != '\0')
305 	{
306 		// ch2 == '\0'
307 		// str2 ends, str1 continues
308 		return 1;
309 	}
310 
311 	if (ch2 != '\0')
312 	{
313 		// ch1 == '\0'
314 		// str1 ends, str2 continues
315 		return -1;
316 	}
317 
318 	// ch1 == '\0' && ch2 == '\0'.
319 	// Strings match completely.
320 	return 0;
321 #else
322 	// this will do for now
323 	return strcmp ((const char *) str1, (const char *) str2);
324 #endif
325 }
326 
327 unsigned char *
skipUTF8Chars(const unsigned char * ptr,size_t num)328 skipUTF8Chars(const unsigned char *ptr, size_t num) {
329 	UniChar ch;
330 	const unsigned char *oldPtr;
331 
332 	while (num--) {
333 		oldPtr = ptr;
334 		ch = getCharFromString(&ptr);
335 		if (ch == '\0')
336 			return (unsigned char *) unconst(oldPtr);
337 	}
338 	return (unsigned char *) unconst(ptr);
339 }
340 
341 // Decodes a UTF-8 string (start) into a unicode character string (wstr)
342 // returns number of chars decoded and stored, not counting 0-term
343 // any chars that do not fit are truncated
344 // wide string term 0 is always appended, unless the destination
345 // buffer is 0 chars long
346 size_t
getUniCharFromStringN(UniChar * wstr,size_t maxcount,const unsigned char * start,const unsigned char * end)347 getUniCharFromStringN(UniChar *wstr, size_t maxcount,
348 		const unsigned char *start, const unsigned char *end)
349 {
350 	UniChar *next;
351 
352 	if (maxcount == 0)
353 		return 0;
354 
355 	// always leave room for 0-term
356 	--maxcount;
357 
358 	for (next = wstr; maxcount > 0; ++next, --maxcount)
359 	{
360 		*next = getCharFromStringN(&start, end);
361 		if (*next == 0)
362 			break;
363 	}
364 
365 	*next = 0; // term
366 
367 	return next - wstr;
368 }
369 
370 // See getStringFromWideN() for functionality
371 //  the only difference is that the source string (start) length is
372 //  calculated by searching for 0-term
373 size_t
getUniCharFromString(UniChar * wstr,size_t maxcount,const unsigned char * start)374 getUniCharFromString(UniChar *wstr, size_t maxcount,
375 		const unsigned char *start)
376 {
377 	UniChar *next;
378 
379 	if (maxcount == 0)
380 		return 0;
381 
382 	// always leave room for 0-term
383 	--maxcount;
384 
385 	for (next = wstr; maxcount > 0; ++next, --maxcount)
386 	{
387 		*next = getCharFromString(&start);
388 		if (*next == 0)
389 			break;
390 	}
391 
392 	*next = 0; // term
393 
394 	return next - wstr;
395 }
396 
397 // Encode one wide character into UTF-8
398 // returns number of bytes used in the buffer,
399 //  0  : invalid or unsupported char
400 //  <0 : negative of bytes needed if buffer too small
401 // string term '\0' is *not* appended or counted
402 int
getStringFromChar(unsigned char * ptr,size_t size,UniChar ch)403 getStringFromChar(unsigned char *ptr, size_t size, UniChar ch)
404 {
405 	int i;
406 	static const struct range_def
407 	{
408 		UniChar lim;
409 		int marker;
410 		int mask;
411 	}
412 	ranges[] =
413 	{
414 		{0x0000007f, 0x00, 0x7f},
415 		{0x000007ff, 0xc0, 0x1f},
416 		{0x0000ffff, 0xe0, 0x0f},
417 		{0x001fffff, 0xf0, 0x07},
418 		{0x03ffffff, 0xf8, 0x03},
419 		{0x7fffffff, 0xfc, 0x01},
420 		{0x00000000, 0x00, 0x00} // term
421 	};
422 	const struct range_def *def;
423 
424 	// lookup the range
425 	for (i = 0, def = ranges; ch > def->lim && def->mask != 0; ++i, ++def)
426 		;
427 	if (def->mask == 0)
428 	{	// invalid or unsupported char
429 		log_add(log_Warning, "Warning: Invalid or unsupported unicode "
430 				"char (%lu)", (unsigned long) ch);
431 		return 0;
432 	}
433 
434 	if ((size_t)i + 1 > size)
435 		return -(i + 1);
436 
437 	// unrolled for speed
438 	switch (i)
439 	{
440 		case 5: ptr[5] = (ch & 0x3f) | 0x80;
441 				ch >>= 6;
442 		case 4: ptr[4] = (ch & 0x3f) | 0x80;
443 				ch >>= 6;
444 		case 3: ptr[3] = (ch & 0x3f) | 0x80;
445 				ch >>= 6;
446 		case 2: ptr[2] = (ch & 0x3f) | 0x80;
447 				ch >>= 6;
448 		case 1: ptr[1] = (ch & 0x3f) | 0x80;
449 				ch >>= 6;
450 		case 0: ptr[0] = (ch & def->mask) | def->marker;
451 	}
452 
453 	return i + 1;
454 }
455 
456 // Encode a wide char string (wstr) into a UTF-8 string (ptr)
457 // returns number of bytes used in the buffer (includes 0-term)
458 // any chars that do not fit are truncated
459 // string term '\0' is always appended, unless the destination
460 // buffer is 0 bytes long
461 size_t
getStringFromWideN(unsigned char * ptr,size_t size,const UniChar * wstr,size_t count)462 getStringFromWideN(unsigned char *ptr, size_t size,
463 		const UniChar *wstr, size_t count)
464 {
465 	unsigned char *next;
466 	int used;
467 
468 	if (size == 0)
469 		return 0;
470 
471 	// always leave room for 0-term
472 	--size;
473 
474 	for (next = ptr; size > 0 && count > 0;
475 			size -= used, next += used, --count, ++wstr)
476 	{
477 		used = getStringFromChar(next, size, *wstr);
478 		if (used < 0)
479 			break; // not enough room
480 		if (used == 0)
481 		{	// bad char?
482 			*next = '?';
483 			used = 1;
484 		}
485 	}
486 
487 	*next = '\0'; // term
488 
489 	return next - ptr + 1;
490 }
491 
492 // See getStringFromWideN() for functionality
493 //  the only difference is that the source string (wstr) length is
494 //  calculated by searching for 0-term
495 size_t
getStringFromWide(unsigned char * ptr,size_t size,const UniChar * wstr)496 getStringFromWide(unsigned char *ptr, size_t size, const UniChar *wstr)
497 {
498 	const UniChar *end;
499 
500 	for (end = wstr; *end != 0; ++end)
501 		;
502 
503 	return getStringFromWideN(ptr, size, wstr, (end - wstr));
504 }
505 
506 int
UniChar_isGraph(UniChar ch)507 UniChar_isGraph(UniChar ch)
508 {	// this is not technically sufficient, but close enough for us
509 	// we'll consider all non-control (CO and C1) chars in 'graph' class
510 	// except for the "Private Use Area" (0xE000 - 0xF8FF)
511 
512 	// TODO: The private use area is really only glommed by OS X,
513 	// and even there, not all of it.  (Delete and Backspace both
514 	// end up producing characters there -- see bug #942 for the
515 	// gory details.)
516 	return (ch > 0xa0 && (ch < 0xE000 || ch > 0xF8FF)) ||
517 			(ch > 0x20 && ch < 0x7f);
518 }
519 
520 int
UniChar_isPrint(UniChar ch)521 UniChar_isPrint(UniChar ch)
522 {	// this is not technically sufficient, but close enough for us
523 	// chars in 'print' class are 'graph' + 'space' classes
524 	// the only space we currently have defined is 0x20
525 	return (ch == 0x20) || UniChar_isGraph(ch);
526 }
527 
528 UniChar
UniChar_toUpper(UniChar ch)529 UniChar_toUpper(UniChar ch)
530 {	// this is a very basic Latin-1 implementation
531 	// just to get things going
532 	return (ch < 0x100) ? (UniChar) toupper((int) ch) : ch;
533 }
534 
535 UniChar
UniChar_toLower(UniChar ch)536 UniChar_toLower(UniChar ch)
537 {	// this is a very basic Latin-1 implementation
538 	// just to get things going
539 	return (ch < 0x100) ? (UniChar) tolower((int) ch) : ch;
540 }
541 
542