1 /*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
15 */
16
17 #include "port.h"
18
19 #define UNICODE_INTERNAL
20 #include "libs/unicode.h"
21
22 #include <ctype.h>
23 #include <stddef.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include "libs/log.h"
27 #include "libs/misc.h"
28
29
30 // Resynchronise (skip everything starting with 0x10xxxxxx):
31 static inline void
resyncUTF8(const unsigned char ** ptr)32 resyncUTF8(const unsigned char **ptr) {
33 while ((**ptr & 0xc0) == 0x80)
34 (*ptr)++;
35 }
36
37 // Get one character from a UTF-8 encoded string.
38 // *ptr will point to the start of the next character.
39 // Returns 0 if the encoding is bad. This can be distinguished from the
40 // '\0' character by checking whether **ptr == '\0' before calling this
41 // function.
42 UniChar
getCharFromString(const unsigned char ** ptr)43 getCharFromString(const unsigned char **ptr) {
44 UniChar result;
45
46 if (**ptr < 0x80) {
47 // 0xxxxxxx, regular ASCII
48 result = **ptr;
49 (*ptr)++;
50
51 return result;
52 }
53
54 if ((**ptr & 0xe0) == 0xc0) {
55 // 110xxxxx; 10xxxxxx must follow
56 // Value between 0x00000080 and 0x000007ff (inclusive)
57 result = **ptr & 0x1f;
58 (*ptr)++;
59
60 if ((**ptr & 0xc0) != 0x80)
61 goto err;
62 result = (result << 6) | ((**ptr) & 0x3f);
63 (*ptr)++;
64
65 if (result < 0x00000080) {
66 // invalid encoding - must reject
67 goto err;
68 }
69 return result;
70 }
71
72 if ((**ptr & 0xf0) == 0xe0) {
73 // 1110xxxx; 10xxxxxx 10xxxxxx must follow
74 // Value between 0x00000800 and 0x0000ffff (inclusive)
75 result = **ptr & 0x0f;
76 (*ptr)++;
77
78 if ((**ptr & 0xc0) != 0x80)
79 goto err;
80 result = (result << 6) | ((**ptr) & 0x3f);
81 (*ptr)++;
82
83 if ((**ptr & 0xc0) != 0x80)
84 goto err;
85 result = (result << 6) | ((**ptr) & 0x3f);
86 (*ptr)++;
87
88 if (result < 0x00000800) {
89 // invalid encoding - must reject
90 goto err;
91 }
92 return result;
93 }
94
95 if ((**ptr & 0xf8) == 0xf0) {
96 // 11110xxx; 10xxxxxx 10xxxxxx 10xxxxxx must follow
97 // Value between 0x00010000 and 0x0010ffff (inclusive)
98 result = **ptr & 0x07;
99 (*ptr)++;
100
101 if ((**ptr & 0xc0) != 0x80)
102 goto err;
103 result = (result << 6) | ((**ptr) & 0x3f);
104 (*ptr)++;
105
106 if ((**ptr & 0xc0) != 0x80)
107 goto err;
108 result = (result << 6) | ((**ptr) & 0x3f);
109 (*ptr)++;
110
111 if ((**ptr & 0xc0) != 0x80)
112 goto err;
113 result = (result << 6) | ((**ptr) & 0x3f);
114 (*ptr)++;
115
116 if (result < 0x00010000) {
117 // invalid encoding - must reject
118 goto err;
119 }
120 return result;
121 }
122
123 err:
124 log_add(log_Warning, "Warning: Invalid UTF8 sequence.");
125
126 // Resynchronise (skip everything starting with 0x10xxxxxx):
127 resyncUTF8(ptr);
128
129 return 0;
130 }
131
132 UniChar
getCharFromStringN(const unsigned char ** ptr,const unsigned char * end)133 getCharFromStringN(const unsigned char **ptr, const unsigned char *end) {
134 size_t numBytes;
135
136 if (*ptr == end)
137 goto err;
138
139 if (**ptr < 0x80) {
140 numBytes = 1;
141 } else if ((**ptr & 0xe0) == 0xc0) {
142 numBytes = 2;
143 } else if ((**ptr & 0xf0) == 0xe0) {
144 numBytes = 3;
145 } else if ((**ptr & 0xf8) == 0xf0) {
146 numBytes = 4;
147 } else
148 goto err;
149
150 if (*ptr + numBytes > end)
151 goto err;
152
153 return getCharFromString(ptr);
154
155 err:
156 *ptr = end;
157 return 0;
158 }
159
160 // Get one line from a string.
161 // A line is terminated with either CRLF (DOS/Windows),
162 // LF (Unix, MacOS X), or CR (old MacOS).
163 // The end of the string is reached when **startNext == '\0'.
164 // NULL is returned if the string is not valid UTF8. In this case
165 // *end points to the first invalid character (or the character before if
166 // it was a LF), and *startNext to the start of the next (possibly invalid
167 // too) character.
168 unsigned char *
getLineFromString(const unsigned char * start,const unsigned char ** end,const unsigned char ** startNext)169 getLineFromString(const unsigned char *start, const unsigned char **end,
170 const unsigned char **startNext) {
171 const unsigned char *ptr = start;
172 const unsigned char *lastPtr;
173 UniChar ch;
174
175 // Search for the first newline.
176 for (;;) {
177 if (*ptr == '\0') {
178 *end = ptr;
179 *startNext = ptr;
180 return (unsigned char *) unconst(start);
181 }
182 lastPtr = ptr;
183 ch = getCharFromString(&ptr);
184 if (ch == '\0') {
185 // Bad string
186 *end = lastPtr;
187 *startNext = ptr;
188 return NULL;
189 }
190 if (ch == '\n') {
191 *end = lastPtr;
192 if (*ptr == '\0'){
193 // LF at the end of the string.
194 *startNext = ptr;
195 return (unsigned char *) unconst(start);
196 }
197 ch = getCharFromString(&ptr);
198 if (ch == '\0') {
199 // Bad string
200 return NULL;
201 }
202 if (ch == '\r') {
203 // LFCR
204 *startNext = ptr;
205 } else {
206 // LF
207 *startNext = *end;
208 }
209 return (unsigned char *) unconst(start);
210 } else if (ch == '\r') {
211 *end = lastPtr;
212 *startNext = ptr;
213 return (unsigned char *) unconst(start);
214 } // else: a normal character
215 }
216 }
217
218 size_t
utf8StringCount(const unsigned char * start)219 utf8StringCount(const unsigned char *start) {
220 size_t count = 0;
221 UniChar ch;
222
223 for (;;) {
224 ch = getCharFromString(&start);
225 if (ch == '\0')
226 return count;
227 count++;
228 }
229 }
230
231 size_t
utf8StringCountN(const unsigned char * start,const unsigned char * end)232 utf8StringCountN(const unsigned char *start, const unsigned char *end) {
233 size_t count = 0;
234 UniChar ch;
235
236 for (;;) {
237 ch = getCharFromStringN(&start, end);
238 if (ch == '\0')
239 return count;
240 count++;
241 }
242 }
243
244 // Locates a unicode character (ch) in a UTF-8 string (pStr)
245 // returns the char positions when found
246 // -1 when not found
247 int
utf8StringPos(const unsigned char * pStr,UniChar ch)248 utf8StringPos (const unsigned char *pStr, UniChar ch)
249 {
250 int pos;
251
252 for (pos = 0; *pStr != '\0'; ++pos)
253 {
254 if (getCharFromString (&pStr) == ch)
255 return pos;
256 }
257
258 if (ch == '\0' && *pStr == '\0')
259 return pos;
260
261 return -1;
262 }
263
264 // Safe version of strcpy(), somewhat analogous to strncpy()
265 // except it guarantees a 0-term when size > 0
266 // when size == 0, returns NULL
267 // BUG: this may result in the last character being only partially in the
268 // buffer
269 unsigned char *
utf8StringCopy(unsigned char * dst,size_t size,const unsigned char * src)270 utf8StringCopy (unsigned char *dst, size_t size, const unsigned char *src)
271 {
272 if (size == 0)
273 return 0;
274
275 strncpy ((char *) dst, (const char *) src, size);
276 dst[size - 1] = '\0';
277
278 return dst;
279 }
280
281 // TODO: this is not implemented with respect to collating order
282 int
utf8StringCompare(const unsigned char * str1,const unsigned char * str2)283 utf8StringCompare (const unsigned char *str1, const unsigned char *str2)
284 {
285 #if 0
286 // UniChar comparing version
287 UniChar ch1;
288 UniChar ch2;
289
290 for (;;)
291 {
292 int cmp;
293
294 ch1 = getCharFromString(&str1);
295 ch2 = getCharFromString(&str2);
296 if (ch1 == '\0' || ch2 == '\0')
297 break;
298
299 cmp = utf8CompareChar (ch1, ch2);
300 if (cmp != 0)
301 return cmp;
302 }
303
304 if (ch1 != '\0')
305 {
306 // ch2 == '\0'
307 // str2 ends, str1 continues
308 return 1;
309 }
310
311 if (ch2 != '\0')
312 {
313 // ch1 == '\0'
314 // str1 ends, str2 continues
315 return -1;
316 }
317
318 // ch1 == '\0' && ch2 == '\0'.
319 // Strings match completely.
320 return 0;
321 #else
322 // this will do for now
323 return strcmp ((const char *) str1, (const char *) str2);
324 #endif
325 }
326
327 unsigned char *
skipUTF8Chars(const unsigned char * ptr,size_t num)328 skipUTF8Chars(const unsigned char *ptr, size_t num) {
329 UniChar ch;
330 const unsigned char *oldPtr;
331
332 while (num--) {
333 oldPtr = ptr;
334 ch = getCharFromString(&ptr);
335 if (ch == '\0')
336 return (unsigned char *) unconst(oldPtr);
337 }
338 return (unsigned char *) unconst(ptr);
339 }
340
341 // Decodes a UTF-8 string (start) into a unicode character string (wstr)
342 // returns number of chars decoded and stored, not counting 0-term
343 // any chars that do not fit are truncated
344 // wide string term 0 is always appended, unless the destination
345 // buffer is 0 chars long
346 size_t
getUniCharFromStringN(UniChar * wstr,size_t maxcount,const unsigned char * start,const unsigned char * end)347 getUniCharFromStringN(UniChar *wstr, size_t maxcount,
348 const unsigned char *start, const unsigned char *end)
349 {
350 UniChar *next;
351
352 if (maxcount == 0)
353 return 0;
354
355 // always leave room for 0-term
356 --maxcount;
357
358 for (next = wstr; maxcount > 0; ++next, --maxcount)
359 {
360 *next = getCharFromStringN(&start, end);
361 if (*next == 0)
362 break;
363 }
364
365 *next = 0; // term
366
367 return next - wstr;
368 }
369
370 // See getStringFromWideN() for functionality
371 // the only difference is that the source string (start) length is
372 // calculated by searching for 0-term
373 size_t
getUniCharFromString(UniChar * wstr,size_t maxcount,const unsigned char * start)374 getUniCharFromString(UniChar *wstr, size_t maxcount,
375 const unsigned char *start)
376 {
377 UniChar *next;
378
379 if (maxcount == 0)
380 return 0;
381
382 // always leave room for 0-term
383 --maxcount;
384
385 for (next = wstr; maxcount > 0; ++next, --maxcount)
386 {
387 *next = getCharFromString(&start);
388 if (*next == 0)
389 break;
390 }
391
392 *next = 0; // term
393
394 return next - wstr;
395 }
396
397 // Encode one wide character into UTF-8
398 // returns number of bytes used in the buffer,
399 // 0 : invalid or unsupported char
400 // <0 : negative of bytes needed if buffer too small
401 // string term '\0' is *not* appended or counted
402 int
getStringFromChar(unsigned char * ptr,size_t size,UniChar ch)403 getStringFromChar(unsigned char *ptr, size_t size, UniChar ch)
404 {
405 int i;
406 static const struct range_def
407 {
408 UniChar lim;
409 int marker;
410 int mask;
411 }
412 ranges[] =
413 {
414 {0x0000007f, 0x00, 0x7f},
415 {0x000007ff, 0xc0, 0x1f},
416 {0x0000ffff, 0xe0, 0x0f},
417 {0x001fffff, 0xf0, 0x07},
418 {0x03ffffff, 0xf8, 0x03},
419 {0x7fffffff, 0xfc, 0x01},
420 {0x00000000, 0x00, 0x00} // term
421 };
422 const struct range_def *def;
423
424 // lookup the range
425 for (i = 0, def = ranges; ch > def->lim && def->mask != 0; ++i, ++def)
426 ;
427 if (def->mask == 0)
428 { // invalid or unsupported char
429 log_add(log_Warning, "Warning: Invalid or unsupported unicode "
430 "char (%lu)", (unsigned long) ch);
431 return 0;
432 }
433
434 if ((size_t)i + 1 > size)
435 return -(i + 1);
436
437 // unrolled for speed
438 switch (i)
439 {
440 case 5: ptr[5] = (ch & 0x3f) | 0x80;
441 ch >>= 6;
442 case 4: ptr[4] = (ch & 0x3f) | 0x80;
443 ch >>= 6;
444 case 3: ptr[3] = (ch & 0x3f) | 0x80;
445 ch >>= 6;
446 case 2: ptr[2] = (ch & 0x3f) | 0x80;
447 ch >>= 6;
448 case 1: ptr[1] = (ch & 0x3f) | 0x80;
449 ch >>= 6;
450 case 0: ptr[0] = (ch & def->mask) | def->marker;
451 }
452
453 return i + 1;
454 }
455
456 // Encode a wide char string (wstr) into a UTF-8 string (ptr)
457 // returns number of bytes used in the buffer (includes 0-term)
458 // any chars that do not fit are truncated
459 // string term '\0' is always appended, unless the destination
460 // buffer is 0 bytes long
461 size_t
getStringFromWideN(unsigned char * ptr,size_t size,const UniChar * wstr,size_t count)462 getStringFromWideN(unsigned char *ptr, size_t size,
463 const UniChar *wstr, size_t count)
464 {
465 unsigned char *next;
466 int used;
467
468 if (size == 0)
469 return 0;
470
471 // always leave room for 0-term
472 --size;
473
474 for (next = ptr; size > 0 && count > 0;
475 size -= used, next += used, --count, ++wstr)
476 {
477 used = getStringFromChar(next, size, *wstr);
478 if (used < 0)
479 break; // not enough room
480 if (used == 0)
481 { // bad char?
482 *next = '?';
483 used = 1;
484 }
485 }
486
487 *next = '\0'; // term
488
489 return next - ptr + 1;
490 }
491
492 // See getStringFromWideN() for functionality
493 // the only difference is that the source string (wstr) length is
494 // calculated by searching for 0-term
495 size_t
getStringFromWide(unsigned char * ptr,size_t size,const UniChar * wstr)496 getStringFromWide(unsigned char *ptr, size_t size, const UniChar *wstr)
497 {
498 const UniChar *end;
499
500 for (end = wstr; *end != 0; ++end)
501 ;
502
503 return getStringFromWideN(ptr, size, wstr, (end - wstr));
504 }
505
506 int
UniChar_isGraph(UniChar ch)507 UniChar_isGraph(UniChar ch)
508 { // this is not technically sufficient, but close enough for us
509 // we'll consider all non-control (CO and C1) chars in 'graph' class
510 // except for the "Private Use Area" (0xE000 - 0xF8FF)
511
512 // TODO: The private use area is really only glommed by OS X,
513 // and even there, not all of it. (Delete and Backspace both
514 // end up producing characters there -- see bug #942 for the
515 // gory details.)
516 return (ch > 0xa0 && (ch < 0xE000 || ch > 0xF8FF)) ||
517 (ch > 0x20 && ch < 0x7f);
518 }
519
520 int
UniChar_isPrint(UniChar ch)521 UniChar_isPrint(UniChar ch)
522 { // this is not technically sufficient, but close enough for us
523 // chars in 'print' class are 'graph' + 'space' classes
524 // the only space we currently have defined is 0x20
525 return (ch == 0x20) || UniChar_isGraph(ch);
526 }
527
528 UniChar
UniChar_toUpper(UniChar ch)529 UniChar_toUpper(UniChar ch)
530 { // this is a very basic Latin-1 implementation
531 // just to get things going
532 return (ch < 0x100) ? (UniChar) toupper((int) ch) : ch;
533 }
534
535 UniChar
UniChar_toLower(UniChar ch)536 UniChar_toLower(UniChar ch)
537 { // this is a very basic Latin-1 implementation
538 // just to get things going
539 return (ch < 0x100) ? (UniChar) tolower((int) ch) : ch;
540 }
541
542