1 /*
2 Copyright (C) 2004-2017,2018 John E. Davis
3
4 This file is part of the S-Lang Library.
5
6 The S-Lang Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
10
11 The S-Lang Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this library; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19 USA.
20 */
21 #include "slinclud.h"
22 #include <string.h>
23
24 #include "slang.h"
25 #include "_slang.h"
26
27 static unsigned char Len_Map[256] =
28 {
29 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 31 */
30 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 63 */
31 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 95 */
32 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 127 */
33 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 159 */
34 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 191 */
35 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* - 223 */
36 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 /* - 255 */
37 };
38
39 /*
40 * Also note that the code positions U+D800 to U+DFFF (UTF-16 surrogates)
41 * as well as U+FFFE and U+FFFF must not occur in normal UTF-8 or UCS-4
42 * data. UTF-8 decoders should treat them like malformed or overlong
43 * sequences for safety reasons.
44 */
45 #define IS_ILLEGAL_UNICODE(w) \
46 (((w >= 0xD800) && (w <= 0xDFFF)) || (w == 0xFFFE) || (w == 0xFFFF))
47
48 _INLINE_
is_invalid_or_overlong_utf8(SLuchar_Type * u,unsigned int len)49 static int is_invalid_or_overlong_utf8 (SLuchar_Type *u, unsigned int len)
50 {
51 unsigned int i;
52 unsigned char ch, ch1;
53
54 /* Check for invalid sequences */
55 for (i = 1; i < len; i++)
56 {
57 if ((u[i] & 0xC0) != 0x80)
58 return 1;
59 }
60
61 /* Illegal (overlong) sequences */
62 /* 1100000x (10xxxxxx) */
63 /* 11100000 100xxxxx (10xxxxxx) */
64 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
65 /* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
66 /* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
67 ch = *u;
68 if ((ch == 0xC0) || (ch == 0xC1))
69 return 1;
70
71 ch1 = u[1];
72 if (((ch1 & ch) == 0x80)
73 && ((ch == 0xE0)
74 || (ch == 0xF0)
75 || (ch == 0xF8)
76 || (ch == 0xFC)))
77 return 1;
78
79 if (len == 3)
80 {
81 /* D800 is encoded as 0xED 0xA0 0x80 and DFFF as 0xED 0xBF 0xBF */
82 if ((ch == 0xED)
83 && ((ch1 >= 0xA0) && (ch1 <= 0xBF))
84 && (u[2] >= 0x80) && (u[2] <= 0xBF))
85 return 1;
86 /* Now FFFE and FFFF */
87 if ((ch == 0xEF)
88 && (ch1 == 0xBF)
89 && ((u[2] == 0xBE) || (u[2] == 0xBF)))
90 return 1;
91 }
92 return 0;
93 }
94
95 /* This function assumes that the necessary checks have been made to ensure
96 * a valid UTF-8 encoded character is present.
97 */
98 _INLINE_
fast_utf8_decode(SLuchar_Type * u,unsigned int len)99 static SLwchar_Type fast_utf8_decode (SLuchar_Type *u, unsigned int len)
100 {
101 static unsigned char masks[7] =
102 {
103 0, 0, 0x1F, 0xF, 0x7, 0x3, 0x1
104 };
105 SLuchar_Type *umax;
106 SLwchar_Type w;
107
108 w = (*u & masks[len]);
109 umax = u + len;
110 u++;
111 while (u < umax)
112 {
113 w = (w << 6)| (u[0] & 0x3F);
114 u++;
115 }
116 return w;
117 }
118
SLutf8_skip_char(unsigned char * s,unsigned char * smax)119 unsigned char *SLutf8_skip_char (unsigned char *s, unsigned char *smax)
120 {
121 unsigned int len;
122
123 if (s >= smax)
124 return s;
125
126 len = Len_Map[*s];
127 if (len <= 1)
128 return s+1;
129
130 if (s + len > smax)
131 return s+1;
132
133 if (is_invalid_or_overlong_utf8 (s, len))
134 return s + 1;
135
136 return s + len;
137 }
138
SLutf8_skip_chars(SLuchar_Type * s,SLuchar_Type * smax,SLstrlen_Type num,SLstrlen_Type * dnum,int ignore_combining)139 SLuchar_Type *SLutf8_skip_chars (SLuchar_Type *s, SLuchar_Type *smax,
140 SLstrlen_Type num, SLstrlen_Type *dnum,
141 int ignore_combining)
142 {
143 SLstrlen_Type n;
144
145 n = 0;
146 while ((n < num) && (s < smax))
147 {
148 unsigned int len = Len_Map[*s];
149
150 if (len <= 1)
151 {
152 n++;
153 s++;
154 continue;
155 }
156
157 if (s + len > smax)
158 {
159 s++;
160 n++;
161 continue;
162 }
163
164 if (is_invalid_or_overlong_utf8 (s, len))
165 {
166 s++;
167 n++;
168 continue;
169 }
170
171 if (ignore_combining)
172 {
173 SLwchar_Type w = fast_utf8_decode (s, len);
174 if (0 != SLwchar_wcwidth (w))
175 n++;
176 s += len;
177 continue;
178 }
179
180 n++;
181 s += len;
182 }
183
184 if (ignore_combining)
185 {
186 while (s < smax)
187 {
188 SLwchar_Type w;
189 SLstrlen_Type nconsumed;
190 if (NULL == SLutf8_decode (s, smax, &w, &nconsumed))
191 break;
192
193 if (0 != SLwchar_wcwidth (w))
194 break;
195
196 s += nconsumed;
197 }
198 }
199
200 if (dnum != NULL)
201 *dnum = n;
202 return s;
203 }
204
SLutf8_bskip_chars(SLuchar_Type * smin,SLuchar_Type * s,SLstrlen_Type num,SLstrlen_Type * dnum,int ignore_combining)205 SLuchar_Type *SLutf8_bskip_chars (SLuchar_Type *smin, SLuchar_Type *s,
206 SLstrlen_Type num, SLstrlen_Type *dnum,
207 int ignore_combining)
208 {
209 SLstrlen_Type n;
210 SLuchar_Type *smax = s;
211
212 n = 0;
213 while ((n < num) && (s > smin))
214 {
215 unsigned char ch;
216 unsigned int dn;
217
218 s--;
219 ch = *s;
220 if (ch < 0x80)
221 {
222 n++;
223 smax = s;
224 continue;
225 }
226
227 dn = 0;
228 while ((s != smin)
229 && (Len_Map[ch] == 0)
230 && (dn < SLUTF8_MAX_MBLEN))
231 {
232 s--;
233 ch = *s;
234 dn++;
235 }
236
237 if (ch <= 0xBF)
238 {
239 /* Invalid sequence */
240 n++;
241 smax--;
242 s = smax;
243 continue;
244 }
245
246 if (ch > 0xBF)
247 {
248 SLwchar_Type w;
249 SLuchar_Type *s1;
250
251 if ((NULL == (s1 = SLutf8_decode (s, smax, &w, NULL)))
252 || (s1 != smax))
253 {
254 /* This means we backed up over an invalid sequence */
255 dn = (unsigned int) (smax - s);
256 n++;
257 smax--;
258 s = smax;
259 continue;
260 }
261
262 if ((ignore_combining == 0)
263 || (0 != SLwchar_wcwidth (w)))
264 n++;
265
266 smax = s;
267 }
268 }
269
270 if (dnum != NULL)
271 *dnum = n;
272 return s;
273 }
274
SLutf8_bskip_char(SLuchar_Type * smin,SLuchar_Type * s)275 SLuchar_Type *SLutf8_bskip_char (SLuchar_Type *smin, SLuchar_Type *s)
276 {
277 if (s > smin)
278 {
279 SLstrlen_Type dn;
280
281 s--;
282 if (*s >= 0x80)
283 s = SLutf8_bskip_chars (smin, s+1, 1, &dn, 0);
284 }
285 return s;
286 }
287
288 /* This function counts the number of wide characters in a UTF-8 encoded
289 * string. Each byte in an invalid sequence is counted as a single
290 * character. If the string contains illegal values, the illegal byte
291 * is counted as 1 character.
292 */
SLutf8_strlen(SLuchar_Type * s,int ignore_combining)293 SLstrlen_Type SLutf8_strlen (SLuchar_Type *s, int ignore_combining)
294 {
295 SLstrlen_Type count, len;
296
297 if (s == NULL)
298 return 0;
299
300 len = strlen ((char *)s);
301 (void) SLutf8_skip_chars (s, s + len, len, &count, ignore_combining);
302 return count;
303 }
304
305 /*
306 * This function returns NULL if the input does not correspond to a valid
307 * UTF-8 sequence, otherwise, it returns the position of the next character
308 * in the sequence.
309 */
SLutf8_decode(unsigned char * u,unsigned char * umax,SLwchar_Type * wp,SLstrlen_Type * nconsumedp)310 unsigned char *SLutf8_decode (unsigned char *u, unsigned char *umax,
311 SLwchar_Type *wp, SLstrlen_Type *nconsumedp)
312 {
313 unsigned int len;
314 unsigned char ch;
315 SLwchar_Type w;
316
317 if (u >= umax)
318 {
319 *wp = 0;
320 if (nconsumedp != NULL)
321 *nconsumedp = 0;
322 return NULL;
323 }
324
325 *wp = ch = *u;
326 if (ch < 0x80)
327 {
328 if (nconsumedp != NULL) *nconsumedp = 1;
329 return u+1;
330 }
331
332 len = Len_Map[ch];
333 if (len < 2)
334 {
335 /* should not happen--- code here for completeness */
336 if (nconsumedp != NULL) *nconsumedp = 1;
337 return NULL;
338 }
339 if (u + len > umax)
340 {
341 if (nconsumedp != NULL) *nconsumedp = 1; /* (unsigned int) (umax - u); */
342 return NULL;
343 }
344
345 if (is_invalid_or_overlong_utf8 (u, len))
346 {
347 if (nconsumedp != NULL)
348 *nconsumedp = 1;
349
350 return NULL;
351 }
352
353 if (nconsumedp != NULL)
354 *nconsumedp = len;
355
356 *wp = w = fast_utf8_decode (u, len);
357
358 if (IS_ILLEGAL_UNICODE(w))
359 return NULL;
360
361 return u + len;
362 }
363
364 /* Encode the wide character returning a pointer to the end of the
365 * utf8 of the encoded multi-byte character. This function will also encode
366 * illegal unicode values. It returns NULL if buflen is too small.
367 * Otherwise, it returns a pointer at the end of the last encoded byte.
368 * It does not null terminate the encoded string.
369 */
SLutf8_encode(SLwchar_Type w,SLuchar_Type * u,SLstrlen_Type ulen)370 SLuchar_Type *SLutf8_encode (SLwchar_Type w, SLuchar_Type *u, SLstrlen_Type ulen)
371 {
372 SLuchar_Type *umax = u + ulen;
373
374 /* U-00000000 - U-0000007F: 0xxxxxxx */
375 if (w <= 0x7F)
376 {
377 if (u >= umax)
378 return NULL;
379
380 *u++ = (unsigned char) w;
381 return u;
382 }
383
384 /* U-00000080 - U-000007FF: 110xxxxx 10xxxxxx */
385 if (w <= 0x7FF)
386 {
387 if ((u + 1) >= umax)
388 return NULL;
389
390 *u++ = (w >> 6) | 0xC0;
391 *u++ = (w & 0x3F) | 0x80;
392 return u;
393 }
394
395 /* First bad character starts at 0xD800 */
396
397 /* Allow illegal values to be encoded */
398
399 /*
400 *if (IS_ILLEGAL_UNICODE(w))
401 * return NULL;
402 */
403
404 /* U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
405 if (w <= 0xFFFF)
406 {
407 if (u+2 >= umax)
408 return NULL;
409 *u++ = (w >> 12 ) | 0xE0;
410 goto finish_2;
411 }
412
413 /* U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
414 if (w <= 0x1FFFFF)
415 {
416 if (u+3 >= umax)
417 return NULL;
418 *u++ = (w >> 18) | 0xF0;
419 goto finish_3;
420 }
421
422 /* U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
423 if (w <= 0x3FFFFFF)
424 {
425 if (u+4 >= umax)
426 return NULL;
427 *u++ = (w >> 24) | 0xF8;
428 goto finish_4;
429 }
430
431 /* U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
432 if (w <= 0x7FFFFFFF)
433 {
434 if (u+5 >= umax)
435 return NULL;
436 *u++ = (w >> 30) | 0xFC;
437 goto finish_5;
438 }
439
440 /* unreached?? */
441 return NULL;
442
443 finish_5: *u++ = ((w >> 24) & 0x3F)|0x80;
444 finish_4: *u++ = ((w >> 18) & 0x3F)|0x80;
445 finish_3: *u++ = ((w >> 12) & 0x3F)|0x80;
446 finish_2: *u++ = ((w >> 6) & 0x3F)|0x80;
447 *u++ = (w & 0x3F)|0x80;
448
449 return u;
450 }
451
452 /* Like SLutf8_encode, but null terminates the result.
453 * At least SLUTF8_MAX_MBLEN+1 bytes assumed.
454 */
SLutf8_encode_null_terminate(SLwchar_Type w,SLuchar_Type * u)455 SLuchar_Type *SLutf8_encode_null_terminate (SLwchar_Type w, SLuchar_Type *u)
456 {
457 SLuchar_Type *p;
458
459 p = SLutf8_encode (w, u, SLUTF8_MAX_MBLEN);
460 if (p != NULL)
461 *p = 0;
462 return p;
463 }
464
465 #if 0
466 int SLutf8_decode_bytes (SLuchar_Type *u, SLuchar_Type *umax,
467 unsigned char *b, unsigned int *np)
468 {
469 unsigned char *bmax;
470
471 bmax = b;
472 while (u < umax)
473 {
474 SLwchar_Type w;
475
476 if (0 == (*u & 0x80))
477 {
478 *bmax++ = *u++;
479 continue;
480 }
481
482 if (NULL == (u = SLutf8_decode (u, umax, &w, NULL)))
483 return -1; /* FIXME: HANDLE ERROR */
484
485 if (w > 0xFF)
486 {
487 #if 0
488 sprintf (bmax, "<U+%04X>", w);
489 bmax += strlen (bmax);
490 continue;
491 #endif
492 /* FIXME: HANDLE ERROR */
493 w = w & 0xFF;
494 }
495
496 *bmax++ = w;
497 }
498 *np = bmax - b;
499 *bmax = 0;
500 return 0;
501 }
502
503 /* UTF-8 Encode the bytes between b and bmax storing the results in the
504 * buffer defined by u and umax, returning the position following the
505 * last encoded character. Upon return, *np is set to the number of bytes
506 * sucessfully encoded.
507 */
508 SLuchar_Type *SLutf8_encode_bytes (unsigned char *b, unsigned char *bmax,
509 SLuchar_Type *u, unsigned int ulen,
510 unsigned int *np)
511 {
512 unsigned char *bstart = b;
513 SLuchar_Type *umax = u + ulen;
514
515 while (b < bmax)
516 {
517 SLuchar_Type *u1;
518
519 if (0 == (*b & 0x80))
520 {
521 if (u >= umax)
522 break;
523
524 *u++ = *b++;
525 continue;
526 }
527
528 if (NULL == (u1 = SLutf8_encode (*b, u, umax - u)))
529 break;
530 u = u1;
531 b++;
532 }
533
534 *np = b - bstart;
535 if (u < umax)
536 *u = 0;
537
538 return u;
539 }
540 #endif
541
xform_utf8(SLuchar_Type * u,SLuchar_Type * umax,SLwchar_Type (* fun)(SLwchar_Type))542 static SLuchar_Type *xform_utf8 (SLuchar_Type *u, SLuchar_Type *umax,
543 SLwchar_Type (*fun)(SLwchar_Type))
544 {
545 SLuchar_Type *buf, *p;
546 size_t malloced_len, len;
547
548 if (umax < u)
549 return NULL;
550
551 len = 0;
552 p = buf = NULL;
553 malloced_len = 0;
554
555 while (1)
556 {
557 SLwchar_Type w;
558 SLuchar_Type *u1;
559 SLstrlen_Type nconsumed;
560
561 if (malloced_len <= len + SLUTF8_MAX_MBLEN)
562 {
563 SLuchar_Type *newbuf;
564 malloced_len += 1 + (umax - u) + SLUTF8_MAX_MBLEN;
565
566 newbuf = (SLuchar_Type *)SLrealloc ((char *)buf, malloced_len);
567 if (newbuf == NULL)
568 {
569 SLfree ((char *)buf);
570 return NULL;
571 }
572 buf = newbuf;
573 p = buf + len;
574 }
575
576 if (u >= umax)
577 {
578 *p = 0;
579 p = (SLuchar_Type *) SLang_create_nslstring ((char *)buf, len);
580 SLfree ((char *)buf);
581 return p;
582 }
583
584 if (NULL == (u1 = SLutf8_decode (u, umax, &w, &nconsumed)))
585 {
586 /* Invalid sequence */
587 memcpy ((char *) p, u, nconsumed);
588 p += nconsumed;
589 len += nconsumed;
590 u1 = u + nconsumed;
591 }
592 else
593 {
594 SLuchar_Type *p1;
595
596 p1 = SLutf8_encode ((*fun)(w), p, malloced_len);
597 if (p1 == NULL)
598 {
599 SLfree ((char *)buf);
600 _pSLang_verror (SL_INTERNAL_ERROR, "SLutf8_encode returned NULL");
601 return NULL;
602 }
603 len += p1 - p;
604 p = p1;
605 }
606
607 u = u1;
608 }
609 }
610
611 /* Returned an uppercased version of an UTF-8 encoded string. Illegal or
612 * invalid sequences will be returned as-is. This function returns
613 * an SLstring.
614 */
SLutf8_strup(SLuchar_Type * u,SLuchar_Type * umax)615 SLuchar_Type *SLutf8_strup (SLuchar_Type *u, SLuchar_Type *umax)
616 {
617 return xform_utf8 (u, umax, SLwchar_toupper);
618 }
619
620 /* Returned an lowercased version of an UTF-8 encoded string. Illegal or
621 * invalid sequences will be returned as-is. This function returns
622 * an SLstring.
623 */
SLutf8_strlo(SLuchar_Type * u,SLuchar_Type * umax)624 SLuchar_Type *SLutf8_strlo (SLuchar_Type *u, SLuchar_Type *umax)
625 {
626 return xform_utf8 (u, umax, SLwchar_tolower);
627 }
628
SLutf8_compare(SLuchar_Type * a,SLuchar_Type * amax,SLuchar_Type * b,SLuchar_Type * bmax,SLstrlen_Type nchars,int cs)629 int SLutf8_compare (SLuchar_Type *a, SLuchar_Type *amax,
630 SLuchar_Type *b, SLuchar_Type *bmax,
631 SLstrlen_Type nchars,
632 int cs)
633 {
634 while (nchars && (a < amax) && (b < bmax))
635 {
636 SLwchar_Type cha, chb;
637 SLstrlen_Type na, nb;
638 int aok, bok;
639
640 if (*a < 0x80)
641 {
642 cha = (SLwchar_Type) *a++;
643 aok = 1;
644 }
645 else
646 {
647 aok = (NULL != SLutf8_decode (a, amax, &cha, &na));
648 a += na;
649 }
650
651 if (*b < 0x80)
652 {
653 chb = (SLwchar_Type) *b++;
654 bok = 1;
655 }
656 else
657 {
658 bok = (NULL != SLutf8_decode (b, bmax, &chb, &nb));
659 b += nb;
660 }
661
662 nchars--;
663
664 if (aok && bok)
665 {
666 if (cs == 0)
667 {
668 cha = SLwchar_toupper (cha);
669 chb = SLwchar_toupper (chb);
670 }
671 }
672 else if (aok)
673 return 1;
674 else if (bok)
675 return -1;
676
677 if (cha == chb)
678 continue;
679
680 if (cha > chb)
681 return 1;
682
683 return -1;
684 }
685
686 if (nchars == 0)
687 return 0;
688
689 if ((a >= amax) && (b >= bmax))
690 return 0;
691
692 if (b >= bmax)
693 return 1;
694
695 return -1;
696 }
697
698 /* Returns an SLstring */
SLutf8_subst_wchar(SLuchar_Type * u,SLuchar_Type * umax,SLwchar_Type wch,SLstrlen_Type pos,int ignore_combining)699 SLstr_Type *SLutf8_subst_wchar (SLuchar_Type *u, SLuchar_Type *umax,
700 SLwchar_Type wch, SLstrlen_Type pos,
701 int ignore_combining)
702 {
703 SLuchar_Type *a, *a1, *b;
704 SLstrlen_Type dpos;
705 SLuchar_Type buf[SLUTF8_MAX_MBLEN+1];
706 SLstr_Type *c;
707 SLstrlen_Type n1, n2, n3, len;
708
709 a = SLutf8_skip_chars (u, umax, pos, &dpos, ignore_combining);
710
711 if ((dpos != pos) || (a == umax))
712 {
713 _pSLang_verror (SL_INDEX_ERROR, "Specified character position is invalid for string");
714 return NULL;
715 }
716
717 a1 = SLutf8_skip_chars (a, umax, 1, NULL, ignore_combining);
718
719 b = SLutf8_encode (wch, buf, SLUTF8_MAX_MBLEN);
720 if (b == NULL)
721 {
722 _pSLang_verror (SL_UNICODE_ERROR, "Unable to encode wchar 0x%lX", (unsigned long)wch);
723 return NULL;
724 }
725
726 n1 = (a-u);
727 n2 = (b-buf);
728 n3 = (umax-a1);
729 len = n1 + n2 + n3;
730 c = _pSLallocate_slstring (len);
731 if (c == NULL)
732 return NULL;
733
734 memcpy (c, (char *)u, n1);
735 memcpy (c+n1, (char *)buf, n2);
736 memcpy (c+n1+n2, (char *)a1, n3);
737 c[len] = 0;
738
739 /* No need to worry about this failing-- it frees its argument */
740 return _pSLcreate_via_alloced_slstring (c, len);
741 }
742
743 /* utf8 buffer assumed to be at least SLUTF8_MAX_MBLEN+1 bytes. Result will be
744 * null terminated. Returns position of NEXT character.
745 * Analogous to: *p++
746 */
SLutf8_extract_utf8_char(SLuchar_Type * u,SLuchar_Type * umax,SLuchar_Type * utf8)747 SLuchar_Type *SLutf8_extract_utf8_char (SLuchar_Type *u,
748 SLuchar_Type *umax,
749 SLuchar_Type *utf8)
750 {
751 SLuchar_Type *u1;
752
753 u1 = SLutf8_skip_char (u, umax);
754 memcpy ((char *)utf8, u, u1-u);
755 utf8[u1-u] = 0;
756
757 return u1;
758 }
759
760 /* These routines depend upon the value of the _pSLinterp_UTF8_Mode variable.
761 * They also generate slang errors upon error.
762 */
_pSLinterp_decode_wchar(SLuchar_Type * u,SLuchar_Type * umax,SLwchar_Type * chp)763 SLuchar_Type *_pSLinterp_decode_wchar (SLuchar_Type *u,
764 SLuchar_Type *umax,
765 SLwchar_Type *chp)
766 {
767 if (_pSLinterp_UTF8_Mode == 0)
768 {
769 if (u < umax)
770 *chp = (SLwchar_Type) *u++;
771 return u;
772 }
773
774 if (NULL == (u = SLutf8_decode (u, umax, chp, NULL)))
775 _pSLang_verror (SL_INVALID_UTF8, "Invalid UTF-8 encoded string");
776
777 return u;
778 }
779
780 /* At least SLUTF8_MAX_MBLEN+1 bytes assumed-- null terminates result.
781 * Upon success, it returns a pointer to the _end_ of the encoded character
782 */
_pSLinterp_encode_wchar(SLwchar_Type wch,SLuchar_Type * u,unsigned int * encoded_len)783 SLuchar_Type *_pSLinterp_encode_wchar (SLwchar_Type wch, SLuchar_Type *u, unsigned int *encoded_len)
784 {
785 SLuchar_Type *u1;
786
787 if (_pSLinterp_UTF8_Mode == 0)
788 {
789 *encoded_len = 1;
790 *u++ = (SLuchar_Type) wch;
791 *u++ = 0;
792 return u;
793 }
794
795 if (NULL == (u1 = SLutf8_encode_null_terminate (wch, u)))
796 {
797 _pSLang_verror (SL_UNICODE_ERROR, "Unable to encode character 0x%lX", (unsigned long)wch);
798 return NULL;
799 }
800
801 *encoded_len = (unsigned int) (u1 - u);
802 return u1;
803 }
804
805 #ifdef REGRESSION
main(int argc,char ** argv)806 int main (int argc, char **argv)
807 {
808 unsigned char *s, *smax;
809 char **t;
810 char *ok_tests [] =
811 {
812 "",
813 "",
814 "�",
815 "",
816 "����",
817 NULL
818 };
819 char *long_tests [] =
820 {
821 "��",
822 "���",
823 "����",
824 "�����",
825 "������",
826 NULL
827 };
828
829 t = long_tests;
830 while ((s = (unsigned char *) *t++) != NULL)
831 {
832 smax = s + strlen ((char *)s);
833
834 while (s < smax)
835 {
836 SLwchar_Type w;
837
838 if (NULL == (s = SLutf8_to_wc (s, smax, &w)))
839 {
840 fprintf (stderr, "SLutf8_to_wc failed\n");
841 break;
842 }
843 if (w == 0)
844 break;
845 fprintf (stdout, " 0x%X", w);
846 }
847
848 fprintf (stdout, "\n");
849 }
850 return 0;
851 }
852 #endif
853
854