1 /*
2 Copyright (C) 2004-2017,2018 John E. Davis
3 
4 This file is part of the S-Lang Library.
5 
6 The S-Lang Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
10 
11 The S-Lang Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with this library; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19 USA.
20 */
21 #include "slinclud.h"
22 #include <string.h>
23 
24 #include "slang.h"
25 #include "_slang.h"
26 
27 static unsigned char Len_Map[256] =
28 {
29   0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* - 31 */
30   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* - 63 */
31   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* - 95 */
32   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* - 127 */
33   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* - 159 */
34   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* - 191 */
35   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  /* - 223 */
36   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1   /* - 255 */
37 };
38 
39 /*
40  * Also note that the code positions U+D800 to U+DFFF (UTF-16 surrogates)
41  * as well as U+FFFE and U+FFFF must not occur in normal UTF-8 or UCS-4
42  * data. UTF-8 decoders should treat them like malformed or overlong
43  * sequences for safety reasons.
44  */
45 #define IS_ILLEGAL_UNICODE(w) \
46    (((w >= 0xD800) && (w <= 0xDFFF)) || (w == 0xFFFE) || (w == 0xFFFF))
47 
48 _INLINE_
is_invalid_or_overlong_utf8(SLuchar_Type * u,unsigned int len)49 static int is_invalid_or_overlong_utf8 (SLuchar_Type *u, unsigned int len)
50 {
51    unsigned int i;
52    unsigned char ch, ch1;
53 
54    /* Check for invalid sequences */
55    for (i = 1; i < len; i++)
56      {
57 	if ((u[i] & 0xC0) != 0x80)
58 	  return 1;
59      }
60 
61    /* Illegal (overlong) sequences */
62    /*           1100000x (10xxxxxx) */
63    /*           11100000 100xxxxx (10xxxxxx) */
64    /*           11110000 1000xxxx (10xxxxxx 10xxxxxx) */
65    /*           11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
66    /*           11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
67    ch = *u;
68    if ((ch == 0xC0) || (ch == 0xC1))
69      return 1;
70 
71    ch1 = u[1];
72    if (((ch1 & ch) == 0x80)
73        && ((ch == 0xE0)
74 	   || (ch == 0xF0)
75 	   || (ch == 0xF8)
76 	   || (ch == 0xFC)))
77      return 1;
78 
79    if (len == 3)
80      {
81 	/* D800 is encoded as 0xED 0xA0 0x80 and DFFF as 0xED 0xBF 0xBF */
82 	if ((ch == 0xED)
83 	    && ((ch1 >= 0xA0) && (ch1 <= 0xBF))
84 	    && (u[2] >= 0x80) && (u[2] <= 0xBF))
85 	  return 1;
86 	/* Now FFFE and FFFF */
87 	if ((ch == 0xEF)
88 	    && (ch1 == 0xBF)
89 	    && ((u[2] == 0xBE) || (u[2] == 0xBF)))
90 	  return 1;
91      }
92    return 0;
93 }
94 
95 /* This function assumes that the necessary checks have been made to ensure
96  * a valid UTF-8 encoded character is present.
97  */
98 _INLINE_
fast_utf8_decode(SLuchar_Type * u,unsigned int len)99 static SLwchar_Type fast_utf8_decode (SLuchar_Type *u, unsigned int len)
100 {
101    static unsigned char masks[7] =
102      {
103 	0, 0, 0x1F, 0xF, 0x7, 0x3, 0x1
104      };
105    SLuchar_Type *umax;
106    SLwchar_Type w;
107 
108    w = (*u & masks[len]);
109    umax = u + len;
110    u++;
111    while (u < umax)
112      {
113 	w = (w << 6)| (u[0] & 0x3F);
114 	u++;
115      }
116    return w;
117 }
118 
SLutf8_skip_char(unsigned char * s,unsigned char * smax)119 unsigned char *SLutf8_skip_char (unsigned char *s, unsigned char *smax)
120 {
121    unsigned int len;
122 
123    if (s >= smax)
124      return s;
125 
126    len = Len_Map[*s];
127    if (len <= 1)
128      return s+1;
129 
130    if (s + len > smax)
131      return s+1;
132 
133    if (is_invalid_or_overlong_utf8 (s, len))
134      return s + 1;
135 
136    return s + len;
137 }
138 
SLutf8_skip_chars(SLuchar_Type * s,SLuchar_Type * smax,SLstrlen_Type num,SLstrlen_Type * dnum,int ignore_combining)139 SLuchar_Type *SLutf8_skip_chars (SLuchar_Type *s, SLuchar_Type *smax,
140 				 SLstrlen_Type num, SLstrlen_Type *dnum,
141 				 int ignore_combining)
142 {
143    SLstrlen_Type n;
144 
145    n = 0;
146    while ((n < num) && (s < smax))
147      {
148 	unsigned int len = Len_Map[*s];
149 
150 	if (len <= 1)
151 	  {
152 	     n++;
153 	     s++;
154 	     continue;
155 	  }
156 
157 	if (s + len > smax)
158 	  {
159 	     s++;
160 	     n++;
161 	     continue;
162 	  }
163 
164 	if (is_invalid_or_overlong_utf8 (s, len))
165 	  {
166 	     s++;
167 	     n++;
168 	     continue;
169 	  }
170 
171 	if (ignore_combining)
172 	  {
173 	     SLwchar_Type w = fast_utf8_decode (s, len);
174 	     if (0 != SLwchar_wcwidth (w))
175 	       n++;
176 	     s += len;
177 	     continue;
178 	  }
179 
180 	n++;
181 	s += len;
182      }
183 
184    if (ignore_combining)
185      {
186 	while (s < smax)
187 	  {
188 	     SLwchar_Type w;
189 	     SLstrlen_Type nconsumed;
190 	     if (NULL == SLutf8_decode (s, smax, &w, &nconsumed))
191 	       break;
192 
193 	     if (0 != SLwchar_wcwidth (w))
194 	       break;
195 
196 	     s += nconsumed;
197 	  }
198      }
199 
200    if (dnum != NULL)
201      *dnum = n;
202    return s;
203 }
204 
SLutf8_bskip_chars(SLuchar_Type * smin,SLuchar_Type * s,SLstrlen_Type num,SLstrlen_Type * dnum,int ignore_combining)205 SLuchar_Type *SLutf8_bskip_chars (SLuchar_Type *smin, SLuchar_Type *s,
206 				  SLstrlen_Type num, SLstrlen_Type *dnum,
207 				  int ignore_combining)
208 {
209    SLstrlen_Type n;
210    SLuchar_Type *smax = s;
211 
212    n = 0;
213    while ((n < num) && (s > smin))
214      {
215 	unsigned char ch;
216 	unsigned int dn;
217 
218 	s--;
219 	ch = *s;
220 	if (ch < 0x80)
221 	  {
222 	     n++;
223 	     smax = s;
224 	     continue;
225 	  }
226 
227 	dn = 0;
228 	while ((s != smin)
229 	       && (Len_Map[ch] == 0)
230 	       && (dn < SLUTF8_MAX_MBLEN))
231 	  {
232 	     s--;
233 	     ch = *s;
234 	     dn++;
235 	  }
236 
237 	if (ch <= 0xBF)
238 	  {
239 	     /* Invalid sequence */
240 	     n++;
241 	     smax--;
242 	     s = smax;
243 	     continue;
244 	  }
245 
246 	if (ch > 0xBF)
247 	  {
248 	     SLwchar_Type w;
249 	     SLuchar_Type *s1;
250 
251 	     if ((NULL == (s1 = SLutf8_decode (s, smax, &w, NULL)))
252 		 || (s1 != smax))
253 	       {
254 		  /* This means we backed up over an invalid sequence */
255 		  dn = (unsigned int) (smax - s);
256 		  n++;
257 		  smax--;
258 		  s = smax;
259 		  continue;
260 	       }
261 
262 	     if ((ignore_combining == 0)
263 		 || (0 != SLwchar_wcwidth (w)))
264 	       n++;
265 
266 	     smax = s;
267 	  }
268      }
269 
270    if (dnum != NULL)
271      *dnum = n;
272    return s;
273 }
274 
SLutf8_bskip_char(SLuchar_Type * smin,SLuchar_Type * s)275 SLuchar_Type *SLutf8_bskip_char (SLuchar_Type *smin, SLuchar_Type *s)
276 {
277    if (s > smin)
278      {
279 	SLstrlen_Type dn;
280 
281 	s--;
282 	if (*s >= 0x80)
283 	  s = SLutf8_bskip_chars (smin, s+1, 1, &dn, 0);
284      }
285    return s;
286 }
287 
288 /* This function counts the number of wide characters in a UTF-8 encoded
289  * string.  Each byte in an invalid sequence is counted as a single
290  * character. If the string contains illegal values, the illegal byte
291  * is counted as 1 character.
292  */
SLutf8_strlen(SLuchar_Type * s,int ignore_combining)293 SLstrlen_Type SLutf8_strlen (SLuchar_Type *s, int ignore_combining)
294 {
295    SLstrlen_Type count, len;
296 
297    if (s == NULL)
298      return 0;
299 
300    len = strlen ((char *)s);
301    (void) SLutf8_skip_chars (s, s + len, len, &count, ignore_combining);
302    return count;
303 }
304 
305 /*
306  * This function returns NULL if the input does not correspond to a valid
307  * UTF-8 sequence, otherwise, it returns the position of the next character
308  * in the sequence.
309  */
SLutf8_decode(unsigned char * u,unsigned char * umax,SLwchar_Type * wp,SLstrlen_Type * nconsumedp)310 unsigned char *SLutf8_decode (unsigned char *u, unsigned char *umax,
311 			      SLwchar_Type *wp, SLstrlen_Type *nconsumedp)
312 {
313    unsigned int len;
314    unsigned char ch;
315    SLwchar_Type w;
316 
317    if (u >= umax)
318      {
319 	*wp = 0;
320 	if (nconsumedp != NULL)
321 	  *nconsumedp = 0;
322 	return NULL;
323      }
324 
325    *wp = ch = *u;
326    if (ch < 0x80)
327      {
328 	if (nconsumedp != NULL) *nconsumedp = 1;
329 	return u+1;
330      }
331 
332    len = Len_Map[ch];
333    if (len < 2)
334      {
335 	/* should not happen--- code here for completeness */
336 	if (nconsumedp != NULL) *nconsumedp = 1;
337 	return NULL;
338      }
339    if (u + len > umax)
340      {
341 	if (nconsumedp != NULL) *nconsumedp = 1; /* (unsigned int) (umax - u); */
342 	return NULL;
343      }
344 
345    if (is_invalid_or_overlong_utf8 (u, len))
346      {
347 	if (nconsumedp != NULL)
348 	  *nconsumedp = 1;
349 
350 	return NULL;
351      }
352 
353    if (nconsumedp != NULL)
354      *nconsumedp = len;
355 
356    *wp = w = fast_utf8_decode (u, len);
357 
358    if (IS_ILLEGAL_UNICODE(w))
359      return NULL;
360 
361    return u + len;
362 }
363 
364 /* Encode the wide character returning a pointer to the end of the
365  * utf8 of the encoded multi-byte character.  This function will also encode
366  * illegal unicode values.  It returns NULL if buflen is too small.
367  * Otherwise, it returns a pointer at the end of the last encoded byte.
368  * It does not null terminate the encoded string.
369  */
SLutf8_encode(SLwchar_Type w,SLuchar_Type * u,SLstrlen_Type ulen)370 SLuchar_Type *SLutf8_encode (SLwchar_Type w, SLuchar_Type *u, SLstrlen_Type ulen)
371 {
372    SLuchar_Type *umax = u + ulen;
373 
374    /*   U-00000000 - U-0000007F: 0xxxxxxx */
375    if (w <= 0x7F)
376      {
377 	if (u >= umax)
378 	  return NULL;
379 
380 	*u++ = (unsigned char) w;
381 	return u;
382      }
383 
384    /*   U-00000080 - U-000007FF: 110xxxxx 10xxxxxx */
385    if (w <= 0x7FF)
386      {
387 	if ((u + 1) >= umax)
388 	  return NULL;
389 
390 	*u++ = (w >> 6) | 0xC0;
391 	*u++ = (w & 0x3F) | 0x80;
392 	return u;
393      }
394 
395    /* First bad character starts at 0xD800 */
396 
397    /* Allow illegal values to be encoded */
398 
399    /*
400     *if (IS_ILLEGAL_UNICODE(w))
401     * return NULL;
402     */
403 
404    /*   U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
405    if (w <= 0xFFFF)
406      {
407 	if (u+2 >= umax)
408 	  return NULL;
409 	*u++ = (w >> 12 ) | 0xE0;
410 	goto finish_2;
411      }
412 
413    /*   U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
414    if (w <= 0x1FFFFF)
415      {
416 	if (u+3 >= umax)
417 	  return NULL;
418 	*u++ = (w >> 18) | 0xF0;
419 	goto finish_3;
420      }
421 
422    /*   U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
423    if (w <= 0x3FFFFFF)
424      {
425 	if (u+4 >= umax)
426 	  return NULL;
427 	*u++ = (w >> 24) | 0xF8;
428 	goto finish_4;
429      }
430 
431    /*   U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
432    if (w <= 0x7FFFFFFF)
433      {
434 	if (u+5 >= umax)
435 	  return NULL;
436 	*u++ = (w >> 30) | 0xFC;
437 	goto finish_5;
438      }
439 
440    /* unreached?? */
441    return NULL;
442 
443    finish_5: *u++ = ((w >> 24) & 0x3F)|0x80;
444    finish_4: *u++ = ((w >> 18) & 0x3F)|0x80;
445    finish_3: *u++ = ((w >> 12) & 0x3F)|0x80;
446    finish_2: *u++ = ((w >> 6) & 0x3F)|0x80;
447              *u++ = (w & 0x3F)|0x80;
448 
449    return u;
450 }
451 
452 /* Like SLutf8_encode, but null terminates the result.
453  * At least SLUTF8_MAX_MBLEN+1 bytes assumed.
454  */
SLutf8_encode_null_terminate(SLwchar_Type w,SLuchar_Type * u)455 SLuchar_Type *SLutf8_encode_null_terminate (SLwchar_Type w, SLuchar_Type *u)
456 {
457    SLuchar_Type *p;
458 
459    p = SLutf8_encode (w, u, SLUTF8_MAX_MBLEN);
460    if (p != NULL)
461      *p = 0;
462    return p;
463 }
464 
465 #if 0
466 int SLutf8_decode_bytes (SLuchar_Type *u, SLuchar_Type *umax,
467 			 unsigned char *b, unsigned int *np)
468 {
469    unsigned char *bmax;
470 
471    bmax = b;
472    while (u < umax)
473      {
474 	SLwchar_Type w;
475 
476 	if (0 == (*u & 0x80))
477 	  {
478 	     *bmax++ = *u++;
479 	     continue;
480 	  }
481 
482 	if (NULL == (u = SLutf8_decode (u, umax, &w, NULL)))
483 	  return -1;		       /* FIXME: HANDLE ERROR */
484 
485 	if (w > 0xFF)
486 	  {
487 #if 0
488 	     sprintf (bmax, "<U+%04X>", w);
489 	     bmax += strlen (bmax);
490 	     continue;
491 #endif
492 	     /* FIXME: HANDLE ERROR */
493 	     w = w & 0xFF;
494 	  }
495 
496 	*bmax++ = w;
497      }
498    *np = bmax - b;
499    *bmax = 0;
500    return 0;
501 }
502 
503 /* UTF-8 Encode the bytes between b and bmax storing the results in the
504  * buffer defined by u and umax, returning the position following the
505  * last encoded character.  Upon return, *np is set to the number of bytes
506  * sucessfully encoded.
507  */
508 SLuchar_Type *SLutf8_encode_bytes (unsigned char *b, unsigned char *bmax,
509 				   SLuchar_Type *u, unsigned int ulen,
510 				   unsigned int *np)
511 {
512    unsigned char *bstart = b;
513    SLuchar_Type *umax = u + ulen;
514 
515    while (b < bmax)
516      {
517 	SLuchar_Type *u1;
518 
519 	if (0 == (*b & 0x80))
520 	  {
521 	     if (u >= umax)
522 	       break;
523 
524 	     *u++ = *b++;
525 	     continue;
526 	  }
527 
528 	if (NULL == (u1 = SLutf8_encode (*b, u, umax - u)))
529 	  break;
530 	u = u1;
531 	b++;
532      }
533 
534    *np = b - bstart;
535    if (u < umax)
536      *u = 0;
537 
538    return u;
539 }
540 #endif
541 
xform_utf8(SLuchar_Type * u,SLuchar_Type * umax,SLwchar_Type (* fun)(SLwchar_Type))542 static SLuchar_Type *xform_utf8 (SLuchar_Type *u, SLuchar_Type *umax,
543                                  SLwchar_Type (*fun)(SLwchar_Type))
544 {
545    SLuchar_Type *buf, *p;
546    size_t malloced_len, len;
547 
548    if (umax < u)
549      return NULL;
550 
551    len = 0;
552    p = buf = NULL;
553    malloced_len = 0;
554 
555    while (1)
556      {
557         SLwchar_Type w;
558         SLuchar_Type *u1;
559         SLstrlen_Type nconsumed;
560 
561         if (malloced_len <= len + SLUTF8_MAX_MBLEN)
562           {
563              SLuchar_Type *newbuf;
564              malloced_len += 1 + (umax - u) + SLUTF8_MAX_MBLEN;
565 
566              newbuf = (SLuchar_Type *)SLrealloc ((char *)buf, malloced_len);
567              if (newbuf == NULL)
568                {
569                   SLfree ((char *)buf);
570                   return NULL;
571                }
572              buf = newbuf;
573              p = buf + len;
574           }
575 
576         if (u >= umax)
577           {
578              *p = 0;
579              p = (SLuchar_Type *) SLang_create_nslstring ((char *)buf, len);
580              SLfree ((char *)buf);
581              return p;
582           }
583 
584         if (NULL == (u1 = SLutf8_decode (u, umax, &w, &nconsumed)))
585           {
586              /* Invalid sequence */
587              memcpy ((char *) p, u, nconsumed);
588              p += nconsumed;
589              len += nconsumed;
590              u1 = u + nconsumed;
591           }
592         else
593           {
594              SLuchar_Type *p1;
595 
596              p1 = SLutf8_encode ((*fun)(w), p, malloced_len);
597              if (p1 == NULL)
598                {
599                   SLfree ((char *)buf);
600                   _pSLang_verror (SL_INTERNAL_ERROR, "SLutf8_encode returned NULL");
601                   return NULL;
602                }
603              len += p1 - p;
604              p = p1;
605           }
606 
607         u = u1;
608      }
609 }
610 
611 /* Returned an uppercased version of an UTF-8 encoded string.  Illegal or
612  * invalid sequences will be returned as-is.  This function returns
613  * an SLstring.
614  */
SLutf8_strup(SLuchar_Type * u,SLuchar_Type * umax)615 SLuchar_Type *SLutf8_strup (SLuchar_Type *u, SLuchar_Type *umax)
616 {
617    return xform_utf8 (u, umax, SLwchar_toupper);
618 }
619 
620 /* Returned an lowercased version of an UTF-8 encoded string.  Illegal or
621  * invalid sequences will be returned as-is.  This function returns
622  * an SLstring.
623  */
SLutf8_strlo(SLuchar_Type * u,SLuchar_Type * umax)624 SLuchar_Type *SLutf8_strlo (SLuchar_Type *u, SLuchar_Type *umax)
625 {
626    return xform_utf8 (u, umax, SLwchar_tolower);
627 }
628 
SLutf8_compare(SLuchar_Type * a,SLuchar_Type * amax,SLuchar_Type * b,SLuchar_Type * bmax,SLstrlen_Type nchars,int cs)629 int SLutf8_compare (SLuchar_Type *a, SLuchar_Type *amax,
630                     SLuchar_Type *b, SLuchar_Type *bmax,
631                     SLstrlen_Type nchars,
632                     int cs)
633 {
634    while (nchars && (a < amax) && (b < bmax))
635      {
636         SLwchar_Type cha, chb;
637         SLstrlen_Type na, nb;
638         int aok, bok;
639 
640         if (*a < 0x80)
641           {
642              cha = (SLwchar_Type) *a++;
643              aok = 1;
644           }
645         else
646           {
647              aok = (NULL != SLutf8_decode (a, amax, &cha, &na));
648              a += na;
649           }
650 
651         if (*b < 0x80)
652           {
653              chb = (SLwchar_Type) *b++;
654              bok = 1;
655           }
656         else
657           {
658              bok = (NULL != SLutf8_decode (b, bmax, &chb, &nb));
659              b += nb;
660           }
661 
662         nchars--;
663 
664         if (aok && bok)
665           {
666              if (cs == 0)
667                {
668                   cha = SLwchar_toupper (cha);
669                   chb = SLwchar_toupper (chb);
670                }
671           }
672         else if (aok)
673           return 1;
674         else if (bok)
675           return -1;
676 
677         if (cha == chb)
678           continue;
679 
680         if (cha > chb)
681           return 1;
682 
683         return -1;
684      }
685 
686    if (nchars == 0)
687      return 0;
688 
689    if ((a >= amax) && (b >= bmax))
690      return 0;
691 
692    if (b >= bmax)
693      return 1;
694 
695    return -1;
696 }
697 
698 /* Returns an SLstring */
SLutf8_subst_wchar(SLuchar_Type * u,SLuchar_Type * umax,SLwchar_Type wch,SLstrlen_Type pos,int ignore_combining)699 SLstr_Type *SLutf8_subst_wchar (SLuchar_Type *u, SLuchar_Type *umax,
700 				SLwchar_Type wch, SLstrlen_Type pos,
701 				int ignore_combining)
702 {
703    SLuchar_Type *a, *a1, *b;
704    SLstrlen_Type dpos;
705    SLuchar_Type buf[SLUTF8_MAX_MBLEN+1];
706    SLstr_Type *c;
707    SLstrlen_Type n1, n2, n3, len;
708 
709    a = SLutf8_skip_chars (u, umax, pos, &dpos, ignore_combining);
710 
711    if ((dpos != pos) || (a == umax))
712      {
713 	_pSLang_verror (SL_INDEX_ERROR, "Specified character position is invalid for string");
714 	return NULL;
715      }
716 
717    a1 = SLutf8_skip_chars (a, umax, 1, NULL, ignore_combining);
718 
719    b = SLutf8_encode (wch, buf, SLUTF8_MAX_MBLEN);
720    if (b == NULL)
721      {
722 	_pSLang_verror (SL_UNICODE_ERROR, "Unable to encode wchar 0x%lX", (unsigned long)wch);
723 	return NULL;
724      }
725 
726    n1 = (a-u);
727    n2 = (b-buf);
728    n3 = (umax-a1);
729    len = n1 + n2 + n3;
730    c = _pSLallocate_slstring (len);
731    if (c == NULL)
732      return NULL;
733 
734    memcpy (c, (char *)u, n1);
735    memcpy (c+n1, (char *)buf, n2);
736    memcpy (c+n1+n2, (char *)a1, n3);
737    c[len] = 0;
738 
739    /* No need to worry about this failing-- it frees its argument */
740    return _pSLcreate_via_alloced_slstring (c, len);
741 }
742 
743 /* utf8 buffer assumed to be at least SLUTF8_MAX_MBLEN+1 bytes.  Result will be
744  * null terminated.   Returns position of NEXT character.
745  * Analogous to: *p++
746  */
SLutf8_extract_utf8_char(SLuchar_Type * u,SLuchar_Type * umax,SLuchar_Type * utf8)747 SLuchar_Type *SLutf8_extract_utf8_char (SLuchar_Type *u,
748 					SLuchar_Type *umax,
749 					SLuchar_Type *utf8)
750 {
751    SLuchar_Type *u1;
752 
753    u1 = SLutf8_skip_char (u, umax);
754    memcpy ((char *)utf8, u, u1-u);
755    utf8[u1-u] = 0;
756 
757    return u1;
758 }
759 
760 /* These routines depend upon the value of the _pSLinterp_UTF8_Mode variable.
761  * They also generate slang errors upon error.
762  */
_pSLinterp_decode_wchar(SLuchar_Type * u,SLuchar_Type * umax,SLwchar_Type * chp)763 SLuchar_Type *_pSLinterp_decode_wchar (SLuchar_Type *u,
764 				      SLuchar_Type *umax,
765 				      SLwchar_Type *chp)
766 {
767    if (_pSLinterp_UTF8_Mode == 0)
768      {
769 	if (u < umax)
770 	  *chp = (SLwchar_Type) *u++;
771 	return u;
772      }
773 
774    if (NULL == (u = SLutf8_decode (u, umax, chp, NULL)))
775      _pSLang_verror (SL_INVALID_UTF8, "Invalid UTF-8 encoded string");
776 
777    return u;
778 }
779 
780 /* At least SLUTF8_MAX_MBLEN+1 bytes assumed-- null terminates result.
781  * Upon success, it returns a pointer to the _end_ of the encoded character
782  */
_pSLinterp_encode_wchar(SLwchar_Type wch,SLuchar_Type * u,unsigned int * encoded_len)783 SLuchar_Type *_pSLinterp_encode_wchar (SLwchar_Type wch, SLuchar_Type *u, unsigned int *encoded_len)
784 {
785    SLuchar_Type *u1;
786 
787    if (_pSLinterp_UTF8_Mode == 0)
788      {
789 	*encoded_len = 1;
790 	*u++ = (SLuchar_Type) wch;
791 	*u++ = 0;
792 	return u;
793      }
794 
795    if (NULL == (u1 = SLutf8_encode_null_terminate (wch, u)))
796      {
797 	_pSLang_verror (SL_UNICODE_ERROR, "Unable to encode character 0x%lX", (unsigned long)wch);
798 	return NULL;
799      }
800 
801    *encoded_len = (unsigned int) (u1 - u);
802    return u1;
803 }
804 
805 #ifdef REGRESSION
main(int argc,char ** argv)806 int main (int argc, char **argv)
807 {
808    unsigned char *s, *smax;
809    char **t;
810    char *ok_tests [] =
811      {
812 	"퟿",
813 	  "",
814 	  "�",
815 	  "��",
816 	  "����",
817 	  NULL
818      };
819    char *long_tests [] =
820      {
821 	"��",
822 	  "���",
823 	  "����",
824 	  "�����",
825 	  "������",
826 	  NULL
827      };
828 
829    t = long_tests;
830    while ((s = (unsigned char *) *t++) != NULL)
831      {
832 	smax = s + strlen ((char *)s);
833 
834 	while (s < smax)
835 	  {
836 	     SLwchar_Type w;
837 
838 	     if (NULL == (s = SLutf8_to_wc (s, smax, &w)))
839 	       {
840 		  fprintf (stderr, "SLutf8_to_wc failed\n");
841 		  break;
842 	       }
843 	     if (w == 0)
844 	       break;
845 	     fprintf (stdout, " 0x%X", w);
846 	  }
847 
848 	fprintf (stdout, "\n");
849      }
850    return 0;
851 }
852 #endif
853 
854