1 /*
2   This file is part of Deadbeef Player source code
3   http://deadbeef.sourceforge.net
4 
5   utf8 string manipulation
6 
7   Copyright (C) 2009-2013 Alexey Yakovenko
8 
9   This software is provided 'as-is', without any express or implied
10   warranty.  In no event will the authors be held liable for any damages
11   arising from the use of this software.
12 
13   Permission is granted to anyone to use this software for any purpose,
14   including commercial applications, and to alter it and redistribute it
15   freely, subject to the following restrictions:
16 
17   1. The origin of this software must not be misrepresented; you must not
18      claim that you wrote the original software. If you use this software
19      in a product, an acknowledgment in the product documentation would be
20      appreciated but is not required.
21   2. Altered source versions must be plainly marked as such, and must not be
22      misrepresented as being the original software.
23   3. This notice may not be removed or altered from any source distribution.
24 
25   Alexey Yakovenko waker@users.sourceforge.net
26 */
27 
28 /*
29     based on Basic UTF-8 manipulation routines
30     by Jeff Bezanson
31     placed in the public domain Fall 2005
32 */
33 #ifdef HAVE_CONFIG_H
34 #  include "config.h"
35 #endif
36 #ifdef HAVE_ALLOCA_H
37 #  include <alloca.h>
38 #endif
39 #include <stdlib.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <stdarg.h>
43 //#include <alloca.h>
44 #include "ctype.h"
45 #include "utf8.h"
46 #include "u8_lc_map.h"
47 #include "u8_uc_map.h"
48 
49 static const uint32_t offsetsFromUTF8[6] = {
50     0x00000000UL, 0x00003080UL, 0x000E2080UL,
51     0x03C82080UL, 0xFA082080UL, 0x82082080UL
52 };
53 
54 static const char trailingBytesForUTF8[256] = {
55     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
62     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
63 };
64 
65 /* conversions without error checking
66    only works for valid UTF-8, i.e. no 5- or 6-byte sequences
67    srcsz = source size in bytes, or -1 if 0-terminated
68    sz = dest size in # of wide characters
69 
70    returns # characters converted
71    dest will always be L'\0'-terminated, even if there isn't enough room
72    for all the characters.
73    if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
74 */
u8_toucs(uint32_t * dest,int32_t sz,const char * src,int32_t srcsz)75 int u8_toucs(uint32_t *dest, int32_t sz, const char *src, int32_t srcsz)
76 {
77     uint32_t ch;
78     const char *src_end = src + srcsz;
79     int32_t nb;
80     int32_t i=0;
81 
82     while (i < sz-1) {
83         nb = trailingBytesForUTF8[(unsigned char)*src];
84         if (srcsz == -1) {
85             if (*src == 0)
86                 goto done_toucs;
87         }
88         else {
89             if (src + nb >= src_end)
90                 goto done_toucs;
91         }
92         ch = 0;
93         switch (nb) {
94             /* these fall through deliberately */
95         case 3: ch += (unsigned char)*src++; ch <<= 6;
96         case 2: ch += (unsigned char)*src++; ch <<= 6;
97         case 1: ch += (unsigned char)*src++; ch <<= 6;
98         case 0: ch += (unsigned char)*src++;
99         }
100         ch -= offsetsFromUTF8[nb];
101         dest[i++] = ch;
102     }
103  done_toucs:
104     dest[i] = 0;
105     return i;
106 }
107 
108 /* srcsz = number of source characters, or -1 if 0-terminated
109    sz = size of dest buffer in bytes
110 
111    returns # characters converted
112    dest will only be '\0'-terminated if there is enough space. this is
113    for consistency; imagine there are 2 bytes of space left, but the next
114    character requires 3 bytes. in this case we could NUL-terminate, but in
115    general we can't when there's insufficient space. therefore this function
116    only NUL-terminates if all the characters fit, and there's space for
117    the NUL as well.
118    the destination string will never be bigger than the source string.
119 */
u8_toutf8(char * dest,int32_t sz,uint32_t * src,int32_t srcsz)120 int u8_toutf8(char *dest, int32_t sz, uint32_t *src, int32_t srcsz)
121 {
122     uint32_t ch;
123     int32_t i = 0;
124     char *dest_end = dest + sz;
125 
126     while (srcsz<0 ? src[i]!=0 : i < srcsz) {
127         ch = src[i];
128         if (ch < 0x80) {
129             if (dest >= dest_end)
130                 return i;
131             *dest++ = (char)ch;
132         }
133         else if (ch < 0x800) {
134             if (dest >= dest_end-1)
135                 return i;
136             *dest++ = (ch>>6) | 0xC0;
137             *dest++ = (ch & 0x3F) | 0x80;
138         }
139         else if (ch < 0x10000) {
140             if (dest >= dest_end-2)
141                 return i;
142             *dest++ = (ch>>12) | 0xE0;
143             *dest++ = ((ch>>6) & 0x3F) | 0x80;
144             *dest++ = (ch & 0x3F) | 0x80;
145         }
146         else if (ch < 0x200000) {
147             if (dest >= dest_end-3)
148                 return i;
149             *dest++ = (ch>>18) | 0xF0;
150             *dest++ = ((ch>>12) & 0x3F) | 0x80;
151             *dest++ = ((ch>>6) & 0x3F) | 0x80;
152             *dest++ = (ch & 0x3F) | 0x80;
153         }
154         i++;
155     }
156     if (dest < dest_end)
157         *dest = '\0';
158     return i;
159 }
160 
u8_wc_toutf8(char * dest,uint32_t ch)161 int u8_wc_toutf8(char *dest, uint32_t ch)
162 {
163     if (ch < 0x80) {
164         dest[0] = (char)ch;
165         return 1;
166     }
167     if (ch < 0x800) {
168         dest[0] = (ch>>6) | 0xC0;
169         dest[1] = (ch & 0x3F) | 0x80;
170         return 2;
171     }
172     if (ch < 0x10000) {
173         dest[0] = (ch>>12) | 0xE0;
174         dest[1] = ((ch>>6) & 0x3F) | 0x80;
175         dest[2] = (ch & 0x3F) | 0x80;
176         return 3;
177     }
178     if (ch < 0x200000) {
179         dest[0] = (ch>>18) | 0xF0;
180         dest[1] = ((ch>>12) & 0x3F) | 0x80;
181         dest[2] = ((ch>>6) & 0x3F) | 0x80;
182         dest[3] = (ch & 0x3F) | 0x80;
183         return 4;
184     }
185     return 0;
186 }
187 
188 /* charnum => byte offset */
u8_offset(char * str,int32_t charnum)189 int u8_offset(char *str, int32_t charnum)
190 {
191     int32_t offs=0;
192 
193     while (charnum > 0 && str[offs]) {
194         (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
195                isutf(str[++offs]) || ++offs);
196         charnum--;
197     }
198     return offs;
199 }
200 
201 /* byte offset => charnum */
u8_charnum(char * s,int32_t offset)202 int u8_charnum(char *s, int32_t offset)
203 {
204     int32_t charnum = 0, offs=0;
205 
206     while (offs < offset && s[offs]) {
207         (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
208                isutf(s[++offs]) || ++offs);
209         charnum++;
210     }
211     return charnum;
212 }
213 
214 /* number of characters */
u8_strlen(char * s)215 int u8_strlen(char *s)
216 {
217     int32_t count = 0;
218     int32_t i = 0;
219 
220     while (u8_nextchar(s, &i) != 0)
221         count++;
222 
223     return count;
224 }
225 
226 /* reads the next utf-8 sequence out of a string, updating an index */
u8_nextchar(const char * s,int32_t * i)227 uint32_t u8_nextchar(const char *s, int32_t *i)
228 {
229     uint32_t ch = 0;
230     int32_t sz = 0;
231 
232     do {
233         ch <<= 6;
234         ch += (unsigned char)s[(*i)++];
235         sz++;
236     } while (s[*i] && !isutf(s[*i]));
237     ch -= offsetsFromUTF8[sz-1];
238 
239     return ch;
240 }
241 
242 /* copies num_chars characters from src to dest, return bytes written */
u8_strncpy(char * dest,const char * src,int num_chars)243 int u8_strncpy (char *dest, const char* src, int num_chars)
244 {
245     const char *s = src;
246     int32_t num_bytes = 0;
247     while (num_chars && *s) {
248         int32_t i = 0;
249         u8_nextchar (s, &i);
250         num_chars--;
251         num_bytes += i;
252         s += i;
253     }
254     strncpy (dest, src, s - src);
255     dest[s - src] = 0;
256     return num_bytes;
257 }
258 
u8_strnbcpy(char * dest,const char * src,int num_bytes)259 int u8_strnbcpy (char *dest, const char* src, int num_bytes) {
260     int32_t prev_index = 0;
261     int32_t index = 0;
262     int32_t nb = num_bytes;
263     while (src[index] && num_bytes > 0) {
264         u8_inc (src, &index);
265         int32_t charlen = index - prev_index;
266         if (charlen > num_bytes) {
267             break;
268         }
269         memcpy (dest, &src[prev_index], charlen);
270         prev_index = index;
271         dest += charlen;
272         num_bytes -= charlen;
273     }
274     return nb - num_bytes;
275 }
276 
u8_charcpy(char * dest,const char * src,int num_bytes)277 int u8_charcpy (char *dest, const char *src, int num_bytes) {
278     int32_t index = 0;
279     u8_inc (src, &index);
280     if (index > num_bytes) {
281         return 0;
282     }
283     memcpy (dest, src, index);
284     return index;
285 }
286 
u8_inc(const char * s,int32_t * i)287 void u8_inc(const char *s, int32_t *i)
288 {
289     (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
290            isutf(s[++(*i)]) || ++(*i));
291 }
292 
u8_dec(const char * s,int32_t * i)293 void u8_dec(const char *s, int32_t *i)
294 {
295     (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
296            isutf(s[--(*i)]) || --(*i));
297 }
298 
octal_digit(char c)299 int octal_digit(char c)
300 {
301     return (c >= '0' && c <= '7');
302 }
303 
hex_digit(char c)304 int hex_digit(char c)
305 {
306     return ((c >= '0' && c <= '9') ||
307             (c >= 'A' && c <= 'F') ||
308             (c >= 'a' && c <= 'f'));
309 }
310 
311 /* assumes that src points to the character after a backslash
312    returns number of input characters processed */
u8_read_escape_sequence(const char * str,uint32_t * dest)313 int u8_read_escape_sequence(const char *str, uint32_t *dest)
314 {
315     uint32_t ch;
316     char digs[]="\0\0\0\0\0\0\0\0\0";
317     int32_t dno=0, i=1;
318 
319     ch = (uint32_t)str[0];    /* take literal character */
320     if (str[0] == 'n')
321         ch = L'\n';
322     else if (str[0] == 't')
323         ch = L'\t';
324     else if (str[0] == 'r')
325         ch = L'\r';
326     else if (str[0] == 'b')
327         ch = L'\b';
328     else if (str[0] == 'f')
329         ch = L'\f';
330     else if (str[0] == 'v')
331         ch = L'\v';
332     else if (str[0] == 'a')
333         ch = L'\a';
334     else if (octal_digit(str[0])) {
335         i = 0;
336         do {
337             digs[dno++] = str[i++];
338         } while (octal_digit(str[i]) && dno < 3);
339         ch = strtol(digs, NULL, 8);
340     }
341     else if (str[0] == 'x') {
342         while (hex_digit(str[i]) && dno < 2) {
343             digs[dno++] = str[i++];
344         }
345         if (dno > 0)
346             ch = strtol(digs, NULL, 16);
347     }
348     else if (str[0] == 'u') {
349         while (hex_digit(str[i]) && dno < 4) {
350             digs[dno++] = str[i++];
351         }
352         if (dno > 0)
353             ch = strtol(digs, NULL, 16);
354     }
355     else if (str[0] == 'U') {
356         while (hex_digit(str[i]) && dno < 8) {
357             digs[dno++] = str[i++];
358         }
359         if (dno > 0)
360             ch = strtol(digs, NULL, 16);
361     }
362     *dest = ch;
363 
364     return i;
365 }
366 
367 // convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
368 // example: u8_unescape(mybuf, 256, "hello\\u220e")
369 // note the double backslash is needed if called on a C string literal
u8_unescape(char * buf,int32_t sz,const char * src)370 int u8_unescape(char *buf, int32_t sz, const char *src)
371 {
372     int32_t c=0, amt;
373     uint32_t ch;
374     char temp[4];
375 
376     while (*src && c < sz) {
377         if (*src == '\\') {
378             src++;
379             amt = u8_read_escape_sequence(src, &ch);
380         }
381         else {
382             ch = (uint32_t)*src;
383             amt = 1;
384         }
385         src += amt;
386         amt = u8_wc_toutf8(temp, ch);
387         if (amt > sz-c)
388             break;
389         memcpy(&buf[c], temp, amt);
390         c += amt;
391     }
392     if (c < sz)
393         buf[c] = '\0';
394     return c;
395 }
396 
u8_escape_wchar(char * buf,int32_t sz,uint32_t ch)397 int u8_escape_wchar(char *buf, int32_t sz, uint32_t ch)
398 {
399     if (ch == L'\n')
400         return snprintf(buf, sz, "\\n");
401     else if (ch == L'\t')
402         return snprintf(buf, sz, "\\t");
403     else if (ch == L'\r')
404         return snprintf(buf, sz, "\\r");
405     else if (ch == L'\b')
406         return snprintf(buf, sz, "\\b");
407     else if (ch == L'\f')
408         return snprintf(buf, sz, "\\f");
409     else if (ch == L'\v')
410         return snprintf(buf, sz, "\\v");
411     else if (ch == L'\a')
412         return snprintf(buf, sz, "\\a");
413     else if (ch == L'\\')
414         return snprintf(buf, sz, "\\\\");
415     else if (ch < 32 || ch == 0x7f)
416         return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch);
417     else if (ch > 0xFFFF)
418         return snprintf(buf, sz, "\\U%.8X", (uint32_t)ch);
419     else if (ch >= 0x80 && ch <= 0xFFFF)
420         return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch);
421 
422     return snprintf(buf, sz, "%c", (char)ch);
423 }
424 
u8_escape(char * buf,int32_t sz,const char * src,int32_t escape_quotes)425 int u8_escape(char *buf, int32_t sz, const char *src, int32_t escape_quotes)
426 {
427     int32_t c=0, i=0, amt;
428 
429     while (src[i] && c < sz) {
430         if (escape_quotes && src[i] == '"') {
431             amt = snprintf(buf, sz - c, "\\\"");
432             i++;
433         }
434         else {
435             amt = u8_escape_wchar(buf, sz - c, u8_nextchar(src, &i));
436         }
437         c += amt;
438         buf += amt;
439     }
440     if (c < sz)
441         *buf = '\0';
442     return c;
443 }
444 
u8_strchr(char * s,uint32_t ch,int32_t * charn)445 char *u8_strchr(char *s, uint32_t ch, int32_t *charn)
446 {
447     int32_t i = 0, lasti=0;
448     uint32_t c;
449 
450     *charn = 0;
451     while (s[i]) {
452         c = u8_nextchar(s, &i);
453         if (c == ch) {
454             return &s[lasti];
455         }
456         lasti = i;
457         (*charn)++;
458     }
459     return NULL;
460 }
461 
u8_memchr(char * s,uint32_t ch,size_t sz,int32_t * charn)462 char *u8_memchr(char *s, uint32_t ch, size_t sz, int32_t *charn)
463 {
464     int32_t i = 0, lasti=0;
465     uint32_t c;
466     int32_t csz;
467 
468     *charn = 0;
469     while (i < sz) {
470         c = csz = 0;
471         do {
472             c <<= 6;
473             c += (unsigned char)s[i++];
474             csz++;
475         } while (i < sz && !isutf(s[i]));
476         c -= offsetsFromUTF8[csz-1];
477 
478         if (c == ch) {
479             return &s[lasti];
480         }
481         lasti = i;
482         (*charn)++;
483     }
484     return NULL;
485 }
486 
u8_is_locale_utf8(char * locale)487 int u8_is_locale_utf8(char *locale)
488 {
489     /* this code based on libutf8 */
490     const char* cp = locale;
491 
492     for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
493         if (*cp == '.') {
494             const char* encoding = ++cp;
495             for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
496                 ;
497             if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
498                 || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
499                 return 1; /* it's UTF-8 */
500             break;
501         }
502     }
503     return 0;
504 }
505 
u8_vprintf(char * fmt,va_list ap)506 int u8_vprintf(char *fmt, va_list ap)
507 {
508     int32_t cnt, sz=0;
509     char *buf;
510     uint32_t *wcs;
511 
512     sz = 512;
513     buf = (char*)alloca(sz);
514  try_print:
515     cnt = vsnprintf(buf, sz, fmt, ap);
516     if (cnt >= sz) {
517         buf = (char*)alloca(cnt - sz + 1);
518         sz = cnt + 1;
519         goto try_print;
520     }
521     wcs = (uint32_t*)alloca((cnt+1) * sizeof(uint32_t));
522     cnt = u8_toucs(wcs, cnt+1, buf, cnt);
523     printf("%ls", (wchar_t*)wcs);
524     return cnt;
525 }
526 
u8_printf(char * fmt,...)527 int u8_printf(char *fmt, ...)
528 {
529     int32_t cnt;
530     va_list args;
531 
532     va_start(args, fmt);
533 
534     cnt = u8_vprintf(fmt, args);
535 
536     va_end(args);
537     return cnt;
538 }
539 
540 // adaptation of g_utf8_validate
541 
542 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
543   if (Char < 128)                                                             \
544     {                                                                         \
545       Len = 1;                                                                \
546       Mask = 0x7f;                                                            \
547     }                                                                         \
548   else if ((Char & 0xe0) == 0xc0)                                             \
549     {                                                                         \
550       Len = 2;                                                                \
551       Mask = 0x1f;                                                            \
552     }                                                                         \
553   else if ((Char & 0xf0) == 0xe0)                                             \
554     {                                                                         \
555       Len = 3;                                                                \
556       Mask = 0x0f;                                                            \
557     }                                                                         \
558   else if ((Char & 0xf8) == 0xf0)                                             \
559     {                                                                         \
560       Len = 4;                                                                \
561       Mask = 0x07;                                                            \
562     }                                                                         \
563   else if ((Char & 0xfc) == 0xf8)                                             \
564     {                                                                         \
565       Len = 5;                                                                \
566       Mask = 0x03;                                                            \
567     }                                                                         \
568   else if ((Char & 0xfe) == 0xfc)                                             \
569     {                                                                         \
570       Len = 6;                                                                \
571       Mask = 0x01;                                                            \
572     }                                                                         \
573   else                                                                        \
574     Len = -1;
575 
576 #define UTF8_LENGTH(Char)              \
577   ((Char) < 0x80 ? 1 :                 \
578    ((Char) < 0x800 ? 2 :               \
579     ((Char) < 0x10000 ? 3 :            \
580      ((Char) < 0x200000 ? 4 :          \
581       ((Char) < 0x4000000 ? 5 : 6)))))
582 
583 
584 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
585   (Result) = (Chars)[0] & (Mask);                                             \
586   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
587     {                                                                         \
588       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
589         {                                                                     \
590           (Result) = -1;                                                      \
591           break;                                                              \
592         }                                                                     \
593       (Result) <<= 6;                                                         \
594       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
595     }
596 
597 #define UNICODE_VALID(Char)                   \
598     ((Char) < 0x110000 &&                     \
599      (((Char) & 0xFFFFF800) != 0xD800) &&     \
600      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
601      ((Char) & 0xFFFE) != 0xFFFE)
602 
603 
u8_valid(const char * str,int max_len,const char ** end)604 int u8_valid (const char  *str,
605         int max_len,
606         const char **end)
607 {
608 
609     const char *p;
610 
611     if (!str) {
612         return 0;
613     }
614 
615     if (end)
616         *end = str;
617 
618     p = str;
619 
620     while ((max_len < 0 || (p - str) < max_len) && *p)
621     {
622         int i, mask = 0, len;
623         int32_t result;
624         unsigned char c = (unsigned char) *p;
625 
626         UTF8_COMPUTE (c, mask, len);
627 
628         if (len == -1)
629             break;
630 
631         /* check that the expected number of bytes exists in str */
632         if (max_len >= 0 &&
633                 ((max_len - (p - str)) < len))
634             break;
635 
636         UTF8_GET (result, p, i, mask, len);
637 
638         if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
639             break;
640 
641         if (result == (int32_t)-1)
642             break;
643 
644         if (!UNICODE_VALID (result))
645             break;
646 
647         p += len;
648     }
649 
650     if (end)
651         *end = p;
652 
653     /* See that we covered the entire length if a length was
654      * passed in, or that we ended on a nul if not
655      */
656     if (max_len >= 0 && p != (str + max_len) && *p != 0) {
657         return 0;
658     }
659     else if (max_len < 0 && *p != '\0') {
660         return 0;
661     }
662     return 1;
663 }
664 
665 #if 0
666 static const char lowerchars[] = "áéíñóúüäöåæøàçèêабвгдеёжзийклмнорпстуфхцчшщъыьэюя";
667 static const char upperchars[] = "ÁÉÍÑÓÚÜÄÖÅÆØÀÇÈÊАБВГДЕЁЖЗИЙКЛМНОРПСТУФХЦЧШЩЪЫЬЭЮЯ";
668 #endif
669 
670 int
u8_tolower_slow(const char * input,int len,char * out)671 u8_tolower_slow (const char *input, int len, char *out) {
672     struct u8_case_map_t *lc = u8_lc_in_word_set (input, len);
673     if (lc) {
674         int ll = strlen (lc->lower);
675         memcpy (out, lc->lower, ll);
676         out[ll] = 0;
677         return ll;
678     }
679     return 0;
680 }
681 
682 int
u8_tolower(const signed char * c,int l,char * out)683 u8_tolower (const signed char *c, int l, char *out) {
684     if (*c >= 65 && *c <= 90) {
685         *out = *c + 0x20;
686         out[1] = 0;
687         return 1;
688     }
689     else if (*c > 0) {
690         *out = *c;
691         out[1] = 0;
692         return 1;
693     }
694     else {
695         int ll = u8_tolower_slow (c, l, out);
696         if (ll) {
697             return ll;
698         }
699         memcpy (out, c, l);
700         out[l] = 0;
701         return l;
702     }
703 }
704 
705 int
u8_toupper_slow(const char * input,int len,char * out)706 u8_toupper_slow (const char *input, int len, char *out) {
707     struct u8_uppercase_map_t *uc = u8_uc_in_word_set (input, len);
708     if (uc) {
709         int ll = strlen (uc->upper);
710         memcpy (out, uc->upper, ll);
711         out[ll] = 0;
712         return ll;
713     }
714     return 0;
715 }
716 
717 int
u8_toupper(const signed char * c,int l,char * out)718 u8_toupper (const signed char *c, int l, char *out) {
719     if (*c >= 97 && *c <= 122) {
720         *out = *c - 0x20;
721         out[1] = 0;
722         return 1;
723     }
724     else if (*c > 0) {
725         *out = *c;
726         out[1] = 0;
727         return 1;
728     }
729     else {
730         int ll = u8_toupper_slow (c, l, out);
731         if (ll) {
732             return ll;
733         }
734         memcpy (out, c, l);
735         out[l] = 0;
736         return l;
737     }
738 }
739 
740 const char *
utfcasestr(const char * s1,const char * s2)741 utfcasestr (const char *s1, const char *s2) {
742 #if 0 // small u8_tolower test
743     while (*s2) {
744         int32_t i = 0;
745         u8_nextchar (s2, &i);
746         const char *next = s2 + i;
747         char lw[10];
748         int l = u8_tolower (s2, next-s2, lw);
749         s2 = next;
750         fprintf (stderr, "%s", lw);
751     }
752     fprintf (stderr, "\n");
753     return NULL;
754 #endif
755     while (*s1) {
756         const char *p1 = s1;
757         const char *p2 = s2;
758         while (*p2 && *p1) {
759             int32_t i1 = 0;
760             int32_t i2 = 0;
761             char lw1[10];
762             char lw2[10];
763             const char *next;
764             u8_nextchar (p1, &i1);
765             u8_nextchar (p2, &i2);
766             int l1 = u8_tolower (p1, i1, lw1);
767             int l2 = u8_tolower (p2, i2, lw2);
768             //fprintf (stderr, "comparing %s to %s\n", lw1, lw2);
769             if (strcmp (lw1, lw2)) {
770                 //fprintf (stderr, "fail\n");
771                 break;
772             }
773             p1 += i1;
774             p2 += i2;
775         }
776         if (*p2 == 0) {
777             //fprintf (stderr, "%s found in %s\n", s2, s1);
778             return p1;
779         }
780         int32_t i = 0;
781         u8_nextchar (s1, &i);
782         s1 += i;
783     }
784     return NULL;
785 }
786 
787 #define min(x,y) ((x)<(y)?(x):(y))
788 // s2 must be lowercase
789 const char *
utfcasestr_fast(const char * s1,const char * s2)790 utfcasestr_fast (const char *s1, const char *s2) {
791     while (*s1) {
792         const char *p1 = s1;
793         const char *p2 = s2;
794         while (*p2 && *p1) {
795             int32_t i1 = 0;
796             int32_t i2 = 0;
797             char lw1[10];
798             const char *next;
799             u8_nextchar (p1, &i1);
800             u8_nextchar (p2, &i2);
801             int l1 = u8_tolower (p1, i1, lw1);
802             if (memcmp (lw1, p2, min(i2,l1))) {
803                 break;
804             }
805             p1 += i1;
806             p2 += i2;
807         }
808         if (*p2 == 0) {
809             return p1;
810         }
811         int32_t i = 0;
812         u8_nextchar (s1, &i);
813         s1 += i;
814     }
815     return NULL;
816 }
817 
818 int
u8_strcasecmp(const char * a,const char * b)819 u8_strcasecmp (const char *a, const char *b) {
820     const char *p1 = a, *p2 = b;
821     while (*p1 && *p2) {
822         int32_t i1 = 0;
823         int32_t i2 = 0;
824         char s1[10], s2[10];
825         const char *next;
826         u8_nextchar (p1, &i1);
827         u8_nextchar (p2, &i2);
828         int l1 = u8_tolower (p1, i1, s1);
829         int l2 = u8_tolower (p2, i2, s2);
830         int res = 0;
831         if (l1 != l2) {
832             res = l1-l2;
833         }
834         else {
835             res = memcmp (s1, s2, l1);
836         }
837         if (res) {
838             return res;
839         }
840         p1 += i1;
841         p2 += i2;
842     }
843 
844     if (*p1) {
845         return 1;
846     }
847     else if (*p2) {
848         return -1;
849     }
850 
851     return 0;
852 }
853 
854 void
u8_lc_map_test(void)855 u8_lc_map_test (void) {
856     struct u8_case_map_t *lc;
857     lc = u8_lc_in_word_set ("Á", 2);
858     printf ("%s -> %s\n", lc->name, lc->lower);
859     lc = u8_lc_in_word_set ("É", 2);
860     printf ("%s -> %s\n", lc->name, lc->lower);
861     lc = u8_lc_in_word_set ("Í", 2);
862     printf ("%s -> %s\n", lc->name, lc->lower);
863     lc = u8_lc_in_word_set ("Ñ", 2);
864     printf ("%s -> %s\n", lc->name, lc->lower);
865     lc = u8_lc_in_word_set ("П", 2);
866     printf ("%s -> %s\n", lc->name, lc->lower);
867     lc = u8_lc_in_word_set ("Л", 2);
868     printf ("%s -> %s\n", lc->name, lc->lower);
869     lc = u8_lc_in_word_set ("А", 2);
870     printf ("%s -> %s\n", lc->name, lc->lower);
871 }
872