1 /*
2 This file is part of Deadbeef Player source code
3 http://deadbeef.sourceforge.net
4
5 utf8 string manipulation
6
7 Copyright (C) 2009-2013 Alexey Yakovenko
8
9 This software is provided 'as-is', without any express or implied
10 warranty. In no event will the authors be held liable for any damages
11 arising from the use of this software.
12
13 Permission is granted to anyone to use this software for any purpose,
14 including commercial applications, and to alter it and redistribute it
15 freely, subject to the following restrictions:
16
17 1. The origin of this software must not be misrepresented; you must not
18 claim that you wrote the original software. If you use this software
19 in a product, an acknowledgment in the product documentation would be
20 appreciated but is not required.
21 2. Altered source versions must be plainly marked as such, and must not be
22 misrepresented as being the original software.
23 3. This notice may not be removed or altered from any source distribution.
24
25 Alexey Yakovenko waker@users.sourceforge.net
26 */
27
28 /*
29 based on Basic UTF-8 manipulation routines
30 by Jeff Bezanson
31 placed in the public domain Fall 2005
32 */
33 #ifdef HAVE_CONFIG_H
34 # include "config.h"
35 #endif
36 #ifdef HAVE_ALLOCA_H
37 # include <alloca.h>
38 #endif
39 #include <stdlib.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <stdarg.h>
43 //#include <alloca.h>
44 #include "ctype.h"
45 #include "utf8.h"
46 #include "u8_lc_map.h"
47 #include "u8_uc_map.h"
48
49 static const uint32_t offsetsFromUTF8[6] = {
50 0x00000000UL, 0x00003080UL, 0x000E2080UL,
51 0x03C82080UL, 0xFA082080UL, 0x82082080UL
52 };
53
54 static const char trailingBytesForUTF8[256] = {
55 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
62 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
63 };
64
65 /* conversions without error checking
66 only works for valid UTF-8, i.e. no 5- or 6-byte sequences
67 srcsz = source size in bytes, or -1 if 0-terminated
68 sz = dest size in # of wide characters
69
70 returns # characters converted
71 dest will always be L'\0'-terminated, even if there isn't enough room
72 for all the characters.
73 if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
74 */
u8_toucs(uint32_t * dest,int32_t sz,const char * src,int32_t srcsz)75 int u8_toucs(uint32_t *dest, int32_t sz, const char *src, int32_t srcsz)
76 {
77 uint32_t ch;
78 const char *src_end = src + srcsz;
79 int32_t nb;
80 int32_t i=0;
81
82 while (i < sz-1) {
83 nb = trailingBytesForUTF8[(unsigned char)*src];
84 if (srcsz == -1) {
85 if (*src == 0)
86 goto done_toucs;
87 }
88 else {
89 if (src + nb >= src_end)
90 goto done_toucs;
91 }
92 ch = 0;
93 switch (nb) {
94 /* these fall through deliberately */
95 case 3: ch += (unsigned char)*src++; ch <<= 6;
96 case 2: ch += (unsigned char)*src++; ch <<= 6;
97 case 1: ch += (unsigned char)*src++; ch <<= 6;
98 case 0: ch += (unsigned char)*src++;
99 }
100 ch -= offsetsFromUTF8[nb];
101 dest[i++] = ch;
102 }
103 done_toucs:
104 dest[i] = 0;
105 return i;
106 }
107
108 /* srcsz = number of source characters, or -1 if 0-terminated
109 sz = size of dest buffer in bytes
110
111 returns # characters converted
112 dest will only be '\0'-terminated if there is enough space. this is
113 for consistency; imagine there are 2 bytes of space left, but the next
114 character requires 3 bytes. in this case we could NUL-terminate, but in
115 general we can't when there's insufficient space. therefore this function
116 only NUL-terminates if all the characters fit, and there's space for
117 the NUL as well.
118 the destination string will never be bigger than the source string.
119 */
u8_toutf8(char * dest,int32_t sz,uint32_t * src,int32_t srcsz)120 int u8_toutf8(char *dest, int32_t sz, uint32_t *src, int32_t srcsz)
121 {
122 uint32_t ch;
123 int32_t i = 0;
124 char *dest_end = dest + sz;
125
126 while (srcsz<0 ? src[i]!=0 : i < srcsz) {
127 ch = src[i];
128 if (ch < 0x80) {
129 if (dest >= dest_end)
130 return i;
131 *dest++ = (char)ch;
132 }
133 else if (ch < 0x800) {
134 if (dest >= dest_end-1)
135 return i;
136 *dest++ = (ch>>6) | 0xC0;
137 *dest++ = (ch & 0x3F) | 0x80;
138 }
139 else if (ch < 0x10000) {
140 if (dest >= dest_end-2)
141 return i;
142 *dest++ = (ch>>12) | 0xE0;
143 *dest++ = ((ch>>6) & 0x3F) | 0x80;
144 *dest++ = (ch & 0x3F) | 0x80;
145 }
146 else if (ch < 0x200000) {
147 if (dest >= dest_end-3)
148 return i;
149 *dest++ = (ch>>18) | 0xF0;
150 *dest++ = ((ch>>12) & 0x3F) | 0x80;
151 *dest++ = ((ch>>6) & 0x3F) | 0x80;
152 *dest++ = (ch & 0x3F) | 0x80;
153 }
154 i++;
155 }
156 if (dest < dest_end)
157 *dest = '\0';
158 return i;
159 }
160
u8_wc_toutf8(char * dest,uint32_t ch)161 int u8_wc_toutf8(char *dest, uint32_t ch)
162 {
163 if (ch < 0x80) {
164 dest[0] = (char)ch;
165 return 1;
166 }
167 if (ch < 0x800) {
168 dest[0] = (ch>>6) | 0xC0;
169 dest[1] = (ch & 0x3F) | 0x80;
170 return 2;
171 }
172 if (ch < 0x10000) {
173 dest[0] = (ch>>12) | 0xE0;
174 dest[1] = ((ch>>6) & 0x3F) | 0x80;
175 dest[2] = (ch & 0x3F) | 0x80;
176 return 3;
177 }
178 if (ch < 0x200000) {
179 dest[0] = (ch>>18) | 0xF0;
180 dest[1] = ((ch>>12) & 0x3F) | 0x80;
181 dest[2] = ((ch>>6) & 0x3F) | 0x80;
182 dest[3] = (ch & 0x3F) | 0x80;
183 return 4;
184 }
185 return 0;
186 }
187
188 /* charnum => byte offset */
u8_offset(char * str,int32_t charnum)189 int u8_offset(char *str, int32_t charnum)
190 {
191 int32_t offs=0;
192
193 while (charnum > 0 && str[offs]) {
194 (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
195 isutf(str[++offs]) || ++offs);
196 charnum--;
197 }
198 return offs;
199 }
200
201 /* byte offset => charnum */
u8_charnum(char * s,int32_t offset)202 int u8_charnum(char *s, int32_t offset)
203 {
204 int32_t charnum = 0, offs=0;
205
206 while (offs < offset && s[offs]) {
207 (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
208 isutf(s[++offs]) || ++offs);
209 charnum++;
210 }
211 return charnum;
212 }
213
214 /* number of characters */
u8_strlen(char * s)215 int u8_strlen(char *s)
216 {
217 int32_t count = 0;
218 int32_t i = 0;
219
220 while (u8_nextchar(s, &i) != 0)
221 count++;
222
223 return count;
224 }
225
226 /* reads the next utf-8 sequence out of a string, updating an index */
u8_nextchar(const char * s,int32_t * i)227 uint32_t u8_nextchar(const char *s, int32_t *i)
228 {
229 uint32_t ch = 0;
230 int32_t sz = 0;
231
232 do {
233 ch <<= 6;
234 ch += (unsigned char)s[(*i)++];
235 sz++;
236 } while (s[*i] && !isutf(s[*i]));
237 ch -= offsetsFromUTF8[sz-1];
238
239 return ch;
240 }
241
242 /* copies num_chars characters from src to dest, return bytes written */
u8_strncpy(char * dest,const char * src,int num_chars)243 int u8_strncpy (char *dest, const char* src, int num_chars)
244 {
245 const char *s = src;
246 int32_t num_bytes = 0;
247 while (num_chars && *s) {
248 int32_t i = 0;
249 u8_nextchar (s, &i);
250 num_chars--;
251 num_bytes += i;
252 s += i;
253 }
254 strncpy (dest, src, s - src);
255 dest[s - src] = 0;
256 return num_bytes;
257 }
258
u8_strnbcpy(char * dest,const char * src,int num_bytes)259 int u8_strnbcpy (char *dest, const char* src, int num_bytes) {
260 int32_t prev_index = 0;
261 int32_t index = 0;
262 int32_t nb = num_bytes;
263 while (src[index] && num_bytes > 0) {
264 u8_inc (src, &index);
265 int32_t charlen = index - prev_index;
266 if (charlen > num_bytes) {
267 break;
268 }
269 memcpy (dest, &src[prev_index], charlen);
270 prev_index = index;
271 dest += charlen;
272 num_bytes -= charlen;
273 }
274 return nb - num_bytes;
275 }
276
u8_charcpy(char * dest,const char * src,int num_bytes)277 int u8_charcpy (char *dest, const char *src, int num_bytes) {
278 int32_t index = 0;
279 u8_inc (src, &index);
280 if (index > num_bytes) {
281 return 0;
282 }
283 memcpy (dest, src, index);
284 return index;
285 }
286
u8_inc(const char * s,int32_t * i)287 void u8_inc(const char *s, int32_t *i)
288 {
289 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
290 isutf(s[++(*i)]) || ++(*i));
291 }
292
u8_dec(const char * s,int32_t * i)293 void u8_dec(const char *s, int32_t *i)
294 {
295 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
296 isutf(s[--(*i)]) || --(*i));
297 }
298
octal_digit(char c)299 int octal_digit(char c)
300 {
301 return (c >= '0' && c <= '7');
302 }
303
hex_digit(char c)304 int hex_digit(char c)
305 {
306 return ((c >= '0' && c <= '9') ||
307 (c >= 'A' && c <= 'F') ||
308 (c >= 'a' && c <= 'f'));
309 }
310
311 /* assumes that src points to the character after a backslash
312 returns number of input characters processed */
u8_read_escape_sequence(const char * str,uint32_t * dest)313 int u8_read_escape_sequence(const char *str, uint32_t *dest)
314 {
315 uint32_t ch;
316 char digs[]="\0\0\0\0\0\0\0\0\0";
317 int32_t dno=0, i=1;
318
319 ch = (uint32_t)str[0]; /* take literal character */
320 if (str[0] == 'n')
321 ch = L'\n';
322 else if (str[0] == 't')
323 ch = L'\t';
324 else if (str[0] == 'r')
325 ch = L'\r';
326 else if (str[0] == 'b')
327 ch = L'\b';
328 else if (str[0] == 'f')
329 ch = L'\f';
330 else if (str[0] == 'v')
331 ch = L'\v';
332 else if (str[0] == 'a')
333 ch = L'\a';
334 else if (octal_digit(str[0])) {
335 i = 0;
336 do {
337 digs[dno++] = str[i++];
338 } while (octal_digit(str[i]) && dno < 3);
339 ch = strtol(digs, NULL, 8);
340 }
341 else if (str[0] == 'x') {
342 while (hex_digit(str[i]) && dno < 2) {
343 digs[dno++] = str[i++];
344 }
345 if (dno > 0)
346 ch = strtol(digs, NULL, 16);
347 }
348 else if (str[0] == 'u') {
349 while (hex_digit(str[i]) && dno < 4) {
350 digs[dno++] = str[i++];
351 }
352 if (dno > 0)
353 ch = strtol(digs, NULL, 16);
354 }
355 else if (str[0] == 'U') {
356 while (hex_digit(str[i]) && dno < 8) {
357 digs[dno++] = str[i++];
358 }
359 if (dno > 0)
360 ch = strtol(digs, NULL, 16);
361 }
362 *dest = ch;
363
364 return i;
365 }
366
367 // convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
368 // example: u8_unescape(mybuf, 256, "hello\\u220e")
369 // note the double backslash is needed if called on a C string literal
u8_unescape(char * buf,int32_t sz,const char * src)370 int u8_unescape(char *buf, int32_t sz, const char *src)
371 {
372 int32_t c=0, amt;
373 uint32_t ch;
374 char temp[4];
375
376 while (*src && c < sz) {
377 if (*src == '\\') {
378 src++;
379 amt = u8_read_escape_sequence(src, &ch);
380 }
381 else {
382 ch = (uint32_t)*src;
383 amt = 1;
384 }
385 src += amt;
386 amt = u8_wc_toutf8(temp, ch);
387 if (amt > sz-c)
388 break;
389 memcpy(&buf[c], temp, amt);
390 c += amt;
391 }
392 if (c < sz)
393 buf[c] = '\0';
394 return c;
395 }
396
u8_escape_wchar(char * buf,int32_t sz,uint32_t ch)397 int u8_escape_wchar(char *buf, int32_t sz, uint32_t ch)
398 {
399 if (ch == L'\n')
400 return snprintf(buf, sz, "\\n");
401 else if (ch == L'\t')
402 return snprintf(buf, sz, "\\t");
403 else if (ch == L'\r')
404 return snprintf(buf, sz, "\\r");
405 else if (ch == L'\b')
406 return snprintf(buf, sz, "\\b");
407 else if (ch == L'\f')
408 return snprintf(buf, sz, "\\f");
409 else if (ch == L'\v')
410 return snprintf(buf, sz, "\\v");
411 else if (ch == L'\a')
412 return snprintf(buf, sz, "\\a");
413 else if (ch == L'\\')
414 return snprintf(buf, sz, "\\\\");
415 else if (ch < 32 || ch == 0x7f)
416 return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch);
417 else if (ch > 0xFFFF)
418 return snprintf(buf, sz, "\\U%.8X", (uint32_t)ch);
419 else if (ch >= 0x80 && ch <= 0xFFFF)
420 return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch);
421
422 return snprintf(buf, sz, "%c", (char)ch);
423 }
424
u8_escape(char * buf,int32_t sz,const char * src,int32_t escape_quotes)425 int u8_escape(char *buf, int32_t sz, const char *src, int32_t escape_quotes)
426 {
427 int32_t c=0, i=0, amt;
428
429 while (src[i] && c < sz) {
430 if (escape_quotes && src[i] == '"') {
431 amt = snprintf(buf, sz - c, "\\\"");
432 i++;
433 }
434 else {
435 amt = u8_escape_wchar(buf, sz - c, u8_nextchar(src, &i));
436 }
437 c += amt;
438 buf += amt;
439 }
440 if (c < sz)
441 *buf = '\0';
442 return c;
443 }
444
u8_strchr(char * s,uint32_t ch,int32_t * charn)445 char *u8_strchr(char *s, uint32_t ch, int32_t *charn)
446 {
447 int32_t i = 0, lasti=0;
448 uint32_t c;
449
450 *charn = 0;
451 while (s[i]) {
452 c = u8_nextchar(s, &i);
453 if (c == ch) {
454 return &s[lasti];
455 }
456 lasti = i;
457 (*charn)++;
458 }
459 return NULL;
460 }
461
u8_memchr(char * s,uint32_t ch,size_t sz,int32_t * charn)462 char *u8_memchr(char *s, uint32_t ch, size_t sz, int32_t *charn)
463 {
464 int32_t i = 0, lasti=0;
465 uint32_t c;
466 int32_t csz;
467
468 *charn = 0;
469 while (i < sz) {
470 c = csz = 0;
471 do {
472 c <<= 6;
473 c += (unsigned char)s[i++];
474 csz++;
475 } while (i < sz && !isutf(s[i]));
476 c -= offsetsFromUTF8[csz-1];
477
478 if (c == ch) {
479 return &s[lasti];
480 }
481 lasti = i;
482 (*charn)++;
483 }
484 return NULL;
485 }
486
u8_is_locale_utf8(char * locale)487 int u8_is_locale_utf8(char *locale)
488 {
489 /* this code based on libutf8 */
490 const char* cp = locale;
491
492 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
493 if (*cp == '.') {
494 const char* encoding = ++cp;
495 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
496 ;
497 if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
498 || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
499 return 1; /* it's UTF-8 */
500 break;
501 }
502 }
503 return 0;
504 }
505
u8_vprintf(char * fmt,va_list ap)506 int u8_vprintf(char *fmt, va_list ap)
507 {
508 int32_t cnt, sz=0;
509 char *buf;
510 uint32_t *wcs;
511
512 sz = 512;
513 buf = (char*)alloca(sz);
514 try_print:
515 cnt = vsnprintf(buf, sz, fmt, ap);
516 if (cnt >= sz) {
517 buf = (char*)alloca(cnt - sz + 1);
518 sz = cnt + 1;
519 goto try_print;
520 }
521 wcs = (uint32_t*)alloca((cnt+1) * sizeof(uint32_t));
522 cnt = u8_toucs(wcs, cnt+1, buf, cnt);
523 printf("%ls", (wchar_t*)wcs);
524 return cnt;
525 }
526
u8_printf(char * fmt,...)527 int u8_printf(char *fmt, ...)
528 {
529 int32_t cnt;
530 va_list args;
531
532 va_start(args, fmt);
533
534 cnt = u8_vprintf(fmt, args);
535
536 va_end(args);
537 return cnt;
538 }
539
540 // adaptation of g_utf8_validate
541
542 #define UTF8_COMPUTE(Char, Mask, Len) \
543 if (Char < 128) \
544 { \
545 Len = 1; \
546 Mask = 0x7f; \
547 } \
548 else if ((Char & 0xe0) == 0xc0) \
549 { \
550 Len = 2; \
551 Mask = 0x1f; \
552 } \
553 else if ((Char & 0xf0) == 0xe0) \
554 { \
555 Len = 3; \
556 Mask = 0x0f; \
557 } \
558 else if ((Char & 0xf8) == 0xf0) \
559 { \
560 Len = 4; \
561 Mask = 0x07; \
562 } \
563 else if ((Char & 0xfc) == 0xf8) \
564 { \
565 Len = 5; \
566 Mask = 0x03; \
567 } \
568 else if ((Char & 0xfe) == 0xfc) \
569 { \
570 Len = 6; \
571 Mask = 0x01; \
572 } \
573 else \
574 Len = -1;
575
576 #define UTF8_LENGTH(Char) \
577 ((Char) < 0x80 ? 1 : \
578 ((Char) < 0x800 ? 2 : \
579 ((Char) < 0x10000 ? 3 : \
580 ((Char) < 0x200000 ? 4 : \
581 ((Char) < 0x4000000 ? 5 : 6)))))
582
583
584 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
585 (Result) = (Chars)[0] & (Mask); \
586 for ((Count) = 1; (Count) < (Len); ++(Count)) \
587 { \
588 if (((Chars)[(Count)] & 0xc0) != 0x80) \
589 { \
590 (Result) = -1; \
591 break; \
592 } \
593 (Result) <<= 6; \
594 (Result) |= ((Chars)[(Count)] & 0x3f); \
595 }
596
597 #define UNICODE_VALID(Char) \
598 ((Char) < 0x110000 && \
599 (((Char) & 0xFFFFF800) != 0xD800) && \
600 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
601 ((Char) & 0xFFFE) != 0xFFFE)
602
603
u8_valid(const char * str,int max_len,const char ** end)604 int u8_valid (const char *str,
605 int max_len,
606 const char **end)
607 {
608
609 const char *p;
610
611 if (!str) {
612 return 0;
613 }
614
615 if (end)
616 *end = str;
617
618 p = str;
619
620 while ((max_len < 0 || (p - str) < max_len) && *p)
621 {
622 int i, mask = 0, len;
623 int32_t result;
624 unsigned char c = (unsigned char) *p;
625
626 UTF8_COMPUTE (c, mask, len);
627
628 if (len == -1)
629 break;
630
631 /* check that the expected number of bytes exists in str */
632 if (max_len >= 0 &&
633 ((max_len - (p - str)) < len))
634 break;
635
636 UTF8_GET (result, p, i, mask, len);
637
638 if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
639 break;
640
641 if (result == (int32_t)-1)
642 break;
643
644 if (!UNICODE_VALID (result))
645 break;
646
647 p += len;
648 }
649
650 if (end)
651 *end = p;
652
653 /* See that we covered the entire length if a length was
654 * passed in, or that we ended on a nul if not
655 */
656 if (max_len >= 0 && p != (str + max_len) && *p != 0) {
657 return 0;
658 }
659 else if (max_len < 0 && *p != '\0') {
660 return 0;
661 }
662 return 1;
663 }
664
665 #if 0
666 static const char lowerchars[] = "áéíñóúüäöåæøàçèêабвгдеёжзийклмнорпстуфхцчшщъыьэюя";
667 static const char upperchars[] = "ÁÉÍÑÓÚÜÄÖÅÆØÀÇÈÊАБВГДЕЁЖЗИЙКЛМНОРПСТУФХЦЧШЩЪЫЬЭЮЯ";
668 #endif
669
670 int
u8_tolower_slow(const char * input,int len,char * out)671 u8_tolower_slow (const char *input, int len, char *out) {
672 struct u8_case_map_t *lc = u8_lc_in_word_set (input, len);
673 if (lc) {
674 int ll = strlen (lc->lower);
675 memcpy (out, lc->lower, ll);
676 out[ll] = 0;
677 return ll;
678 }
679 return 0;
680 }
681
682 int
u8_tolower(const signed char * c,int l,char * out)683 u8_tolower (const signed char *c, int l, char *out) {
684 if (*c >= 65 && *c <= 90) {
685 *out = *c + 0x20;
686 out[1] = 0;
687 return 1;
688 }
689 else if (*c > 0) {
690 *out = *c;
691 out[1] = 0;
692 return 1;
693 }
694 else {
695 int ll = u8_tolower_slow (c, l, out);
696 if (ll) {
697 return ll;
698 }
699 memcpy (out, c, l);
700 out[l] = 0;
701 return l;
702 }
703 }
704
705 int
u8_toupper_slow(const char * input,int len,char * out)706 u8_toupper_slow (const char *input, int len, char *out) {
707 struct u8_uppercase_map_t *uc = u8_uc_in_word_set (input, len);
708 if (uc) {
709 int ll = strlen (uc->upper);
710 memcpy (out, uc->upper, ll);
711 out[ll] = 0;
712 return ll;
713 }
714 return 0;
715 }
716
717 int
u8_toupper(const signed char * c,int l,char * out)718 u8_toupper (const signed char *c, int l, char *out) {
719 if (*c >= 97 && *c <= 122) {
720 *out = *c - 0x20;
721 out[1] = 0;
722 return 1;
723 }
724 else if (*c > 0) {
725 *out = *c;
726 out[1] = 0;
727 return 1;
728 }
729 else {
730 int ll = u8_toupper_slow (c, l, out);
731 if (ll) {
732 return ll;
733 }
734 memcpy (out, c, l);
735 out[l] = 0;
736 return l;
737 }
738 }
739
740 const char *
utfcasestr(const char * s1,const char * s2)741 utfcasestr (const char *s1, const char *s2) {
742 #if 0 // small u8_tolower test
743 while (*s2) {
744 int32_t i = 0;
745 u8_nextchar (s2, &i);
746 const char *next = s2 + i;
747 char lw[10];
748 int l = u8_tolower (s2, next-s2, lw);
749 s2 = next;
750 fprintf (stderr, "%s", lw);
751 }
752 fprintf (stderr, "\n");
753 return NULL;
754 #endif
755 while (*s1) {
756 const char *p1 = s1;
757 const char *p2 = s2;
758 while (*p2 && *p1) {
759 int32_t i1 = 0;
760 int32_t i2 = 0;
761 char lw1[10];
762 char lw2[10];
763 const char *next;
764 u8_nextchar (p1, &i1);
765 u8_nextchar (p2, &i2);
766 int l1 = u8_tolower (p1, i1, lw1);
767 int l2 = u8_tolower (p2, i2, lw2);
768 //fprintf (stderr, "comparing %s to %s\n", lw1, lw2);
769 if (strcmp (lw1, lw2)) {
770 //fprintf (stderr, "fail\n");
771 break;
772 }
773 p1 += i1;
774 p2 += i2;
775 }
776 if (*p2 == 0) {
777 //fprintf (stderr, "%s found in %s\n", s2, s1);
778 return p1;
779 }
780 int32_t i = 0;
781 u8_nextchar (s1, &i);
782 s1 += i;
783 }
784 return NULL;
785 }
786
787 #define min(x,y) ((x)<(y)?(x):(y))
788 // s2 must be lowercase
789 const char *
utfcasestr_fast(const char * s1,const char * s2)790 utfcasestr_fast (const char *s1, const char *s2) {
791 while (*s1) {
792 const char *p1 = s1;
793 const char *p2 = s2;
794 while (*p2 && *p1) {
795 int32_t i1 = 0;
796 int32_t i2 = 0;
797 char lw1[10];
798 const char *next;
799 u8_nextchar (p1, &i1);
800 u8_nextchar (p2, &i2);
801 int l1 = u8_tolower (p1, i1, lw1);
802 if (memcmp (lw1, p2, min(i2,l1))) {
803 break;
804 }
805 p1 += i1;
806 p2 += i2;
807 }
808 if (*p2 == 0) {
809 return p1;
810 }
811 int32_t i = 0;
812 u8_nextchar (s1, &i);
813 s1 += i;
814 }
815 return NULL;
816 }
817
818 int
u8_strcasecmp(const char * a,const char * b)819 u8_strcasecmp (const char *a, const char *b) {
820 const char *p1 = a, *p2 = b;
821 while (*p1 && *p2) {
822 int32_t i1 = 0;
823 int32_t i2 = 0;
824 char s1[10], s2[10];
825 const char *next;
826 u8_nextchar (p1, &i1);
827 u8_nextchar (p2, &i2);
828 int l1 = u8_tolower (p1, i1, s1);
829 int l2 = u8_tolower (p2, i2, s2);
830 int res = 0;
831 if (l1 != l2) {
832 res = l1-l2;
833 }
834 else {
835 res = memcmp (s1, s2, l1);
836 }
837 if (res) {
838 return res;
839 }
840 p1 += i1;
841 p2 += i2;
842 }
843
844 if (*p1) {
845 return 1;
846 }
847 else if (*p2) {
848 return -1;
849 }
850
851 return 0;
852 }
853
854 void
u8_lc_map_test(void)855 u8_lc_map_test (void) {
856 struct u8_case_map_t *lc;
857 lc = u8_lc_in_word_set ("Á", 2);
858 printf ("%s -> %s\n", lc->name, lc->lower);
859 lc = u8_lc_in_word_set ("É", 2);
860 printf ("%s -> %s\n", lc->name, lc->lower);
861 lc = u8_lc_in_word_set ("Í", 2);
862 printf ("%s -> %s\n", lc->name, lc->lower);
863 lc = u8_lc_in_word_set ("Ñ", 2);
864 printf ("%s -> %s\n", lc->name, lc->lower);
865 lc = u8_lc_in_word_set ("П", 2);
866 printf ("%s -> %s\n", lc->name, lc->lower);
867 lc = u8_lc_in_word_set ("Л", 2);
868 printf ("%s -> %s\n", lc->name, lc->lower);
869 lc = u8_lc_in_word_set ("А", 2);
870 printf ("%s -> %s\n", lc->name, lc->lower);
871 }
872