1 /** @file string.c  UTF-8 string with copy-on-write semantics.
2 
3 @authors Copyright (c) 2017 Jaakko Keränen <jaakko.keranen@iki.fi>
4 
5 @par License
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10 1. Redistributions of source code must retain the above copyright notice, this
11    list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright notice,
13    this list of conditions and the following disclaimer in the documentation
14    and/or other materials provided with the distribution.
15 
16 <small>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</small>
26 */
27 
28 #include "the_Foundation/string.h"
29 #include "the_Foundation/stringlist.h"
30 #include "the_Foundation/range.h"
31 #include "the_Foundation/stdthreads.h"
32 
33 #include <stdlib.h>
34 #include <stdarg.h>
35 #include <strings.h>
36 #include <unicase.h>
37 #include <unictype.h>
38 #include <uniconv.h>
39 #include <uninorm.h>
40 #include <unistr.h>
41 #include <ctype.h>
42 
43 #if !defined (iHaveStrnstr)
44 #   include "platform/strnstr.h"
45 #endif
46 
47 static char localeCharSet_[64];
48 
currentLocaleLanguage_(void)49 iLocalDef const char *currentLocaleLanguage_(void) {
50     static iString *loc = NULL;
51     if (!loc) {
52         loc = newCStr_String(uc_locale_language());
53     }
54     return cstr_String(loc);
55 }
56 
upper_Char(iChar d)57 iChar upper_Char(iChar d) {
58     return uc_toupper(d);
59 }
60 
lower_Char(iChar d)61 iChar lower_Char(iChar d) {
62     return uc_tolower(d);
63 }
64 
isSpace_Char(iChar d)65 iBool isSpace_Char(iChar d) {
66     return uc_is_space(d) ? iTrue : iFalse;
67 }
68 
isAlpha_Char(iChar d)69 iBool isAlpha_Char(iChar d) {
70     return uc_is_alpha(d) ? iTrue : iFalse;
71 }
72 
isNumeric_Char(iChar d)73 iBool isNumeric_Char(iChar d) {
74     return uc_is_digit(d) ? iTrue : iFalse;
75 }
76 
isAlphaNumeric_Char(iChar d)77 iBool isAlphaNumeric_Char(iChar d) {
78     return uc_is_alnum(d) ? iTrue : iFalse;
79 }
80 
isPunct_Char(iChar d)81 iBool isPunct_Char(iChar d) {
82     return uc_is_punct(d) ? iTrue : iFalse;
83 }
84 
setLocaleCharSet_String(const char * charSet)85 void setLocaleCharSet_String(const char *charSet) {
86     const size_t n = sizeof(localeCharSet_);
87     strncpy(localeCharSet_, charSet, n);
88     localeCharSet_[n - 1] = 0;
89 }
90 
new_String(void)91 iString *new_String(void) {
92     iString *d = iMalloc(String);
93     init_String(d);
94     return d;
95 }
96 
newCStr_String(const char * cstr)97 iString *newCStr_String(const char *cstr) {
98     return newCStrN_String(cstr, strlen(cstr));
99 }
100 
newCStrN_String(const char * cstr,size_t n)101 iString *newCStrN_String(const char *cstr, size_t n) {
102     iString *d = iMalloc(String);
103     initData_Block(&d->chars, cstr, n);
104     return d;
105 }
106 
newUtf16_String(const uint16_t * utf16Str)107 iString *newUtf16_String(const uint16_t *utf16Str) {
108     iString *d = iMalloc(String);
109     initUtf16_String(d, utf16Str);
110     return d;
111 }
112 
newUtf16N_String(const uint16_t * utf16Str,size_t n)113 iString *newUtf16N_String(const uint16_t *utf16Str, size_t n) {
114     iString *d = iMalloc(String);
115     initUtf16N_String(d, utf16Str, n);
116     return d;
117 }
118 
newLocalCStr_String(const char * localCStr)119 iString *newLocalCStr_String(const char *localCStr) {
120     return newLocalCStrN_String(localCStr, strlen(localCStr));
121 }
122 
newLocalCStrN_String(const char * localCStr,size_t n)123 iString *newLocalCStrN_String(const char *localCStr, size_t n) {
124     iString *d = iMalloc(String);
125     initLocalCStrN_String(d, localCStr, n);
126     return d;
127 }
128 
newUnicode_String(const iChar * ucs)129 iString *newUnicode_String(const iChar *ucs) {
130     return newUnicodeN_String(ucs, u32_strlen(ucs));
131 }
132 
newUnicodeN_String(const iChar * ucs,size_t n)133 iString *newUnicodeN_String(const iChar *ucs, size_t n) {
134     iString *d = iMalloc(String);
135     initUnicodeN_String(d, ucs, n);
136     return d;
137 }
138 
newFormat_String(const char * format,...)139 iString *newFormat_String(const char *format, ...) {
140     iString *d = new_String();
141     va_list args;
142     va_start(args, format);
143     vprintf_Block(&d->chars, format, args);
144     va_end(args);
145     return d;
146 }
147 
collectNewFormat_String(const char * format,...)148 iString *collectNewFormat_String(const char *format, ...) {
149     iString *d = collectNew_String();
150     va_list args;
151     va_start(args, format);
152     vprintf_Block(&d->chars, format, args);
153     va_end(args);
154     return d;
155 }
156 
format_CStr(const char * format,...)157 const char *format_CStr(const char *format, ...) {
158     iString *d = collectNew_String();
159     va_list args;
160     va_start(args, format);
161     vprintf_Block(&d->chars, format, args);
162     va_end(args);
163     return cstr_String(d);
164 }
165 
newBlock_String(const iBlock * data)166 iString *newBlock_String(const iBlock *data) {
167     iString *d = iMalloc(String);
168     initCopy_Block(&d->chars, data);
169     return d;
170 }
171 
copy_String(const iString * d)172 iString *copy_String(const iString *d) {
173     iString *copy = iMalloc(String);
174     initCopy_Block(&copy->chars, &d->chars);
175     return copy;
176 }
177 
delete_String(iString * d)178 void delete_String(iString *d) {
179     if (d) {
180         deinit_String(d);
181         free(d);
182     }
183 }
184 
init_String(iString * d)185 void init_String(iString *d) {
186     init_Block(&d->chars, 0);
187 }
188 
initBlock_String(iString * d,const iBlock * chars)189 void initBlock_String(iString *d, const iBlock *chars) {
190     initCopy_Block(&d->chars, chars);
191 }
192 
initCStr_String(iString * d,const char * cstr)193 void initCStr_String(iString *d, const char *cstr) {
194     initCStrN_String(d, cstr, strlen(cstr));
195 }
196 
initCStrN_String(iString * d,const char * cstr,size_t size)197 void initCStrN_String(iString *d, const char *cstr, size_t size) {
198     initData_Block(&d->chars, cstr, size);
199 }
200 
initUtf16_String(iString * d,const uint16_t * utf16Str)201 void initUtf16_String(iString *d, const uint16_t *utf16Str) {
202     initUtf16N_String(d, utf16Str, u16_strlen(utf16Str));
203 }
204 
initUtf16N_String(iString * d,const uint16_t * utf16Str,size_t n)205 void initUtf16N_String(iString *d, const uint16_t *utf16Str, size_t n) {
206     size_t len = 0;
207     uint8_t *data = u16_to_u8(utf16Str, n, NULL, &len);
208     data = realloc(data, len + 1);
209     data[len] = 0; // terminate
210     initPrealloc_Block(&d->chars, data, len, len + 1);
211 }
212 
initLocalCStr_String(iString * d,const char * localCStr)213 void initLocalCStr_String(iString *d, const char *localCStr) {
214     initLocalCStrN_String(d, localCStr, strlen(localCStr));
215 }
216 
initLocalCStrN_String(iString * d,const char * localCStr,size_t size)217 void initLocalCStrN_String(iString *d, const char *localCStr, size_t size) {
218     initBlockEncoding_String(d, &iBlockLiteral(localCStr, size, size), localeCharSet_);
219 }
220 
initBlockEncoding_String(iString * d,const iBlock * chars,const char * encoding)221 void initBlockEncoding_String(iString *d, const iBlock *chars, const char *encoding) {
222     size_t len = 0;
223     uint8_t *data = u8_conv_from_encoding(encoding,
224                                           iconveh_question_mark,
225                                           constData_Block(chars),
226                                           size_Block(chars),
227                                           NULL,
228                                           NULL,
229                                           &len);
230     data = realloc(data, len + 1);
231     data[len] = 0;
232     initPrealloc_Block(&d->chars, data, len, len + 1);
233 }
234 
initUnicode_String(iString * d,const iChar * ucs)235 void initUnicode_String(iString *d, const iChar *ucs) {
236     initUnicodeN_String(d, ucs, u32_strlen(ucs));
237 }
238 
initUnicodeN_String(iString * d,const iChar * ucs,size_t n)239 void initUnicodeN_String(iString *d, const iChar *ucs, size_t n) {
240     size_t len = 0;
241     uint8_t *str = u32_to_u8(ucs, n, NULL, &len);
242     str = realloc(str, len + 1);
243     str[len] = 0;
244     iBlock chars;
245     initPrealloc_Block(&chars, str, len, len + 1);
246     initBlock_String(d, &chars);
247     deinit_Block(&chars);
248 }
249 
initCopy_String(iString * d,const iString * other)250 void initCopy_String(iString *d, const iString *other) {
251     initCopy_Block(&d->chars, &other->chars);
252 }
253 
deinit_String(iString * d)254 void deinit_String(iString *d) {
255     deinit_Block(&d->chars);
256 }
257 
serialize_String(const iString * d,iStream * outs)258 void serialize_String(const iString *d, iStream *outs) {
259     serialize_Block(&d->chars, outs);
260 }
261 
deserialize_String(iString * d,iStream * ins)262 void deserialize_String(iString *d, iStream *ins) {
263     deserialize_Block(&d->chars, ins);
264 }
265 
clear_String(iString * d)266 void clear_String(iString *d) {
267     clear_Block(&d->chars);
268 }
269 
truncate_String(iString * d,size_t charCount)270 void truncate_String(iString *d, size_t charCount) {
271     const char *start = constData_Block(&d->chars);
272     const char *pos = start;
273     iConstForEach(String, i, d) {
274         if (charCount-- == 0) break;
275         pos = i.next;
276     }
277     truncate_Block(&d->chars, (size_t) (pos - start));
278 }
279 
removeEnd_String(iString * d,size_t charCount)280 void removeEnd_String(iString *d, size_t charCount) {
281     if (charCount > 0) {
282         const size_t len = length_String(d);
283         if (charCount < len) {
284             truncate_String(d, len - charCount);
285         }
286         else {
287             clear_String(d);
288         }
289     }
290 }
291 
trimStart_String(iString * d)292 void trimStart_String(iString *d) {
293     if (!isEmpty_String(d)) {
294         iRangecc range = range_String(d);
295         const char *start = range.start;
296         trimStart_Rangecc(&range);
297         remove_Block(&d->chars, 0, (size_t) (range.start - start));
298     }
299 }
300 
trimStart_Rangecc(iRangecc * d)301 void trimStart_Rangecc(iRangecc *d) {
302     const uint8_t *pos = (const uint8_t *) d->start;
303     while (pos != (const uint8_t *) d->end) {
304         iChar ch;
305         pos = u8_next(&ch, pos);
306         /* Variation selectors follow the main codepoint, so if one is found at the beginning
307            it should be ignored. */
308         if (!isSpace_Char(ch) && !isVariationSelector_Char(ch)) break;
309         d->start = (const char *) pos;
310     }
311 }
312 
trimEnd_String(iString * d)313 void trimEnd_String(iString *d) {
314     if (!isEmpty_String(d)) {
315         iRangecc range = range_String(d);
316         trimEnd_Rangecc(&range);
317         truncate_Block(&d->chars, (size_t) (range.end - range.start));
318     }
319 }
320 
trimEnd_Rangecc(iRangecc * d)321 void trimEnd_Rangecc(iRangecc *d) {
322     while (d->end != d->start) {
323         iAssert(d->end > d->start);
324         /* Skip over any extra NULL characters. */
325         if (d->end[-1] == 0) {
326             d->end--;
327         }
328         else {
329             iChar ch = 0;
330             const uint8_t *pos = u8_prev(&ch, (const uint8_t *) d->end, (const uint8_t *) d->start);
331             if (!pos) {
332                 /* `pos` is NULL when beginning of the string is reached (but we're already
333                    checking for that so the loop ends before that happens), or if there's
334                    an invalid codepoint. We'll trim the invalid ones, too. */
335                 d->end--;
336             }
337             else if (isSpace_Char(ch)) {
338                 d->end = (const char *) pos;
339             }
340             else break;
341         }
342     }
343 }
344 
trim_Rangecc(iRangecc * d)345 void trim_Rangecc(iRangecc *d) {
346     trimStart_Rangecc(d);
347     trimEnd_Rangecc(d);
348 }
349 
trim_String(iString * d)350 void trim_String(iString *d) {
351     trimStart_String(d);
352     trimEnd_String(d);
353 }
354 
trimmed_String(const iString * d)355 iString *trimmed_String(const iString *d) {
356     iString *str = copy_String(d);
357     trim_String(str);
358     return str;
359 }
360 
replace_String(iString * d,const char * src,const char * dst)361 void replace_String(iString *d, const char *src, const char *dst) {
362     const size_t srcLen = strlen(src);
363     const size_t dstLen = strlen(dst);
364     for (size_t pos = indexOfCStr_String(d, src); pos != iInvalidPos;
365          pos = indexOfCStrFrom_String(d, src, pos)) {
366         remove_Block(&d->chars, pos, srcLen);
367         insertData_Block(&d->chars, pos, dst, dstLen);
368         pos += dstLen;
369     }
370 }
371 
normalize_String(iString * d)372 void normalize_String(iString *d) {
373     size_t len = 0;
374     uint8_t *nfc =
375         u8_normalize(UNINORM_NFC, constData_Block(&d->chars), size_Block(&d->chars), NULL, &len);
376     /* Ensure it's null-terminated. */
377     nfc = realloc(nfc, len + 1);
378     nfc[len] = 0;
379     iBlock data;
380     initPrealloc_Block(&data, nfc, len, len + 1);
381     set_Block(&d->chars, &data);
382     deinit_Block(&data);
383 }
384 
cstr_String(const iString * d)385 const char *cstr_String(const iString *d) {
386     return constData_Block(&d->chars);
387 }
388 
length_String(const iString * d)389 size_t length_String(const iString *d) {
390     return u8_mbsnlen((const uint8_t *) cstr_String(d), size_String(d));
391 }
392 
isUtf8_Rangecc(iRangecc d)393 iBool isUtf8_Rangecc(iRangecc d) {
394     return u8_check((const uint8_t *) d.start, size_Range(&d)) == NULL;
395 }
396 
length_Rangecc(const iRangecc d)397 size_t length_Rangecc(const iRangecc d) {
398     /*
399     size_t n = 0;
400     for (const char *i = d.start; i < d.end; ) {
401         iChar ch;
402         const int chLen = decodeBytes_MultibyteChar(i, d.end, &ch);
403         if (chLen <= 0) break;
404         i += chLen;
405         n++;
406     }
407     return n;*/
408     return u8_mbsnlen((const uint8_t *) d.start, size_Range(&d));
409 }
410 
size_String(const iString * d)411 size_t size_String(const iString *d) {
412     return d ? size_Block(&d->chars) : 0;
413 }
414 
mid_String(const iString * d,size_t charStartPos,size_t charCount)415 iString *mid_String(const iString *d, size_t charStartPos, size_t charCount) {
416     if (charCount == 0) return new_String();
417     const char *chars = constData_Block(&d->chars);
418     iRanges range = { 0, size_Block(&d->chars) };
419     size_t pos = 0;
420     iConstForEach(String, i, d) {
421         if (pos > charStartPos && pos == charStartPos + charCount) {
422             range.end = i.pos - chars;
423             break;
424         }
425         else if (pos == charStartPos) {
426             range.start = i.pos - chars;
427             if (charCount == iInvalidSize) break;
428         }
429         pos++;
430     }
431     iBlock *midChars = midRange_Block(&d->chars, range);
432     iString *mid = newBlock_String(midChars);
433     delete_Block(midChars);
434     return mid;
435 }
436 
upper_String(const iString * d)437 iString *upper_String(const iString *d) {
438     size_t len = 0;
439     uint8_t *str = u8_toupper((const uint8_t *) cstr_String(d),
440                               size_String(d),
441                               currentLocaleLanguage_(),
442                               NULL,
443                               NULL,
444                               &len);
445     str = realloc(str, len + 1);
446     str[len] = 0;
447     iBlock data;
448     initPrealloc_Block(&data, str, len, len + 1);
449     if (cmp_Block(&data, &d->chars) == 0) {
450         /* Memory optimization: nothing changed so just use a reference. */
451         deinit_Block(&data);
452         return copy_String(d);
453     }
454     iString *up = newBlock_String(&data);
455     deinit_Block(&data);
456     return up;
457 }
458 
lower_String(const iString * d)459 iString *lower_String(const iString *d) {
460     size_t len = 0;
461     uint8_t *str = u8_tolower((const uint8_t *) cstr_String(d),
462                               size_String(d),
463                               currentLocaleLanguage_(),
464                               NULL,
465                               NULL,
466                               &len);
467     str = realloc(str, len + 1);
468     str[len] = 0;
469     iBlock data;
470     initPrealloc_Block(&data, str, len, len + 1);
471     if (cmp_Block(&data, &d->chars) == 0) {
472         /* Memory optimization: nothing changed so just use a reference. */
473         deinit_Block(&data);
474         return copy_String(d);
475     }
476     iString *lwr = newBlock_String(&data);
477     deinit_Block(&data);
478     return lwr;
479 }
480 
split_String(const iString * d,const char * separator)481 iStringList *split_String(const iString *d, const char *separator) {
482     const iRangecc range = range_String(d);
483     return split_Rangecc(range, separator);
484 }
485 
urlEncodeExclude_String(const iString * d,const char * excluded)486 iString *urlEncodeExclude_String(const iString *d, const char *excluded) {
487     iString *enc = maybeUrlEncodeExclude_String(d, excluded);
488     return enc ? enc : copy_String(d);
489 }
490 
urlEncode_String(const iString * d)491 iString *urlEncode_String(const iString *d) {
492     return urlEncodeExclude_String(d, "");
493 }
494 
maybeUrlEncodeExclude_String(const iString * d,const char * excluded)495 iString *maybeUrlEncodeExclude_String(const iString *d, const char *excluded) {
496     /* TODO: Return NULL if nothing to encode. */
497     iString *encoded = new_String();
498     /* Note: Any UTF-8 code points are encoded as multiple %NN sequences. */
499     for (const char *i = constBegin_String(d), *end = constEnd_String(d); i != end; ++i) {
500         char ch = *i;
501         if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') ||
502             ch == '-' || ch == '_' || ch == '.' || ch == '~' || strchr(excluded, ch)) {
503             appendData_Block(&encoded->chars, i, 1);
504         }
505         else {
506             static const char hex[16] = {
507                 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
508             char escaped[3] = {'%', hex[(ch >> 4) & 0xf], hex[ch & 0xf]};
509             appendCStrN_String(encoded, escaped, 3);
510         }
511     }
512     return encoded;
513 }
514 
fromHex_(char ch)515 static int fromHex_(char ch) {
516     if (ch >= '0' && ch <= '9') return ch - '0';
517     if (ch >= 'A' && ch <= 'F') return 10 + ch - 'A';
518     if (ch >= 'a' && ch <= 'f') return 10 + ch - 'a';
519     return -1;
520 }
521 
urlDecode_String(const iString * d)522 iString *urlDecode_String(const iString *d) {
523     return urlDecodeExclude_String(d, "");
524 }
525 
maybeUrlDecodeExclude_String(const iString * d,const char * excluded)526 iString *maybeUrlDecodeExclude_String(const iString *d, const char *excluded) {
527     if (indexOf_String(d, '%') == iInvalidPos) {
528         return NULL;
529     }
530     iString *decoded = new_String();
531     for (const char *i = constBegin_String(d), *end = constEnd_String(d); i != end; ++i) {
532         if (*i == '%' && i + 3 <= end) {
533             const int values[2] = { fromHex_(i[1]), fromHex_(i[2]) };
534             if (values[0] >= 0 && values[1] >= 0) {
535                 const char ch = (char) ((values[0] << 4) | values[1]);
536                 if (!strchr(excluded, ch)) {
537                     appendData_Block(&decoded->chars, &ch, 1);
538                     i += 2;
539                     continue;
540                 }
541             }
542         }
543         appendData_Block(&decoded->chars, i, 1);
544     }
545     return decoded;
546 }
547 
urlDecodeExclude_String(const iString * d,const char * excluded)548 iString *urlDecodeExclude_String(const iString *d, const char *excluded) {
549     iString *dec = maybeUrlDecodeExclude_String(d, excluded);
550     return dec ? dec : copy_String(d);
551 }
552 
first_String(const iString * d)553 iChar first_String(const iString *d) {
554     iStringConstIterator iter;
555     init_StringConstIterator(&iter, d);
556     return iter.value;
557 }
558 
last_String(const iString * d)559 iChar last_String(const iString *d) {
560     iStringReverseConstIterator iter;
561     init_StringReverseConstIterator(&iter, d);
562     return iter.value;
563 }
564 
toLocal_String(const iString * d)565 iBlock *toLocal_String(const iString *d) {
566     size_t len = 0;
567     char * str = u8_conv_to_encoding(localeCharSet_,
568                                      iconveh_question_mark,
569                                      (const uint8_t *) cstr_String(d),
570                                      size_String(d),
571                                      NULL,
572                                      NULL,
573                                      &len);
574     str = realloc(str, len + 1);
575     str[len] = 0;
576     return newPrealloc_Block(str, len, len + 1);
577 }
578 
toUtf16_String(const iString * d)579 iBlock *toUtf16_String(const iString *d) {
580     size_t len = 0;
581     uint16_t *u16 = u8_to_u16((const uint8_t *) cstr_String(d),
582                               size_String(d),
583                               NULL,
584                               &len);
585     /* Make it null-terminated. */
586     const size_t bytes = 2 * len;
587     u16 = realloc(u16, bytes + 2);
588     u16[len] = 0;
589     return newPrealloc_Block(u16, bytes, bytes + 2);
590 }
591 
toUnicode_String(const iString * d)592 iBlock *toUnicode_String(const iString *d) {
593     size_t len = 0;
594     uint32_t *u32 = u8_to_u32((const uint8_t *) cstr_String(d),
595                               size_String(d),
596                               NULL,
597                               &len);
598     /* Make it null-terminated. */
599     const size_t bytes = 4 * len;
600     u32 = realloc(u32, bytes + 4);
601     u32[len] = 0;
602     return newPrealloc_Block(u32, bytes, bytes + 4);
603 }
604 
cmpSc_String(const iString * d,const char * cstr,const iStringComparison * sc)605 int cmpSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
606     return sc->cmp(constData_Block(&d->chars), cstr);
607 }
608 
cmpNSc_String(const iString * d,const char * cstr,size_t n,const iStringComparison * sc)609 int cmpNSc_String(const iString *d, const char *cstr, size_t n, const iStringComparison *sc) {
610     return sc->cmpN(constData_Block(&d->chars), cstr, n);
611 }
612 
startsWithSc_String(const iString * d,const char * cstr,const iStringComparison * sc)613 iBool startsWithSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
614     const iRangecc rc = range_String(d);
615     return startsWithSc_Rangecc(rc, cstr, sc);
616 }
617 
startsWithSc_Rangecc(const iRangecc d,const char * cstr,const iStringComparison * sc)618 iBool startsWithSc_Rangecc(const iRangecc d, const char *cstr, const iStringComparison *sc) {
619     const size_t len = strlen(cstr);
620     if (size_Range(&d) < len) return iFalse;
621     return !sc->cmpN(d.start, cstr, len);
622 }
623 
endsWithSc_Rangecc(const iRangecc d,const char * cstr,const iStringComparison * sc)624 iBool endsWithSc_Rangecc(const iRangecc d, const char *cstr, const iStringComparison *sc) {
625     const size_t len = strlen(cstr);
626     if (size_Range(&d) < len) return iFalse;
627     return !sc->cmpN(d.end - len, cstr, len);
628 }
629 
endsWithSc_String(const iString * d,const char * cstr,const iStringComparison * sc)630 iBool endsWithSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
631     const size_t len = strlen(cstr);
632     if (size_String(d) < len) return iFalse;
633     return !sc->cmp(constEnd_Block(&d->chars) - len, cstr);
634 }
635 
set_String(iString * d,const iString * other)636 void set_String(iString *d, const iString *other) {
637     set_Block(&d->chars, &other->chars);
638 }
639 
setCStr_String(iString * d,const char * cstr)640 void setCStr_String(iString *d, const char *cstr) {
641     setCStr_Block(&d->chars, cstr);
642 }
643 
setCStrN_String(iString * d,const char * cstr,size_t n)644 void setCStrN_String(iString *d, const char *cstr, size_t n) {
645     setData_Block(&d->chars, cstr, n);
646 }
647 
setBlock_String(iString * d,const iBlock * block)648 void setBlock_String(iString *d, const iBlock *block) {
649     set_Block(&d->chars, block);
650 }
651 
format_String(iString * d,const char * format,...)652 void format_String(iString *d, const char *format, ...) {
653     va_list args;
654     va_start(args, format);
655     vprintf_Block(&d->chars, format, args);
656     va_end(args);
657 }
658 
appendFormat_String(iString * d,const char * format,...)659 void appendFormat_String(iString *d, const char *format, ...) {
660     iBlock chars;
661     init_Block(&chars, 0); {
662         va_list args;
663         va_start(args, format);
664         vprintf_Block(&chars, format, args);
665         va_end(args);
666     }
667     append_Block(&d->chars, &chars);
668     deinit_Block(&chars);
669 }
670 
indexOf_String(const iString * d,iChar ch)671 size_t indexOf_String(const iString *d, iChar ch) {
672     iMultibyteChar mb;
673     init_MultibyteChar(&mb, ch);
674     return indexOfCStr_String(d, mb.bytes);
675 }
676 
indexOfCStr_String(const iString * d,const char * cstr)677 size_t indexOfCStr_String(const iString *d, const char *cstr) {
678     return indexOfCStrFromSc_String(d, cstr, 0, &iCaseSensitive);
679 }
680 
indexOfCStrFrom_String(const iString * d,const char * cstr,size_t from)681 size_t indexOfCStrFrom_String(const iString *d, const char *cstr, size_t from) {
682     return indexOfCStrFromSc_String(d, cstr, from, &iCaseSensitive);
683 }
684 
indexOfCStrSc_String(const iString * d,const char * cstr,const iStringComparison * sc)685 size_t indexOfCStrSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
686     return indexOfCStrFromSc_String(d, cstr, 0, sc);
687 }
688 
indexOfCStrFromSc_String(const iString * d,const char * cstr,size_t from,const iStringComparison * sc)689 size_t indexOfCStrFromSc_String(const iString *d, const char *cstr, size_t from,
690                                 const iStringComparison *sc) {
691     if (from >= size_String(d)) return iInvalidPos;
692     const char *chars = cstr_String(d) + from;
693     const char *found = sc->locate(chars, cstr);
694     if (found) {
695         return found - chars + from;
696     }
697     return iInvalidPos;
698 }
699 
lastIndexOf_String(const iString * d,iChar ch)700 size_t lastIndexOf_String(const iString *d, iChar ch) {
701     iMultibyteChar mb;
702     init_MultibyteChar(&mb, ch);
703     return lastIndexOfCStr_String(d, mb.bytes);
704 }
705 
lastIndexOfCStr_Rangecc(const iRangecc d,const char * cstr)706 size_t lastIndexOfCStr_Rangecc(const iRangecc d, const char *cstr) {
707     const size_t len = strlen(cstr);
708     if (len > size_Range(&d)) return iInvalidPos;
709     for (const char *i = d.end - len; i >= d.start; --i) {
710         if (iCmpStrN(i, cstr, len) == 0) {
711             return i - d.start;
712         }
713     }
714     return iInvalidPos;
715 }
716 
lastIndexOfCStr_String(const iString * d,const char * cstr)717 size_t lastIndexOfCStr_String(const iString *d, const char *cstr) {
718     return lastIndexOfCStr_Rangecc((iRangecc){ constBegin_String(d), constEnd_String(d) }, cstr);
719 }
720 
append_String(iString * d,const iString * other)721 void append_String(iString *d, const iString *other) {
722     append_Block(&d->chars, &other->chars);
723 }
724 
appendCStr_String(iString * d,const char * cstr)725 void appendCStr_String(iString *d, const char *cstr) {
726     appendCStr_Block(&d->chars, cstr);
727 }
728 
appendCStrN_String(iString * d,const char * cstr,size_t size)729 void appendCStrN_String(iString *d, const char *cstr, size_t size) {
730     appendData_Block(&d->chars, cstr, size);
731 }
732 
appendChar_String(iString * d,iChar ch)733 void appendChar_String(iString *d, iChar ch) {
734     iMultibyteChar mb;
735     init_MultibyteChar(&mb, ch);
736     appendCStr_String(d, mb.bytes);
737 }
738 
appendRange_String(iString * d,const iRangecc range)739 void appendRange_String(iString *d, const iRangecc range) {
740     appendData_Block(&d->chars, range.start, size_Range(&range));
741 }
742 
prepend_String(iString * d,const iString * other)743 void prepend_String(iString *d, const iString *other) {
744     iString pre;
745     initCopy_String(&pre, other);
746     append_String(&pre, d);
747     set_String(d, &pre);
748     deinit_String(&pre);
749 }
750 
prependChar_String(iString * d,iChar ch)751 void prependChar_String(iString *d, iChar ch) {
752     iMultibyteChar mb;
753     init_MultibyteChar(&mb, ch);
754     insertData_Block(&d->chars, 0, mb.bytes, strlen(mb.bytes));
755 }
756 
prependCStr_String(iString * d,const char * cstr)757 void prependCStr_String(iString *d, const char *cstr) {
758     iString pre;
759     initCStr_String(&pre, cstr);
760     append_String(&pre, d);
761     set_String(d, &pre);
762     deinit_String(&pre);
763 }
764 
nextSplit_Rangecc(const iRangecc str,const char * separator,iRangecc * range)765 iBool nextSplit_Rangecc(const iRangecc str, const char *separator, iRangecc *range) {
766     iAssert(range->start == NULL || contains_Range(&str, range->start));
767     const size_t separatorSize = strlen(separator);
768     iAssert(separatorSize > 0);
769     if (range->start == NULL) {
770         if (separatorSize > size_Range(&str)) {
771             /* Doesn't fit in the string. */
772             return iFalse;
773         }
774         if (!cmpCStrSc_Rangecc(str, separator, &iCaseSensitive)) {
775             return iFalse;
776         }
777         range->start = range->end = str.start;
778         if (!iCmpStrN(range->start, separator, separatorSize)) {
779             /* Skip the first separator. */
780             range->start += separatorSize;
781         }
782     }
783     else if (range->start == str.end) {
784         return iFalse;
785     }
786     else {
787         range->start = range->end + separatorSize;
788         if (range->start >= str.end) {
789             return iFalse;
790         }
791     }
792     const char *found = strstr(range->start, separator);
793     range->end = (found && found < str.end ? found : str.end);
794     iAssert(range->start <= range->end);
795     return iTrue;
796 }
797 
cstr_Rangecc(iRangecc range)798 const char *cstr_Rangecc(iRangecc range) {
799     const size_t len  = size_Range(&range);
800     char *       copy = malloc(len + 1);
801     memcpy(copy, range.start, len);
802     copy[len] = 0;
803     return iCollectMem(copy);
804 }
805 
string_Rangecc(iRangecc range)806 const iString *string_Rangecc(iRangecc range) {
807     return collect_String(newRange_String(range));
808 }
809 
cmpNullRange_(const char * cstr)810 iLocalDef int cmpNullRange_(const char *cstr) {
811     return (cstr == NULL || *cstr == 0 ? 0 : -1);
812 }
813 
cmpCStrSc_Rangecc(const iRangecc d,const char * cstr,const iStringComparison * sc)814 int cmpCStrSc_Rangecc(const iRangecc d, const char *cstr, const iStringComparison *sc) {
815     if (isNull_Rangecc(d)) {
816         return cmpNullRange_(cstr);
817     }
818     return cmpCStrNSc_Rangecc(d, cstr, strlen(cstr), sc);
819 }
820 
cmpCStrNSc_Rangecc(const iRangecc d,const char * cstr,size_t n,const iStringComparison * sc)821 int cmpCStrNSc_Rangecc(const iRangecc d, const char *cstr, size_t n, const iStringComparison *sc) {
822     if (isNull_Rangecc(d)) {
823         return cmpNullRange_(cstr);
824     }
825     const size_t size = size_Range(&d);
826     int cmp = sc->cmpN(d.start, cstr, iMin(n, size));
827     if (cmp == 0) {
828         if (n == size) {
829             return 0;
830         }
831         return size < n ? -1 : 1;
832     }
833     return cmp;
834 }
835 
split_Rangecc(const iRangecc d,const char * separator)836 iStringList *split_Rangecc(const iRangecc d, const char *separator) {
837     iStringList *parts = new_StringList();
838     iRangecc range = iNullRange;
839     while (nextSplit_Rangecc(d, separator, &range)) {
840         pushBackRange_StringList(parts, range);
841     }
842     return parts;
843 }
844 
toInt_String(const iString * d)845 int toInt_String(const iString *d) {
846     if (startsWith_String(d, "0x") || startsWith_String(d, "0X")) {
847         return strtol(cstr_String(d), NULL, 16);
848     }
849     return atoi(cstr_String(d));
850 }
851 
toFloat_String(const iString * d)852 float toFloat_String(const iString *d) {
853     return strtof(cstr_String(d), NULL);
854 }
855 
toDouble_String(const iString * d)856 double toDouble_String(const iString *d) {
857     return strtod(cstr_String(d), NULL);
858 }
859 
quote_String(const iString * d,iBool numericUnicode)860 iString *quote_String(const iString *d, iBool numericUnicode) {
861     iString *quot = new_String();
862     iConstForEach(String, i, d) {
863         const iChar ch = i.value;
864         if (ch == '"') {
865             appendCStr_String(quot, "\\\"");
866         }
867         else if (ch == '\\') {
868             appendCStr_String(quot, "\\\\");
869         }
870         else if (ch == '\n') {
871             appendCStr_String(quot, "\\n");
872         }
873         else if (ch == '\r') {
874             appendCStr_String(quot, "\\r");
875         }
876         else if (ch == '\t') {
877             appendCStr_String(quot, "\\t");
878         }
879         else if (numericUnicode && ch >= 0x80) {
880             if ((ch >= 0xD800 && ch < 0xE000) || ch >= 0x10000) {
881                 /* TODO: Add a helper function? */
882                 /* UTF-16 surrogate pair */
883                 iString *chs = newUnicodeN_String(&ch, 1);
884                 iBlock *u16 = toUtf16_String(chs);
885                 delete_String(chs);
886                 const uint16_t *ch16 = constData_Block(u16);
887                 appendFormat_String(quot, "\\u%04x\\u%04x", ch16[0], ch16[1]);
888             }
889             else {
890                 appendFormat_String(quot, "\\u%04x", ch);
891             }
892         }
893         else {
894             appendChar_String(quot, ch);
895         }
896     }
897     return quot;
898 }
899 
unquote_String(const iString * d)900 iString *unquote_String(const iString *d) {
901     iString *unquot = new_String();
902     iConstForEach(String, i, d) {
903         const iChar ch = i.value;
904         if (ch == '\\') {
905             next_StringConstIterator(&i);
906             const iChar esc = i.value;
907             if (esc == '\\') {
908                 appendChar_String(unquot, esc);
909             }
910             else if (esc == 'n') {
911                 appendChar_String(unquot, '\n');
912             }
913             else if (esc == 'r') {
914                 appendChar_String(unquot, '\r');
915             }
916             else if (esc == 't') {
917                 appendChar_String(unquot, '\t');
918             }
919             else if (esc == '"') {
920                 appendChar_String(unquot, '"');
921             }
922             else if (esc == 'u') {
923                 char digits[5];
924                 iZap(digits);
925                 for (size_t j = 0; j < 4; j++) {
926                     next_StringConstIterator(&i);
927                     digits[j] = *i.pos;
928                 }
929                 uint16_t ch16[2] = { strtoul(digits, NULL, 16), 0 };
930                 if (ch16[0] < 0xD800 || ch16[0] >= 0xE000) {
931                     appendChar_String(unquot, ch16[0]);
932                 }
933                 else {
934                     /* UTF-16 surrogate pair */
935                     next_StringConstIterator(&i);
936                     next_StringConstIterator(&i);
937                     iZap(digits);
938                     for (size_t j = 0; j < 4; j++) {
939                         next_StringConstIterator(&i);
940                         digits[j] = *i.pos;
941                     }
942                     ch16[1] = strtoul(digits, NULL, 16);
943                     iString *u16 = newUtf16N_String(ch16, 2);
944                     append_String(unquot, u16);
945                     delete_String(u16);
946                 }
947             }
948             else {
949                 iAssert(0);
950             }
951         }
952         else {
953             appendChar_String(unquot, ch);
954         }
955     }
956     return unquot;
957 }
958 
skipSpace_CStr(const char * cstr)959 const char *skipSpace_CStr(const char *cstr) {
960     while (*cstr && isspace((int) *cstr)) {
961         cstr++;
962     }
963     return cstr;
964 }
965 
findAscii_Rangecc(const iRangecc str,char ch)966 const char *findAscii_Rangecc(const iRangecc str, char ch) {
967     const char *pos = strchr(str.start, ch);
968     if (!pos || pos >= str.end) return NULL;
969     return pos;
970 }
971 
split_CStr(const char * cstr,const char * separator)972 iStringList *split_CStr(const char *cstr, const char *separator) {
973     return split_Rangecc((iRangecc){ cstr, cstr + strlen(cstr) }, separator);
974 }
975 
976 /*-------------------------------------------------------------------------------------*/
977 
decodeNextMultibyte_StringConstIterator_(iStringConstIterator * d)978 static void decodeNextMultibyte_StringConstIterator_(iStringConstIterator *d) {
979     d->value = 0;
980     /* u8_next() returns NULL when end is reached. */
981     d->next = (const char *) u8_next(&d->value, (const uint8_t *) d->next);
982 }
983 
decodePrecedingMultibyte_StringConstIterator_(iStringConstIterator * d)984 static void decodePrecedingMultibyte_StringConstIterator_(iStringConstIterator *d) {
985     d->value = 0;
986     d->next = (const char *) u8_prev(
987         &d->value, (const uint8_t *) d->next, constData_Block(&d->str->chars));
988 }
989 
init_StringConstIterator(iStringConstIterator * d,const iString * str)990 void init_StringConstIterator(iStringConstIterator *d, const iString *str) {
991     d->str = str;
992     d->value = 0;
993     if (str) {
994         d->pos = d->next = constData_Block(&str->chars);
995         /* Decode the first character. */
996         decodeNextMultibyte_StringConstIterator_(d);
997     }
998     else {
999         d->pos = d->next = NULL;
1000     }
1001 }
1002 
next_StringConstIterator(iStringConstIterator * d)1003 void next_StringConstIterator(iStringConstIterator *d) {
1004     d->pos = d->next;
1005     decodeNextMultibyte_StringConstIterator_(d);
1006 }
1007 
init_StringReverseConstIterator(iStringConstIterator * d,const iString * str)1008 void init_StringReverseConstIterator(iStringConstIterator *d, const iString *str) {
1009     d->str = str;
1010     d->value = 0;
1011     d->pos = d->next = constEnd_Block(&str->chars);
1012     /* Decode the first (last) character. */
1013     decodePrecedingMultibyte_StringConstIterator_(d);
1014 }
1015 
next_StringReverseConstIterator(iStringConstIterator * d)1016 void next_StringReverseConstIterator(iStringConstIterator *d) {
1017     d->pos = d->next;
1018     decodePrecedingMultibyte_StringConstIterator_(d);
1019 }
1020 
1021 /*-------------------------------------------------------------------------------------*/
1022 
init_MultibyteChar(iMultibyteChar * d,iChar ch)1023 void init_MultibyteChar(iMultibyteChar *d, iChar ch) {
1024     int len = u8_uctomb((uint8_t *) d->bytes, ch, sizeof(d->bytes));
1025     d->bytes[iMax(0, len)] = 0;
1026 }
1027 
decodeBytes_MultibyteChar(const char * bytes,const char * end,iChar * ch_out)1028 int decodeBytes_MultibyteChar(const char *bytes, const char *end, iChar *ch_out) {
1029     int rc = u8_mbtouc(ch_out, (const uint8_t *) bytes, end - bytes);
1030     if (*ch_out == 0xfffd) {
1031         rc = -1; /* Decode failed. */
1032     }
1033     return rc;
1034 }
1035 
decodePrecedingBytes_MultibyteChar(const char * bytes,const char * start,iChar * ch_out)1036 int decodePrecedingBytes_MultibyteChar(const char *bytes, const char *start, iChar *ch_out) {
1037     *ch_out = 0;
1038     const char *precPos =
1039         (const char *) u8_prev(ch_out, (const uint8_t *) bytes, (const uint8_t *) start);
1040     if (!precPos) {
1041         return 0;
1042     }
1043     return bytes - precPos;
1044 }
1045 
threadLocalCharBuffer_(void)1046 static char *threadLocalCharBuffer_(void) {
1047     static tss_t bufKey = 0;
1048     if (!bufKey) {
1049         tss_create(&bufKey, free);
1050     }
1051     char *buf = tss_get(bufKey);
1052     if (!buf) {
1053         tss_set(bufKey, buf = malloc(iMultibyteCharMaxSize + 1));
1054     }
1055     return buf;
1056 }
1057 
cstrLocal_Char(iChar ch)1058 const char *cstrLocal_Char(iChar ch) {
1059     char *chBuf = threadLocalCharBuffer_();
1060     const iChar ucs[2] = { ch, 0 };
1061     size_t len = iMultibyteCharMaxSize;
1062     u32_conv_to_encoding(localeCharSet_, iconveh_question_mark, ucs, 1, NULL, chBuf, &len);
1063     chBuf[len] = 0;
1064     return chBuf;
1065 }
1066 
iCmpStrRange(const iRangecc range,const char * cstr)1067 int iCmpStrRange(const iRangecc range, const char *cstr) {
1068     const size_t clen = strlen(cstr);
1069     const int cmp = iCmpStrN(range.start, cstr, size_Range(&range));
1070     if (clen == size_Range(&range)) {
1071         return cmp;
1072     }
1073     if (cmp == 0) return (size_Range(&range) < clen? -1 : 1);
1074     return cmp;
1075 }
1076 
iCmpStrCase(const char * a,const char * b)1077 int iCmpStrCase(const char *a, const char *b) {
1078     int rc = 0;
1079     u8_casecmp((const uint8_t *) a,
1080                strlen(a),
1081                (const uint8_t *) b,
1082                strlen(b),
1083                currentLocaleLanguage_(),
1084                NULL,
1085                &rc);
1086     return rc;
1087 }
1088 
iCmpStrNCase(const char * a,const char * b,size_t len)1089 int iCmpStrNCase(const char *a, const char *b, size_t len) {
1090     int rc = 0;
1091     u8_casecmp((const uint8_t *) a,
1092                strnlen(a, len),
1093                (const uint8_t *) b,
1094                strnlen(b, len),
1095                currentLocaleLanguage_(),
1096                NULL,
1097                &rc);
1098     return rc;
1099 }
1100 
strcasestr_(const char * haystack,const char * needle)1101 static char *strcasestr_(const char *haystack, const char *needle) {
1102     const iString hay = iStringLiteral(haystack);
1103     const iString ndl = iStringLiteral(needle);
1104     const iChar ndlFirstChar = lower_Char(first_String(&ndl));
1105     if (size_String(&ndl) > size_String(&hay)) {
1106         /* Too long to be able to find it. */
1107         return NULL;
1108     }
1109     iConstForEach(String, i, &hay) {
1110         if (lower_Char(i.value) == ndlFirstChar) {
1111             /* Check if the full needle matches. */
1112             iStringConstIterator hayStart;
1113             memcpy(&hayStart, &i, sizeof(i));
1114             iStringConstIterator j;
1115             init_StringConstIterator(&j, &ndl);
1116             for (;;) {
1117                 next_StringConstIterator(&j);
1118                 next_StringConstIterator(&i);
1119                 if (!j.value) return iConstCast(char *, hayStart.pos); // Matched full needle.
1120                 if (!i.value) return NULL; // Not long enough for needle.
1121                 if (lower_Char(i.value) != lower_Char(j.value)) {
1122                     /* Must match all need characters. */
1123                     break;
1124                 }
1125             }
1126             memcpy(&i, &hayStart, sizeof(i));
1127         }
1128     }
1129     return NULL;
1130 }
1131 
iCmpStr(const char * a,const char * b)1132 int iCmpStr(const char *a, const char *b) {
1133     return u8_strcmp((const uint8_t *) a, (const uint8_t *) b);
1134 }
1135 
iCmpStrN(const char * a,const char * b,size_t n)1136 int iCmpStrN(const char *a, const char *b, size_t n) {
1137     const size_t n1 = strnlen(a, n);
1138     const size_t n2 = strnlen(b, n);
1139     return u8_cmp2((const uint8_t *) a, n1, (const uint8_t *) b, n2);
1140 }
1141 
1142 iStringComparison iCaseSensitive = {
1143     .cmp    = iCmpStr,
1144     .cmpN   = iCmpStrN,
1145     .locate = strstr,
1146 };
1147 
1148 iStringComparison iCaseInsensitive = {
1149     .cmp    = iCmpStrCase,
1150     .cmpN   = iCmpStrNCase,
1151     .locate = strcasestr_,
1152 };
1153 
iDupStr(const char * a)1154 char *iDupStr(const char *a) {
1155     return strdup(a);
1156 }
1157 
iStrStrN(const char * a,const char * b,size_t n)1158 char *iStrStrN(const char *a, const char *b, size_t n) {
1159     return strnstr(a, b, n);
1160 }
1161