1 /** @file string.c UTF-8 string with copy-on-write semantics.
2
3 @authors Copyright (c) 2017 Jaakko Keränen <jaakko.keranen@iki.fi>
4
5 @par License
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice, this
11 list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright notice,
13 this list of conditions and the following disclaimer in the documentation
14 and/or other materials provided with the distribution.
15
16 <small>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</small>
26 */
27
28 #include "the_Foundation/string.h"
29 #include "the_Foundation/stringlist.h"
30 #include "the_Foundation/range.h"
31 #include "the_Foundation/stdthreads.h"
32
33 #include <stdlib.h>
34 #include <stdarg.h>
35 #include <strings.h>
36 #include <unicase.h>
37 #include <unictype.h>
38 #include <uniconv.h>
39 #include <uninorm.h>
40 #include <unistr.h>
41 #include <ctype.h>
42
43 #if !defined (iHaveStrnstr)
44 # include "platform/strnstr.h"
45 #endif
46
47 static char localeCharSet_[64];
48
currentLocaleLanguage_(void)49 iLocalDef const char *currentLocaleLanguage_(void) {
50 static iString *loc = NULL;
51 if (!loc) {
52 loc = newCStr_String(uc_locale_language());
53 }
54 return cstr_String(loc);
55 }
56
upper_Char(iChar d)57 iChar upper_Char(iChar d) {
58 return uc_toupper(d);
59 }
60
lower_Char(iChar d)61 iChar lower_Char(iChar d) {
62 return uc_tolower(d);
63 }
64
isSpace_Char(iChar d)65 iBool isSpace_Char(iChar d) {
66 return uc_is_space(d) ? iTrue : iFalse;
67 }
68
isAlpha_Char(iChar d)69 iBool isAlpha_Char(iChar d) {
70 return uc_is_alpha(d) ? iTrue : iFalse;
71 }
72
isNumeric_Char(iChar d)73 iBool isNumeric_Char(iChar d) {
74 return uc_is_digit(d) ? iTrue : iFalse;
75 }
76
isAlphaNumeric_Char(iChar d)77 iBool isAlphaNumeric_Char(iChar d) {
78 return uc_is_alnum(d) ? iTrue : iFalse;
79 }
80
isPunct_Char(iChar d)81 iBool isPunct_Char(iChar d) {
82 return uc_is_punct(d) ? iTrue : iFalse;
83 }
84
setLocaleCharSet_String(const char * charSet)85 void setLocaleCharSet_String(const char *charSet) {
86 const size_t n = sizeof(localeCharSet_);
87 strncpy(localeCharSet_, charSet, n);
88 localeCharSet_[n - 1] = 0;
89 }
90
new_String(void)91 iString *new_String(void) {
92 iString *d = iMalloc(String);
93 init_String(d);
94 return d;
95 }
96
newCStr_String(const char * cstr)97 iString *newCStr_String(const char *cstr) {
98 return newCStrN_String(cstr, strlen(cstr));
99 }
100
newCStrN_String(const char * cstr,size_t n)101 iString *newCStrN_String(const char *cstr, size_t n) {
102 iString *d = iMalloc(String);
103 initData_Block(&d->chars, cstr, n);
104 return d;
105 }
106
newUtf16_String(const uint16_t * utf16Str)107 iString *newUtf16_String(const uint16_t *utf16Str) {
108 iString *d = iMalloc(String);
109 initUtf16_String(d, utf16Str);
110 return d;
111 }
112
newUtf16N_String(const uint16_t * utf16Str,size_t n)113 iString *newUtf16N_String(const uint16_t *utf16Str, size_t n) {
114 iString *d = iMalloc(String);
115 initUtf16N_String(d, utf16Str, n);
116 return d;
117 }
118
newLocalCStr_String(const char * localCStr)119 iString *newLocalCStr_String(const char *localCStr) {
120 return newLocalCStrN_String(localCStr, strlen(localCStr));
121 }
122
newLocalCStrN_String(const char * localCStr,size_t n)123 iString *newLocalCStrN_String(const char *localCStr, size_t n) {
124 iString *d = iMalloc(String);
125 initLocalCStrN_String(d, localCStr, n);
126 return d;
127 }
128
newUnicode_String(const iChar * ucs)129 iString *newUnicode_String(const iChar *ucs) {
130 return newUnicodeN_String(ucs, u32_strlen(ucs));
131 }
132
newUnicodeN_String(const iChar * ucs,size_t n)133 iString *newUnicodeN_String(const iChar *ucs, size_t n) {
134 iString *d = iMalloc(String);
135 initUnicodeN_String(d, ucs, n);
136 return d;
137 }
138
newFormat_String(const char * format,...)139 iString *newFormat_String(const char *format, ...) {
140 iString *d = new_String();
141 va_list args;
142 va_start(args, format);
143 vprintf_Block(&d->chars, format, args);
144 va_end(args);
145 return d;
146 }
147
collectNewFormat_String(const char * format,...)148 iString *collectNewFormat_String(const char *format, ...) {
149 iString *d = collectNew_String();
150 va_list args;
151 va_start(args, format);
152 vprintf_Block(&d->chars, format, args);
153 va_end(args);
154 return d;
155 }
156
format_CStr(const char * format,...)157 const char *format_CStr(const char *format, ...) {
158 iString *d = collectNew_String();
159 va_list args;
160 va_start(args, format);
161 vprintf_Block(&d->chars, format, args);
162 va_end(args);
163 return cstr_String(d);
164 }
165
newBlock_String(const iBlock * data)166 iString *newBlock_String(const iBlock *data) {
167 iString *d = iMalloc(String);
168 initCopy_Block(&d->chars, data);
169 return d;
170 }
171
copy_String(const iString * d)172 iString *copy_String(const iString *d) {
173 iString *copy = iMalloc(String);
174 initCopy_Block(©->chars, &d->chars);
175 return copy;
176 }
177
delete_String(iString * d)178 void delete_String(iString *d) {
179 if (d) {
180 deinit_String(d);
181 free(d);
182 }
183 }
184
init_String(iString * d)185 void init_String(iString *d) {
186 init_Block(&d->chars, 0);
187 }
188
initBlock_String(iString * d,const iBlock * chars)189 void initBlock_String(iString *d, const iBlock *chars) {
190 initCopy_Block(&d->chars, chars);
191 }
192
initCStr_String(iString * d,const char * cstr)193 void initCStr_String(iString *d, const char *cstr) {
194 initCStrN_String(d, cstr, strlen(cstr));
195 }
196
initCStrN_String(iString * d,const char * cstr,size_t size)197 void initCStrN_String(iString *d, const char *cstr, size_t size) {
198 initData_Block(&d->chars, cstr, size);
199 }
200
initUtf16_String(iString * d,const uint16_t * utf16Str)201 void initUtf16_String(iString *d, const uint16_t *utf16Str) {
202 initUtf16N_String(d, utf16Str, u16_strlen(utf16Str));
203 }
204
initUtf16N_String(iString * d,const uint16_t * utf16Str,size_t n)205 void initUtf16N_String(iString *d, const uint16_t *utf16Str, size_t n) {
206 size_t len = 0;
207 uint8_t *data = u16_to_u8(utf16Str, n, NULL, &len);
208 data = realloc(data, len + 1);
209 data[len] = 0; // terminate
210 initPrealloc_Block(&d->chars, data, len, len + 1);
211 }
212
initLocalCStr_String(iString * d,const char * localCStr)213 void initLocalCStr_String(iString *d, const char *localCStr) {
214 initLocalCStrN_String(d, localCStr, strlen(localCStr));
215 }
216
initLocalCStrN_String(iString * d,const char * localCStr,size_t size)217 void initLocalCStrN_String(iString *d, const char *localCStr, size_t size) {
218 initBlockEncoding_String(d, &iBlockLiteral(localCStr, size, size), localeCharSet_);
219 }
220
initBlockEncoding_String(iString * d,const iBlock * chars,const char * encoding)221 void initBlockEncoding_String(iString *d, const iBlock *chars, const char *encoding) {
222 size_t len = 0;
223 uint8_t *data = u8_conv_from_encoding(encoding,
224 iconveh_question_mark,
225 constData_Block(chars),
226 size_Block(chars),
227 NULL,
228 NULL,
229 &len);
230 data = realloc(data, len + 1);
231 data[len] = 0;
232 initPrealloc_Block(&d->chars, data, len, len + 1);
233 }
234
initUnicode_String(iString * d,const iChar * ucs)235 void initUnicode_String(iString *d, const iChar *ucs) {
236 initUnicodeN_String(d, ucs, u32_strlen(ucs));
237 }
238
initUnicodeN_String(iString * d,const iChar * ucs,size_t n)239 void initUnicodeN_String(iString *d, const iChar *ucs, size_t n) {
240 size_t len = 0;
241 uint8_t *str = u32_to_u8(ucs, n, NULL, &len);
242 str = realloc(str, len + 1);
243 str[len] = 0;
244 iBlock chars;
245 initPrealloc_Block(&chars, str, len, len + 1);
246 initBlock_String(d, &chars);
247 deinit_Block(&chars);
248 }
249
initCopy_String(iString * d,const iString * other)250 void initCopy_String(iString *d, const iString *other) {
251 initCopy_Block(&d->chars, &other->chars);
252 }
253
deinit_String(iString * d)254 void deinit_String(iString *d) {
255 deinit_Block(&d->chars);
256 }
257
serialize_String(const iString * d,iStream * outs)258 void serialize_String(const iString *d, iStream *outs) {
259 serialize_Block(&d->chars, outs);
260 }
261
deserialize_String(iString * d,iStream * ins)262 void deserialize_String(iString *d, iStream *ins) {
263 deserialize_Block(&d->chars, ins);
264 }
265
clear_String(iString * d)266 void clear_String(iString *d) {
267 clear_Block(&d->chars);
268 }
269
truncate_String(iString * d,size_t charCount)270 void truncate_String(iString *d, size_t charCount) {
271 const char *start = constData_Block(&d->chars);
272 const char *pos = start;
273 iConstForEach(String, i, d) {
274 if (charCount-- == 0) break;
275 pos = i.next;
276 }
277 truncate_Block(&d->chars, (size_t) (pos - start));
278 }
279
removeEnd_String(iString * d,size_t charCount)280 void removeEnd_String(iString *d, size_t charCount) {
281 if (charCount > 0) {
282 const size_t len = length_String(d);
283 if (charCount < len) {
284 truncate_String(d, len - charCount);
285 }
286 else {
287 clear_String(d);
288 }
289 }
290 }
291
trimStart_String(iString * d)292 void trimStart_String(iString *d) {
293 if (!isEmpty_String(d)) {
294 iRangecc range = range_String(d);
295 const char *start = range.start;
296 trimStart_Rangecc(&range);
297 remove_Block(&d->chars, 0, (size_t) (range.start - start));
298 }
299 }
300
trimStart_Rangecc(iRangecc * d)301 void trimStart_Rangecc(iRangecc *d) {
302 const uint8_t *pos = (const uint8_t *) d->start;
303 while (pos != (const uint8_t *) d->end) {
304 iChar ch;
305 pos = u8_next(&ch, pos);
306 /* Variation selectors follow the main codepoint, so if one is found at the beginning
307 it should be ignored. */
308 if (!isSpace_Char(ch) && !isVariationSelector_Char(ch)) break;
309 d->start = (const char *) pos;
310 }
311 }
312
trimEnd_String(iString * d)313 void trimEnd_String(iString *d) {
314 if (!isEmpty_String(d)) {
315 iRangecc range = range_String(d);
316 trimEnd_Rangecc(&range);
317 truncate_Block(&d->chars, (size_t) (range.end - range.start));
318 }
319 }
320
trimEnd_Rangecc(iRangecc * d)321 void trimEnd_Rangecc(iRangecc *d) {
322 while (d->end != d->start) {
323 iAssert(d->end > d->start);
324 /* Skip over any extra NULL characters. */
325 if (d->end[-1] == 0) {
326 d->end--;
327 }
328 else {
329 iChar ch = 0;
330 const uint8_t *pos = u8_prev(&ch, (const uint8_t *) d->end, (const uint8_t *) d->start);
331 if (!pos) {
332 /* `pos` is NULL when beginning of the string is reached (but we're already
333 checking for that so the loop ends before that happens), or if there's
334 an invalid codepoint. We'll trim the invalid ones, too. */
335 d->end--;
336 }
337 else if (isSpace_Char(ch)) {
338 d->end = (const char *) pos;
339 }
340 else break;
341 }
342 }
343 }
344
trim_Rangecc(iRangecc * d)345 void trim_Rangecc(iRangecc *d) {
346 trimStart_Rangecc(d);
347 trimEnd_Rangecc(d);
348 }
349
trim_String(iString * d)350 void trim_String(iString *d) {
351 trimStart_String(d);
352 trimEnd_String(d);
353 }
354
trimmed_String(const iString * d)355 iString *trimmed_String(const iString *d) {
356 iString *str = copy_String(d);
357 trim_String(str);
358 return str;
359 }
360
replace_String(iString * d,const char * src,const char * dst)361 void replace_String(iString *d, const char *src, const char *dst) {
362 const size_t srcLen = strlen(src);
363 const size_t dstLen = strlen(dst);
364 for (size_t pos = indexOfCStr_String(d, src); pos != iInvalidPos;
365 pos = indexOfCStrFrom_String(d, src, pos)) {
366 remove_Block(&d->chars, pos, srcLen);
367 insertData_Block(&d->chars, pos, dst, dstLen);
368 pos += dstLen;
369 }
370 }
371
normalize_String(iString * d)372 void normalize_String(iString *d) {
373 size_t len = 0;
374 uint8_t *nfc =
375 u8_normalize(UNINORM_NFC, constData_Block(&d->chars), size_Block(&d->chars), NULL, &len);
376 /* Ensure it's null-terminated. */
377 nfc = realloc(nfc, len + 1);
378 nfc[len] = 0;
379 iBlock data;
380 initPrealloc_Block(&data, nfc, len, len + 1);
381 set_Block(&d->chars, &data);
382 deinit_Block(&data);
383 }
384
cstr_String(const iString * d)385 const char *cstr_String(const iString *d) {
386 return constData_Block(&d->chars);
387 }
388
length_String(const iString * d)389 size_t length_String(const iString *d) {
390 return u8_mbsnlen((const uint8_t *) cstr_String(d), size_String(d));
391 }
392
isUtf8_Rangecc(iRangecc d)393 iBool isUtf8_Rangecc(iRangecc d) {
394 return u8_check((const uint8_t *) d.start, size_Range(&d)) == NULL;
395 }
396
length_Rangecc(const iRangecc d)397 size_t length_Rangecc(const iRangecc d) {
398 /*
399 size_t n = 0;
400 for (const char *i = d.start; i < d.end; ) {
401 iChar ch;
402 const int chLen = decodeBytes_MultibyteChar(i, d.end, &ch);
403 if (chLen <= 0) break;
404 i += chLen;
405 n++;
406 }
407 return n;*/
408 return u8_mbsnlen((const uint8_t *) d.start, size_Range(&d));
409 }
410
size_String(const iString * d)411 size_t size_String(const iString *d) {
412 return d ? size_Block(&d->chars) : 0;
413 }
414
mid_String(const iString * d,size_t charStartPos,size_t charCount)415 iString *mid_String(const iString *d, size_t charStartPos, size_t charCount) {
416 if (charCount == 0) return new_String();
417 const char *chars = constData_Block(&d->chars);
418 iRanges range = { 0, size_Block(&d->chars) };
419 size_t pos = 0;
420 iConstForEach(String, i, d) {
421 if (pos > charStartPos && pos == charStartPos + charCount) {
422 range.end = i.pos - chars;
423 break;
424 }
425 else if (pos == charStartPos) {
426 range.start = i.pos - chars;
427 if (charCount == iInvalidSize) break;
428 }
429 pos++;
430 }
431 iBlock *midChars = midRange_Block(&d->chars, range);
432 iString *mid = newBlock_String(midChars);
433 delete_Block(midChars);
434 return mid;
435 }
436
upper_String(const iString * d)437 iString *upper_String(const iString *d) {
438 size_t len = 0;
439 uint8_t *str = u8_toupper((const uint8_t *) cstr_String(d),
440 size_String(d),
441 currentLocaleLanguage_(),
442 NULL,
443 NULL,
444 &len);
445 str = realloc(str, len + 1);
446 str[len] = 0;
447 iBlock data;
448 initPrealloc_Block(&data, str, len, len + 1);
449 if (cmp_Block(&data, &d->chars) == 0) {
450 /* Memory optimization: nothing changed so just use a reference. */
451 deinit_Block(&data);
452 return copy_String(d);
453 }
454 iString *up = newBlock_String(&data);
455 deinit_Block(&data);
456 return up;
457 }
458
lower_String(const iString * d)459 iString *lower_String(const iString *d) {
460 size_t len = 0;
461 uint8_t *str = u8_tolower((const uint8_t *) cstr_String(d),
462 size_String(d),
463 currentLocaleLanguage_(),
464 NULL,
465 NULL,
466 &len);
467 str = realloc(str, len + 1);
468 str[len] = 0;
469 iBlock data;
470 initPrealloc_Block(&data, str, len, len + 1);
471 if (cmp_Block(&data, &d->chars) == 0) {
472 /* Memory optimization: nothing changed so just use a reference. */
473 deinit_Block(&data);
474 return copy_String(d);
475 }
476 iString *lwr = newBlock_String(&data);
477 deinit_Block(&data);
478 return lwr;
479 }
480
split_String(const iString * d,const char * separator)481 iStringList *split_String(const iString *d, const char *separator) {
482 const iRangecc range = range_String(d);
483 return split_Rangecc(range, separator);
484 }
485
urlEncodeExclude_String(const iString * d,const char * excluded)486 iString *urlEncodeExclude_String(const iString *d, const char *excluded) {
487 iString *enc = maybeUrlEncodeExclude_String(d, excluded);
488 return enc ? enc : copy_String(d);
489 }
490
urlEncode_String(const iString * d)491 iString *urlEncode_String(const iString *d) {
492 return urlEncodeExclude_String(d, "");
493 }
494
maybeUrlEncodeExclude_String(const iString * d,const char * excluded)495 iString *maybeUrlEncodeExclude_String(const iString *d, const char *excluded) {
496 /* TODO: Return NULL if nothing to encode. */
497 iString *encoded = new_String();
498 /* Note: Any UTF-8 code points are encoded as multiple %NN sequences. */
499 for (const char *i = constBegin_String(d), *end = constEnd_String(d); i != end; ++i) {
500 char ch = *i;
501 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') ||
502 ch == '-' || ch == '_' || ch == '.' || ch == '~' || strchr(excluded, ch)) {
503 appendData_Block(&encoded->chars, i, 1);
504 }
505 else {
506 static const char hex[16] = {
507 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
508 char escaped[3] = {'%', hex[(ch >> 4) & 0xf], hex[ch & 0xf]};
509 appendCStrN_String(encoded, escaped, 3);
510 }
511 }
512 return encoded;
513 }
514
fromHex_(char ch)515 static int fromHex_(char ch) {
516 if (ch >= '0' && ch <= '9') return ch - '0';
517 if (ch >= 'A' && ch <= 'F') return 10 + ch - 'A';
518 if (ch >= 'a' && ch <= 'f') return 10 + ch - 'a';
519 return -1;
520 }
521
urlDecode_String(const iString * d)522 iString *urlDecode_String(const iString *d) {
523 return urlDecodeExclude_String(d, "");
524 }
525
maybeUrlDecodeExclude_String(const iString * d,const char * excluded)526 iString *maybeUrlDecodeExclude_String(const iString *d, const char *excluded) {
527 if (indexOf_String(d, '%') == iInvalidPos) {
528 return NULL;
529 }
530 iString *decoded = new_String();
531 for (const char *i = constBegin_String(d), *end = constEnd_String(d); i != end; ++i) {
532 if (*i == '%' && i + 3 <= end) {
533 const int values[2] = { fromHex_(i[1]), fromHex_(i[2]) };
534 if (values[0] >= 0 && values[1] >= 0) {
535 const char ch = (char) ((values[0] << 4) | values[1]);
536 if (!strchr(excluded, ch)) {
537 appendData_Block(&decoded->chars, &ch, 1);
538 i += 2;
539 continue;
540 }
541 }
542 }
543 appendData_Block(&decoded->chars, i, 1);
544 }
545 return decoded;
546 }
547
urlDecodeExclude_String(const iString * d,const char * excluded)548 iString *urlDecodeExclude_String(const iString *d, const char *excluded) {
549 iString *dec = maybeUrlDecodeExclude_String(d, excluded);
550 return dec ? dec : copy_String(d);
551 }
552
first_String(const iString * d)553 iChar first_String(const iString *d) {
554 iStringConstIterator iter;
555 init_StringConstIterator(&iter, d);
556 return iter.value;
557 }
558
last_String(const iString * d)559 iChar last_String(const iString *d) {
560 iStringReverseConstIterator iter;
561 init_StringReverseConstIterator(&iter, d);
562 return iter.value;
563 }
564
toLocal_String(const iString * d)565 iBlock *toLocal_String(const iString *d) {
566 size_t len = 0;
567 char * str = u8_conv_to_encoding(localeCharSet_,
568 iconveh_question_mark,
569 (const uint8_t *) cstr_String(d),
570 size_String(d),
571 NULL,
572 NULL,
573 &len);
574 str = realloc(str, len + 1);
575 str[len] = 0;
576 return newPrealloc_Block(str, len, len + 1);
577 }
578
toUtf16_String(const iString * d)579 iBlock *toUtf16_String(const iString *d) {
580 size_t len = 0;
581 uint16_t *u16 = u8_to_u16((const uint8_t *) cstr_String(d),
582 size_String(d),
583 NULL,
584 &len);
585 /* Make it null-terminated. */
586 const size_t bytes = 2 * len;
587 u16 = realloc(u16, bytes + 2);
588 u16[len] = 0;
589 return newPrealloc_Block(u16, bytes, bytes + 2);
590 }
591
toUnicode_String(const iString * d)592 iBlock *toUnicode_String(const iString *d) {
593 size_t len = 0;
594 uint32_t *u32 = u8_to_u32((const uint8_t *) cstr_String(d),
595 size_String(d),
596 NULL,
597 &len);
598 /* Make it null-terminated. */
599 const size_t bytes = 4 * len;
600 u32 = realloc(u32, bytes + 4);
601 u32[len] = 0;
602 return newPrealloc_Block(u32, bytes, bytes + 4);
603 }
604
cmpSc_String(const iString * d,const char * cstr,const iStringComparison * sc)605 int cmpSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
606 return sc->cmp(constData_Block(&d->chars), cstr);
607 }
608
cmpNSc_String(const iString * d,const char * cstr,size_t n,const iStringComparison * sc)609 int cmpNSc_String(const iString *d, const char *cstr, size_t n, const iStringComparison *sc) {
610 return sc->cmpN(constData_Block(&d->chars), cstr, n);
611 }
612
startsWithSc_String(const iString * d,const char * cstr,const iStringComparison * sc)613 iBool startsWithSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
614 const iRangecc rc = range_String(d);
615 return startsWithSc_Rangecc(rc, cstr, sc);
616 }
617
startsWithSc_Rangecc(const iRangecc d,const char * cstr,const iStringComparison * sc)618 iBool startsWithSc_Rangecc(const iRangecc d, const char *cstr, const iStringComparison *sc) {
619 const size_t len = strlen(cstr);
620 if (size_Range(&d) < len) return iFalse;
621 return !sc->cmpN(d.start, cstr, len);
622 }
623
endsWithSc_Rangecc(const iRangecc d,const char * cstr,const iStringComparison * sc)624 iBool endsWithSc_Rangecc(const iRangecc d, const char *cstr, const iStringComparison *sc) {
625 const size_t len = strlen(cstr);
626 if (size_Range(&d) < len) return iFalse;
627 return !sc->cmpN(d.end - len, cstr, len);
628 }
629
endsWithSc_String(const iString * d,const char * cstr,const iStringComparison * sc)630 iBool endsWithSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
631 const size_t len = strlen(cstr);
632 if (size_String(d) < len) return iFalse;
633 return !sc->cmp(constEnd_Block(&d->chars) - len, cstr);
634 }
635
set_String(iString * d,const iString * other)636 void set_String(iString *d, const iString *other) {
637 set_Block(&d->chars, &other->chars);
638 }
639
setCStr_String(iString * d,const char * cstr)640 void setCStr_String(iString *d, const char *cstr) {
641 setCStr_Block(&d->chars, cstr);
642 }
643
setCStrN_String(iString * d,const char * cstr,size_t n)644 void setCStrN_String(iString *d, const char *cstr, size_t n) {
645 setData_Block(&d->chars, cstr, n);
646 }
647
setBlock_String(iString * d,const iBlock * block)648 void setBlock_String(iString *d, const iBlock *block) {
649 set_Block(&d->chars, block);
650 }
651
format_String(iString * d,const char * format,...)652 void format_String(iString *d, const char *format, ...) {
653 va_list args;
654 va_start(args, format);
655 vprintf_Block(&d->chars, format, args);
656 va_end(args);
657 }
658
appendFormat_String(iString * d,const char * format,...)659 void appendFormat_String(iString *d, const char *format, ...) {
660 iBlock chars;
661 init_Block(&chars, 0); {
662 va_list args;
663 va_start(args, format);
664 vprintf_Block(&chars, format, args);
665 va_end(args);
666 }
667 append_Block(&d->chars, &chars);
668 deinit_Block(&chars);
669 }
670
indexOf_String(const iString * d,iChar ch)671 size_t indexOf_String(const iString *d, iChar ch) {
672 iMultibyteChar mb;
673 init_MultibyteChar(&mb, ch);
674 return indexOfCStr_String(d, mb.bytes);
675 }
676
indexOfCStr_String(const iString * d,const char * cstr)677 size_t indexOfCStr_String(const iString *d, const char *cstr) {
678 return indexOfCStrFromSc_String(d, cstr, 0, &iCaseSensitive);
679 }
680
indexOfCStrFrom_String(const iString * d,const char * cstr,size_t from)681 size_t indexOfCStrFrom_String(const iString *d, const char *cstr, size_t from) {
682 return indexOfCStrFromSc_String(d, cstr, from, &iCaseSensitive);
683 }
684
indexOfCStrSc_String(const iString * d,const char * cstr,const iStringComparison * sc)685 size_t indexOfCStrSc_String(const iString *d, const char *cstr, const iStringComparison *sc) {
686 return indexOfCStrFromSc_String(d, cstr, 0, sc);
687 }
688
indexOfCStrFromSc_String(const iString * d,const char * cstr,size_t from,const iStringComparison * sc)689 size_t indexOfCStrFromSc_String(const iString *d, const char *cstr, size_t from,
690 const iStringComparison *sc) {
691 if (from >= size_String(d)) return iInvalidPos;
692 const char *chars = cstr_String(d) + from;
693 const char *found = sc->locate(chars, cstr);
694 if (found) {
695 return found - chars + from;
696 }
697 return iInvalidPos;
698 }
699
lastIndexOf_String(const iString * d,iChar ch)700 size_t lastIndexOf_String(const iString *d, iChar ch) {
701 iMultibyteChar mb;
702 init_MultibyteChar(&mb, ch);
703 return lastIndexOfCStr_String(d, mb.bytes);
704 }
705
lastIndexOfCStr_Rangecc(const iRangecc d,const char * cstr)706 size_t lastIndexOfCStr_Rangecc(const iRangecc d, const char *cstr) {
707 const size_t len = strlen(cstr);
708 if (len > size_Range(&d)) return iInvalidPos;
709 for (const char *i = d.end - len; i >= d.start; --i) {
710 if (iCmpStrN(i, cstr, len) == 0) {
711 return i - d.start;
712 }
713 }
714 return iInvalidPos;
715 }
716
lastIndexOfCStr_String(const iString * d,const char * cstr)717 size_t lastIndexOfCStr_String(const iString *d, const char *cstr) {
718 return lastIndexOfCStr_Rangecc((iRangecc){ constBegin_String(d), constEnd_String(d) }, cstr);
719 }
720
append_String(iString * d,const iString * other)721 void append_String(iString *d, const iString *other) {
722 append_Block(&d->chars, &other->chars);
723 }
724
appendCStr_String(iString * d,const char * cstr)725 void appendCStr_String(iString *d, const char *cstr) {
726 appendCStr_Block(&d->chars, cstr);
727 }
728
appendCStrN_String(iString * d,const char * cstr,size_t size)729 void appendCStrN_String(iString *d, const char *cstr, size_t size) {
730 appendData_Block(&d->chars, cstr, size);
731 }
732
appendChar_String(iString * d,iChar ch)733 void appendChar_String(iString *d, iChar ch) {
734 iMultibyteChar mb;
735 init_MultibyteChar(&mb, ch);
736 appendCStr_String(d, mb.bytes);
737 }
738
appendRange_String(iString * d,const iRangecc range)739 void appendRange_String(iString *d, const iRangecc range) {
740 appendData_Block(&d->chars, range.start, size_Range(&range));
741 }
742
prepend_String(iString * d,const iString * other)743 void prepend_String(iString *d, const iString *other) {
744 iString pre;
745 initCopy_String(&pre, other);
746 append_String(&pre, d);
747 set_String(d, &pre);
748 deinit_String(&pre);
749 }
750
prependChar_String(iString * d,iChar ch)751 void prependChar_String(iString *d, iChar ch) {
752 iMultibyteChar mb;
753 init_MultibyteChar(&mb, ch);
754 insertData_Block(&d->chars, 0, mb.bytes, strlen(mb.bytes));
755 }
756
prependCStr_String(iString * d,const char * cstr)757 void prependCStr_String(iString *d, const char *cstr) {
758 iString pre;
759 initCStr_String(&pre, cstr);
760 append_String(&pre, d);
761 set_String(d, &pre);
762 deinit_String(&pre);
763 }
764
nextSplit_Rangecc(const iRangecc str,const char * separator,iRangecc * range)765 iBool nextSplit_Rangecc(const iRangecc str, const char *separator, iRangecc *range) {
766 iAssert(range->start == NULL || contains_Range(&str, range->start));
767 const size_t separatorSize = strlen(separator);
768 iAssert(separatorSize > 0);
769 if (range->start == NULL) {
770 if (separatorSize > size_Range(&str)) {
771 /* Doesn't fit in the string. */
772 return iFalse;
773 }
774 if (!cmpCStrSc_Rangecc(str, separator, &iCaseSensitive)) {
775 return iFalse;
776 }
777 range->start = range->end = str.start;
778 if (!iCmpStrN(range->start, separator, separatorSize)) {
779 /* Skip the first separator. */
780 range->start += separatorSize;
781 }
782 }
783 else if (range->start == str.end) {
784 return iFalse;
785 }
786 else {
787 range->start = range->end + separatorSize;
788 if (range->start >= str.end) {
789 return iFalse;
790 }
791 }
792 const char *found = strstr(range->start, separator);
793 range->end = (found && found < str.end ? found : str.end);
794 iAssert(range->start <= range->end);
795 return iTrue;
796 }
797
cstr_Rangecc(iRangecc range)798 const char *cstr_Rangecc(iRangecc range) {
799 const size_t len = size_Range(&range);
800 char * copy = malloc(len + 1);
801 memcpy(copy, range.start, len);
802 copy[len] = 0;
803 return iCollectMem(copy);
804 }
805
string_Rangecc(iRangecc range)806 const iString *string_Rangecc(iRangecc range) {
807 return collect_String(newRange_String(range));
808 }
809
cmpNullRange_(const char * cstr)810 iLocalDef int cmpNullRange_(const char *cstr) {
811 return (cstr == NULL || *cstr == 0 ? 0 : -1);
812 }
813
cmpCStrSc_Rangecc(const iRangecc d,const char * cstr,const iStringComparison * sc)814 int cmpCStrSc_Rangecc(const iRangecc d, const char *cstr, const iStringComparison *sc) {
815 if (isNull_Rangecc(d)) {
816 return cmpNullRange_(cstr);
817 }
818 return cmpCStrNSc_Rangecc(d, cstr, strlen(cstr), sc);
819 }
820
cmpCStrNSc_Rangecc(const iRangecc d,const char * cstr,size_t n,const iStringComparison * sc)821 int cmpCStrNSc_Rangecc(const iRangecc d, const char *cstr, size_t n, const iStringComparison *sc) {
822 if (isNull_Rangecc(d)) {
823 return cmpNullRange_(cstr);
824 }
825 const size_t size = size_Range(&d);
826 int cmp = sc->cmpN(d.start, cstr, iMin(n, size));
827 if (cmp == 0) {
828 if (n == size) {
829 return 0;
830 }
831 return size < n ? -1 : 1;
832 }
833 return cmp;
834 }
835
split_Rangecc(const iRangecc d,const char * separator)836 iStringList *split_Rangecc(const iRangecc d, const char *separator) {
837 iStringList *parts = new_StringList();
838 iRangecc range = iNullRange;
839 while (nextSplit_Rangecc(d, separator, &range)) {
840 pushBackRange_StringList(parts, range);
841 }
842 return parts;
843 }
844
toInt_String(const iString * d)845 int toInt_String(const iString *d) {
846 if (startsWith_String(d, "0x") || startsWith_String(d, "0X")) {
847 return strtol(cstr_String(d), NULL, 16);
848 }
849 return atoi(cstr_String(d));
850 }
851
toFloat_String(const iString * d)852 float toFloat_String(const iString *d) {
853 return strtof(cstr_String(d), NULL);
854 }
855
toDouble_String(const iString * d)856 double toDouble_String(const iString *d) {
857 return strtod(cstr_String(d), NULL);
858 }
859
quote_String(const iString * d,iBool numericUnicode)860 iString *quote_String(const iString *d, iBool numericUnicode) {
861 iString *quot = new_String();
862 iConstForEach(String, i, d) {
863 const iChar ch = i.value;
864 if (ch == '"') {
865 appendCStr_String(quot, "\\\"");
866 }
867 else if (ch == '\\') {
868 appendCStr_String(quot, "\\\\");
869 }
870 else if (ch == '\n') {
871 appendCStr_String(quot, "\\n");
872 }
873 else if (ch == '\r') {
874 appendCStr_String(quot, "\\r");
875 }
876 else if (ch == '\t') {
877 appendCStr_String(quot, "\\t");
878 }
879 else if (numericUnicode && ch >= 0x80) {
880 if ((ch >= 0xD800 && ch < 0xE000) || ch >= 0x10000) {
881 /* TODO: Add a helper function? */
882 /* UTF-16 surrogate pair */
883 iString *chs = newUnicodeN_String(&ch, 1);
884 iBlock *u16 = toUtf16_String(chs);
885 delete_String(chs);
886 const uint16_t *ch16 = constData_Block(u16);
887 appendFormat_String(quot, "\\u%04x\\u%04x", ch16[0], ch16[1]);
888 }
889 else {
890 appendFormat_String(quot, "\\u%04x", ch);
891 }
892 }
893 else {
894 appendChar_String(quot, ch);
895 }
896 }
897 return quot;
898 }
899
unquote_String(const iString * d)900 iString *unquote_String(const iString *d) {
901 iString *unquot = new_String();
902 iConstForEach(String, i, d) {
903 const iChar ch = i.value;
904 if (ch == '\\') {
905 next_StringConstIterator(&i);
906 const iChar esc = i.value;
907 if (esc == '\\') {
908 appendChar_String(unquot, esc);
909 }
910 else if (esc == 'n') {
911 appendChar_String(unquot, '\n');
912 }
913 else if (esc == 'r') {
914 appendChar_String(unquot, '\r');
915 }
916 else if (esc == 't') {
917 appendChar_String(unquot, '\t');
918 }
919 else if (esc == '"') {
920 appendChar_String(unquot, '"');
921 }
922 else if (esc == 'u') {
923 char digits[5];
924 iZap(digits);
925 for (size_t j = 0; j < 4; j++) {
926 next_StringConstIterator(&i);
927 digits[j] = *i.pos;
928 }
929 uint16_t ch16[2] = { strtoul(digits, NULL, 16), 0 };
930 if (ch16[0] < 0xD800 || ch16[0] >= 0xE000) {
931 appendChar_String(unquot, ch16[0]);
932 }
933 else {
934 /* UTF-16 surrogate pair */
935 next_StringConstIterator(&i);
936 next_StringConstIterator(&i);
937 iZap(digits);
938 for (size_t j = 0; j < 4; j++) {
939 next_StringConstIterator(&i);
940 digits[j] = *i.pos;
941 }
942 ch16[1] = strtoul(digits, NULL, 16);
943 iString *u16 = newUtf16N_String(ch16, 2);
944 append_String(unquot, u16);
945 delete_String(u16);
946 }
947 }
948 else {
949 iAssert(0);
950 }
951 }
952 else {
953 appendChar_String(unquot, ch);
954 }
955 }
956 return unquot;
957 }
958
skipSpace_CStr(const char * cstr)959 const char *skipSpace_CStr(const char *cstr) {
960 while (*cstr && isspace((int) *cstr)) {
961 cstr++;
962 }
963 return cstr;
964 }
965
findAscii_Rangecc(const iRangecc str,char ch)966 const char *findAscii_Rangecc(const iRangecc str, char ch) {
967 const char *pos = strchr(str.start, ch);
968 if (!pos || pos >= str.end) return NULL;
969 return pos;
970 }
971
split_CStr(const char * cstr,const char * separator)972 iStringList *split_CStr(const char *cstr, const char *separator) {
973 return split_Rangecc((iRangecc){ cstr, cstr + strlen(cstr) }, separator);
974 }
975
976 /*-------------------------------------------------------------------------------------*/
977
decodeNextMultibyte_StringConstIterator_(iStringConstIterator * d)978 static void decodeNextMultibyte_StringConstIterator_(iStringConstIterator *d) {
979 d->value = 0;
980 /* u8_next() returns NULL when end is reached. */
981 d->next = (const char *) u8_next(&d->value, (const uint8_t *) d->next);
982 }
983
decodePrecedingMultibyte_StringConstIterator_(iStringConstIterator * d)984 static void decodePrecedingMultibyte_StringConstIterator_(iStringConstIterator *d) {
985 d->value = 0;
986 d->next = (const char *) u8_prev(
987 &d->value, (const uint8_t *) d->next, constData_Block(&d->str->chars));
988 }
989
init_StringConstIterator(iStringConstIterator * d,const iString * str)990 void init_StringConstIterator(iStringConstIterator *d, const iString *str) {
991 d->str = str;
992 d->value = 0;
993 if (str) {
994 d->pos = d->next = constData_Block(&str->chars);
995 /* Decode the first character. */
996 decodeNextMultibyte_StringConstIterator_(d);
997 }
998 else {
999 d->pos = d->next = NULL;
1000 }
1001 }
1002
next_StringConstIterator(iStringConstIterator * d)1003 void next_StringConstIterator(iStringConstIterator *d) {
1004 d->pos = d->next;
1005 decodeNextMultibyte_StringConstIterator_(d);
1006 }
1007
init_StringReverseConstIterator(iStringConstIterator * d,const iString * str)1008 void init_StringReverseConstIterator(iStringConstIterator *d, const iString *str) {
1009 d->str = str;
1010 d->value = 0;
1011 d->pos = d->next = constEnd_Block(&str->chars);
1012 /* Decode the first (last) character. */
1013 decodePrecedingMultibyte_StringConstIterator_(d);
1014 }
1015
next_StringReverseConstIterator(iStringConstIterator * d)1016 void next_StringReverseConstIterator(iStringConstIterator *d) {
1017 d->pos = d->next;
1018 decodePrecedingMultibyte_StringConstIterator_(d);
1019 }
1020
1021 /*-------------------------------------------------------------------------------------*/
1022
init_MultibyteChar(iMultibyteChar * d,iChar ch)1023 void init_MultibyteChar(iMultibyteChar *d, iChar ch) {
1024 int len = u8_uctomb((uint8_t *) d->bytes, ch, sizeof(d->bytes));
1025 d->bytes[iMax(0, len)] = 0;
1026 }
1027
decodeBytes_MultibyteChar(const char * bytes,const char * end,iChar * ch_out)1028 int decodeBytes_MultibyteChar(const char *bytes, const char *end, iChar *ch_out) {
1029 int rc = u8_mbtouc(ch_out, (const uint8_t *) bytes, end - bytes);
1030 if (*ch_out == 0xfffd) {
1031 rc = -1; /* Decode failed. */
1032 }
1033 return rc;
1034 }
1035
decodePrecedingBytes_MultibyteChar(const char * bytes,const char * start,iChar * ch_out)1036 int decodePrecedingBytes_MultibyteChar(const char *bytes, const char *start, iChar *ch_out) {
1037 *ch_out = 0;
1038 const char *precPos =
1039 (const char *) u8_prev(ch_out, (const uint8_t *) bytes, (const uint8_t *) start);
1040 if (!precPos) {
1041 return 0;
1042 }
1043 return bytes - precPos;
1044 }
1045
threadLocalCharBuffer_(void)1046 static char *threadLocalCharBuffer_(void) {
1047 static tss_t bufKey = 0;
1048 if (!bufKey) {
1049 tss_create(&bufKey, free);
1050 }
1051 char *buf = tss_get(bufKey);
1052 if (!buf) {
1053 tss_set(bufKey, buf = malloc(iMultibyteCharMaxSize + 1));
1054 }
1055 return buf;
1056 }
1057
cstrLocal_Char(iChar ch)1058 const char *cstrLocal_Char(iChar ch) {
1059 char *chBuf = threadLocalCharBuffer_();
1060 const iChar ucs[2] = { ch, 0 };
1061 size_t len = iMultibyteCharMaxSize;
1062 u32_conv_to_encoding(localeCharSet_, iconveh_question_mark, ucs, 1, NULL, chBuf, &len);
1063 chBuf[len] = 0;
1064 return chBuf;
1065 }
1066
iCmpStrRange(const iRangecc range,const char * cstr)1067 int iCmpStrRange(const iRangecc range, const char *cstr) {
1068 const size_t clen = strlen(cstr);
1069 const int cmp = iCmpStrN(range.start, cstr, size_Range(&range));
1070 if (clen == size_Range(&range)) {
1071 return cmp;
1072 }
1073 if (cmp == 0) return (size_Range(&range) < clen? -1 : 1);
1074 return cmp;
1075 }
1076
iCmpStrCase(const char * a,const char * b)1077 int iCmpStrCase(const char *a, const char *b) {
1078 int rc = 0;
1079 u8_casecmp((const uint8_t *) a,
1080 strlen(a),
1081 (const uint8_t *) b,
1082 strlen(b),
1083 currentLocaleLanguage_(),
1084 NULL,
1085 &rc);
1086 return rc;
1087 }
1088
iCmpStrNCase(const char * a,const char * b,size_t len)1089 int iCmpStrNCase(const char *a, const char *b, size_t len) {
1090 int rc = 0;
1091 u8_casecmp((const uint8_t *) a,
1092 strnlen(a, len),
1093 (const uint8_t *) b,
1094 strnlen(b, len),
1095 currentLocaleLanguage_(),
1096 NULL,
1097 &rc);
1098 return rc;
1099 }
1100
strcasestr_(const char * haystack,const char * needle)1101 static char *strcasestr_(const char *haystack, const char *needle) {
1102 const iString hay = iStringLiteral(haystack);
1103 const iString ndl = iStringLiteral(needle);
1104 const iChar ndlFirstChar = lower_Char(first_String(&ndl));
1105 if (size_String(&ndl) > size_String(&hay)) {
1106 /* Too long to be able to find it. */
1107 return NULL;
1108 }
1109 iConstForEach(String, i, &hay) {
1110 if (lower_Char(i.value) == ndlFirstChar) {
1111 /* Check if the full needle matches. */
1112 iStringConstIterator hayStart;
1113 memcpy(&hayStart, &i, sizeof(i));
1114 iStringConstIterator j;
1115 init_StringConstIterator(&j, &ndl);
1116 for (;;) {
1117 next_StringConstIterator(&j);
1118 next_StringConstIterator(&i);
1119 if (!j.value) return iConstCast(char *, hayStart.pos); // Matched full needle.
1120 if (!i.value) return NULL; // Not long enough for needle.
1121 if (lower_Char(i.value) != lower_Char(j.value)) {
1122 /* Must match all need characters. */
1123 break;
1124 }
1125 }
1126 memcpy(&i, &hayStart, sizeof(i));
1127 }
1128 }
1129 return NULL;
1130 }
1131
iCmpStr(const char * a,const char * b)1132 int iCmpStr(const char *a, const char *b) {
1133 return u8_strcmp((const uint8_t *) a, (const uint8_t *) b);
1134 }
1135
iCmpStrN(const char * a,const char * b,size_t n)1136 int iCmpStrN(const char *a, const char *b, size_t n) {
1137 const size_t n1 = strnlen(a, n);
1138 const size_t n2 = strnlen(b, n);
1139 return u8_cmp2((const uint8_t *) a, n1, (const uint8_t *) b, n2);
1140 }
1141
1142 iStringComparison iCaseSensitive = {
1143 .cmp = iCmpStr,
1144 .cmpN = iCmpStrN,
1145 .locate = strstr,
1146 };
1147
1148 iStringComparison iCaseInsensitive = {
1149 .cmp = iCmpStrCase,
1150 .cmpN = iCmpStrNCase,
1151 .locate = strcasestr_,
1152 };
1153
iDupStr(const char * a)1154 char *iDupStr(const char *a) {
1155 return strdup(a);
1156 }
1157
iStrStrN(const char * a,const char * b,size_t n)1158 char *iStrStrN(const char *a, const char *b, size_t n) {
1159 return strnstr(a, b, n);
1160 }
1161