1 #include <stdio.h>
2 #include "log/log.h"
3 #include "string_utils.h"
4 #include "strndup.h"
5
6 #define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n))
7
string_compare_case_insensitive(const char * str1,const char * str2)8 int string_compare_case_insensitive(const char *str1, const char *str2) {
9 int c1, c2;
10
11 do {
12 c1 = tolower(*str1++);
13 c2 = tolower(*str2++);
14 } while (c1 == c2 && c1 != 0);
15
16 return c1 - c2;
17 }
18
string_compare_len_case_insensitive(const char * str1,const char * str2,size_t len)19 int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len) {
20 register unsigned char *s1 = (unsigned char *) str1;
21 register unsigned char *s2 = (unsigned char *) str2;
22
23 unsigned char c1, c2;
24
25 if (len == 0) return 0;
26
27 do {
28 c1 = *s1++;
29 c2 = *s2++;
30 if (!c1 || !c2)
31 break;
32 if (c1 == c2)
33 continue;
34 c1 = tolower(c1);
35 c2 = tolower(c2);
36 if (c1 != c2)
37 break;
38 } while (--len);
39
40 return (int)c1 - (int)c2;
41
42 }
43
string_common_prefix(const char * str1,const char * str2)44 inline size_t string_common_prefix(const char *str1, const char *str2) {
45 size_t common_prefix;
46 for (common_prefix = 0; *str1 && *str2 && *str1 == *str2; str1++, str2++)
47 common_prefix++;
48 return common_prefix;
49 }
50
string_common_suffix(const char * str1,const char * str2)51 inline size_t string_common_suffix(const char *str1, const char *str2) {
52 size_t common_suffix = 0;
53 size_t str1_len = strlen(str1);
54 size_t str2_len = strlen(str2);
55 size_t min_len = (str1_len < str2_len) ? str1_len : str2_len;
56 for (int i=1; i <= min_len && str1[str1_len-i] == str2[str2_len-i]; i++)
57 common_suffix++;
58 return common_suffix;
59 }
60
string_starts_with(const char * str,const char * start)61 inline bool string_starts_with(const char *str, const char *start) {
62 for (; *start; str++, start++)
63 if (*str != *start)
64 return false;
65 return true;
66 }
67
string_ends_with(const char * str,const char * ending)68 inline bool string_ends_with(const char *str, const char *ending) {
69 size_t end_len = strlen(ending);
70 size_t str_len = strlen(str);
71
72 return str_len < end_len ? false : !strcmp(str + str_len - end_len, ending);
73 }
74
string_equals(const char * s1,const char * s2)75 inline bool string_equals(const char *s1, const char *s2) {
76 if (s1 == NULL || s2 == NULL) return false;
77 return strcmp(s1, s2) == 0;
78 }
79
string_upper(char * s)80 inline void string_upper(char *s) {
81 for (; *s; ++s) *s = toupper(*s);
82 }
83
string_replace_char(char * str,char c1,char c2)84 inline char *string_replace_char(char *str, char c1, char c2) {
85 char *repl = strdup(str);
86 if (repl == NULL) return NULL;
87 char *ptr = repl;
88 for (; *ptr; ++ptr) {
89 if (*ptr == c1) *ptr = c2;
90 }
91 return repl;
92 }
93
string_replace_with_array(char * str,char * replace,char * with,char_array * result)94 bool string_replace_with_array(char *str, char *replace, char *with, char_array *result) {
95 if (str == NULL) return false;
96 if (result == NULL) return false;
97
98 if (replace == NULL) {
99 replace = "";
100 }
101
102 size_t len_replace = strlen(replace);
103 if (len_replace == 0) {
104 return true;
105 }
106
107 if (with == NULL) {
108 with = "";
109 }
110
111 size_t len_with = strlen(with);
112
113 char *temp;
114 char *start = str;
115
116 for (size_t count = 0; (temp = strstr(start, replace)); count++) {
117 char_array_cat_len(result, start, temp - start);
118 char_array_cat_len(result, with, len_with);
119 start = temp + len_replace;
120 }
121
122 char_array_cat(result, start);
123
124 return true;
125 }
126
string_replace(char * str,char * replace,char * with)127 char *string_replace(char *str, char *replace, char *with) {
128 if (str == NULL) return NULL;
129
130 char_array *array = char_array_new_size(strlen(str));
131 if (!string_replace_with_array(str, replace, with, array)) {
132 char_array_destroy(array);
133 return NULL;
134 }
135 return char_array_to_string(array);
136 }
137
138
string_is_upper(char * s)139 inline bool string_is_upper(char *s) {
140 for (; *s; ++s) {
141 if (*s != toupper(*s)) return false;
142 }
143 return true;
144 }
145
string_lower(char * s)146 inline void string_lower(char *s) {
147 for (; *s; ++s) *s = tolower(*s);
148 }
149
string_is_lower(char * s)150 inline bool string_is_lower(char *s) {
151 for (; *s; ++s) {
152 if (*s != tolower(*s)) return false;
153 }
154 return true;
155 }
156
string_translate(char * str,size_t len,char * word_chars,char * word_repls,size_t trans_len)157 uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) {
158 uint32_t num_replacements = 0;
159
160 for (int i = 0; i < len; i++) {
161 for (int j = 0; j < trans_len; j++) {
162 if (str[i] == word_chars[j]) {
163 str[i] = word_repls[j];
164 num_replacements++;
165 break;
166 }
167 }
168 }
169 return num_replacements;
170 }
171
utf8proc_iterate_reversed(const uint8_t * str,ssize_t start,int32_t * dst)172 ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst) {
173 ssize_t len = 0;
174
175 const uint8_t *ptr = str + start;
176
177 *dst = -1;
178
179 do {
180 if (ptr <= str) return 0;
181 ptr--; len++;
182 } while ((*ptr & 0xC0) == 0x80);
183
184 int32_t ch = 0;
185
186 ssize_t ret_len = utf8proc_iterate(ptr, len, &ch);
187 *dst = ch;
188 return ret_len;
189 }
190
utf8_reversed_string(const char * s)191 char *utf8_reversed_string(const char *s) {
192 int32_t unich;
193 ssize_t len, remaining;
194
195 size_t slen = strlen(s);
196
197 char *out = malloc(slen + 1);
198
199 uint8_t *ptr = (uint8_t *)s;
200 char *rev_ptr = out + slen;
201
202 while(1) {
203 len = utf8proc_iterate(ptr, -1, &unich);
204 remaining = len;
205 if (len <= 0 || unich == 0) break;
206 if (!(utf8proc_codepoint_valid(unich))) goto error_free_output;
207
208 rev_ptr -= len;
209 memcpy(rev_ptr, (char *)ptr, len);
210
211 ptr += len;
212
213 }
214
215 out[slen] = '\0';
216 return out;
217
218 error_free_output:
219 free(out);
220 return NULL;
221 }
222
223 typedef enum casing_option {
224 UTF8_LOWER,
225 UTF8_UPPER
226 } casing_option_t;
227
utf8_case(const char * s,casing_option_t casing,utf8proc_option_t options)228 char *utf8_case(const char *s, casing_option_t casing, utf8proc_option_t options) {
229 ssize_t len = (ssize_t)strlen(s);
230 utf8proc_uint8_t *str = (utf8proc_uint8_t *)s;
231
232 utf8proc_ssize_t result;
233 result = utf8proc_decompose(str, len, NULL, 0, options);
234
235 if (result < 0) return NULL;
236 utf8proc_int32_t *buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
237 if (buffer == NULL) return NULL;
238
239 result = utf8proc_decompose(str, len, buffer, result, options);
240 if (result < 0) {
241 free(buffer);
242 return NULL;
243 }
244
245 for (utf8proc_ssize_t i = 0; i < result; i++) {
246 utf8proc_int32_t uc = buffer[i];
247 utf8proc_int32_t norm = uc;
248
249 if (casing == UTF8_LOWER) {
250 norm = utf8proc_tolower(uc);
251 } else if (casing == UTF8_UPPER) {
252 norm = utf8proc_toupper(uc);
253 }
254 buffer[i] = norm;
255 }
256
257 result = utf8proc_reencode(buffer, result, options);
258 if (result < 0) {
259 free(buffer);
260 return NULL;
261 }
262
263 utf8proc_int32_t *newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
264 if (newptr != NULL) {
265 buffer = newptr;
266 } else {
267 free(buffer);
268 return NULL;
269 }
270
271 return (char *)buffer;
272 }
273
utf8_lower_options(const char * s,utf8proc_option_t options)274 inline char *utf8_lower_options(const char *s, utf8proc_option_t options) {
275 return utf8_case(s, UTF8_LOWER, options);
276 }
277
utf8_lower(const char * s)278 inline char *utf8_lower(const char *s) {
279 return utf8_case(s, UTF8_LOWER, UTF8PROC_OPTIONS_NFC);
280 }
281
utf8_upper_options(const char * s,utf8proc_option_t options)282 inline char *utf8_upper_options(const char *s, utf8proc_option_t options) {
283 return utf8_case(s, UTF8_UPPER, options);
284 }
285
utf8_upper(const char * s)286 inline char *utf8_upper(const char *s) {
287 return utf8_case(s, UTF8_UPPER, UTF8PROC_OPTIONS_NFC);
288 }
289
290
utf8_is_letter(int cat)291 inline bool utf8_is_letter(int cat) {
292 return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
293 || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
294 || cat == UTF8PROC_CATEGORY_LM;
295 }
296
utf8_is_digit(int cat)297 inline bool utf8_is_digit(int cat) {
298 return cat == UTF8PROC_CATEGORY_ND;
299 }
300
utf8_is_number(int cat)301 inline bool utf8_is_number(int cat) {
302 return cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO;
303 }
304
utf8_is_letter_or_number(int cat)305 inline bool utf8_is_letter_or_number(int cat) {
306 return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
307 || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
308 || cat == UTF8PROC_CATEGORY_LM || cat == UTF8PROC_CATEGORY_ND \
309 || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO;
310 }
311
utf8_is_hyphen(int32_t ch)312 inline bool utf8_is_hyphen(int32_t ch) {
313 int cat = utf8proc_category(ch);
314 return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212;
315 }
316
317 #define PERIOD_CODEPOINT 46
318
utf8_is_period(int32_t codepoint)319 inline bool utf8_is_period(int32_t codepoint) {
320 return codepoint == PERIOD_CODEPOINT;
321 }
322
utf8_is_punctuation(int cat)323 inline bool utf8_is_punctuation(int cat) {
324 return cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PE \
325 || cat == UTF8PROC_CATEGORY_PF || cat == UTF8PROC_CATEGORY_PI \
326 || cat == UTF8PROC_CATEGORY_PO || cat == UTF8PROC_CATEGORY_PS;
327 }
328
utf8_is_symbol(int cat)329 inline bool utf8_is_symbol(int cat) {
330 return cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_SC \
331 || cat == UTF8PROC_CATEGORY_SM || cat == UTF8PROC_CATEGORY_SO;
332 }
333
utf8_is_separator(int cat)334 inline bool utf8_is_separator(int cat) {
335 return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
336 }
337
utf8_is_whitespace(int32_t ch)338 inline bool utf8_is_whitespace(int32_t ch) {
339 int cat = utf8proc_category(ch);
340 return utf8_is_separator(cat) ||
341 ch == 9 || // character tabulation
342 ch == 10 || // line feed
343 ch == 11 || // line tabulation
344 ch == 12 || // form feed
345 ch == 13 || // carriage return
346 ch == 133 // next line
347 ;
348 }
349
350
utf8_len(const char * str,size_t len)351 ssize_t utf8_len(const char *str, size_t len) {
352 if (str == NULL) return -1;
353 if (len == 0) return 0;
354
355 int32_t ch = 0;
356 ssize_t num_utf8_chars = 0;
357 ssize_t char_len;
358
359 uint8_t *ptr = (uint8_t *)str;
360
361 size_t remaining = len;
362
363 while (1) {
364 char_len = utf8proc_iterate(ptr, -1, &ch);
365
366 if (ch == 0) break;
367 remaining -= char_len;
368 if (remaining == 0) break;
369
370 ptr += char_len;
371 num_utf8_chars++;
372 }
373
374 return num_utf8_chars;
375 }
376
unicode_codepoints(const char * str)377 uint32_array *unicode_codepoints(const char *str) {
378 if (str == NULL) return NULL;
379
380 uint32_array *a = uint32_array_new();
381
382 int32_t ch = 0;
383 ssize_t num_utf8_chars = 0;
384 ssize_t char_len;
385
386 uint8_t *ptr = (uint8_t *)str;
387
388 while (1) {
389 char_len = utf8proc_iterate(ptr, -1, &ch);
390
391 if (ch == 0) break;
392
393 uint32_array_push(a, (uint32_t)ch);
394 ptr += char_len;
395 }
396
397 return a;
398 }
399
unicode_equals(uint32_array * u1_array,uint32_array * u2_array)400 bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array) {
401 size_t len1 = u1_array->n;
402 size_t len2 = u2_array->n;
403 if (len1 != len2) return false;
404
405 uint32_t *u1 = u1_array->a;
406 uint32_t *u2 = u2_array->a;
407 for (size_t i = 0; i < len1; i++) {
408 if (u1[i] != u2[i]) return false;
409 }
410 return true;
411 }
412
unicode_common_prefix(uint32_array * u1_array,uint32_array * u2_array)413 size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array) {
414 size_t len1 = u1_array->n;
415 size_t len2 = u2_array->n;
416
417 size_t min_len = len1 <= len2 ? len1 : len2;
418
419 uint32_t *u1 = u1_array->a;
420 uint32_t *u2 = u2_array->a;
421 size_t common_prefix = 0;
422
423 for (size_t i = 0; i < min_len; i++) {
424 if (u1[i] == u2[i]) {
425 common_prefix++;
426 } else {
427 break;
428 }
429 }
430 return common_prefix;
431 }
432
unicode_common_suffix(uint32_array * u1_array,uint32_array * u2_array)433 size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array) {
434 size_t len1 = u1_array->n;
435 size_t len2 = u2_array->n;
436
437 size_t min_len = len1 <= len2 ? len1 : len2;
438
439 uint32_t *u1 = u1_array->a;
440 uint32_t *u2 = u2_array->a;
441 size_t common_suffix = 0;
442
443 for (size_t i = 0; i < min_len; i++) {
444 if (u1[len1 - i - 1] == u2[len2 - i - 1]) {
445 common_suffix++;
446 } else {
447 break;
448 }
449 }
450 return common_suffix;
451 }
452
453
454
455
utf8_compare_len_option(const char * str1,const char * str2,size_t len,bool case_insensitive)456 int utf8_compare_len_option(const char *str1, const char *str2, size_t len, bool case_insensitive) {
457 if (len == 0) return 0;
458
459 int32_t c1, c2;
460 ssize_t len1, len2;
461
462 uint8_t *ptr1 = (uint8_t *)str1;
463 uint8_t *ptr2 = (uint8_t *)str2;
464
465 size_t remaining = len;
466
467 while (1) {
468 len1 = utf8proc_iterate(ptr1, -1, &c1);
469 len2 = utf8proc_iterate(ptr2, -1, &c2);
470
471 if (c1 == 0 || c2 == 0) break;
472
473 if (c1 == c2 || (case_insensitive && utf8proc_tolower(c1) == utf8proc_tolower(c2))) {
474 ptr1 += len1;
475 ptr2 += len2;
476 remaining -= len1;
477 } else {
478 break;
479 }
480
481 if (remaining == 0) break;
482
483 }
484
485 return (int) c1 - c2;
486 }
487
utf8_compare_len(const char * str1,const char * str2,size_t len)488 inline int utf8_compare_len(const char *str1, const char *str2, size_t len) {
489 return utf8_compare_len_option(str1, str2, len, false);
490 }
491
utf8_compare(const char * str1,const char * str2)492 inline int utf8_compare(const char *str1, const char *str2) {
493 size_t len1 = strlen(str1);
494 size_t len2 = strlen(str2);
495 size_t max_len = len1 >= len2 ? len1 : len2;
496 return utf8_compare_len_option(str1, str2, max_len, false);
497 }
498
utf8_compare_len_case_insensitive(const char * str1,const char * str2,size_t len)499 inline int utf8_compare_len_case_insensitive(const char *str1, const char *str2, size_t len) {
500 return utf8_compare_len_option(str1, str2, len, true);
501 }
502
utf8_compare_case_insensitive(const char * str1,const char * str2,size_t len)503 inline int utf8_compare_case_insensitive(const char *str1, const char *str2, size_t len) {
504 size_t len1 = strlen(str1);
505 size_t len2 = strlen(str2);
506 size_t max_len = len1 >= len2 ? len1 : len2;
507 return utf8_compare_len_option(str1, str2, max_len, true);
508 }
509
510
utf8_common_prefix_len(const char * str1,const char * str2,size_t len)511 size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len) {
512 size_t common_prefix = 0;
513
514 if (len == 0) return common_prefix;
515
516 int32_t c1 = 0;
517 int32_t c2 = 0;
518
519 size_t remaining = len;
520
521 ssize_t len1, len2;
522
523 uint8_t *ptr1 = (uint8_t *)str1;
524 uint8_t *ptr2 = (uint8_t *)str2;
525
526 while (1) {
527 len1 = utf8proc_iterate(ptr1, -1, &c1);
528 len2 = utf8proc_iterate(ptr2, -1, &c2);
529
530 if (c1 <= 0 || c2 <= 0) break;
531 if (c1 == c2) {
532 ptr1 += len1;
533 ptr2 += len2;
534 common_prefix += len1;
535 if (common_prefix >= len) {
536 return common_prefix;
537 }
538 } else {
539 break;
540 }
541 }
542
543 return common_prefix;
544 }
545
utf8_common_prefix(const char * str1,const char * str2)546 size_t utf8_common_prefix(const char *str1, const char *str2) {
547 size_t len1 = strlen(str1);
548 size_t len2 = strlen(str2);
549
550 size_t len = len1 <= len2 ? len1 : len2;
551
552 return utf8_common_prefix_len(str1, str2, len);
553 }
554
555
utf8_common_prefix_len_ignore_separators(const char * str1,const char * str2,size_t len)556 size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len) {
557 if (len == 0) return 0;
558
559 int32_t c1 = -1, c2 = -1;
560 ssize_t len1, len2;
561
562 uint8_t *ptr1 = (uint8_t *)str1;
563 uint8_t *ptr2 = (uint8_t *)str2;
564
565 size_t remaining = len;
566
567 size_t match_len = 0;
568
569 bool one_char_match = false;
570
571 while (1) {
572 len1 = utf8proc_iterate(ptr1, -1, &c1);
573 len2 = utf8proc_iterate(ptr2, -1, &c2);
574
575 if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) {
576 ptr1++;
577 ptr2++;
578 remaining--;
579 match_len++;
580 one_char_match = true;
581 if (remaining == 0) break;
582 continue;
583 }
584
585 if (c1 == 0 || c2 == 0) break;
586
587 if (c1 == c2) {
588 ptr1 += len1;
589 ptr2 += len2;
590 remaining -= len1;
591 match_len += len1;
592 one_char_match = true;
593 } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
594 ptr1 += len1;
595 match_len += len1;
596 if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
597 ptr2 += len2;
598 remaining -= len2;
599 }
600 } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
601 ptr2 += len2;
602 remaining -= len2;
603 } else {
604 break;
605 }
606
607 if (remaining == 0) break;
608
609 }
610
611 return one_char_match ? match_len : 0;
612
613 }
614
utf8_common_prefix_ignore_separators(const char * str1,const char * str2)615 inline size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2) {
616 return utf8_common_prefix_len_ignore_separators(str1, str2, strlen(str2));
617 }
618
utf8_equal_ignore_separators_len(const char * str1,const char * str2,size_t len)619 bool utf8_equal_ignore_separators_len(const char *str1, const char *str2, size_t len) {
620 if (len == 0) return false;
621
622 int32_t c1 = -1, c2 = -1;
623 ssize_t len1, len2;
624
625 uint8_t *ptr1 = (uint8_t *)str1;
626 uint8_t *ptr2 = (uint8_t *)str2;
627
628 size_t remaining = len;
629
630 while (1) {
631 len1 = utf8proc_iterate(ptr1, -1, &c1);
632 len2 = utf8proc_iterate(ptr2, -1, &c2);
633
634 if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) {
635 ptr1++;
636 ptr2++;
637 remaining--;
638 if (remaining == 0) return true;
639 continue;
640 }
641
642 if (c1 != 0 && c2 != 0 && c1 == c2) {
643 ptr1 += len1;
644 ptr2 += len2;
645 remaining -= len1;
646 } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
647 ptr1 += len1;
648 if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
649 ptr2 += len2;
650 }
651 remaining -= len1;
652 } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
653 ptr2 += len2;
654 remaining -= len2;
655 } else {
656 break;
657 }
658
659 if (remaining == 0) return true;
660
661 }
662
663 return false;
664 }
665
utf8_equal_ignore_separators(const char * str1,const char * str2)666 inline bool utf8_equal_ignore_separators(const char *str1, const char *str2) {
667 size_t len1 = strlen(str1);
668 size_t len2 = strlen(str2);
669 size_t len = len1 > len2 ? len1 : len2;
670
671 return utf8_equal_ignore_separators_len(str1, str2, len);
672 }
673
string_is_digit(char * str,size_t len)674 bool string_is_digit(char *str, size_t len) {
675 uint8_t *ptr = (uint8_t *)str;
676 size_t idx = 0;
677
678 bool ignorable = true;
679
680 while (idx < len) {
681 int32_t ch;
682 ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
683
684 if (char_len <= 0) break;
685 if (ch == 0) break;
686 if (!(utf8proc_codepoint_valid(ch))) return false;
687
688 int cat = utf8proc_category(ch);
689 if (cat != UTF8PROC_CATEGORY_ND) {
690 return false;
691 }
692
693 ptr += char_len;
694 idx += char_len;
695 }
696
697 return true;
698 }
699
string_is_ignorable(char * str,size_t len)700 bool string_is_ignorable(char *str, size_t len) {
701 uint8_t *ptr = (uint8_t *)str;
702 size_t idx = 0;
703
704 bool ignorable = true;
705
706 while (idx < len) {
707 int32_t ch;
708 ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
709
710 if (char_len <= 0) break;
711 if (ch == 0) break;
712 if (!(utf8proc_codepoint_valid(ch))) return false;
713
714 int cat = utf8proc_category(ch);
715 if (!utf8_is_separator(cat) && !utf8_is_hyphen(ch)) {
716 return false;
717 }
718
719 ptr += char_len;
720 idx += char_len;
721 }
722
723 return true;
724 }
725
string_next_hyphen_index(char * str,size_t len)726 ssize_t string_next_hyphen_index(char *str, size_t len) {
727 uint8_t *ptr = (uint8_t *)str;
728 int32_t codepoint;
729 ssize_t idx = 0;
730
731 while (idx < len) {
732 ssize_t char_len = utf8proc_iterate(ptr, len, &codepoint);
733
734 if (char_len <= 0 || codepoint == 0) break;
735
736 if (utf8_is_hyphen(codepoint)) return idx;
737 ptr += char_len;
738 idx += char_len;
739 }
740 return -1;
741 }
742
string_contains(char * str,char * sub)743 inline bool string_contains(char *str, char *sub) {
744 return str != NULL && sub != NULL && strstr(str, sub) != NULL;
745 }
746
string_contains_hyphen_len(char * str,size_t len)747 inline bool string_contains_hyphen_len(char *str, size_t len) {
748 return string_next_hyphen_index(str, len) >= 0;
749 }
750
string_contains_hyphen(char * str)751 inline bool string_contains_hyphen(char *str) {
752 return string_next_hyphen_index(str, strlen(str)) >= 0;
753 }
754
string_next_codepoint_len(char * str,uint32_t codepoint,size_t len)755 ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len) {
756 uint8_t *ptr = (uint8_t *)str;
757 int32_t ch;
758 ssize_t idx = 0;
759
760 while (idx < len) {
761 ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
762
763 if (char_len <= 0 || ch == 0) break;
764
765 if ((uint32_t)ch == codepoint) return idx;
766 ptr += char_len;
767 idx += char_len;
768 }
769 return -1;
770 }
771
string_next_codepoint(char * str,uint32_t codepoint)772 ssize_t string_next_codepoint(char *str, uint32_t codepoint) {
773 return string_next_codepoint_len(str, codepoint, strlen(str));
774 }
775
string_next_period_len(char * str,size_t len)776 ssize_t string_next_period_len(char *str, size_t len) {
777 return string_next_codepoint_len(str, PERIOD_CODEPOINT, len);
778 }
779
string_next_period(char * str)780 ssize_t string_next_period(char *str) {
781 return string_next_codepoint(str, PERIOD_CODEPOINT);
782 }
783
string_contains_period_len(char * str,size_t len)784 inline bool string_contains_period_len(char *str, size_t len) {
785 return string_next_codepoint_len(str, PERIOD_CODEPOINT, len) >= 0;
786 }
787
string_contains_period(char * str)788 inline bool string_contains_period(char *str) {
789 return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0;
790 }
791
string_next_whitespace_len(char * str,size_t len)792 ssize_t string_next_whitespace_len(char *str, size_t len) {
793 uint8_t *ptr = (uint8_t *)str;
794 int32_t ch;
795 ssize_t idx = 0;
796
797 while (idx < len) {
798 ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
799
800 if (char_len <= 0 || ch == 0) break;
801
802 if (utf8_is_whitespace(ch)) return idx;
803 ptr += char_len;
804 idx += char_len;
805 }
806 return -1;
807 }
808
string_next_whitespace(char * str)809 ssize_t string_next_whitespace(char *str) {
810 return string_next_whitespace_len(str, strlen(str));
811 }
812
813
string_right_spaces_len(char * str,size_t len)814 size_t string_right_spaces_len(char *str, size_t len) {
815 size_t spaces = 0;
816
817 uint8_t *ptr = (uint8_t *)str;
818 int32_t ch = 0;
819 ssize_t index = len;
820
821 while (1) {
822 ssize_t char_len = utf8proc_iterate_reversed(ptr, index, &ch);
823
824 if (ch <= 0) break;
825
826 if (!utf8_is_whitespace(ch)) {
827 break;
828 }
829
830 index -= char_len;
831 spaces += char_len;
832 }
833
834 return spaces;
835
836 }
837
string_hyphen_prefix_len(char * str,size_t len)838 inline size_t string_hyphen_prefix_len(char *str, size_t len) {
839 // Strip beginning hyphens
840 int32_t unichr;
841 uint8_t *ptr = (uint8_t *)str;
842 ssize_t char_len = utf8proc_iterate(ptr, len, &unichr);
843 if (utf8_is_hyphen(unichr)) {
844 return (size_t)char_len;
845 }
846 return 0;
847 }
848
string_hyphen_suffix_len(char * str,size_t len)849 inline size_t string_hyphen_suffix_len(char *str, size_t len) {
850 // Strip ending hyphens
851 int32_t unichr;
852 uint8_t *ptr = (uint8_t *)str;
853 ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr);
854 if (utf8_is_hyphen(unichr)) {
855 return (size_t)char_len;
856 }
857 return 0;
858 }
859
string_left_spaces_len(char * str,size_t len)860 size_t string_left_spaces_len(char *str, size_t len) {
861 size_t spaces = 0;
862
863 uint8_t *ptr = (uint8_t *)str;
864 int32_t ch = 0;
865 ssize_t index = 0;
866
867 while (1) {
868 ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
869
870 if (ch <= 0) break;
871
872 if (!utf8_is_whitespace(ch)) {
873 break;
874 }
875 index += char_len;
876 ptr += char_len;
877 spaces += char_len;
878 }
879
880 return spaces;
881 }
882
string_trim(char * str)883 char *string_trim(char *str) {
884 size_t len = strlen(str);
885 size_t left_spaces = string_left_spaces_len(str, len);
886 size_t right_spaces = string_right_spaces_len(str, len);
887 char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
888 return ret;
889 }
890
char_array_from_string(char * str)891 char_array *char_array_from_string(char *str) {
892 size_t len = strlen(str);
893 char_array *array = char_array_new_size(len+1);
894 strcpy(array->a, str);
895 array->n = len;
896 return array;
897 }
898
char_array_from_string_no_copy(char * str,size_t n)899 char_array *char_array_from_string_no_copy(char *str, size_t n) {
900 char_array *array = malloc(sizeof(char_array));
901 array->a = str;
902 array->m = n;
903 array->n = n;
904 return array;
905 }
906
char_array_get_string(char_array * array)907 inline char *char_array_get_string(char_array *array) {
908 if (array->n == 0 || array->a[array->n - 1] != '\0') {
909 char_array_terminate(array);
910 }
911 return array->a;
912 }
913
char_array_to_string(char_array * array)914 inline char *char_array_to_string(char_array *array) {
915 if (array->n == 0 || array->a[array->n - 1] != '\0') {
916 char_array_terminate(array);
917 }
918 char *a = array->a;
919 free(array);
920 return a;
921 }
922
923
char_array_strip_nul_byte(char_array * array)924 inline void char_array_strip_nul_byte(char_array *array) {
925 if (array->n > 0 && array->a[array->n - 1] == '\0') {
926 array->a[array->n - 1] = '\0';
927 array->n--;
928 }
929 }
930
char_array_len(char_array * array)931 inline size_t char_array_len(char_array *array) {
932 if (array->n > 0 && array->a[array->n - 1] == '\0') {
933 return array->n - 1;
934 } else {
935 return array->n;
936 }
937 }
938
char_array_append(char_array * array,char * str)939 inline void char_array_append(char_array *array, char *str) {
940 while(*str) {
941 char_array_push(array, *str++);
942 }
943 }
944
char_array_append_len(char_array * array,char * str,size_t len)945 inline void char_array_append_len(char_array *array, char *str, size_t len) {
946 for (size_t i = 0; i < len; i++) {
947 char_array_push(array, *str++);
948 }
949 }
950
char_array_append_reversed_len(char_array * array,char * str,size_t len)951 inline void char_array_append_reversed_len(char_array *array, char *str, size_t len) {
952 int32_t unich;
953 ssize_t char_len;
954
955 size_t idx = len;
956 uint8_t *ptr = (uint8_t *)str;
957
958 while(idx > 0) {
959 char_len = utf8proc_iterate_reversed(ptr, idx, &unich);
960 if (char_len <= 0 || unich == 0) break;
961 if (!(utf8proc_codepoint_valid(unich))) break;
962
963 idx -= char_len;
964 char_array_append_len(array, str + idx, char_len);
965 }
966 }
967
char_array_append_reversed(char_array * array,char * str)968 inline void char_array_append_reversed(char_array *array, char *str) {
969 size_t len = strlen(str);
970 char_array_append_reversed_len(array, str, len);
971 }
972
char_array_terminate(char_array * array)973 inline void char_array_terminate(char_array *array) {
974 char_array_push(array, '\0');
975 }
976
char_array_cat(char_array * array,char * str)977 inline void char_array_cat(char_array *array, char *str) {
978 char_array_strip_nul_byte(array);
979 char_array_append(array, str);
980 char_array_terminate(array);
981 }
982
char_array_cat_len(char_array * array,char * str,size_t len)983 inline void char_array_cat_len(char_array *array, char *str, size_t len) {
984 char_array_strip_nul_byte(array);
985 char_array_append_len(array, str, len);
986 char_array_terminate(array);
987 }
988
989
char_array_cat_reversed(char_array * array,char * str)990 inline void char_array_cat_reversed(char_array *array, char *str) {
991 char_array_strip_nul_byte(array);
992 char_array_append_reversed(array, str);
993 char_array_terminate(array);
994 }
995
996
char_array_cat_reversed_len(char_array * array,char * str,size_t len)997 inline void char_array_cat_reversed_len(char_array *array, char *str, size_t len) {
998 char_array_strip_nul_byte(array);
999 char_array_append_reversed_len(array, str, len);
1000 char_array_terminate(array);
1001 }
1002
char_array_add(char_array * array,char * str)1003 inline void char_array_add(char_array *array, char *str) {
1004 char_array_append(array, str);
1005 char_array_terminate(array);
1006 }
1007
char_array_add_len(char_array * array,char * str,size_t len)1008 inline void char_array_add_len(char_array *array, char *str, size_t len) {
1009 char_array_append_len(array, str, len);
1010 char_array_terminate(array);
1011 }
1012
1013
char_array_add_vjoined(char_array * array,char * separator,bool strip_separator,int count,va_list args)1014 void char_array_add_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args) {
1015 if (count <= 0) {
1016 return;
1017 }
1018
1019 size_t separator_len = strlen(separator);
1020
1021 for (size_t i = 0; i < count - 1; i++) {
1022 char *arg = va_arg(args, char *);
1023 size_t len = strlen(arg);
1024
1025 if (strip_separator &&
1026 ((separator_len == 1 && arg[len-1] == separator[0]) ||
1027 (len > separator_len && strncmp(arg + len - separator_len, separator, separator_len) == 0))) {
1028 len -= separator_len;
1029 }
1030
1031 char_array_append_len(array, arg, len);
1032 char_array_append(array, separator);
1033 }
1034
1035 char *arg = va_arg(args, char *);
1036 char_array_append(array, arg);
1037 char_array_terminate(array);
1038
1039 }
1040
char_array_add_joined(char_array * array,char * separator,bool strip_separator,int count,...)1041 inline void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...) {
1042 va_list args;
1043 va_start(args, count);
1044 char_array_add_vjoined(array, separator, strip_separator, count, args);
1045 va_end(args);
1046 }
1047
char_array_cat_joined(char_array * array,char * separator,bool strip_separator,int count,...)1048 inline void char_array_cat_joined(char_array *array, char *separator, bool strip_separator, int count, ...) {
1049 char_array_strip_nul_byte(array);
1050 va_list args;
1051 va_start(args, count);
1052 char_array_add_vjoined(array, separator, strip_separator, count, args);
1053 va_end(args);
1054 }
1055
char_array_cat_vprintf(char_array * array,char * format,va_list args)1056 void char_array_cat_vprintf(char_array *array, char *format, va_list args) {
1057 char_array_strip_nul_byte(array);
1058
1059 va_list cpy;
1060
1061 char *buf;
1062 size_t buflen;
1063
1064 size_t last_n = array->n;
1065 size_t size = array->m - array->n <= 2 ? array->m * 2 : array->m;
1066
1067 while(1) {
1068 char_array_resize(array, size);
1069 buf = array->a + last_n;
1070 buflen = size - last_n;
1071 if (buf == NULL) return;
1072 array->a[size - 2] = '\0';
1073 va_copy(cpy, args);
1074 vsnprintf(buf, buflen, format, cpy);
1075 if (array->a[size - 2] != '\0') {
1076 size *= 2;
1077 continue;
1078 } else {
1079 array->n += strlen(buf);
1080 }
1081 break;
1082 }
1083 }
1084
char_array_cat_printf(char_array * array,char * format,...)1085 void char_array_cat_printf(char_array *array, char *format, ...) {
1086 va_list args;
1087 va_start(args, format);
1088 char_array_cat_vprintf(array, format, args);
1089 va_end(args);
1090 }
1091
cstring_array_new(void)1092 cstring_array *cstring_array_new(void) {
1093 cstring_array *array = malloc(sizeof(cstring_array));
1094 if (array == NULL) return NULL;
1095
1096 array->indices = uint32_array_new();
1097 if (array->indices == NULL) {
1098 cstring_array_destroy(array);
1099 return NULL;
1100 }
1101
1102 array->str = char_array_new();
1103 if (array->str == NULL) {
1104 cstring_array_destroy(array);
1105 return NULL;
1106 }
1107
1108 return array;
1109 }
1110
cstring_array_destroy(cstring_array * self)1111 void cstring_array_destroy(cstring_array *self) {
1112 if (self == NULL) return;
1113 if (self->indices) {
1114 uint32_array_destroy(self->indices);
1115 }
1116 if (self->str) {
1117 char_array_destroy(self->str);
1118 }
1119 free(self);
1120 }
1121
cstring_array_new_size(size_t size)1122 cstring_array *cstring_array_new_size(size_t size) {
1123 cstring_array *array = cstring_array_new();
1124 char_array_resize(array->str, size);
1125 return array;
1126 }
1127
cstring_array_from_char_array(char_array * str)1128 cstring_array *cstring_array_from_char_array(char_array *str) {
1129 if (str == NULL) return NULL;
1130 if (str->n == 0)
1131 return cstring_array_new();
1132
1133 cstring_array *array = malloc(sizeof(cstring_array));
1134 if (array == NULL) return NULL;
1135
1136 array->str = str;
1137 array->indices = uint32_array_new_size(1);
1138
1139 uint32_array_push(array->indices, 0);
1140 char *ptr = str->a;
1141 for (uint32_t i = 0; i < str->n - 1; i++, ptr++) {
1142 if (*ptr == '\0') {
1143 uint32_array_push(array->indices, i + 1);
1144 }
1145 }
1146 return array;
1147 }
1148
cstring_array_from_strings(char ** strings,size_t n)1149 cstring_array *cstring_array_from_strings(char **strings, size_t n) {
1150 cstring_array *array = cstring_array_new();
1151 for (size_t i = 0; i < n; i++) {
1152 cstring_array_start_token(array);
1153 cstring_array_add_string(array, strings[i]);
1154 }
1155 return array;
1156 }
1157
cstring_array_extend(cstring_array * array,cstring_array * other)1158 bool cstring_array_extend(cstring_array *array, cstring_array *other) {
1159 if (array == NULL || other == NULL) return false;
1160 size_t n = cstring_array_num_strings(other);
1161
1162 for (size_t i = 0; i < n; i++) {
1163 char *s_i = cstring_array_get_string(other, i);
1164 cstring_array_add_string(array, s_i);
1165 }
1166 return true;
1167 }
1168
1169
cstring_array_capacity(cstring_array * self)1170 inline size_t cstring_array_capacity(cstring_array *self) {
1171 return self->str->m;
1172 }
1173
cstring_array_used(cstring_array * self)1174 inline size_t cstring_array_used(cstring_array *self) {
1175 return self->str->n;
1176 }
1177
cstring_array_num_strings(cstring_array * self)1178 inline size_t cstring_array_num_strings(cstring_array *self) {
1179 if (self == NULL) return 0;
1180 return self->indices->n;
1181 }
1182
cstring_array_resize(cstring_array * self,size_t size)1183 inline void cstring_array_resize(cstring_array *self, size_t size) {
1184 if (size < cstring_array_capacity(self)) return;
1185 char_array_resize(self->str, size);
1186 }
1187
cstring_array_clear(cstring_array * self)1188 void cstring_array_clear(cstring_array *self) {
1189 if (self == NULL) return;
1190
1191 if (self->indices != NULL) {
1192 uint32_array_clear(self->indices);
1193 }
1194
1195 if (self->str != NULL) {
1196 char_array_clear(self->str);
1197 }
1198 }
1199
cstring_array_start_token(cstring_array * self)1200 inline uint32_t cstring_array_start_token(cstring_array *self) {
1201 uint32_t index = (uint32_t)self->str->n;
1202 uint32_array_push(self->indices, index);
1203 return index;
1204 }
1205
cstring_array_terminate(cstring_array * self)1206 inline void cstring_array_terminate(cstring_array *self) {
1207 char_array_terminate(self->str);
1208 }
1209
cstring_array_add_string(cstring_array * self,char * str)1210 inline uint32_t cstring_array_add_string(cstring_array *self, char *str) {
1211 uint32_t index = cstring_array_start_token(self);
1212 char_array_append(self->str, str);
1213 char_array_terminate(self->str);
1214 return index;
1215 }
1216
cstring_array_add_string_len(cstring_array * self,char * str,size_t len)1217 inline uint32_t cstring_array_add_string_len(cstring_array *self, char *str, size_t len) {
1218 uint32_t index = cstring_array_start_token(self);
1219 char_array_append_len(self->str, str, len);
1220 char_array_terminate(self->str);
1221 return index;
1222 }
1223
cstring_array_append_string(cstring_array * self,char * str)1224 inline void cstring_array_append_string(cstring_array *self, char *str) {
1225 char_array_append(self->str, str);
1226 }
1227
cstring_array_append_string_len(cstring_array * self,char * str,size_t len)1228 inline void cstring_array_append_string_len(cstring_array *self, char *str, size_t len) {
1229 char_array_append_len(self->str, str, len);
1230 }
1231
cstring_array_cat_string(cstring_array * self,char * str)1232 inline void cstring_array_cat_string(cstring_array *self, char *str) {
1233 char_array_cat(self->str, str);
1234 }
1235
cstring_array_cat_string_len(cstring_array * self,char * str,size_t len)1236 inline void cstring_array_cat_string_len(cstring_array *self, char *str, size_t len) {
1237 char_array_cat_len(self->str, str, len);
1238 }
1239
cstring_array_get_offset(cstring_array * self,uint32_t i)1240 inline int32_t cstring_array_get_offset(cstring_array *self, uint32_t i) {
1241 if (INVALID_INDEX(i, self->indices->n)) {
1242 return -1;
1243 }
1244 return (int32_t)self->indices->a[i];
1245 }
1246
cstring_array_get_string(cstring_array * self,uint32_t i)1247 inline char *cstring_array_get_string(cstring_array *self, uint32_t i) {
1248 int32_t data_index = cstring_array_get_offset(self, i);
1249 if (data_index < 0) return NULL;
1250 return self->str->a + data_index;
1251 }
1252
cstring_array_token_length(cstring_array * self,uint32_t i)1253 inline int64_t cstring_array_token_length(cstring_array *self, uint32_t i) {
1254 if (INVALID_INDEX(i, self->indices->n)) {
1255 return -1;
1256 }
1257 if (i < self->indices->n - 1) {
1258 return self->indices->a[i+1] - self->indices->a[i] - 1;
1259 } else {
1260 return self->str->n - self->indices->a[i] - 1;
1261 }
1262 }
1263
cstring_array_split_options(char * str,const char * separator,size_t separator_len,bool ignore_consecutive,size_t * count)1264 static cstring_array *cstring_array_split_options(char *str, const char *separator, size_t separator_len, bool ignore_consecutive, size_t *count) {
1265 *count = 0;
1266 char_array *array = char_array_new_size(strlen(str));
1267
1268 bool last_was_separator = false;
1269 bool first_char = false;
1270
1271 while (*str) {
1272 if ((separator_len == 1 && *str == separator[0]) || (memcmp(str, separator, separator_len) == 0)) {
1273 if (first_char && (!ignore_consecutive || !last_was_separator)) {
1274 char_array_push(array, '\0');
1275 }
1276 str += separator_len;
1277 last_was_separator = true;
1278 } else {
1279 char_array_push(array, *str);
1280 str++;
1281 last_was_separator = false;
1282 first_char = true;
1283 }
1284 }
1285 char_array_push(array, '\0');
1286
1287 cstring_array *string_array = cstring_array_from_char_array(array);
1288 *count = cstring_array_num_strings(string_array);
1289
1290 return string_array;
1291 }
1292
1293
cstring_array_split(char * str,const char * separator,size_t separator_len,size_t * count)1294 cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) {
1295 return cstring_array_split_options(str, separator, separator_len, false, count);
1296 }
1297
1298
cstring_array_split_ignore_consecutive(char * str,const char * separator,size_t separator_len,size_t * count)1299 cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count) {
1300 return cstring_array_split_options(str, separator, separator_len, true, count);
1301 }
1302
1303
cstring_array_split_no_copy(char * str,char separator,size_t * count)1304 cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count) {
1305 *count = 0;
1306 char *ptr = str;
1307 size_t len = strlen(str);
1308
1309 for (int i = 0; i < len; i++, ptr++) {
1310 if (*ptr == separator) {
1311 *ptr = '\0';
1312 }
1313 }
1314
1315 char_array *array = char_array_from_string_no_copy(str, len);
1316 cstring_array *string_array = cstring_array_from_char_array(array);
1317 *count = cstring_array_num_strings(string_array);
1318
1319 return string_array;
1320 }
1321
1322
cstring_array_to_strings(cstring_array * self)1323 char **cstring_array_to_strings(cstring_array *self) {
1324 char **strings = malloc(self->indices->n * sizeof(char *));
1325
1326 for (int i = 0; i < cstring_array_num_strings(self); i++) {
1327 char *str = cstring_array_get_string(self, i);
1328 strings[i] = strdup(str);
1329 }
1330
1331 cstring_array_destroy(self);
1332 return strings;
1333 }
1334
1335
string_tree_new_size(size_t size)1336 string_tree_t *string_tree_new_size(size_t size) {
1337 string_tree_t *self = malloc(sizeof(string_tree_t));
1338 if (self == NULL) {
1339 return NULL;
1340 }
1341
1342 self->token_indices = uint32_array_new_size(size);
1343 if (self->token_indices == NULL) {
1344 free(self);
1345 return NULL;
1346 }
1347
1348 uint32_array_push(self->token_indices, 0);
1349
1350 self->strings = cstring_array_new();
1351 if (self->strings == NULL) {
1352 uint32_array_destroy(self->token_indices);
1353 free(self);
1354 return NULL;
1355 }
1356
1357 return self;
1358 }
1359
1360 #define DEFAULT_STRING_TREE_SIZE 8
1361
string_tree_new(void)1362 string_tree_t *string_tree_new(void) {
1363 return string_tree_new_size((size_t)DEFAULT_STRING_TREE_SIZE);
1364 }
1365
string_tree_get_alternative(string_tree_t * self,size_t token_index,uint32_t alternative)1366 inline char *string_tree_get_alternative(string_tree_t *self, size_t token_index, uint32_t alternative) {
1367 if (token_index >= self->token_indices->n) return NULL;
1368
1369 uint32_t token_start = self->token_indices->a[token_index];
1370
1371 return cstring_array_get_string(self->strings, token_start + alternative);
1372 }
1373
string_tree_finalize_token(string_tree_t * self)1374 inline void string_tree_finalize_token(string_tree_t *self) {
1375 uint32_array_push(self->token_indices, (uint32_t)cstring_array_num_strings(self->strings));
1376 }
1377
string_tree_clear(string_tree_t * self)1378 void string_tree_clear(string_tree_t *self) {
1379 uint32_array_clear(self->token_indices);
1380 uint32_array_push(self->token_indices, 0);
1381 cstring_array_clear(self->strings);
1382 }
1383
1384 // terminated
string_tree_add_string(string_tree_t * self,char * str)1385 inline void string_tree_add_string(string_tree_t *self, char *str) {
1386 cstring_array_add_string(self->strings, str);
1387 }
1388
string_tree_add_string_len(string_tree_t * self,char * str,size_t len)1389 inline void string_tree_add_string_len(string_tree_t *self, char *str, size_t len) {
1390 cstring_array_add_string_len(self->strings, str, len);
1391 }
1392
1393 // unterminated
string_tree_append_string(string_tree_t * self,char * str)1394 inline void string_tree_append_string(string_tree_t *self, char *str) {
1395 cstring_array_append_string(self->strings, str);
1396 }
1397
string_tree_append_string_len(string_tree_t * self,char * str,size_t len)1398 inline void string_tree_append_string_len(string_tree_t *self, char *str, size_t len) {
1399 cstring_array_append_string_len(self->strings, str, len);
1400 }
1401
string_tree_num_tokens(string_tree_t * self)1402 inline uint32_t string_tree_num_tokens(string_tree_t *self) {
1403 return (uint32_t)self->token_indices->n - 1;
1404 }
1405
string_tree_num_strings(string_tree_t * self)1406 inline uint32_t string_tree_num_strings(string_tree_t *self) {
1407 return (uint32_t)cstring_array_num_strings(self->strings);
1408 }
1409
string_tree_num_alternatives(string_tree_t * self,uint32_t i)1410 inline uint32_t string_tree_num_alternatives(string_tree_t *self, uint32_t i) {
1411 if (i >= self->token_indices->n) return 0;
1412 uint32_t n = self->token_indices->a[i + 1] - self->token_indices->a[i];
1413 return n > 0 ? n : 1;
1414 }
1415
string_tree_destroy(string_tree_t * self)1416 void string_tree_destroy(string_tree_t *self) {
1417 if (self == NULL) return;
1418
1419 if (self->token_indices != NULL) {
1420 uint32_array_destroy(self->token_indices);
1421 }
1422
1423 if (self->strings != NULL) {
1424 cstring_array_destroy(self->strings);
1425 }
1426
1427 free(self);
1428 }
1429
string_tree_iterator_new(string_tree_t * tree)1430 string_tree_iterator_t *string_tree_iterator_new(string_tree_t *tree) {
1431 string_tree_iterator_t *self = malloc(sizeof(string_tree_iterator_t));
1432 self->tree = tree;
1433
1434 uint32_t num_tokens = string_tree_num_tokens(tree);
1435 self->num_tokens = num_tokens;
1436
1437 // calloc since the first path through the tree is all zeros
1438 self->path = calloc(num_tokens, sizeof(uint32_t));
1439
1440 uint32_t permutations = 1;
1441 uint32_t num_strings;
1442
1443 for (int i = 0; i < num_tokens; i++) {
1444 // N + 1 indices stored in token_indices, so this is always valid
1445 num_strings = string_tree_num_alternatives(tree, i);
1446 if (num_strings > 0) {
1447 // 1 or more strings in the string_tree means use those instead of the actual token
1448 permutations *= num_strings;
1449 }
1450 }
1451
1452 if (permutations > 1) {
1453 self->remaining = (uint32_t)permutations;
1454 } else{
1455 self->remaining = 1;
1456 }
1457
1458 return self;
1459 }
1460
string_tree_iterator_next(string_tree_iterator_t * self)1461 void string_tree_iterator_next(string_tree_iterator_t *self) {
1462 if (self->remaining > 0) {
1463 int i;
1464 for (i = self->num_tokens - 1; i >= 0; i--) {
1465 self->path[i]++;
1466 if (self->path[i] == string_tree_num_alternatives(self->tree, i)) {
1467 self->path[i] = 0;
1468 } else {
1469 self->remaining--;
1470 break;
1471 }
1472 }
1473
1474 if (i < 0) {
1475 self->remaining = 0;
1476 }
1477 }
1478 }
1479
string_tree_iterator_get_string(string_tree_iterator_t * self,uint32_t i)1480 char *string_tree_iterator_get_string(string_tree_iterator_t *self, uint32_t i) {
1481 if (i >= self->num_tokens) {
1482 return NULL;
1483 }
1484 uint32_t base_index = self->tree->token_indices->a[i];
1485 uint32_t offset = self->path[i];
1486
1487 return cstring_array_get_string(self->tree->strings, base_index + offset);
1488 }
1489
string_tree_iterator_done(string_tree_iterator_t * self)1490 bool string_tree_iterator_done(string_tree_iterator_t *self) {
1491 return self->remaining == 0;
1492 }
1493
string_tree_iterator_destroy(string_tree_iterator_t * self)1494 void string_tree_iterator_destroy(string_tree_iterator_t *self) {
1495 if (self == NULL) return;
1496
1497 if (self->path) {
1498 free(self->path);
1499 }
1500
1501 free(self);
1502 }
1503