1 #include <stdio.h>
2 #include "log/log.h"
3 #include "string_utils.h"
4 #include "strndup.h"
5 
6 #define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n))
7 
string_compare_case_insensitive(const char * str1,const char * str2)8 int string_compare_case_insensitive(const char *str1, const char *str2) {
9     int c1, c2;
10 
11     do {
12         c1 =  tolower(*str1++);
13         c2 = tolower(*str2++);
14     } while (c1 == c2 && c1 != 0);
15 
16     return c1 - c2;
17 }
18 
string_compare_len_case_insensitive(const char * str1,const char * str2,size_t len)19 int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len) {
20     register unsigned char *s1 = (unsigned char *) str1;
21     register unsigned char *s2 = (unsigned char *) str2;
22 
23     unsigned char c1, c2;
24 
25     if (len == 0) return 0;
26 
27     do {
28         c1 = *s1++;
29         c2 = *s2++;
30         if (!c1 || !c2)
31             break;
32         if (c1 == c2)
33             continue;
34         c1 = tolower(c1);
35         c2 = tolower(c2);
36         if (c1 != c2)
37             break;
38     } while (--len);
39 
40     return (int)c1 - (int)c2;
41 
42 }
43 
string_common_prefix(const char * str1,const char * str2)44 inline size_t string_common_prefix(const char *str1, const char *str2) {
45     size_t common_prefix;
46     for (common_prefix = 0; *str1 && *str2 && *str1 == *str2; str1++, str2++)
47         common_prefix++;
48     return common_prefix;
49 }
50 
string_common_suffix(const char * str1,const char * str2)51 inline size_t string_common_suffix(const char *str1, const char *str2) {
52     size_t common_suffix = 0;
53     size_t str1_len = strlen(str1);
54     size_t str2_len = strlen(str2);
55     size_t min_len = (str1_len < str2_len) ? str1_len : str2_len;
56     for (int i=1; i <= min_len && str1[str1_len-i] == str2[str2_len-i]; i++)
57         common_suffix++;
58     return common_suffix;
59 }
60 
string_starts_with(const char * str,const char * start)61 inline bool string_starts_with(const char *str, const char *start) {
62     for (; *start; str++, start++)
63         if (*str != *start)
64             return false;
65     return true;
66 }
67 
string_ends_with(const char * str,const char * ending)68 inline bool string_ends_with(const char *str, const char *ending) {
69     size_t end_len = strlen(ending);
70     size_t str_len = strlen(str);
71 
72     return str_len < end_len ? false : !strcmp(str + str_len - end_len, ending);
73 }
74 
string_equals(const char * s1,const char * s2)75 inline bool string_equals(const char *s1, const char *s2) {
76     if (s1 == NULL || s2  == NULL) return false;
77     return strcmp(s1, s2) == 0;
78 }
79 
string_upper(char * s)80 inline void string_upper(char *s) {
81     for (; *s; ++s) *s = toupper(*s);
82 }
83 
string_replace_char(char * str,char c1,char c2)84 inline char *string_replace_char(char *str, char c1, char c2) {
85     char *repl = strdup(str);
86     if (repl == NULL) return NULL;
87     char *ptr = repl;
88     for (; *ptr; ++ptr) {
89         if (*ptr == c1) *ptr = c2;
90     }
91     return repl;
92 }
93 
string_replace_with_array(char * str,char * replace,char * with,char_array * result)94 bool string_replace_with_array(char *str, char *replace, char *with, char_array *result) {
95     if (str == NULL) return false;
96     if (result == NULL) return false;
97 
98     if (replace == NULL) {
99         replace = "";
100     }
101 
102     size_t len_replace = strlen(replace);
103     if (len_replace == 0) {
104         return true;
105     }
106 
107     if (with == NULL) {
108         with = "";
109     }
110 
111     size_t len_with = strlen(with);
112 
113     char *temp;
114     char *start = str;
115 
116     for (size_t count = 0; (temp = strstr(start, replace)); count++) {
117         char_array_cat_len(result, start, temp - start);
118         char_array_cat_len(result, with, len_with);
119         start = temp + len_replace;
120     }
121 
122     char_array_cat(result, start);
123 
124     return true;
125 }
126 
string_replace(char * str,char * replace,char * with)127 char *string_replace(char *str, char *replace, char *with) {
128     if (str == NULL) return NULL;
129 
130     char_array *array = char_array_new_size(strlen(str));
131     if (!string_replace_with_array(str, replace, with, array)) {
132         char_array_destroy(array);
133         return NULL;
134     }
135     return char_array_to_string(array);
136 }
137 
138 
string_is_upper(char * s)139 inline bool string_is_upper(char *s) {
140     for (; *s; ++s) {
141         if (*s != toupper(*s)) return false;
142     }
143     return true;
144 }
145 
string_lower(char * s)146 inline void string_lower(char *s) {
147     for (; *s; ++s) *s = tolower(*s);
148 }
149 
string_is_lower(char * s)150 inline bool string_is_lower(char *s) {
151     for (; *s; ++s) {
152         if (*s != tolower(*s)) return false;
153     }
154     return true;
155 }
156 
string_translate(char * str,size_t len,char * word_chars,char * word_repls,size_t trans_len)157 uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) {
158     uint32_t num_replacements = 0;
159 
160     for (int i = 0; i < len; i++) {
161         for (int j = 0; j < trans_len; j++) {
162             if (str[i] == word_chars[j]) {
163                 str[i] = word_repls[j];
164                 num_replacements++;
165                 break;
166             }
167         }
168     }
169     return num_replacements;
170 }
171 
utf8proc_iterate_reversed(const uint8_t * str,ssize_t start,int32_t * dst)172 ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst) {
173     ssize_t len = 0;
174 
175     const uint8_t *ptr = str + start;
176 
177     *dst = -1;
178 
179     do {
180         if (ptr <= str) return 0;
181         ptr--; len++;
182     } while ((*ptr & 0xC0) == 0x80);
183 
184     int32_t ch = 0;
185 
186     ssize_t ret_len = utf8proc_iterate(ptr, len, &ch);
187     *dst = ch;
188     return ret_len;
189 }
190 
utf8_reversed_string(const char * s)191 char *utf8_reversed_string(const char *s) {
192     int32_t unich;
193     ssize_t len, remaining;
194 
195     size_t slen = strlen(s);
196 
197     char *out = malloc(slen + 1);
198 
199     uint8_t *ptr =  (uint8_t *)s;
200     char *rev_ptr = out + slen;
201 
202     while(1) {
203         len = utf8proc_iterate(ptr, -1, &unich);
204         remaining = len;
205         if (len <= 0 || unich == 0) break;
206         if (!(utf8proc_codepoint_valid(unich))) goto error_free_output;
207 
208         rev_ptr -= len;
209         memcpy(rev_ptr, (char *)ptr, len);
210 
211         ptr += len;
212 
213     }
214 
215     out[slen] = '\0';
216     return out;
217 
218 error_free_output:
219     free(out);
220     return NULL;
221 }
222 
223 typedef enum casing_option {
224     UTF8_LOWER,
225     UTF8_UPPER
226 } casing_option_t;
227 
utf8_case(const char * s,casing_option_t casing,utf8proc_option_t options)228 char *utf8_case(const char *s, casing_option_t casing, utf8proc_option_t options) {
229     ssize_t len = (ssize_t)strlen(s);
230     utf8proc_uint8_t *str = (utf8proc_uint8_t *)s;
231 
232     utf8proc_ssize_t result;
233     result = utf8proc_decompose(str, len, NULL, 0, options);
234 
235     if (result < 0) return NULL;
236     utf8proc_int32_t *buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
237     if (buffer == NULL) return NULL;
238 
239     result = utf8proc_decompose(str, len, buffer, result, options);
240     if (result < 0) {
241         free(buffer);
242         return NULL;
243     }
244 
245     for (utf8proc_ssize_t i = 0; i < result; i++) {
246         utf8proc_int32_t uc = buffer[i];
247         utf8proc_int32_t norm = uc;
248 
249         if (casing == UTF8_LOWER) {
250             norm = utf8proc_tolower(uc);
251         } else if (casing == UTF8_UPPER) {
252             norm = utf8proc_toupper(uc);
253         }
254         buffer[i] = norm;
255     }
256 
257     result = utf8proc_reencode(buffer, result, options);
258     if (result < 0) {
259         free(buffer);
260         return NULL;
261     }
262 
263     utf8proc_int32_t *newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
264     if (newptr != NULL) {
265         buffer = newptr;
266     } else {
267         free(buffer);
268         return NULL;
269     }
270 
271     return (char *)buffer;
272 }
273 
utf8_lower_options(const char * s,utf8proc_option_t options)274 inline char *utf8_lower_options(const char *s, utf8proc_option_t options) {
275     return utf8_case(s, UTF8_LOWER, options);
276 }
277 
utf8_lower(const char * s)278 inline char *utf8_lower(const char *s) {
279     return utf8_case(s, UTF8_LOWER, UTF8PROC_OPTIONS_NFC);
280 }
281 
utf8_upper_options(const char * s,utf8proc_option_t options)282 inline char *utf8_upper_options(const char *s, utf8proc_option_t options) {
283     return utf8_case(s, UTF8_UPPER, options);
284 }
285 
utf8_upper(const char * s)286 inline char *utf8_upper(const char *s) {
287     return utf8_case(s, UTF8_UPPER, UTF8PROC_OPTIONS_NFC);
288 }
289 
290 
utf8_is_letter(int cat)291 inline bool utf8_is_letter(int cat) {
292     return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU        \
293             || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO    \
294             || cat == UTF8PROC_CATEGORY_LM;
295 }
296 
utf8_is_digit(int cat)297 inline bool utf8_is_digit(int cat) {
298     return cat == UTF8PROC_CATEGORY_ND;
299 }
300 
utf8_is_number(int cat)301 inline bool utf8_is_number(int cat) {
302     return cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO;
303 }
304 
utf8_is_letter_or_number(int cat)305 inline bool utf8_is_letter_or_number(int cat) {
306     return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU        \
307             || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO    \
308             || cat == UTF8PROC_CATEGORY_LM || cat == UTF8PROC_CATEGORY_ND    \
309             || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO;
310 }
311 
utf8_is_hyphen(int32_t ch)312 inline bool utf8_is_hyphen(int32_t ch) {
313     int cat = utf8proc_category(ch);
314     return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212;
315 }
316 
317 #define PERIOD_CODEPOINT 46
318 
utf8_is_period(int32_t codepoint)319 inline bool utf8_is_period(int32_t codepoint) {
320     return codepoint == PERIOD_CODEPOINT;
321 }
322 
utf8_is_punctuation(int cat)323 inline bool utf8_is_punctuation(int cat) {
324     return cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PE        \
325            || cat == UTF8PROC_CATEGORY_PF || cat == UTF8PROC_CATEGORY_PI    \
326            || cat == UTF8PROC_CATEGORY_PO || cat == UTF8PROC_CATEGORY_PS;
327 }
328 
utf8_is_symbol(int cat)329 inline bool utf8_is_symbol(int cat) {
330     return cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_SC       \
331            || cat == UTF8PROC_CATEGORY_SM || cat == UTF8PROC_CATEGORY_SO;
332 }
333 
utf8_is_separator(int cat)334 inline bool utf8_is_separator(int cat) {
335     return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
336 }
337 
utf8_is_whitespace(int32_t ch)338 inline bool utf8_is_whitespace(int32_t ch) {
339     int cat = utf8proc_category(ch);
340     return utf8_is_separator(cat) ||
341            ch == 9 || // character tabulation
342            ch == 10 || // line feed
343            ch == 11 || // line tabulation
344            ch == 12 || // form feed
345            ch == 13 || // carriage return
346            ch == 133 // next line
347            ;
348 }
349 
350 
utf8_len(const char * str,size_t len)351 ssize_t utf8_len(const char *str, size_t len) {
352     if (str == NULL) return -1;
353     if (len == 0) return 0;
354 
355     int32_t ch = 0;
356     ssize_t num_utf8_chars = 0;
357     ssize_t char_len;
358 
359     uint8_t *ptr = (uint8_t *)str;
360 
361     size_t remaining = len;
362 
363     while (1) {
364         char_len = utf8proc_iterate(ptr, -1, &ch);
365 
366         if (ch == 0) break;
367         remaining -= char_len;
368         if (remaining == 0) break;
369 
370         ptr += char_len;
371         num_utf8_chars++;
372     }
373 
374     return num_utf8_chars;
375 }
376 
unicode_codepoints(const char * str)377 uint32_array *unicode_codepoints(const char *str) {
378     if (str == NULL) return NULL;
379 
380     uint32_array *a = uint32_array_new();
381 
382     int32_t ch = 0;
383     ssize_t num_utf8_chars = 0;
384     ssize_t char_len;
385 
386     uint8_t *ptr = (uint8_t *)str;
387 
388     while (1) {
389         char_len = utf8proc_iterate(ptr, -1, &ch);
390 
391         if (ch == 0) break;
392 
393         uint32_array_push(a, (uint32_t)ch);
394         ptr += char_len;
395     }
396 
397     return a;
398 }
399 
unicode_equals(uint32_array * u1_array,uint32_array * u2_array)400 bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array) {
401     size_t len1 = u1_array->n;
402     size_t len2 = u2_array->n;
403     if (len1 != len2) return false;
404 
405     uint32_t *u1 = u1_array->a;
406     uint32_t *u2 = u2_array->a;
407     for (size_t i = 0; i < len1; i++) {
408         if (u1[i] != u2[i]) return false;
409     }
410     return true;
411 }
412 
unicode_common_prefix(uint32_array * u1_array,uint32_array * u2_array)413 size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array) {
414     size_t len1 = u1_array->n;
415     size_t len2 = u2_array->n;
416 
417     size_t min_len = len1 <= len2 ? len1 : len2;
418 
419     uint32_t *u1 = u1_array->a;
420     uint32_t *u2 = u2_array->a;
421     size_t common_prefix = 0;
422 
423     for (size_t i = 0; i < min_len; i++) {
424         if (u1[i] == u2[i]) {
425             common_prefix++;
426         } else {
427             break;
428         }
429     }
430     return common_prefix;
431 }
432 
unicode_common_suffix(uint32_array * u1_array,uint32_array * u2_array)433 size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array) {
434     size_t len1 = u1_array->n;
435     size_t len2 = u2_array->n;
436 
437     size_t min_len = len1 <= len2 ? len1 : len2;
438 
439     uint32_t *u1 = u1_array->a;
440     uint32_t *u2 = u2_array->a;
441     size_t common_suffix = 0;
442 
443     for (size_t i = 0; i < min_len; i++) {
444         if (u1[len1 - i - 1] == u2[len2 - i - 1]) {
445             common_suffix++;
446         } else {
447             break;
448         }
449     }
450     return common_suffix;
451 }
452 
453 
454 
455 
utf8_compare_len_option(const char * str1,const char * str2,size_t len,bool case_insensitive)456 int utf8_compare_len_option(const char *str1, const char *str2, size_t len, bool case_insensitive) {
457     if (len == 0) return 0;
458 
459     int32_t c1, c2;
460     ssize_t len1, len2;
461 
462     uint8_t *ptr1 = (uint8_t *)str1;
463     uint8_t *ptr2 = (uint8_t *)str2;
464 
465     size_t remaining = len;
466 
467     while (1) {
468         len1 = utf8proc_iterate(ptr1, -1, &c1);
469         len2 = utf8proc_iterate(ptr2, -1, &c2);
470 
471         if (c1 == 0 || c2 == 0) break;
472 
473         if (c1 == c2 || (case_insensitive && utf8proc_tolower(c1) == utf8proc_tolower(c2))) {
474             ptr1 += len1;
475             ptr2 += len2;
476             remaining -= len1;
477         } else {
478             break;
479         }
480 
481         if (remaining == 0) break;
482 
483     }
484 
485     return (int) c1 - c2;
486 }
487 
utf8_compare_len(const char * str1,const char * str2,size_t len)488 inline int utf8_compare_len(const char *str1, const char *str2, size_t len) {
489     return utf8_compare_len_option(str1, str2, len, false);
490 }
491 
utf8_compare(const char * str1,const char * str2)492 inline int utf8_compare(const char *str1, const char *str2) {
493     size_t len1 = strlen(str1);
494     size_t len2 = strlen(str2);
495     size_t max_len = len1 >= len2 ? len1 : len2;
496     return utf8_compare_len_option(str1, str2, max_len, false);
497 }
498 
utf8_compare_len_case_insensitive(const char * str1,const char * str2,size_t len)499 inline int utf8_compare_len_case_insensitive(const char *str1, const char *str2, size_t len) {
500     return utf8_compare_len_option(str1, str2, len, true);
501 }
502 
utf8_compare_case_insensitive(const char * str1,const char * str2,size_t len)503 inline int utf8_compare_case_insensitive(const char *str1, const char *str2, size_t len) {
504     size_t len1 = strlen(str1);
505     size_t len2 = strlen(str2);
506     size_t max_len = len1 >= len2 ? len1 : len2;
507     return utf8_compare_len_option(str1, str2, max_len, true);
508 }
509 
510 
utf8_common_prefix_len(const char * str1,const char * str2,size_t len)511 size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len) {
512     size_t common_prefix = 0;
513 
514     if (len == 0) return common_prefix;
515 
516     int32_t c1 = 0;
517     int32_t c2 = 0;
518 
519     size_t remaining = len;
520 
521     ssize_t len1, len2;
522 
523     uint8_t *ptr1 = (uint8_t *)str1;
524     uint8_t *ptr2 = (uint8_t *)str2;
525 
526     while (1) {
527         len1 = utf8proc_iterate(ptr1, -1, &c1);
528         len2 = utf8proc_iterate(ptr2, -1, &c2);
529 
530         if (c1 <= 0 || c2 <= 0) break;
531         if (c1 == c2) {
532             ptr1 += len1;
533             ptr2 += len2;
534             common_prefix += len1;
535             if (common_prefix >= len) {
536                 return common_prefix;
537             }
538         } else {
539             break;
540         }
541     }
542 
543     return common_prefix;
544 }
545 
utf8_common_prefix(const char * str1,const char * str2)546 size_t utf8_common_prefix(const char *str1, const char *str2) {
547     size_t len1 = strlen(str1);
548     size_t len2 = strlen(str2);
549 
550     size_t len = len1 <= len2 ? len1 : len2;
551 
552     return utf8_common_prefix_len(str1, str2, len);
553 }
554 
555 
utf8_common_prefix_len_ignore_separators(const char * str1,const char * str2,size_t len)556 size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len) {
557     if (len == 0) return 0;
558 
559     int32_t c1 = -1, c2 = -1;
560     ssize_t len1, len2;
561 
562     uint8_t *ptr1 = (uint8_t *)str1;
563     uint8_t *ptr2 = (uint8_t *)str2;
564 
565     size_t remaining = len;
566 
567     size_t match_len = 0;
568 
569     bool one_char_match = false;
570 
571     while (1) {
572         len1 = utf8proc_iterate(ptr1, -1, &c1);
573         len2 = utf8proc_iterate(ptr2, -1, &c2);
574 
575         if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) {
576             ptr1++;
577             ptr2++;
578             remaining--;
579             match_len++;
580             one_char_match = true;
581             if (remaining == 0) break;
582             continue;
583         }
584 
585         if (c1 == 0 || c2 == 0) break;
586 
587         if (c1 == c2) {
588             ptr1 += len1;
589             ptr2 += len2;
590             remaining -= len1;
591             match_len += len1;
592             one_char_match = true;
593         } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
594             ptr1 += len1;
595             match_len += len1;
596             if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
597                 ptr2 += len2;
598                 remaining -= len2;
599             }
600         } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
601             ptr2 += len2;
602             remaining -= len2;
603         } else {
604             break;
605         }
606 
607         if (remaining == 0) break;
608 
609     }
610 
611     return one_char_match ? match_len : 0;
612 
613 }
614 
utf8_common_prefix_ignore_separators(const char * str1,const char * str2)615 inline size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2) {
616     return utf8_common_prefix_len_ignore_separators(str1, str2, strlen(str2));
617 }
618 
utf8_equal_ignore_separators_len(const char * str1,const char * str2,size_t len)619 bool utf8_equal_ignore_separators_len(const char *str1, const char *str2, size_t len) {
620     if (len == 0) return false;
621 
622     int32_t c1 = -1, c2 = -1;
623     ssize_t len1, len2;
624 
625     uint8_t *ptr1 = (uint8_t *)str1;
626     uint8_t *ptr2 = (uint8_t *)str2;
627 
628     size_t remaining = len;
629 
630     while (1) {
631         len1 = utf8proc_iterate(ptr1, -1, &c1);
632         len2 = utf8proc_iterate(ptr2, -1, &c2);
633 
634         if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) {
635             ptr1++;
636             ptr2++;
637             remaining--;
638             if (remaining == 0) return true;
639             continue;
640         }
641 
642         if (c1 != 0 && c2 != 0 && c1 == c2) {
643             ptr1 += len1;
644             ptr2 += len2;
645             remaining -= len1;
646         } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
647             ptr1 += len1;
648             if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
649                 ptr2 += len2;
650             }
651             remaining -= len1;
652         } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
653             ptr2 += len2;
654             remaining -= len2;
655         } else {
656             break;
657         }
658 
659         if (remaining == 0) return true;
660 
661     }
662 
663     return false;
664 }
665 
utf8_equal_ignore_separators(const char * str1,const char * str2)666 inline bool utf8_equal_ignore_separators(const char *str1, const char *str2) {
667     size_t len1 = strlen(str1);
668     size_t len2 = strlen(str2);
669     size_t len = len1 > len2 ? len1 : len2;
670 
671     return utf8_equal_ignore_separators_len(str1, str2, len);
672 }
673 
string_is_digit(char * str,size_t len)674 bool string_is_digit(char *str, size_t len) {
675     uint8_t *ptr = (uint8_t *)str;
676     size_t idx = 0;
677 
678     bool ignorable = true;
679 
680     while (idx < len) {
681         int32_t ch;
682         ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
683 
684         if (char_len <= 0) break;
685         if (ch == 0) break;
686         if (!(utf8proc_codepoint_valid(ch))) return false;
687 
688         int cat = utf8proc_category(ch);
689         if (cat != UTF8PROC_CATEGORY_ND) {
690             return false;
691         }
692 
693         ptr += char_len;
694         idx += char_len;
695     }
696 
697     return true;
698 }
699 
string_is_ignorable(char * str,size_t len)700 bool string_is_ignorable(char *str, size_t len) {
701     uint8_t *ptr = (uint8_t *)str;
702     size_t idx = 0;
703 
704     bool ignorable = true;
705 
706     while (idx < len) {
707         int32_t ch;
708         ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
709 
710         if (char_len <= 0) break;
711         if (ch == 0) break;
712         if (!(utf8proc_codepoint_valid(ch))) return false;
713 
714         int cat = utf8proc_category(ch);
715         if (!utf8_is_separator(cat) && !utf8_is_hyphen(ch)) {
716             return false;
717         }
718 
719         ptr += char_len;
720         idx += char_len;
721     }
722 
723     return true;
724 }
725 
string_next_hyphen_index(char * str,size_t len)726 ssize_t string_next_hyphen_index(char *str, size_t len) {
727     uint8_t *ptr = (uint8_t *)str;
728     int32_t codepoint;
729     ssize_t idx = 0;
730 
731     while (idx < len) {
732         ssize_t char_len = utf8proc_iterate(ptr, len, &codepoint);
733 
734         if (char_len <= 0 || codepoint == 0) break;
735 
736         if (utf8_is_hyphen(codepoint)) return idx;
737         ptr += char_len;
738         idx += char_len;
739     }
740     return -1;
741 }
742 
string_contains(char * str,char * sub)743 inline bool string_contains(char *str, char *sub) {
744     return str != NULL && sub != NULL && strstr(str, sub) != NULL;
745 }
746 
string_contains_hyphen_len(char * str,size_t len)747 inline bool string_contains_hyphen_len(char *str, size_t len) {
748     return string_next_hyphen_index(str, len) >= 0;
749 }
750 
string_contains_hyphen(char * str)751 inline bool string_contains_hyphen(char *str) {
752     return string_next_hyphen_index(str, strlen(str)) >= 0;
753 }
754 
string_next_codepoint_len(char * str,uint32_t codepoint,size_t len)755 ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len) {
756     uint8_t *ptr = (uint8_t *)str;
757     int32_t ch;
758     ssize_t idx = 0;
759 
760     while (idx < len) {
761         ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
762 
763         if (char_len <= 0 || ch == 0) break;
764 
765         if ((uint32_t)ch == codepoint) return idx;
766         ptr += char_len;
767         idx += char_len;
768     }
769     return -1;
770 }
771 
string_next_codepoint(char * str,uint32_t codepoint)772 ssize_t string_next_codepoint(char *str, uint32_t codepoint) {
773     return string_next_codepoint_len(str, codepoint, strlen(str));
774 }
775 
string_next_period_len(char * str,size_t len)776 ssize_t string_next_period_len(char *str, size_t len) {
777     return string_next_codepoint_len(str, PERIOD_CODEPOINT, len);
778 }
779 
string_next_period(char * str)780 ssize_t string_next_period(char *str) {
781     return string_next_codepoint(str, PERIOD_CODEPOINT);
782 }
783 
string_contains_period_len(char * str,size_t len)784 inline bool string_contains_period_len(char *str, size_t len) {
785     return string_next_codepoint_len(str, PERIOD_CODEPOINT, len) >= 0;
786 }
787 
string_contains_period(char * str)788 inline bool string_contains_period(char *str) {
789     return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0;
790 }
791 
string_next_whitespace_len(char * str,size_t len)792 ssize_t string_next_whitespace_len(char *str, size_t len) {
793     uint8_t *ptr = (uint8_t *)str;
794     int32_t ch;
795     ssize_t idx = 0;
796 
797     while (idx < len) {
798         ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
799 
800         if (char_len <= 0 || ch == 0) break;
801 
802         if (utf8_is_whitespace(ch)) return idx;
803         ptr += char_len;
804         idx += char_len;
805     }
806     return -1;
807 }
808 
string_next_whitespace(char * str)809 ssize_t string_next_whitespace(char *str) {
810     return string_next_whitespace_len(str, strlen(str));
811 }
812 
813 
string_right_spaces_len(char * str,size_t len)814 size_t string_right_spaces_len(char *str, size_t len) {
815     size_t spaces = 0;
816 
817     uint8_t *ptr = (uint8_t *)str;
818     int32_t ch = 0;
819     ssize_t index = len;
820 
821     while (1) {
822         ssize_t char_len = utf8proc_iterate_reversed(ptr, index, &ch);
823 
824         if (ch <= 0) break;
825 
826         if (!utf8_is_whitespace(ch)) {
827             break;
828         }
829 
830         index -= char_len;
831         spaces += char_len;
832     }
833 
834     return spaces;
835 
836 }
837 
string_hyphen_prefix_len(char * str,size_t len)838 inline size_t string_hyphen_prefix_len(char *str, size_t len) {
839     // Strip beginning hyphens
840     int32_t unichr;
841     uint8_t *ptr = (uint8_t *)str;
842     ssize_t char_len = utf8proc_iterate(ptr, len, &unichr);
843     if (utf8_is_hyphen(unichr)) {
844         return (size_t)char_len;
845     }
846     return 0;
847 }
848 
string_hyphen_suffix_len(char * str,size_t len)849 inline size_t string_hyphen_suffix_len(char *str, size_t len) {
850     // Strip ending hyphens
851     int32_t unichr;
852     uint8_t *ptr = (uint8_t *)str;
853     ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr);
854     if (utf8_is_hyphen(unichr)) {
855         return (size_t)char_len;
856     }
857     return 0;
858 }
859 
string_left_spaces_len(char * str,size_t len)860 size_t string_left_spaces_len(char *str, size_t len) {
861     size_t spaces = 0;
862 
863     uint8_t *ptr = (uint8_t *)str;
864     int32_t ch = 0;
865     ssize_t index = 0;
866 
867     while (1) {
868         ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
869 
870         if (ch <= 0) break;
871 
872         if (!utf8_is_whitespace(ch)) {
873             break;
874         }
875         index += char_len;
876         ptr += char_len;
877         spaces += char_len;
878     }
879 
880     return spaces;
881 }
882 
string_trim(char * str)883 char *string_trim(char *str) {
884     size_t len = strlen(str);
885     size_t left_spaces = string_left_spaces_len(str, len);
886     size_t right_spaces = string_right_spaces_len(str, len);
887     char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
888     return ret;
889 }
890 
char_array_from_string(char * str)891 char_array *char_array_from_string(char *str) {
892     size_t len = strlen(str);
893     char_array *array = char_array_new_size(len+1);
894     strcpy(array->a, str);
895     array->n = len;
896     return array;
897 }
898 
char_array_from_string_no_copy(char * str,size_t n)899 char_array *char_array_from_string_no_copy(char *str, size_t n) {
900     char_array *array = malloc(sizeof(char_array));
901     array->a = str;
902     array->m = n;
903     array->n = n;
904     return array;
905 }
906 
char_array_get_string(char_array * array)907 inline char *char_array_get_string(char_array *array) {
908     if (array->n == 0 || array->a[array->n - 1] != '\0') {
909         char_array_terminate(array);
910     }
911     return array->a;
912 }
913 
char_array_to_string(char_array * array)914 inline char *char_array_to_string(char_array *array) {
915     if (array->n == 0 || array->a[array->n - 1] != '\0') {
916         char_array_terminate(array);
917     }
918     char *a = array->a;
919     free(array);
920     return a;
921 }
922 
923 
char_array_strip_nul_byte(char_array * array)924 inline void char_array_strip_nul_byte(char_array *array) {
925     if (array->n > 0 && array->a[array->n - 1] == '\0') {
926         array->a[array->n - 1] = '\0';
927         array->n--;
928     }
929 }
930 
char_array_len(char_array * array)931 inline size_t char_array_len(char_array *array) {
932     if (array->n > 0 && array->a[array->n - 1] == '\0') {
933         return array->n - 1;
934     } else {
935         return array->n;
936     }
937 }
938 
char_array_append(char_array * array,char * str)939 inline void char_array_append(char_array *array, char *str) {
940     while(*str) {
941         char_array_push(array, *str++);
942     }
943 }
944 
char_array_append_len(char_array * array,char * str,size_t len)945 inline void char_array_append_len(char_array *array, char *str, size_t len) {
946     for (size_t i = 0; i < len; i++) {
947         char_array_push(array, *str++);
948     }
949 }
950 
char_array_append_reversed_len(char_array * array,char * str,size_t len)951 inline void char_array_append_reversed_len(char_array *array, char *str, size_t len) {
952     int32_t unich;
953     ssize_t char_len;
954 
955     size_t idx = len;
956     uint8_t *ptr = (uint8_t *)str;
957 
958     while(idx > 0) {
959         char_len = utf8proc_iterate_reversed(ptr, idx, &unich);
960         if (char_len <= 0 || unich == 0) break;
961         if (!(utf8proc_codepoint_valid(unich))) break;
962 
963         idx -= char_len;
964         char_array_append_len(array, str + idx, char_len);
965     }
966 }
967 
char_array_append_reversed(char_array * array,char * str)968 inline void char_array_append_reversed(char_array *array, char *str) {
969     size_t len = strlen(str);
970     char_array_append_reversed_len(array, str, len);
971 }
972 
char_array_terminate(char_array * array)973 inline void char_array_terminate(char_array *array) {
974     char_array_push(array, '\0');
975 }
976 
char_array_cat(char_array * array,char * str)977 inline void char_array_cat(char_array *array, char *str) {
978     char_array_strip_nul_byte(array);
979     char_array_append(array, str);
980     char_array_terminate(array);
981 }
982 
char_array_cat_len(char_array * array,char * str,size_t len)983 inline void char_array_cat_len(char_array *array, char *str, size_t len) {
984     char_array_strip_nul_byte(array);
985     char_array_append_len(array, str, len);
986     char_array_terminate(array);
987 }
988 
989 
char_array_cat_reversed(char_array * array,char * str)990 inline void char_array_cat_reversed(char_array *array, char *str) {
991     char_array_strip_nul_byte(array);
992     char_array_append_reversed(array, str);
993     char_array_terminate(array);
994 }
995 
996 
char_array_cat_reversed_len(char_array * array,char * str,size_t len)997 inline void char_array_cat_reversed_len(char_array *array, char *str, size_t len) {
998     char_array_strip_nul_byte(array);
999     char_array_append_reversed_len(array, str, len);
1000     char_array_terminate(array);
1001 }
1002 
char_array_add(char_array * array,char * str)1003 inline void char_array_add(char_array *array, char *str) {
1004     char_array_append(array, str);
1005     char_array_terminate(array);
1006 }
1007 
char_array_add_len(char_array * array,char * str,size_t len)1008 inline void char_array_add_len(char_array *array, char *str, size_t len) {
1009     char_array_append_len(array, str, len);
1010     char_array_terminate(array);
1011 }
1012 
1013 
char_array_add_vjoined(char_array * array,char * separator,bool strip_separator,int count,va_list args)1014 void char_array_add_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args) {
1015     if (count <= 0) {
1016         return;
1017     }
1018 
1019     size_t separator_len = strlen(separator);
1020 
1021     for (size_t i = 0; i < count - 1; i++) {
1022         char *arg = va_arg(args, char *);
1023         size_t len = strlen(arg);
1024 
1025         if (strip_separator &&
1026             ((separator_len == 1 && arg[len-1] == separator[0]) ||
1027             (len > separator_len && strncmp(arg + len - separator_len, separator, separator_len) == 0))) {
1028             len -= separator_len;
1029         }
1030 
1031         char_array_append_len(array, arg, len);
1032         char_array_append(array, separator);
1033     }
1034 
1035     char *arg = va_arg(args, char *);
1036     char_array_append(array, arg);
1037     char_array_terminate(array);
1038 
1039 }
1040 
char_array_add_joined(char_array * array,char * separator,bool strip_separator,int count,...)1041 inline void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...) {
1042     va_list args;
1043     va_start(args, count);
1044     char_array_add_vjoined(array, separator, strip_separator, count, args);
1045     va_end(args);
1046 }
1047 
char_array_cat_joined(char_array * array,char * separator,bool strip_separator,int count,...)1048 inline void char_array_cat_joined(char_array *array, char *separator, bool strip_separator, int count, ...) {
1049     char_array_strip_nul_byte(array);
1050     va_list args;
1051     va_start(args, count);
1052     char_array_add_vjoined(array, separator, strip_separator, count, args);
1053     va_end(args);
1054 }
1055 
char_array_cat_vprintf(char_array * array,char * format,va_list args)1056 void char_array_cat_vprintf(char_array *array, char *format, va_list args) {
1057     char_array_strip_nul_byte(array);
1058 
1059     va_list cpy;
1060 
1061     char *buf;
1062     size_t buflen;
1063 
1064     size_t last_n = array->n;
1065     size_t size = array->m - array->n <= 2 ? array->m * 2 : array->m;
1066 
1067     while(1) {
1068         char_array_resize(array, size);
1069         buf = array->a + last_n;
1070         buflen = size - last_n;
1071         if (buf == NULL) return;
1072         array->a[size - 2] = '\0';
1073         va_copy(cpy, args);
1074         vsnprintf(buf, buflen, format, cpy);
1075         if (array->a[size - 2] != '\0') {
1076             size *= 2;
1077             continue;
1078         } else {
1079             array->n += strlen(buf);
1080         }
1081         break;
1082     }
1083 }
1084 
char_array_cat_printf(char_array * array,char * format,...)1085 void char_array_cat_printf(char_array *array, char *format, ...) {
1086     va_list args;
1087     va_start(args, format);
1088     char_array_cat_vprintf(array, format, args);
1089     va_end(args);
1090 }
1091 
cstring_array_new(void)1092 cstring_array *cstring_array_new(void) {
1093     cstring_array *array = malloc(sizeof(cstring_array));
1094     if (array == NULL) return NULL;
1095 
1096     array->indices = uint32_array_new();
1097     if (array->indices == NULL) {
1098         cstring_array_destroy(array);
1099         return NULL;
1100     }
1101 
1102     array->str = char_array_new();
1103     if (array->str == NULL) {
1104         cstring_array_destroy(array);
1105         return NULL;
1106     }
1107 
1108     return array;
1109 }
1110 
cstring_array_destroy(cstring_array * self)1111 void cstring_array_destroy(cstring_array *self) {
1112     if (self == NULL) return;
1113     if (self->indices) {
1114         uint32_array_destroy(self->indices);
1115     }
1116     if (self->str) {
1117         char_array_destroy(self->str);
1118     }
1119     free(self);
1120 }
1121 
cstring_array_new_size(size_t size)1122 cstring_array *cstring_array_new_size(size_t size) {
1123     cstring_array *array = cstring_array_new();
1124     char_array_resize(array->str, size);
1125     return array;
1126 }
1127 
cstring_array_from_char_array(char_array * str)1128 cstring_array *cstring_array_from_char_array(char_array *str) {
1129     if (str == NULL) return NULL;
1130     if (str->n == 0)
1131         return cstring_array_new();
1132 
1133     cstring_array *array = malloc(sizeof(cstring_array));
1134     if (array == NULL) return NULL;
1135 
1136     array->str = str;
1137     array->indices = uint32_array_new_size(1);
1138 
1139     uint32_array_push(array->indices, 0);
1140     char *ptr = str->a;
1141     for (uint32_t i = 0; i < str->n - 1; i++, ptr++) {
1142         if (*ptr == '\0') {
1143             uint32_array_push(array->indices, i + 1);
1144         }
1145     }
1146     return array;
1147 }
1148 
cstring_array_from_strings(char ** strings,size_t n)1149 cstring_array *cstring_array_from_strings(char **strings, size_t n) {
1150     cstring_array *array = cstring_array_new();
1151     for (size_t i = 0; i < n; i++) {
1152         cstring_array_start_token(array);
1153         cstring_array_add_string(array, strings[i]);
1154     }
1155     return array;
1156 }
1157 
cstring_array_extend(cstring_array * array,cstring_array * other)1158 bool cstring_array_extend(cstring_array *array, cstring_array *other) {
1159     if (array == NULL || other == NULL) return false;
1160     size_t n = cstring_array_num_strings(other);
1161 
1162     for (size_t i = 0; i < n; i++) {
1163         char *s_i = cstring_array_get_string(other, i);
1164         cstring_array_add_string(array, s_i);
1165     }
1166     return true;
1167 }
1168 
1169 
cstring_array_capacity(cstring_array * self)1170 inline size_t cstring_array_capacity(cstring_array *self) {
1171     return self->str->m;
1172 }
1173 
cstring_array_used(cstring_array * self)1174 inline size_t cstring_array_used(cstring_array *self) {
1175     return self->str->n;
1176 }
1177 
cstring_array_num_strings(cstring_array * self)1178 inline size_t cstring_array_num_strings(cstring_array *self) {
1179     if (self == NULL) return 0;
1180     return self->indices->n;
1181 }
1182 
cstring_array_resize(cstring_array * self,size_t size)1183 inline void cstring_array_resize(cstring_array *self, size_t size) {
1184     if (size < cstring_array_capacity(self)) return;
1185     char_array_resize(self->str, size);
1186 }
1187 
cstring_array_clear(cstring_array * self)1188 void cstring_array_clear(cstring_array *self) {
1189     if (self == NULL) return;
1190 
1191     if (self->indices != NULL) {
1192         uint32_array_clear(self->indices);
1193     }
1194 
1195     if (self->str != NULL) {
1196         char_array_clear(self->str);
1197     }
1198 }
1199 
cstring_array_start_token(cstring_array * self)1200 inline uint32_t cstring_array_start_token(cstring_array *self) {
1201     uint32_t index = (uint32_t)self->str->n;
1202     uint32_array_push(self->indices, index);
1203     return index;
1204 }
1205 
cstring_array_terminate(cstring_array * self)1206 inline void cstring_array_terminate(cstring_array *self) {
1207     char_array_terminate(self->str);
1208 }
1209 
cstring_array_add_string(cstring_array * self,char * str)1210 inline uint32_t cstring_array_add_string(cstring_array *self, char *str) {
1211     uint32_t index = cstring_array_start_token(self);
1212     char_array_append(self->str, str);
1213     char_array_terminate(self->str);
1214     return index;
1215 }
1216 
cstring_array_add_string_len(cstring_array * self,char * str,size_t len)1217 inline uint32_t cstring_array_add_string_len(cstring_array *self, char *str, size_t len) {
1218     uint32_t index = cstring_array_start_token(self);
1219     char_array_append_len(self->str, str, len);
1220     char_array_terminate(self->str);
1221     return index;
1222 }
1223 
cstring_array_append_string(cstring_array * self,char * str)1224 inline void cstring_array_append_string(cstring_array *self, char *str) {
1225     char_array_append(self->str, str);
1226 }
1227 
cstring_array_append_string_len(cstring_array * self,char * str,size_t len)1228 inline void cstring_array_append_string_len(cstring_array *self, char *str, size_t len) {
1229     char_array_append_len(self->str, str, len);
1230 }
1231 
cstring_array_cat_string(cstring_array * self,char * str)1232 inline void cstring_array_cat_string(cstring_array *self, char *str) {
1233     char_array_cat(self->str, str);
1234 }
1235 
cstring_array_cat_string_len(cstring_array * self,char * str,size_t len)1236 inline void cstring_array_cat_string_len(cstring_array *self, char *str, size_t len) {
1237     char_array_cat_len(self->str, str, len);
1238 }
1239 
cstring_array_get_offset(cstring_array * self,uint32_t i)1240 inline int32_t cstring_array_get_offset(cstring_array *self, uint32_t i) {
1241     if (INVALID_INDEX(i, self->indices->n)) {
1242         return -1;
1243     }
1244     return (int32_t)self->indices->a[i];
1245 }
1246 
cstring_array_get_string(cstring_array * self,uint32_t i)1247 inline char *cstring_array_get_string(cstring_array *self, uint32_t i) {
1248     int32_t data_index = cstring_array_get_offset(self, i);
1249     if (data_index < 0) return NULL;
1250     return self->str->a + data_index;
1251 }
1252 
cstring_array_token_length(cstring_array * self,uint32_t i)1253 inline int64_t cstring_array_token_length(cstring_array *self, uint32_t i) {
1254     if (INVALID_INDEX(i, self->indices->n)) {
1255         return -1;
1256     }
1257     if (i < self->indices->n - 1) {
1258         return self->indices->a[i+1] - self->indices->a[i] - 1;
1259     } else {
1260         return self->str->n - self->indices->a[i] - 1;
1261     }
1262 }
1263 
cstring_array_split_options(char * str,const char * separator,size_t separator_len,bool ignore_consecutive,size_t * count)1264 static cstring_array *cstring_array_split_options(char *str, const char *separator, size_t separator_len, bool ignore_consecutive, size_t *count) {
1265     *count = 0;
1266     char_array *array = char_array_new_size(strlen(str));
1267 
1268     bool last_was_separator = false;
1269     bool first_char = false;
1270 
1271     while (*str) {
1272         if ((separator_len == 1 && *str == separator[0]) || (memcmp(str, separator, separator_len) == 0)) {
1273             if (first_char && (!ignore_consecutive || !last_was_separator)) {
1274                 char_array_push(array, '\0');
1275             }
1276             str += separator_len;
1277             last_was_separator = true;
1278         } else {
1279             char_array_push(array, *str);
1280             str++;
1281             last_was_separator = false;
1282             first_char = true;
1283         }
1284     }
1285     char_array_push(array, '\0');
1286 
1287     cstring_array *string_array = cstring_array_from_char_array(array);
1288     *count = cstring_array_num_strings(string_array);
1289 
1290     return string_array;
1291 }
1292 
1293 
cstring_array_split(char * str,const char * separator,size_t separator_len,size_t * count)1294 cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) {
1295     return cstring_array_split_options(str, separator, separator_len, false, count);
1296 }
1297 
1298 
cstring_array_split_ignore_consecutive(char * str,const char * separator,size_t separator_len,size_t * count)1299 cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count) {
1300     return cstring_array_split_options(str, separator, separator_len, true, count);
1301 }
1302 
1303 
cstring_array_split_no_copy(char * str,char separator,size_t * count)1304 cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count) {
1305     *count = 0;
1306     char *ptr = str;
1307     size_t len = strlen(str);
1308 
1309     for (int i = 0; i < len; i++, ptr++) {
1310         if (*ptr == separator) {
1311             *ptr = '\0';
1312         }
1313     }
1314 
1315     char_array *array = char_array_from_string_no_copy(str, len);
1316     cstring_array *string_array = cstring_array_from_char_array(array);
1317     *count = cstring_array_num_strings(string_array);
1318 
1319     return string_array;
1320 }
1321 
1322 
cstring_array_to_strings(cstring_array * self)1323 char **cstring_array_to_strings(cstring_array *self) {
1324     char **strings = malloc(self->indices->n * sizeof(char *));
1325 
1326     for (int i = 0; i < cstring_array_num_strings(self); i++) {
1327         char *str = cstring_array_get_string(self, i);
1328         strings[i] = strdup(str);
1329     }
1330 
1331     cstring_array_destroy(self);
1332     return strings;
1333 }
1334 
1335 
string_tree_new_size(size_t size)1336 string_tree_t *string_tree_new_size(size_t size) {
1337     string_tree_t *self = malloc(sizeof(string_tree_t));
1338     if (self == NULL) {
1339         return NULL;
1340     }
1341 
1342     self->token_indices = uint32_array_new_size(size);
1343     if (self->token_indices == NULL) {
1344         free(self);
1345         return NULL;
1346     }
1347 
1348     uint32_array_push(self->token_indices, 0);
1349 
1350     self->strings = cstring_array_new();
1351     if (self->strings == NULL) {
1352         uint32_array_destroy(self->token_indices);
1353         free(self);
1354         return NULL;
1355     }
1356 
1357     return self;
1358 }
1359 
1360 #define DEFAULT_STRING_TREE_SIZE 8
1361 
string_tree_new(void)1362 string_tree_t *string_tree_new(void) {
1363     return string_tree_new_size((size_t)DEFAULT_STRING_TREE_SIZE);
1364 }
1365 
string_tree_get_alternative(string_tree_t * self,size_t token_index,uint32_t alternative)1366 inline char *string_tree_get_alternative(string_tree_t *self, size_t token_index, uint32_t alternative) {
1367     if (token_index >= self->token_indices->n) return NULL;
1368 
1369     uint32_t token_start = self->token_indices->a[token_index];
1370 
1371     return cstring_array_get_string(self->strings, token_start + alternative);
1372 }
1373 
string_tree_finalize_token(string_tree_t * self)1374 inline void string_tree_finalize_token(string_tree_t *self) {
1375     uint32_array_push(self->token_indices, (uint32_t)cstring_array_num_strings(self->strings));
1376 }
1377 
string_tree_clear(string_tree_t * self)1378 void string_tree_clear(string_tree_t *self) {
1379     uint32_array_clear(self->token_indices);
1380     uint32_array_push(self->token_indices, 0);
1381     cstring_array_clear(self->strings);
1382 }
1383 
1384 // terminated
string_tree_add_string(string_tree_t * self,char * str)1385 inline void string_tree_add_string(string_tree_t *self, char *str) {
1386     cstring_array_add_string(self->strings, str);
1387 }
1388 
string_tree_add_string_len(string_tree_t * self,char * str,size_t len)1389 inline void string_tree_add_string_len(string_tree_t *self, char *str, size_t len) {
1390     cstring_array_add_string_len(self->strings, str, len);
1391 }
1392 
1393 // unterminated
string_tree_append_string(string_tree_t * self,char * str)1394 inline void string_tree_append_string(string_tree_t *self, char *str) {
1395     cstring_array_append_string(self->strings, str);
1396 }
1397 
string_tree_append_string_len(string_tree_t * self,char * str,size_t len)1398 inline void string_tree_append_string_len(string_tree_t *self, char *str, size_t len) {
1399     cstring_array_append_string_len(self->strings, str, len);
1400 }
1401 
string_tree_num_tokens(string_tree_t * self)1402 inline uint32_t string_tree_num_tokens(string_tree_t *self) {
1403     return (uint32_t)self->token_indices->n - 1;
1404 }
1405 
string_tree_num_strings(string_tree_t * self)1406 inline uint32_t string_tree_num_strings(string_tree_t *self) {
1407     return (uint32_t)cstring_array_num_strings(self->strings);
1408 }
1409 
string_tree_num_alternatives(string_tree_t * self,uint32_t i)1410 inline uint32_t string_tree_num_alternatives(string_tree_t *self, uint32_t i) {
1411     if (i >= self->token_indices->n) return 0;
1412     uint32_t n = self->token_indices->a[i + 1] - self->token_indices->a[i];
1413     return n > 0 ? n : 1;
1414 }
1415 
string_tree_destroy(string_tree_t * self)1416 void string_tree_destroy(string_tree_t *self) {
1417     if (self == NULL) return;
1418 
1419     if (self->token_indices != NULL) {
1420         uint32_array_destroy(self->token_indices);
1421     }
1422 
1423     if (self->strings != NULL) {
1424         cstring_array_destroy(self->strings);
1425     }
1426 
1427     free(self);
1428 }
1429 
string_tree_iterator_new(string_tree_t * tree)1430 string_tree_iterator_t *string_tree_iterator_new(string_tree_t *tree) {
1431     string_tree_iterator_t *self = malloc(sizeof(string_tree_iterator_t));
1432     self->tree = tree;
1433 
1434     uint32_t num_tokens = string_tree_num_tokens(tree);
1435     self->num_tokens = num_tokens;
1436 
1437     // calloc since the first path through the tree is all zeros
1438     self->path = calloc(num_tokens, sizeof(uint32_t));
1439 
1440     uint32_t permutations = 1;
1441     uint32_t num_strings;
1442 
1443     for (int i = 0; i < num_tokens; i++) {
1444         // N + 1 indices stored in token_indices, so this is always valid
1445         num_strings = string_tree_num_alternatives(tree, i);
1446         if (num_strings > 0) {
1447             // 1 or more strings in the string_tree means use those instead of the actual token
1448             permutations *= num_strings;
1449         }
1450     }
1451 
1452     if (permutations > 1) {
1453         self->remaining = (uint32_t)permutations;
1454     } else{
1455         self->remaining = 1;
1456     }
1457 
1458     return self;
1459 }
1460 
string_tree_iterator_next(string_tree_iterator_t * self)1461 void string_tree_iterator_next(string_tree_iterator_t *self) {
1462     if (self->remaining > 0) {
1463         int i;
1464         for (i = self->num_tokens - 1; i >= 0; i--) {
1465             self->path[i]++;
1466             if (self->path[i] == string_tree_num_alternatives(self->tree, i)) {
1467                 self->path[i] = 0;
1468             } else {
1469                 self->remaining--;
1470                 break;
1471             }
1472         }
1473 
1474         if (i < 0) {
1475             self->remaining = 0;
1476         }
1477     }
1478 }
1479 
string_tree_iterator_get_string(string_tree_iterator_t * self,uint32_t i)1480 char *string_tree_iterator_get_string(string_tree_iterator_t *self, uint32_t i) {
1481     if (i >= self->num_tokens) {
1482         return NULL;
1483     }
1484     uint32_t base_index = self->tree->token_indices->a[i];
1485     uint32_t offset = self->path[i];
1486 
1487     return cstring_array_get_string(self->tree->strings, base_index + offset);
1488 }
1489 
string_tree_iterator_done(string_tree_iterator_t * self)1490 bool string_tree_iterator_done(string_tree_iterator_t *self) {
1491     return self->remaining == 0;
1492 }
1493 
string_tree_iterator_destroy(string_tree_iterator_t * self)1494 void string_tree_iterator_destroy(string_tree_iterator_t *self) {
1495     if (self == NULL) return;
1496 
1497     if (self->path) {
1498         free(self->path);
1499     }
1500 
1501     free(self);
1502 }
1503