1 /* 2 string_utils.h 3 -------------- 4 5 Utilities for manipulating strings in C. 6 */ 7 #ifndef STRING_UTILS_H 8 #define STRING_UTILS_H 9 10 #include <stdlib.h> 11 #include <stdint.h> 12 #include <string.h> 13 #include <ctype.h> 14 #include <stdbool.h> 15 #include <stdarg.h> 16 #include "collections.h" 17 #include "utf8proc/utf8proc.h" 18 #include "vector.h" 19 20 #define MAX_UTF8_CHAR_SIZE 4 21 22 #define INT8_MAX_STRING_LEN 5 23 #define INT8_MAX_STRING_SIZE INT8_MAX_STRING_LEN + 1 24 #define INT16_MAX_STRING_LEN 7 25 #define INT16_MAX_STRING_SIZE INT16_MAX_STRING_LEN + 1 26 #define INT32_MAX_STRING_LEN 11 27 #define INT32_MAX_STRING_SIZE INT32_MAX_STRING_LEN + 1 28 #define INT64_MAX_STRING_LEN 21 29 #define INT64_MAX_STRING_SIZE INT64_MAX_STRING_LEN + 1 30 31 32 #define UTF8PROC_OPTIONS_BASE UTF8PROC_NULLTERM | UTF8PROC_STABLE 33 34 // Unicode normalization forms 35 #define UTF8PROC_OPTIONS_NFD UTF8PROC_OPTIONS_BASE | UTF8PROC_DECOMPOSE 36 #define UTF8PROC_OPTIONS_NFC UTF8PROC_OPTIONS_BASE | UTF8PROC_COMPOSE 37 #define UTF8PROC_OPTIONS_NFKD UTF8PROC_OPTIONS_NFD | UTF8PROC_COMPAT 38 #define UTF8PROC_OPTIONS_NFKC UTF8PROC_OPTIONS_NFC | UTF8PROC_COMPAT 39 40 // Strip accents 41 #define UTF8PROC_OPTIONS_STRIP_ACCENTS UTF8PROC_OPTIONS_BASE | UTF8PROC_DECOMPOSE | UTF8PROC_STRIPMARK 42 43 // Lowercase 44 #define UTF8PROC_OPTIONS_CASE_FOLD UTF8PROC_OPTIONS_BASE | UTF8PROC_CASEFOLD 45 46 47 // ASCII string methods 48 int string_compare_case_insensitive(const char *str1, const char *str2); 49 int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len); 50 size_t string_common_prefix(const char *str1, const char *str2); 51 size_t string_common_suffix(const char *str1, const char *str2); 52 53 bool string_is_lower(char *s); 54 void string_lower(char *s); 55 bool string_is_upper(char *s); 56 void string_upper(char *s); 57 58 char *string_replace_char(char *str, char c1, char c2); 59 bool string_replace_with_array(char *str, char *replace, char *with, char_array *result); 60 char *string_replace(char *str, char *replace, char *with); 61 62 bool string_starts_with(const char *str, const char *start); 63 bool string_ends_with(const char *str, const char *ending); 64 65 bool string_equals(const char *s1, const char *s2); 66 67 uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len); 68 69 // UTF-8 string methods 70 char *utf8_reversed_string(const char *s); // returns a copy, caller frees 71 ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst); 72 73 // Casing functions return a copy, caller frees 74 char *utf8_lower_options(const char *s, utf8proc_option_t options); 75 char *utf8_lower(const char *s); 76 char *utf8_upper_options(const char *s, utf8proc_option_t options); 77 char *utf8_upper(const char *s); 78 79 int utf8_compare(const char *str1, const char *str2); 80 int utf8_compare_len(const char *str1, const char *str2, size_t len); 81 int utf8_compare_case_insensitive(const char *str1, const char *str2, size_t len); 82 int utf8_compare_len_case_insensitive(const char *str1, const char *str2, size_t len); 83 size_t utf8_common_prefix(const char *str1, const char *str2); 84 size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len); 85 size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2); 86 size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len); 87 88 bool utf8_equal_ignore_separators(const char *str1, const char *str2); 89 90 ssize_t utf8_len(const char *str, size_t len); 91 92 uint32_array *unicode_codepoints(const char *str); 93 bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); 94 size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array); 95 size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array); 96 97 bool utf8_is_hyphen(int32_t ch); 98 bool utf8_is_period(int32_t ch); 99 bool utf8_is_letter(int cat); 100 bool utf8_is_number(int cat); 101 bool utf8_is_digit(int cat); 102 bool utf8_is_letter_or_number(int cat); 103 bool utf8_is_punctuation(int cat); 104 bool utf8_is_symbol(int cat); 105 bool utf8_is_separator(int cat); 106 bool utf8_is_whitespace(int32_t ch); 107 108 bool string_is_digit(char *str, size_t len); 109 bool string_is_ignorable(char *str, size_t len); 110 111 ssize_t string_next_hyphen_index(char *str, size_t len); 112 bool string_contains(char *str, char *sub); 113 bool string_contains_hyphen(char *str); 114 bool string_contains_hyphen_len(char *str, size_t len); 115 116 ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len); 117 ssize_t string_next_codepoint(char *str, uint32_t codepoint); 118 119 ssize_t string_next_period_len(char *str, size_t len); 120 ssize_t string_next_period(char *str); 121 122 bool string_contains_period_len(char *str, size_t len); 123 bool string_contains_period(char *str); 124 125 ssize_t string_next_whitespace_len(char *str, size_t len); 126 ssize_t string_next_whitespace(char *str); 127 128 size_t string_left_spaces_len(char *str, size_t len); 129 size_t string_right_spaces_len(char *str, size_t len); 130 char *string_trim(char *str); 131 132 size_t string_hyphen_prefix_len(char *str, size_t len); 133 size_t string_hyphen_suffix_len(char *str, size_t len); 134 135 /* char_array is a dynamic character array defined in collections.h 136 but has a few additional methods related to string manipulation. 137 138 The array pointer can be treated as a plain old C string for methods 139 expecting NUL-terminated char pointers, but operations like 140 concatenation are cheap and safe. 141 */ 142 char_array *char_array_from_string(char *str); 143 char_array *char_array_from_string_no_copy(char *str, size_t n); 144 145 // Gets the underlying C string for a char_array 146 char *char_array_get_string(char_array *array); 147 148 // Frees the char_array and returns a standard NUL-terminated string 149 char *char_array_to_string(char_array *array); 150 151 // Can use strlen(array->a) but this is faster 152 size_t char_array_len(char_array *array); 153 154 // append_* methods do not NUL-terminate 155 void char_array_append(char_array *array, char *str); 156 void char_array_append_len(char_array *array, char *str, size_t len); 157 void char_array_append_reversed(char_array *array, char *str); 158 void char_array_append_reversed_len(char_array *array, char *str, size_t len); 159 // add NUL terminator to a char_array 160 void char_array_strip_nul_byte(char_array *array); 161 void char_array_terminate(char_array *array); 162 163 // add_* methods NUL-terminate without stripping NUL-byte 164 void char_array_add(char_array *array, char *str); 165 void char_array_add_len(char_array *array, char *str, size_t len); 166 167 // Similar to strcat but with dynamic resizing, guaranteed NUL-terminated 168 void char_array_cat(char_array *array, char *str); 169 void char_array_cat_len(char_array *array, char *str, size_t len); 170 void char_array_cat_reversed(char_array *array, char *str); 171 void char_array_cat_reversed_len(char_array *array, char *str, size_t len); 172 173 // Similar to cat methods but with printf args 174 void char_array_cat_vprintf(char_array *array, char *format, va_list args); 175 void char_array_cat_printf(char_array *array, char *format, ...); 176 177 // Mainly for paths or delimited strings 178 void char_array_add_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args); 179 void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...); 180 void char_array_cat_joined(char_array *array, char *separator, bool strip_separator, int count, ...); 181 182 183 /* 184 cstring_arrays represent n strings stored contiguously, delimited by the NUL byte. 185 186 Instead of storing an array of char pointers (char **), cstring_arrays use this format: 187 188 array->indices = {0, 4, 9}; 189 array->str = {'f', 'o', 'o', '\0', 'b', 'a', 'r', '\0', 'b', 'a', 'z', '\0'}; 190 191 Each value in array->indices is the start position of a token in array->str. Each string 192 is NUL-terminated, so array->str->a + 4 is "bar", a valid NUL-terminated C string 193 194 array->str is a char_array, so all of the powerful methods like char_array_cat_printf above 195 can be used when building the contiguous string arrays as well. 196 197 */ 198 199 typedef struct { 200 uint32_array *indices; 201 char_array *str; 202 } cstring_array; 203 204 cstring_array *cstring_array_new(void); 205 206 cstring_array *cstring_array_new_size(size_t size); 207 208 size_t cstring_array_capacity(cstring_array *self); 209 size_t cstring_array_used(cstring_array *self); 210 size_t cstring_array_num_strings(cstring_array *self); 211 void cstring_array_resize(cstring_array *self, size_t size); 212 void cstring_array_clear(cstring_array *self); 213 214 cstring_array *cstring_array_from_char_array(char_array *str); 215 cstring_array *cstring_array_from_strings(char **strings, size_t n); 216 217 bool cstring_array_extend(cstring_array *array, cstring_array *other); 218 219 // Convert cstring_array to an array of n C strings and destroy the cstring_array 220 char **cstring_array_to_strings(cstring_array *self); 221 222 // Split on delimiter 223 cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count); 224 // Split on delimiter, ignore multiple consecutive delimiters 225 cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count); 226 227 // Split on delimiter by replacing (single character) separator with the NUL byte in the original string 228 cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count); 229 230 uint32_t cstring_array_start_token(cstring_array *self); 231 uint32_t cstring_array_add_string(cstring_array *self, char *str); 232 uint32_t cstring_array_add_string_len(cstring_array *self, char *str, size_t len); 233 234 void cstring_array_append_string(cstring_array *self, char *str); 235 void cstring_array_append_string_len(cstring_array *self, char *str, size_t len); 236 237 void cstring_array_cat_string(cstring_array *self, char *str); 238 void cstring_array_cat_string_len(cstring_array *self, char *str, size_t len); 239 240 void cstring_array_terminate(cstring_array *self); 241 int32_t cstring_array_get_offset(cstring_array *self, uint32_t i); 242 char *cstring_array_get_string(cstring_array *self, uint32_t i); 243 int64_t cstring_array_token_length(cstring_array *self, uint32_t i); 244 245 void cstring_array_destroy(cstring_array *self); 246 247 #define cstring_array_foreach(array, i, s, code) { \ 248 for (int __si = 0; __si < array->indices->n; __si++) { \ 249 (i) = __si; \ 250 (s) = array->str->a + array->indices->a[__si]; \ 251 code; \ 252 } \ 253 } 254 255 /* 256 String trees are a way of storing alternative representations of a tokenized string concisely 257 258 Particularly with hyphens, we may want the string "twenty-five" to normalize to both: 259 260 twenty five 261 twentyfive 262 263 so when we encounter "twenty-five", we'd propose both alternative representations as possible 264 normalizations of the token. 265 266 string_tree is similar to a CSR (compressed sparse row) sparse matrix. 267 268 @tokens - for token i, tree->tokens[i] is the index in strings->indices where token i's alternatives begin 269 @strings - a contiguous string array which only contains as many tokens as there are alternatives 270 271 Since we typically only normalize on mid-word hyphens, periods and non-ASCII characters, a string_tree 272 might not need to store anything at all in many languages. 273 274 */ 275 276 typedef struct string_tree { 277 uint32_array *token_indices; 278 cstring_array *strings; 279 } string_tree_t; 280 281 string_tree_t *string_tree_new(void); 282 string_tree_t *string_tree_new_size(size_t size); 283 284 // get 285 char *string_tree_get_alternative(string_tree_t *self, size_t token_index, uint32_t alternative); 286 287 // finalize 288 void string_tree_finalize_token(string_tree_t *self); 289 // terminated 290 void string_tree_add_string(string_tree_t *self, char *str); 291 void string_tree_add_string_len(string_tree_t *self, char *str, size_t len); 292 // unterminated 293 void string_tree_append_string(string_tree_t *self, char *str); 294 void string_tree_append_string_len(string_tree_t *self, char *str, size_t len); 295 296 void string_tree_clear(string_tree_t *self); 297 298 uint32_t string_tree_num_tokens(string_tree_t *self); 299 uint32_t string_tree_num_strings(string_tree_t *self); 300 301 uint32_t string_tree_num_alternatives(string_tree_t *self, uint32_t i); 302 303 void string_tree_destroy(string_tree_t *self); 304 305 typedef struct string_tree_iterator { 306 string_tree_t *tree; 307 uint32_t *path; 308 uint32_t num_tokens; 309 uint32_t remaining; 310 } string_tree_iterator_t; 311 312 string_tree_iterator_t *string_tree_iterator_new(string_tree_t *tree); 313 void string_tree_iterator_next(string_tree_iterator_t *self); 314 char *string_tree_iterator_get_string(string_tree_iterator_t *self, uint32_t i); 315 bool string_tree_iterator_done(string_tree_iterator_t *self); 316 void string_tree_iterator_destroy(string_tree_iterator_t *self); 317 318 319 #define string_tree_iterator_foreach_token(iter, s, code) { \ 320 string_tree_t *tree = iter->tree; \ 321 for (int __pi = 0; __pi < iter->num_tokens; __pi++) { \ 322 (s) = string_tree_get_alternative(tree, __pi, iter->path[__pi]); \ 323 code; \ 324 } \ 325 } 326 327 328 329 330 #endif 331