1 /*
2 string_utils.h
3 --------------
4 
5 Utilities for manipulating strings in C.
6 */
7 #ifndef STRING_UTILS_H
8 #define STRING_UTILS_H
9 
10 #include <stdlib.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <ctype.h>
14 #include <stdbool.h>
15 #include <stdarg.h>
16 #include "collections.h"
17 #include "utf8proc/utf8proc.h"
18 #include "vector.h"
19 
20 #define MAX_UTF8_CHAR_SIZE 4
21 
22 #define INT8_MAX_STRING_LEN 5
23 #define INT8_MAX_STRING_SIZE INT8_MAX_STRING_LEN + 1
24 #define INT16_MAX_STRING_LEN 7
25 #define INT16_MAX_STRING_SIZE INT16_MAX_STRING_LEN + 1
26 #define INT32_MAX_STRING_LEN 11
27 #define INT32_MAX_STRING_SIZE INT32_MAX_STRING_LEN + 1
28 #define INT64_MAX_STRING_LEN 21
29 #define INT64_MAX_STRING_SIZE INT64_MAX_STRING_LEN + 1
30 
31 
32 #define UTF8PROC_OPTIONS_BASE UTF8PROC_NULLTERM | UTF8PROC_STABLE
33 
34 // Unicode normalization forms
35 #define UTF8PROC_OPTIONS_NFD UTF8PROC_OPTIONS_BASE | UTF8PROC_DECOMPOSE
36 #define UTF8PROC_OPTIONS_NFC UTF8PROC_OPTIONS_BASE | UTF8PROC_COMPOSE
37 #define UTF8PROC_OPTIONS_NFKD UTF8PROC_OPTIONS_NFD | UTF8PROC_COMPAT
38 #define UTF8PROC_OPTIONS_NFKC UTF8PROC_OPTIONS_NFC | UTF8PROC_COMPAT
39 
40 // Strip accents
41 #define UTF8PROC_OPTIONS_STRIP_ACCENTS UTF8PROC_OPTIONS_BASE | UTF8PROC_DECOMPOSE | UTF8PROC_STRIPMARK
42 
43 // Lowercase
44 #define UTF8PROC_OPTIONS_CASE_FOLD UTF8PROC_OPTIONS_BASE | UTF8PROC_CASEFOLD
45 
46 
47 // ASCII string methods
48 int string_compare_case_insensitive(const char *str1, const char *str2);
49 int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len);
50 size_t string_common_prefix(const char *str1, const char *str2);
51 size_t string_common_suffix(const char *str1, const char *str2);
52 
53 bool string_is_lower(char *s);
54 void string_lower(char *s);
55 bool string_is_upper(char *s);
56 void string_upper(char *s);
57 
58 char *string_replace_char(char *str, char c1, char c2);
59 bool string_replace_with_array(char *str, char *replace, char *with, char_array *result);
60 char *string_replace(char *str, char *replace, char *with);
61 
62 bool string_starts_with(const char *str, const char *start);
63 bool string_ends_with(const char *str, const char *ending);
64 
65 bool string_equals(const char *s1, const char *s2);
66 
67 uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len);
68 
69 // UTF-8 string methods
70 char *utf8_reversed_string(const char *s); // returns a copy, caller frees
71 ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst);
72 
73 // Casing functions return a copy, caller frees
74 char *utf8_lower_options(const char *s, utf8proc_option_t options);
75 char *utf8_lower(const char *s);
76 char *utf8_upper_options(const char *s, utf8proc_option_t options);
77 char *utf8_upper(const char *s);
78 
79 int utf8_compare(const char *str1, const char *str2);
80 int utf8_compare_len(const char *str1, const char *str2, size_t len);
81 int utf8_compare_case_insensitive(const char *str1, const char *str2, size_t len);
82 int utf8_compare_len_case_insensitive(const char *str1, const char *str2, size_t len);
83 size_t utf8_common_prefix(const char *str1, const char *str2);
84 size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len);
85 size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2);
86 size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len);
87 
88 bool utf8_equal_ignore_separators(const char *str1, const char *str2);
89 
90 ssize_t utf8_len(const char *str, size_t len);
91 
92 uint32_array *unicode_codepoints(const char *str);
93 bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array);
94 size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array);
95 size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array);
96 
97 bool utf8_is_hyphen(int32_t ch);
98 bool utf8_is_period(int32_t ch);
99 bool utf8_is_letter(int cat);
100 bool utf8_is_number(int cat);
101 bool utf8_is_digit(int cat);
102 bool utf8_is_letter_or_number(int cat);
103 bool utf8_is_punctuation(int cat);
104 bool utf8_is_symbol(int cat);
105 bool utf8_is_separator(int cat);
106 bool utf8_is_whitespace(int32_t ch);
107 
108 bool string_is_digit(char *str, size_t len);
109 bool string_is_ignorable(char *str, size_t len);
110 
111 ssize_t string_next_hyphen_index(char *str, size_t len);
112 bool string_contains(char *str, char *sub);
113 bool string_contains_hyphen(char *str);
114 bool string_contains_hyphen_len(char *str, size_t len);
115 
116 ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len);
117 ssize_t string_next_codepoint(char *str, uint32_t codepoint);
118 
119 ssize_t string_next_period_len(char *str, size_t len);
120 ssize_t string_next_period(char *str);
121 
122 bool string_contains_period_len(char *str, size_t len);
123 bool string_contains_period(char *str);
124 
125 ssize_t string_next_whitespace_len(char *str, size_t len);
126 ssize_t string_next_whitespace(char *str);
127 
128 size_t string_left_spaces_len(char *str, size_t len);
129 size_t string_right_spaces_len(char *str, size_t len);
130 char *string_trim(char *str);
131 
132 size_t string_hyphen_prefix_len(char *str, size_t len);
133 size_t string_hyphen_suffix_len(char *str, size_t len);
134 
135 /* char_array is a dynamic character array defined in collections.h
136 but has a few additional methods related to string manipulation.
137 
138 The array pointer can be treated as a plain old C string for methods
139 expecting NUL-terminated char pointers, but operations like
140 concatenation are cheap and safe.
141 */
142 char_array *char_array_from_string(char *str);
143 char_array *char_array_from_string_no_copy(char *str, size_t n);
144 
145 // Gets the underlying C string for a char_array
146 char *char_array_get_string(char_array *array);
147 
148 // Frees the char_array and returns a standard NUL-terminated string
149 char *char_array_to_string(char_array *array);
150 
151 // Can use strlen(array->a) but this is faster
152 size_t char_array_len(char_array *array);
153 
154 // append_* methods do not NUL-terminate
155 void char_array_append(char_array *array, char *str);
156 void char_array_append_len(char_array *array, char *str, size_t len);
157 void char_array_append_reversed(char_array *array, char *str);
158 void char_array_append_reversed_len(char_array *array, char *str, size_t len);
159 // add NUL terminator to a char_array
160 void char_array_strip_nul_byte(char_array *array);
161 void char_array_terminate(char_array *array);
162 
163 // add_* methods NUL-terminate without stripping NUL-byte
164 void char_array_add(char_array *array, char *str);
165 void char_array_add_len(char_array *array, char *str, size_t len);
166 
167 // Similar to strcat but with dynamic resizing, guaranteed NUL-terminated
168 void char_array_cat(char_array *array, char *str);
169 void char_array_cat_len(char_array *array, char *str, size_t len);
170 void char_array_cat_reversed(char_array *array, char *str);
171 void char_array_cat_reversed_len(char_array *array, char *str, size_t len);
172 
173 // Similar to cat methods but with printf args
174 void char_array_cat_vprintf(char_array *array, char *format, va_list args);
175 void char_array_cat_printf(char_array *array, char *format, ...);
176 
177 // Mainly for paths or delimited strings
178 void char_array_add_vjoined(char_array *array, char *separator, bool strip_separator, int count, va_list args);
179 void char_array_add_joined(char_array *array, char *separator, bool strip_separator, int count, ...);
180 void char_array_cat_joined(char_array *array, char *separator, bool strip_separator, int count, ...);
181 
182 
183 /*
184 cstring_arrays represent n strings stored contiguously, delimited by the NUL byte.
185 
186 Instead of storing an array of char pointers (char **), cstring_arrays use this format:
187 
188 array->indices = {0, 4, 9};
189 array->str = {'f', 'o', 'o', '\0', 'b', 'a', 'r', '\0', 'b', 'a', 'z', '\0'};
190 
191 Each value in array->indices is the start position of a token in array->str. Each string
192 is NUL-terminated, so array->str->a + 4 is "bar", a valid NUL-terminated C string
193 
194 array->str is a char_array, so all of the powerful methods like char_array_cat_printf above
195 can be used when building the contiguous string arrays as well.
196 
197 */
198 
199 typedef struct {
200     uint32_array *indices;
201     char_array *str;
202 } cstring_array;
203 
204 cstring_array *cstring_array_new(void);
205 
206 cstring_array *cstring_array_new_size(size_t size);
207 
208 size_t cstring_array_capacity(cstring_array *self);
209 size_t cstring_array_used(cstring_array *self);
210 size_t cstring_array_num_strings(cstring_array *self);
211 void cstring_array_resize(cstring_array *self, size_t size);
212 void cstring_array_clear(cstring_array *self);
213 
214 cstring_array *cstring_array_from_char_array(char_array *str);
215 cstring_array *cstring_array_from_strings(char **strings, size_t n);
216 
217 bool cstring_array_extend(cstring_array *array, cstring_array *other);
218 
219 // Convert cstring_array to an array of n C strings and destroy the cstring_array
220 char **cstring_array_to_strings(cstring_array *self);
221 
222 // Split on delimiter
223 cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count);
224 // Split on delimiter, ignore multiple consecutive delimiters
225 cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count);
226 
227 // Split on delimiter by replacing (single character) separator with the NUL byte in the original string
228 cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count);
229 
230 uint32_t cstring_array_start_token(cstring_array *self);
231 uint32_t cstring_array_add_string(cstring_array *self, char *str);
232 uint32_t cstring_array_add_string_len(cstring_array *self, char *str, size_t len);
233 
234 void cstring_array_append_string(cstring_array *self, char *str);
235 void cstring_array_append_string_len(cstring_array *self, char *str, size_t len);
236 
237 void cstring_array_cat_string(cstring_array *self, char *str);
238 void cstring_array_cat_string_len(cstring_array *self, char *str, size_t len);
239 
240 void cstring_array_terminate(cstring_array *self);
241 int32_t cstring_array_get_offset(cstring_array *self, uint32_t i);
242 char *cstring_array_get_string(cstring_array *self, uint32_t i);
243 int64_t cstring_array_token_length(cstring_array *self, uint32_t i);
244 
245 void cstring_array_destroy(cstring_array *self);
246 
247 #define cstring_array_foreach(array, i, s, code) {                                      \
248     for (int __si = 0; __si < array->indices->n; __si++) {                              \
249         (i) = __si;                                                                     \
250         (s) = array->str->a + array->indices->a[__si];                                  \
251         code;                                                                           \
252     }                                                                                   \
253 }
254 
255 /*
256 String trees are a way of storing alternative representations of a tokenized string concisely
257 
258 Particularly with hyphens, we may want the string "twenty-five" to normalize to both:
259 
260 twenty five
261 twentyfive
262 
263 so when we encounter "twenty-five", we'd propose both alternative representations as possible
264 normalizations of the token.
265 
266 string_tree is similar to a CSR (compressed sparse row) sparse matrix.
267 
268 @tokens - for token i, tree->tokens[i] is the index in strings->indices where token i's alternatives begin
269 @strings - a contiguous string array which only contains as many tokens as there are alternatives
270 
271 Since we typically only normalize on mid-word hyphens, periods and non-ASCII characters, a string_tree
272 might not need to store anything at all in many languages.
273 
274 */
275 
276 typedef struct string_tree {
277     uint32_array *token_indices;
278     cstring_array *strings;
279 } string_tree_t;
280 
281 string_tree_t *string_tree_new(void);
282 string_tree_t *string_tree_new_size(size_t size);
283 
284 // get
285 char *string_tree_get_alternative(string_tree_t *self, size_t token_index, uint32_t alternative);
286 
287 // finalize
288 void string_tree_finalize_token(string_tree_t *self);
289 // terminated
290 void string_tree_add_string(string_tree_t *self, char *str);
291 void string_tree_add_string_len(string_tree_t *self, char *str, size_t len);
292 // unterminated
293 void string_tree_append_string(string_tree_t *self, char *str);
294 void string_tree_append_string_len(string_tree_t *self, char *str, size_t len);
295 
296 void string_tree_clear(string_tree_t *self);
297 
298 uint32_t string_tree_num_tokens(string_tree_t *self);
299 uint32_t string_tree_num_strings(string_tree_t *self);
300 
301 uint32_t string_tree_num_alternatives(string_tree_t *self, uint32_t i);
302 
303 void string_tree_destroy(string_tree_t *self);
304 
305 typedef struct string_tree_iterator {
306     string_tree_t *tree;
307     uint32_t *path;
308     uint32_t num_tokens;
309     uint32_t remaining;
310 } string_tree_iterator_t;
311 
312 string_tree_iterator_t *string_tree_iterator_new(string_tree_t *tree);
313 void string_tree_iterator_next(string_tree_iterator_t *self);
314 char *string_tree_iterator_get_string(string_tree_iterator_t *self, uint32_t i);
315 bool string_tree_iterator_done(string_tree_iterator_t *self);
316 void string_tree_iterator_destroy(string_tree_iterator_t *self);
317 
318 
319 #define string_tree_iterator_foreach_token(iter, s, code) {                             \
320     string_tree_t *tree = iter->tree;                                                   \
321     for (int __pi = 0; __pi < iter->num_tokens; __pi++) {                               \
322         (s) = string_tree_get_alternative(tree, __pi, iter->path[__pi]);                \
323         code;                                                                           \
324     }                                                                                   \
325 }
326 
327 
328 
329 
330 #endif
331