1 #include "text.h"
2 
3 #include "macros.h"
4 
5 #include <ctype.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 
10 /* returns the length of the string written to `dest`
11  */
sprint_humanread_bytes(char * dest,unsigned int size,uint64_t bytes)12 int sprint_humanread_bytes(char *dest, unsigned int size, uint64_t bytes) {
13     char * str[]  = { "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" };
14     int    max_id = COUNTOF(str) - 1;
15     int    i      = 0;
16     double f      = bytes;
17     while ((bytes >= 1024) && (i < max_id)) {
18         bytes /= 1024;
19         f /= 1024.0;
20         i++;
21     }
22 
23     size_t r;
24 
25     r = snprintf((char *)dest, size, "%u", (uint32_t)bytes);
26 
27     if (r >= size) { // truncated
28         r = size - 1;
29     } else {
30         // missing decimals
31         r += snprintf((char *)dest + r, size - r, "%s", str[i]);
32         if (r >= size) { // truncated
33             r = size - 1;
34         }
35     }
36 
37     return r;
38 }
39 
utf8_len(const char * data)40 uint8_t utf8_len(const char *data) {
41     if (!(*data & 0x80)) {
42         return 1;
43     }
44 
45     uint8_t bytes = 1, i;
46     for (i = 6; i != 0xFF; i--) {
47         if (!((*data >> i) & 1)) {
48             break;
49         }
50         bytes++;
51     }
52     // no validation, instead validate all utf8 when received
53     return bytes;
54 }
55 
utf8_len_read(const char * data,uint32_t * ch)56 uint8_t utf8_len_read(const char *data, uint32_t *ch) {
57     uint8_t a = data[0];
58     if (!(a & 0x80)) {
59         *ch = data[0];
60         return 1;
61     }
62 
63     if (!(a & 0x20)) {
64         *ch = ((data[0] & 0x1F) << 6) | (data[1] & 0x3F);
65         return 2;
66     }
67 
68     if (!(a & 0x10)) {
69         *ch = ((data[0] & 0xF) << 12) | ((data[1] & 0x3F) << 6) | (data[2] & 0x3F);
70         return 3;
71     }
72 
73     if (!(a & 8)) {
74         *ch = ((data[0] & 0x7) << 18) | ((data[1] & 0x3F) << 12) | ((data[2] & 0x3F) << 6) | (data[3] & 0x3F);
75         return 4;
76     }
77 
78     if (!(a & 4)) {
79         *ch = ((data[0] & 0x3) << 24) | ((data[1] & 0x3F) << 18) | ((data[2] & 0x3F) << 12) | ((data[3] & 0x3F) << 6)
80               | (data[4] & 0x3F);
81         return 5;
82     }
83 
84     if (!(a & 2)) {
85         *ch = ((data[0] & 0x1) << 30) | ((data[1] & 0x3F) << 24) | ((data[2] & 0x3F) << 18) | ((data[3] & 0x3F) << 12)
86               | ((data[4] & 0x3F) << 6) | (data[5] & 0x3F);
87         return 6;
88     }
89 
90     // never happen
91     return 0;
92 }
93 
utf8_unlen(char * data)94 uint8_t utf8_unlen(char *data) {
95     uint8_t len = 1;
96     if (*(data - 1) & 0x80) {
97         do {
98             len++;
99         } while (!(*(data - len) & 0x40));
100     }
101 
102     return len;
103 }
104 
105 /* I've had some issues with this function in the past when it's given malformed data.
106  * irungentoo has previouslly said, it'll never fail when given a valid utf-8 string, however the
107  * utf8 standard says that applications are required to handle and correctlly respond to malformed
108  * strings as they have been used in the past to create security expliots. This function is known to
109  * enter an endless state, or segv on bad strings. Either way, that's bad and needs to be fixed.
110  * TODO(grayhatter) TODO(anyone) */
utf8_validate(const uint8_t * data,int len)111 int utf8_validate(const uint8_t *data, int len) {
112     // stops when an invalid character is reached
113     const uint8_t *a = data, *end = data + len;
114     while (a != end) {
115         if (!(*a & 0x80)) {
116             a++;
117             continue;
118         }
119 
120         uint8_t bytes = 1, i;
121         for (i = 6; i != 0xFF; i--) {
122             if (!((*a >> i) & 1)) {
123                 break;
124             }
125             bytes++;
126         }
127 
128         if (bytes == 1 || bytes == 8) {
129             break;
130         }
131 
132         // Validate the utf8
133         if (a + bytes > end) {
134             break;
135         }
136 
137         for (i = 1; i < bytes; i++) {
138             if (!(a[i] & 0x80) || (a[i] & 0x40)) {
139                 return a - data;
140             }
141         }
142 
143         a += bytes;
144     }
145 
146     return a - data;
147 }
148 
unicode_to_utf8_len(uint32_t ch)149 uint8_t unicode_to_utf8_len(uint32_t ch) {
150     if (ch > 0x1FFFFF) {
151         return 0;
152     }
153     return 4 - (ch <= 0xFFFF) - (ch <= 0x7FF) - (ch <= 0x7F);
154 }
155 
unicode_to_utf8(uint32_t ch,char * dst)156 void unicode_to_utf8(uint32_t ch, char *dst) {
157     uint32_t HB = (uint32_t)0x80;
158     uint32_t SB = (uint32_t)0x3F;
159     if (ch <= 0x7F) {
160         dst[0] = (uint8_t)ch;
161         return; // 1;
162     }
163     if (ch <= 0x7FF) {
164         dst[0] = (uint8_t)((ch >> 6) | (uint32_t)0xC0);
165         dst[1] = (uint8_t)((ch & SB) | HB);
166         return; // 2;
167     }
168     if (ch <= 0xFFFF) {
169         dst[0] = (uint8_t)((ch >> 12) | (uint32_t)0xE0);
170         dst[1] = (uint8_t)(((ch >> 6) & SB) | HB);
171         dst[2] = (uint8_t)((ch & SB) | HB);
172         return; // 3;
173     }
174     if (ch <= 0x1FFFFF) {
175         dst[0] = (uint8_t)((ch >> 18) | (uint32_t)0xF0);
176         dst[1] = (uint8_t)(((ch >> 12) & SB) | HB);
177         dst[2] = (uint8_t)(((ch >> 6) & SB) | HB);
178         dst[3] = (uint8_t)((ch & SB) | HB);
179         return; // 4;
180     }
181     return; // 0;
182 }
183 
memcmp_case(const char * s1,const char * s2,uint32_t n)184 bool memcmp_case(const char *s1, const char *s2, uint32_t n) {
185     uint32_t i;
186 
187     for (i = 0; i < n; i++) {
188         char c1, c2;
189 
190         c1 = s1[i];
191         c2 = s2[i];
192 
193         if (c1 >= (char)'a' && c1 <= (char)'z') {
194             c1 += ('A' - 'a');
195         }
196 
197         if (c2 >= (char)'a' && c2 <= (char)'z') {
198             c2 += ('A' - 'a');
199         }
200 
201         if (c1 != c2) {
202             return 1;
203         }
204     }
205 
206     return 0;
207 }
208 
tohtml(const char * str,uint16_t length)209 char *tohtml(const char *str, uint16_t length) {
210     uint16_t i   = 0;
211     int      len = 0;
212     while (i != length) {
213         switch (str[i]) {
214             case '<':
215             case '>': {
216                 len += 3;
217                 break;
218             }
219 
220             case '&': {
221                 len += 4;
222                 break;
223             }
224         }
225 
226         i += utf8_len(str + i);
227     }
228 
229     char *out = malloc(length + len + 1);
230     i         = 0;
231     len       = 0;
232     while (i != length) {
233         switch (str[i]) {
234             case '<':
235             case '>': {
236                 memcpy(out + len, str[i] == '>' ? "&gt;" : "&lt;", 4);
237                 len += 4;
238                 i++;
239                 break;
240             }
241 
242             case '&': {
243                 memcpy(out + len, "&amp;", 5);
244                 len += 5;
245                 i++;
246                 break;
247             }
248 
249             default: {
250                 uint16_t r = utf8_len(str + i);
251                 memcpy(out + len, str + i, r);
252                 len += r;
253                 i += r;
254                 break;
255             }
256         }
257     }
258 
259     out[len] = 0;
260 
261     return out;
262 }
263 
to_hex(char * out,uint8_t * in,int size)264 void to_hex(char *out, uint8_t *in, int size) {
265     while (size--) {
266         if (*in >> 4 < 0xA) {
267             *out++ = '0' + (*in >> 4);
268         } else {
269             *out++ = 'A' + (*in >> 4) - 0xA;
270         }
271 
272         if ((*in & 0xf) < 0xA) {
273             *out++ = '0' + (*in & 0xF);
274         } else {
275             *out++ = 'A' + (*in & 0xF) - 0xA;
276         }
277         in++;
278     }
279 }
280 
strstr_case(const char * a,const char * b)281 bool strstr_case(const char *a, const char *b) {
282     const char *c = b;
283     while (*a) {
284         if (tolower(*a) != tolower(*c)) {
285             c = b;
286         }
287 
288         if (tolower(*a) == tolower(*c)) {
289             c++;
290             if (!*c) {
291                 return 1;
292             }
293         }
294         a++;
295     }
296 
297     return 0;
298 }
299 
safe_shrink(const char * string,uint16_t string_length,uint16_t shrink_length)300 uint16_t safe_shrink(const char *string, uint16_t string_length, uint16_t shrink_length) {
301     if (!string) {
302         return 0;
303     }
304 
305     uint16_t length = 0;
306     while (length < string_length) {
307         uint8_t char_length = utf8_len(&string[length]);
308         length += char_length;
309 
310         if (length >= shrink_length) {
311             length -= char_length;
312             break;
313         }
314     }
315 
316     return length;
317 }
318