1 #include "text.h"
2
3 #include "macros.h"
4
5 #include <ctype.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9
10 /* returns the length of the string written to `dest`
11 */
sprint_humanread_bytes(char * dest,unsigned int size,uint64_t bytes)12 int sprint_humanread_bytes(char *dest, unsigned int size, uint64_t bytes) {
13 char * str[] = { "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" };
14 int max_id = COUNTOF(str) - 1;
15 int i = 0;
16 double f = bytes;
17 while ((bytes >= 1024) && (i < max_id)) {
18 bytes /= 1024;
19 f /= 1024.0;
20 i++;
21 }
22
23 size_t r;
24
25 r = snprintf((char *)dest, size, "%u", (uint32_t)bytes);
26
27 if (r >= size) { // truncated
28 r = size - 1;
29 } else {
30 // missing decimals
31 r += snprintf((char *)dest + r, size - r, "%s", str[i]);
32 if (r >= size) { // truncated
33 r = size - 1;
34 }
35 }
36
37 return r;
38 }
39
utf8_len(const char * data)40 uint8_t utf8_len(const char *data) {
41 if (!(*data & 0x80)) {
42 return 1;
43 }
44
45 uint8_t bytes = 1, i;
46 for (i = 6; i != 0xFF; i--) {
47 if (!((*data >> i) & 1)) {
48 break;
49 }
50 bytes++;
51 }
52 // no validation, instead validate all utf8 when received
53 return bytes;
54 }
55
utf8_len_read(const char * data,uint32_t * ch)56 uint8_t utf8_len_read(const char *data, uint32_t *ch) {
57 uint8_t a = data[0];
58 if (!(a & 0x80)) {
59 *ch = data[0];
60 return 1;
61 }
62
63 if (!(a & 0x20)) {
64 *ch = ((data[0] & 0x1F) << 6) | (data[1] & 0x3F);
65 return 2;
66 }
67
68 if (!(a & 0x10)) {
69 *ch = ((data[0] & 0xF) << 12) | ((data[1] & 0x3F) << 6) | (data[2] & 0x3F);
70 return 3;
71 }
72
73 if (!(a & 8)) {
74 *ch = ((data[0] & 0x7) << 18) | ((data[1] & 0x3F) << 12) | ((data[2] & 0x3F) << 6) | (data[3] & 0x3F);
75 return 4;
76 }
77
78 if (!(a & 4)) {
79 *ch = ((data[0] & 0x3) << 24) | ((data[1] & 0x3F) << 18) | ((data[2] & 0x3F) << 12) | ((data[3] & 0x3F) << 6)
80 | (data[4] & 0x3F);
81 return 5;
82 }
83
84 if (!(a & 2)) {
85 *ch = ((data[0] & 0x1) << 30) | ((data[1] & 0x3F) << 24) | ((data[2] & 0x3F) << 18) | ((data[3] & 0x3F) << 12)
86 | ((data[4] & 0x3F) << 6) | (data[5] & 0x3F);
87 return 6;
88 }
89
90 // never happen
91 return 0;
92 }
93
utf8_unlen(char * data)94 uint8_t utf8_unlen(char *data) {
95 uint8_t len = 1;
96 if (*(data - 1) & 0x80) {
97 do {
98 len++;
99 } while (!(*(data - len) & 0x40));
100 }
101
102 return len;
103 }
104
105 /* I've had some issues with this function in the past when it's given malformed data.
106 * irungentoo has previouslly said, it'll never fail when given a valid utf-8 string, however the
107 * utf8 standard says that applications are required to handle and correctlly respond to malformed
108 * strings as they have been used in the past to create security expliots. This function is known to
109 * enter an endless state, or segv on bad strings. Either way, that's bad and needs to be fixed.
110 * TODO(grayhatter) TODO(anyone) */
utf8_validate(const uint8_t * data,int len)111 int utf8_validate(const uint8_t *data, int len) {
112 // stops when an invalid character is reached
113 const uint8_t *a = data, *end = data + len;
114 while (a != end) {
115 if (!(*a & 0x80)) {
116 a++;
117 continue;
118 }
119
120 uint8_t bytes = 1, i;
121 for (i = 6; i != 0xFF; i--) {
122 if (!((*a >> i) & 1)) {
123 break;
124 }
125 bytes++;
126 }
127
128 if (bytes == 1 || bytes == 8) {
129 break;
130 }
131
132 // Validate the utf8
133 if (a + bytes > end) {
134 break;
135 }
136
137 for (i = 1; i < bytes; i++) {
138 if (!(a[i] & 0x80) || (a[i] & 0x40)) {
139 return a - data;
140 }
141 }
142
143 a += bytes;
144 }
145
146 return a - data;
147 }
148
unicode_to_utf8_len(uint32_t ch)149 uint8_t unicode_to_utf8_len(uint32_t ch) {
150 if (ch > 0x1FFFFF) {
151 return 0;
152 }
153 return 4 - (ch <= 0xFFFF) - (ch <= 0x7FF) - (ch <= 0x7F);
154 }
155
unicode_to_utf8(uint32_t ch,char * dst)156 void unicode_to_utf8(uint32_t ch, char *dst) {
157 uint32_t HB = (uint32_t)0x80;
158 uint32_t SB = (uint32_t)0x3F;
159 if (ch <= 0x7F) {
160 dst[0] = (uint8_t)ch;
161 return; // 1;
162 }
163 if (ch <= 0x7FF) {
164 dst[0] = (uint8_t)((ch >> 6) | (uint32_t)0xC0);
165 dst[1] = (uint8_t)((ch & SB) | HB);
166 return; // 2;
167 }
168 if (ch <= 0xFFFF) {
169 dst[0] = (uint8_t)((ch >> 12) | (uint32_t)0xE0);
170 dst[1] = (uint8_t)(((ch >> 6) & SB) | HB);
171 dst[2] = (uint8_t)((ch & SB) | HB);
172 return; // 3;
173 }
174 if (ch <= 0x1FFFFF) {
175 dst[0] = (uint8_t)((ch >> 18) | (uint32_t)0xF0);
176 dst[1] = (uint8_t)(((ch >> 12) & SB) | HB);
177 dst[2] = (uint8_t)(((ch >> 6) & SB) | HB);
178 dst[3] = (uint8_t)((ch & SB) | HB);
179 return; // 4;
180 }
181 return; // 0;
182 }
183
memcmp_case(const char * s1,const char * s2,uint32_t n)184 bool memcmp_case(const char *s1, const char *s2, uint32_t n) {
185 uint32_t i;
186
187 for (i = 0; i < n; i++) {
188 char c1, c2;
189
190 c1 = s1[i];
191 c2 = s2[i];
192
193 if (c1 >= (char)'a' && c1 <= (char)'z') {
194 c1 += ('A' - 'a');
195 }
196
197 if (c2 >= (char)'a' && c2 <= (char)'z') {
198 c2 += ('A' - 'a');
199 }
200
201 if (c1 != c2) {
202 return 1;
203 }
204 }
205
206 return 0;
207 }
208
tohtml(const char * str,uint16_t length)209 char *tohtml(const char *str, uint16_t length) {
210 uint16_t i = 0;
211 int len = 0;
212 while (i != length) {
213 switch (str[i]) {
214 case '<':
215 case '>': {
216 len += 3;
217 break;
218 }
219
220 case '&': {
221 len += 4;
222 break;
223 }
224 }
225
226 i += utf8_len(str + i);
227 }
228
229 char *out = malloc(length + len + 1);
230 i = 0;
231 len = 0;
232 while (i != length) {
233 switch (str[i]) {
234 case '<':
235 case '>': {
236 memcpy(out + len, str[i] == '>' ? ">" : "<", 4);
237 len += 4;
238 i++;
239 break;
240 }
241
242 case '&': {
243 memcpy(out + len, "&", 5);
244 len += 5;
245 i++;
246 break;
247 }
248
249 default: {
250 uint16_t r = utf8_len(str + i);
251 memcpy(out + len, str + i, r);
252 len += r;
253 i += r;
254 break;
255 }
256 }
257 }
258
259 out[len] = 0;
260
261 return out;
262 }
263
to_hex(char * out,uint8_t * in,int size)264 void to_hex(char *out, uint8_t *in, int size) {
265 while (size--) {
266 if (*in >> 4 < 0xA) {
267 *out++ = '0' + (*in >> 4);
268 } else {
269 *out++ = 'A' + (*in >> 4) - 0xA;
270 }
271
272 if ((*in & 0xf) < 0xA) {
273 *out++ = '0' + (*in & 0xF);
274 } else {
275 *out++ = 'A' + (*in & 0xF) - 0xA;
276 }
277 in++;
278 }
279 }
280
strstr_case(const char * a,const char * b)281 bool strstr_case(const char *a, const char *b) {
282 const char *c = b;
283 while (*a) {
284 if (tolower(*a) != tolower(*c)) {
285 c = b;
286 }
287
288 if (tolower(*a) == tolower(*c)) {
289 c++;
290 if (!*c) {
291 return 1;
292 }
293 }
294 a++;
295 }
296
297 return 0;
298 }
299
safe_shrink(const char * string,uint16_t string_length,uint16_t shrink_length)300 uint16_t safe_shrink(const char *string, uint16_t string_length, uint16_t shrink_length) {
301 if (!string) {
302 return 0;
303 }
304
305 uint16_t length = 0;
306 while (length < string_length) {
307 uint8_t char_length = utf8_len(&string[length]);
308 length += char_length;
309
310 if (length >= shrink_length) {
311 length -= char_length;
312 break;
313 }
314 }
315
316 return length;
317 }
318