1 /*
2 * Copyright (c) 2017, Herbert Valerio Riedel
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * * Redistributions in binary form must reproduce the above
13 * copyright notice, this list of conditions and the following
14 * disclaimer in the documentation and/or other materials provided
15 * with the distribution.
16 *
17 * * Neither the name of Herbert Valerio Riedel nor the names of other
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #if !defined(NDEBUG)
35 # warning assert(3) checks enabled
36 #endif
37
38 #include <stdint.h>
39 #include <stdbool.h>
40 #include <string.h>
41 #include <stdlib.h>
42 #include <assert.h>
43 #include <HsFFI.h>
44
45 #if !defined(SIZEOF_VOID_P)
46 # error <HsFFI.h> SIZEOF_VOID_P not defined
47 #endif
48
49 #if (SIZEOF_VOID_P) == 8
50 const static bool is_64bit = true;
51 #elif (SIZEOF_VOID_P) == 4
52 const static bool is_64bit = false;
53 #else
54 # error unexpected SIZEOF_VOID_P value
55 #endif
56
57 #if (WORDS_BIGENDIAN)
58 const static bool is_bigendian = true;
59 #else
60 const static bool is_bigendian = false;
61 #endif
62
63 #if defined(__GNUC__)
64 # define likely(x) __builtin_expect(!!(x),1)
65 # define unlikely(x) __builtin_expect(!!(x),0)
66 #else
67 # define likely(x) (x)
68 # define unlikely(x) (x)
69 #endif
70
71 /* test whether octet in UTF-8 steam is not a continuation byte, i.e. a leading byte */
72 #define utf8_lead_p(octet) (((octet) & 0xc0) != 0x80)
73
74 /* 0 <= x <= 0x110000 */
75 typedef HsWord codepoint_t;
76
77 /* Count number of code-points in well-formed utf8 string */
78 size_t
hs_text_short_length(const uint8_t buf[],const size_t n)79 hs_text_short_length(const uint8_t buf[], const size_t n)
80 {
81 size_t j = 0;
82 size_t l = 0;
83
84 /* Both GCC & Clang are able to optimise the code below quite well at -O3 */
85 for (j = 0; j < n; j++)
86 if (utf8_lead_p(buf[j]))
87 l++;
88
89 return l;
90 }
91
92 /* Locate offset of j-th code-point in well-formed utf8 string
93 *
94 */
95 size_t
hs_text_short_index_ofs(const uint8_t buf[],const size_t n,const size_t i)96 hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i)
97 {
98 if (!n)
99 return n;
100
101 size_t m = 0;
102 size_t j = 0;
103
104 for (;;) {
105 assert(m >= 0);
106 assert(j <= i);
107 assert(j <= m);
108
109 if (j == i)
110 return m; /* found */
111
112 if (i-j >= n-m)
113 return n; /* i-th char is >= buf+n */
114
115 assert(m < n);
116 const uint8_t b0 = buf[m];
117
118 if (!(b0 & 0x80))
119 m += 1; /* 0_______ */
120 else
121 switch(b0 >> 4) {
122 case 0xf: /* 11110___ */
123 m += 4;
124 break;
125 case 0xe: /* 1110____ */
126 m += 3;
127 break;
128 default: /* 110_____ */
129 m += 2;
130 break;
131 }
132
133 j += 1;
134 }
135
136 assert(0);
137 }
138
139 /* Locate offset of j-th code-point (in reverse direction) in
140 * well-formed utf8 string starting at end of buffer.
141 *
142 * The 0-th character from the end is the last character in the utf8
143 * string (if it exists).
144 *
145 * Returns original 'n' if out of bounds.
146 *
147 */
148 size_t
hs_text_short_index_ofs_rev(const uint8_t buf[],const size_t n,const size_t i)149 hs_text_short_index_ofs_rev(const uint8_t buf[], const size_t n, const size_t i)
150 {
151 size_t m = n;
152 size_t j = i;
153
154 for (;;) {
155 assert(m <= n);
156 assert(j >= 0);
157
158 if (j >= m)
159 return n; /* i-th char is < buf */
160
161 /* if (m == i-j) /\* suffix is made up only of ASCII chars, so we can shortcut *\/ */
162 /* return 0; */
163
164 /* scan until octet does not match 10_ */
165 assert(m > 0);
166 if (!(buf[--m] & 0x80))
167 goto l_cont;
168
169 assert(m > 0);
170 if (utf8_lead_p(buf[--m])) {
171 assert ((buf[m] & 0xe0) == 0xc0); /* 110_ */
172 goto l_cont;
173 }
174
175 assert(m > 0);
176 if (utf8_lead_p(buf[--m])) {
177 assert ((buf[m] & 0xf0) == 0xe0); /* 1110_ */
178 goto l_cont;
179 }
180
181 /* this must be a non-10_ octet in a well-formed stream */
182 assert(m > 0);
183 m -= 1;
184
185 assert ((buf[m] & 0xf8) == 0xf0); /* 11110_ */
186
187 l_cont:
188 assert(utf8_lead_p(buf[m]));
189
190 if (!j)
191 return m; /* found */
192
193 j -= 1;
194 }
195
196 assert(0);
197 }
198
199 /* Decode UTF8 code units into code-point
200 * Assumes buf[] points to start of a valid UTF8-encoded code-point
201 */
202 static inline uint32_t
hs_text_short_decode_cp(const uint8_t buf[])203 hs_text_short_decode_cp(const uint8_t buf[])
204 {
205 /* 7 bits | 0xxxxxxx
206 * 11 bits | 110yyyyx 10xxxxxx
207 * 16 bits | 1110yyyy 10yxxxxx 10xxxxxx
208 * 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
209 */
210
211 const uint8_t b0 = buf[0];
212
213 if (!(b0 & 0x80))
214 return b0;
215
216 uint32_t cp = 0;
217
218 switch(b0 >> 4) {
219 case 0xf: /* 11110___ */
220 assert((b0 & 0xf8) == 0xf0);
221 assert(!utf8_lead_p(buf[1]));
222 assert(!utf8_lead_p(buf[2]));
223 assert(!utf8_lead_p(buf[3]));
224 cp = ((uint32_t)(b0 & 0x07)) << (6+6+6);
225 cp |= ((uint32_t)(buf[1] & 0x3f)) << (6+6);
226 cp |= ((uint32_t)(buf[2] & 0x3f)) << 6;
227 cp |= buf[3] & 0x3f;
228 assert (cp > 0xffff); assert (cp < 0x110000);
229 return cp;
230
231 case 0xe: /* 1110____ */
232 assert(!utf8_lead_p(buf[1]));
233 assert(!utf8_lead_p(buf[2]));
234 cp = ((uint32_t)(b0 & 0x0f)) << (6+6);
235 cp |= ((uint32_t)(buf[1] & 0x3f)) << 6;
236 cp |= buf[2] & 0x3f;
237 assert (cp > 0x7ff); assert (cp < 0x10000);
238 assert (cp < 0xd800 || cp > 0xdfff);
239 return cp;
240
241 default: /* 110_____ */
242 assert((b0 & 0xe0) == 0xc0);
243 assert(!utf8_lead_p(buf[1]));
244 cp = ((uint32_t)(b0 & 0x1f)) << 6;
245 cp |= buf[1] & 0x3f;
246 assert (cp > 0x7f); assert (cp < 0x800);
247 return cp;
248 }
249 }
250
251 /* decode codepoint starting at buf[ofs] */
252 codepoint_t
hs_text_short_ofs_cp(const uint8_t buf[],const size_t ofs)253 hs_text_short_ofs_cp(const uint8_t buf[], const size_t ofs)
254 {
255 return hs_text_short_decode_cp(buf+ofs);
256 }
257
258 /* reverse-decode codepoint starting at offset right after a code-point */
259 codepoint_t
hs_text_short_ofs_cp_rev(const uint8_t * buf,const size_t ofs)260 hs_text_short_ofs_cp_rev(const uint8_t *buf, const size_t ofs)
261 {
262 /* 7 bits | 0xxxxxxx
263 * 11 bits | 110yyyyx 10xxxxxx
264 * 16 bits | 1110yyyy 10yxxxxx 10xxxxxx
265 * 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
266 */
267
268 buf = buf + ofs - 1;
269
270 /* this octet is either 10_ or 0_ */
271 uint32_t cp = *buf;
272
273 if (!(cp & 0x80))
274 return cp;
275
276 assert (!utf8_lead_p(cp));
277 cp &= 0x3f;
278
279 /* this octet is either 10_ or 110_ */
280 {
281 const uint8_t b = *(--buf);
282 assert (!utf8_lead_p(b) || ((b & 0xe0) == 0xc0));
283
284 cp |= (b & 0x3f) << 6;
285
286 if (b & 0x40) {
287 assert (cp > 0x7f); assert (cp < 0x800);
288 return cp;
289 }
290 }
291
292 /* this octet is either 10_ or 1110_ */
293 {
294 const uint8_t b = *(--buf);
295 assert (!utf8_lead_p(b) || ((b & 0xf0) == 0xe0));
296
297 if (b & 0x40) {
298 cp |= (b & 0xf) << 12;
299
300 assert (cp > 0x7ff); assert (cp < 0x10000);
301 assert (cp < 0xd800 || cp > 0xdfff);
302 return cp;
303 }
304
305 cp |= (b & 0x3f) << 12;
306 }
307
308 /* this octet must be 11110_ */
309 const uint8_t b = *(buf-1);
310 assert ((b & 0xf8) == 0xf0);
311
312 cp |= (b & 0x7) << 18;
313
314 assert (cp > 0xffff); assert (cp < 0x110000);
315 return cp;
316 }
317
318 /* Retrieve i-th code-point in (valid) UTF8 stream
319 *
320 * Returns -1 if out of bounds
321 */
322 codepoint_t
hs_text_short_index_cp(const uint8_t buf[],const size_t n,const size_t i)323 hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i)
324 {
325 const size_t ofs = hs_text_short_index_ofs(buf, n, i);
326
327 if (ofs >= n)
328 return -1;
329
330 return hs_text_short_decode_cp(&buf[ofs]);
331 }
332
333 /* Retrieve i-th code-point in (valid) UTF8 stream
334 *
335 * Returns -1 if out of bounds
336 */
337 codepoint_t
hs_text_short_index_cp_rev(const uint8_t buf[],const size_t n,const size_t i)338 hs_text_short_index_cp_rev(const uint8_t buf[], const size_t n, const size_t i)
339 {
340 const size_t ofs = hs_text_short_index_ofs_rev(buf, n, i);
341
342 if (ofs >= n)
343 return -1;
344
345 return hs_text_short_decode_cp(&buf[ofs]);
346 }
347
348 /* Validate UTF8 encoding
349
350 7 bits | 0xxxxxxx
351 11 bits | 110yyyyx 10xxxxxx
352 16 bits | 1110yyyy 10yxxxxx 10xxxxxx
353 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
354
355 Valid code-points:
356
357 [U+0000 .. U+D7FF] + [U+E000 .. U+10FFFF]
358
359 Return values:
360
361 0 -> ok
362
363 1 -> invalid byte/code-point
364
365 -1 -> truncated (1 byte missing)
366 -2 -> truncated (2 byte missing)
367 -3 -> truncated (3 byte missing)
368
369 */
370
371 int
hs_text_short_is_valid_utf8(const uint8_t buf[],const size_t n)372 hs_text_short_is_valid_utf8(const uint8_t buf[], const size_t n)
373 {
374 size_t j = 0;
375
376 while (j < n) {
377 const uint8_t b0 = buf[j++];
378
379 if (!(b0 & 0x80))
380 continue; /* b0 elem [ 0x00 .. 0x7f ] */
381
382 if ((b0 & 0xe0) == 0xc0) { /* [ 0xc0 .. 0xdf ] */
383 if (!(b0 & 0x1e)) return 1; /* 0xc0 or 0xc1; denorm */
384 if (j >= n) return -1;
385
386 goto l_trail1; /* b1 */
387 }
388
389 if ((b0 & 0xf0) == 0xe0) { /* [ 0xe0 .. 0xef ] */
390 if ((j+1) >= n) return (n-(j+2));
391
392 const uint8_t b1 = buf[j++];
393 if (utf8_lead_p(b1)) return 1; /* b1 elem [ 0x80 .. 0xbf ] */
394
395 /* if b0==0xe0: b1 elem [ 0xa0 .. 0xbf ] */
396 if (!((b0 & 0x0f) | (b1 & 0x20))) return 1; /* denorm */
397
398 /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
399 /* if b0==0xed: b1 elem [ 0x80 .. 0x9f ] */
400 if ((b0 == 0xed) && (b1 & 0x20)) return 1;
401
402 goto l_trail1; /* b2 */
403 }
404
405 if ((b0 & 0xfc) == 0xf0) { /* [ 0xf0 .. 0xf3 ] */
406 if ((j+2) >= n) return (n-(j+3));
407
408 const uint8_t b1 = buf[j++];
409
410 if (utf8_lead_p(b1)) /* b1 elem [ 0x80 .. 0xbf ] */
411 return 1;
412
413 if (!((b0 & 0x03) | (b1 & 0x30))) /* if b0==0xf0: b1 elem [ 0x90 .. 0xbf ] */
414 return 1;
415
416 goto l_trail2; /* b1, b2 */
417 }
418
419 if (b0 == 0xf4) {
420 if ((j+2) >= n) return (n-(j+3));
421
422 /* b1 */
423 if ((buf[j++] & 0xf0) != 0x80) return 1;
424 /* b1 elem [ 0x80 .. 0x8f ] */
425
426 l_trail2:
427 /* b2 */
428 if (utf8_lead_p(buf[j++])) return 1;
429 /* b2 elem [ 0x80 .. 0xbf ] */
430
431 l_trail1:
432 /* b3 */
433 if (utf8_lead_p(buf[j++])) return 1;
434 /* b3 elem [ 0x80 .. 0xbf ] */
435
436 continue;
437 }
438
439 /* invalid b0 byte */
440 return 1;
441 }
442
443 assert(j == n);
444
445 return 0;
446 }
447
448
449 /* Returns length of longest ASCII-code-point prefix.
450 */
451 size_t
hs_text_short_ascii_length(const uint8_t buf[],const size_t n)452 hs_text_short_ascii_length(const uint8_t buf[], const size_t n)
453 {
454 size_t j = 0;
455
456 if (is_64bit) {
457 /* "vectorized" optimisation checking 8 octets at once
458 *
459 * NB: A 64-bit aligned buffer is assumed. This is assumption is
460 * justified when the buffer is the payload of a `ByteArray#`.
461 */
462 const uint64_t *buf64 = (const uint64_t*)buf;
463
464 for (; (j+7) < n; j+=8, ++buf64)
465 if (*buf64 & UINT64_C(0x8080808080808080))
466 break;
467 } else {
468 /* "vectorized" optimisation checking 4 octets at once */
469 const uint32_t *buf32 = (const uint32_t*)buf;
470
471 for (; (j+3) < n; j+=4, ++buf32)
472 if (*buf32 & UINT64_C(0x80808080))
473 break;
474 }
475
476 for (; j < n; ++j)
477 if (buf[j] & 0x80)
478 return j;
479
480 return j;
481 }
482
483 /* Test whether well-formed UTF8 string contains only ASCII code-points
484 * returns 0 if not ASCII
485 *
486 * This code assumes a naturally aligned buf[]
487 */
488 int
hs_text_short_is_ascii(const uint8_t buf[],const size_t n)489 hs_text_short_is_ascii(const uint8_t buf[], const size_t n)
490 {
491 size_t j = 0;
492
493 if (n < 2)
494 return 1;
495
496 if (is_64bit) {
497 /* "vectorized" optimisation checking 8 octets at once
498 *
499 * NB: A 64-bit aligned buffer is assumed. This is assumption is
500 * justified when the buffer is the payload of a `ByteArray#`.
501 *
502 */
503 const uint64_t *buf64 = (const uint64_t*)buf;
504
505 for (; (j+7) < n; j+=8, ++buf64)
506 if (*buf64 & UINT64_C(0x8080808080808080))
507 return 0;
508
509 if (j < n) {
510 const int maskshift = (8 - (n - j)) << 3;
511 const uint64_t mask = is_bigendian ? (UINT64_C(0x8080808080808080) << maskshift) /* big endian */
512 : (UINT64_C(0x8080808080808080) >> maskshift); /* little endian */
513
514 if (*buf64 & mask)
515 return 0;
516 }
517 } else {
518 /* "vectorized" optimisation checking 4 octets at once */
519 const uint32_t *buf32 = (const uint32_t*)buf;
520
521 for (; (j+3) < n; j+=4, ++buf32)
522 if (*buf32 & UINT64_C(0x80808080))
523 return 0;
524
525 for (; j < n; ++j)
526 if (buf[j] & 0x80)
527 return 0;
528 }
529
530 return 1;
531 }
532
533 /*
534 * Compute length of (transcoded) mutf8 literal
535 *
536 * If the mutf8 literal does not contain either surrogates nor escaped
537 * NULs, a positive length is returned which matches what strlen(3)
538 * would have returned.
539 *
540 * Otherwise, a negated size is returned which corresponds to the size
541 * of a the mutf8->utf8 transcoded string.
542 *
543 */
544 HsInt
hs_text_short_mutf8_strlen(const uint8_t buf[])545 hs_text_short_mutf8_strlen(const uint8_t buf[])
546 {
547 size_t j = 0;
548 size_t nulls = 0;
549 bool surr_seen = false;
550
551 for (;;) {
552 const uint8_t b0 = buf[j];
553
554 if (unlikely(!b0))
555 break;
556
557 if (likely(!(b0 & 0x80)))
558 j += 1; /* 0_______ */
559 else
560 switch(b0 >> 4) {
561 case 0xf: /* 11110___ */
562 j += 4;
563 break;
564 case 0xe: /* 1110____ */
565 /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
566 if (unlikely(!surr_seen && (b0 == 0xed) && (buf[j+1] & 0x20)))
567 surr_seen = true;
568 j += 3;
569 break;
570 default: /* 110_____ */
571 /* escaped NUL */
572 if (unlikely((b0 == 0xc0) && (buf[j+1] == 0x80)))
573 nulls += 1;
574 j += 2;
575 break;
576 }
577 } /* for */
578
579
580 if ((nulls > 0) || surr_seen)
581 return -(HsInt)(j - nulls);
582
583 return j;
584 }
585
586 /* Transcode Modified UTF-8 to proper UTF-8
587 *
588 * This involves
589 *
590 * 1. Unescape denormal 2-byte NULs (0xC0 0x80)
591 * 2. Rewrite surrogate pairs to U+FFFD
592 */
593 void
hs_text_short_mutf8_trans(const uint8_t src0[],uint8_t dst0[])594 hs_text_short_mutf8_trans(const uint8_t src0[], uint8_t dst0[])
595 {
596 const uint8_t *src = src0;
597 uint8_t *dst = dst0;
598
599 for (;;) {
600 const uint8_t b0 = *src++;
601 assert(utf8_lead_p(b0));
602
603 if (likely(!(b0 & 0x80))) { /* 0_______ */
604 if (unlikely(!b0))
605 break;
606
607 *dst++ = b0;
608 continue;
609 }
610
611 switch(b0 >> 4) {
612 case 0xf: /* 11110___ */
613 assert(!utf8_lead_p(src[0]));
614 assert(!utf8_lead_p(src[1]));
615 assert(!utf8_lead_p(src[2]));
616 *dst++ = b0;
617 *dst++ = *src++;
618 *dst++ = *src++;
619 *dst++ = *src++;
620 break;
621
622 case 0xe: { /* 1110____ */
623 const uint8_t b1 = *src++;
624 const uint8_t b2 = *src++;
625 assert(!utf8_lead_p(b1));
626 assert(!utf8_lead_p(b2));
627 if (unlikely((b0 == 0xed) && (b1 & 0x20))) {
628 /* UTF16 Surrogate pairs [U+D800 .. U+DFFF]
629 * -> translate into U+FFFD
630 */
631 *dst++ = 0xef;
632 *dst++ = 0xbf;
633 *dst++ = 0xbd;
634 } else {
635 *dst++ = b0;
636 *dst++ = b1;
637 *dst++ = b2;
638 }
639 break;
640 }
641 default: { /* 110_____ */
642 const uint8_t b1 = *src++;
643 assert(!utf8_lead_p(b1));
644 if (unlikely((b0 == 0xc0) && (b1 == 0x80))) {
645 /* escaped/denormal U+0000 -> normalize */
646 *dst++ = 0x00;
647 } else {
648 *dst++ = b0;
649 *dst++ = b1;
650 }
651 break;
652 }
653 } /* switch */
654 } /* for */
655
656 assert(labs(hs_text_short_mutf8_strlen(src0)) == (dst - dst0));
657 }
658