1 /*
2  * Copyright (c) 2017, Herbert Valerio Riedel
3  *
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  *     * Redistributions of source code must retain the above copyright
10  *       notice, this list of conditions and the following disclaimer.
11  *
12  *     * Redistributions in binary form must reproduce the above
13  *       copyright notice, this list of conditions and the following
14  *       disclaimer in the documentation and/or other materials provided
15  *       with the distribution.
16  *
17  *     * Neither the name of Herbert Valerio Riedel nor the names of other
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #if !defined(NDEBUG)
35 # warning assert(3) checks enabled
36 #endif
37 
38 #include <stdint.h>
39 #include <stdbool.h>
40 #include <string.h>
41 #include <stdlib.h>
42 #include <assert.h>
43 #include <HsFFI.h>
44 
45 #if !defined(SIZEOF_VOID_P)
46 # error <HsFFI.h> SIZEOF_VOID_P not defined
47 #endif
48 
49 #if (SIZEOF_VOID_P) == 8
50 const static bool is_64bit = true;
51 #elif (SIZEOF_VOID_P) == 4
52 const static bool is_64bit = false;
53 #else
54 # error unexpected SIZEOF_VOID_P value
55 #endif
56 
57 #if (WORDS_BIGENDIAN)
58 const static bool is_bigendian = true;
59 #else
60 const static bool is_bigendian = false;
61 #endif
62 
63 #if defined(__GNUC__)
64 # define likely(x)     __builtin_expect(!!(x),1)
65 # define unlikely(x)   __builtin_expect(!!(x),0)
66 #else
67 # define likely(x)     (x)
68 # define unlikely(x)   (x)
69 #endif
70 
71 /* test whether octet in UTF-8 steam is not a continuation byte, i.e. a leading byte */
72 #define utf8_lead_p(octet) (((octet) & 0xc0) != 0x80)
73 
74 /* 0 <= x <= 0x110000 */
75 typedef HsWord codepoint_t;
76 
77 /* Count number of code-points in well-formed utf8 string */
78 size_t
hs_text_short_length(const uint8_t buf[],const size_t n)79 hs_text_short_length(const uint8_t buf[], const size_t n)
80 {
81   size_t j = 0;
82   size_t l = 0;
83 
84   /* Both GCC & Clang are able to optimise the code below quite well at -O3 */
85   for (j = 0; j < n; j++)
86     if (utf8_lead_p(buf[j]))
87       l++;
88 
89   return l;
90 }
91 
92 /* Locate offset of j-th code-point in well-formed utf8 string
93  *
94  */
95 size_t
hs_text_short_index_ofs(const uint8_t buf[],const size_t n,const size_t i)96 hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i)
97 {
98   if (!n)
99     return n;
100 
101   size_t m = 0;
102   size_t j = 0;
103 
104   for (;;) {
105     assert(m >= 0);
106     assert(j <= i);
107     assert(j <= m);
108 
109     if (j == i)
110       return m; /* found */
111 
112     if (i-j >= n-m)
113       return n; /* i-th char is >= buf+n */
114 
115     assert(m < n);
116     const uint8_t b0 = buf[m];
117 
118     if (!(b0 & 0x80))
119       m += 1;   /* 0_______ */
120     else
121       switch(b0 >> 4) {
122       case 0xf: /* 11110___ */
123         m += 4;
124         break;
125       case 0xe: /* 1110____ */
126         m += 3;
127         break;
128       default:  /* 110_____ */
129         m += 2;
130         break;
131       }
132 
133     j += 1;
134   }
135 
136   assert(0);
137 }
138 
139 /* Locate offset of j-th code-point (in reverse direction) in
140  * well-formed utf8 string starting at end of buffer.
141  *
142  * The 0-th character from the end is the last character in the utf8
143  * string (if it exists).
144  *
145  * Returns original 'n' if out of bounds.
146  *
147  */
148 size_t
hs_text_short_index_ofs_rev(const uint8_t buf[],const size_t n,const size_t i)149 hs_text_short_index_ofs_rev(const uint8_t buf[], const size_t n, const size_t i)
150 {
151   size_t m = n;
152   size_t j = i;
153 
154   for (;;) {
155     assert(m <= n);
156     assert(j >= 0);
157 
158     if (j >= m)
159       return n; /* i-th char is < buf */
160 
161     /* if (m == i-j) /\* suffix is made up only of ASCII chars, so we can shortcut *\/ */
162     /*   return 0; */
163 
164     /* scan until octet does not match 10_ */
165     assert(m > 0);
166     if (!(buf[--m] & 0x80))
167       goto l_cont;
168 
169     assert(m > 0);
170     if (utf8_lead_p(buf[--m])) {
171       assert ((buf[m] & 0xe0) == 0xc0); /* 110_ */
172       goto l_cont;
173     }
174 
175     assert(m > 0);
176     if (utf8_lead_p(buf[--m])) {
177       assert ((buf[m] & 0xf0) == 0xe0); /* 1110_ */
178       goto l_cont;
179     }
180 
181     /* this must be a non-10_ octet in a well-formed stream */
182     assert(m > 0);
183     m -= 1;
184 
185     assert ((buf[m] & 0xf8) == 0xf0); /* 11110_ */
186 
187   l_cont:
188     assert(utf8_lead_p(buf[m]));
189 
190     if (!j)
191       return m; /* found */
192 
193     j -= 1;
194   }
195 
196   assert(0);
197 }
198 
199 /* Decode UTF8 code units into code-point
200  * Assumes buf[] points to start of a valid UTF8-encoded code-point
201  */
202 static inline uint32_t
hs_text_short_decode_cp(const uint8_t buf[])203 hs_text_short_decode_cp(const uint8_t buf[])
204 {
205   /*  7 bits | 0xxxxxxx
206    * 11 bits | 110yyyyx  10xxxxxx
207    * 16 bits | 1110yyyy  10yxxxxx  10xxxxxx
208    * 21 bits | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
209    */
210 
211   const uint8_t b0 = buf[0];
212 
213   if (!(b0 & 0x80))
214     return b0;
215 
216   uint32_t cp = 0;
217 
218   switch(b0 >> 4) {
219   case 0xf: /* 11110___ */
220     assert((b0 & 0xf8) == 0xf0);
221     assert(!utf8_lead_p(buf[1]));
222     assert(!utf8_lead_p(buf[2]));
223     assert(!utf8_lead_p(buf[3]));
224     cp  = ((uint32_t)(b0     & 0x07)) << (6+6+6);
225     cp |= ((uint32_t)(buf[1] & 0x3f)) << (6+6);
226     cp |= ((uint32_t)(buf[2] & 0x3f)) << 6;
227     cp |=             buf[3] & 0x3f;
228     assert (cp > 0xffff); assert (cp < 0x110000);
229     return cp;
230 
231   case 0xe: /* 1110____ */
232     assert(!utf8_lead_p(buf[1]));
233     assert(!utf8_lead_p(buf[2]));
234     cp  = ((uint32_t)(b0     & 0x0f)) << (6+6);
235     cp |= ((uint32_t)(buf[1] & 0x3f)) << 6;
236     cp |=             buf[2] & 0x3f;
237     assert (cp > 0x7ff); assert (cp < 0x10000);
238     assert (cp < 0xd800 || cp > 0xdfff);
239     return cp;
240 
241   default:  /* 110_____ */
242     assert((b0 & 0xe0) == 0xc0);
243     assert(!utf8_lead_p(buf[1]));
244     cp  = ((uint32_t)(b0     & 0x1f)) << 6;
245     cp |=             buf[1] & 0x3f;
246     assert (cp > 0x7f); assert (cp < 0x800);
247     return cp;
248   }
249 }
250 
251 /* decode codepoint starting at buf[ofs] */
252 codepoint_t
hs_text_short_ofs_cp(const uint8_t buf[],const size_t ofs)253 hs_text_short_ofs_cp(const uint8_t buf[], const size_t ofs)
254 {
255   return hs_text_short_decode_cp(buf+ofs);
256 }
257 
258 /* reverse-decode codepoint starting at offset right after a code-point */
259 codepoint_t
hs_text_short_ofs_cp_rev(const uint8_t * buf,const size_t ofs)260 hs_text_short_ofs_cp_rev(const uint8_t *buf, const size_t ofs)
261 {
262   /*  7 bits | 0xxxxxxx
263    * 11 bits | 110yyyyx  10xxxxxx
264    * 16 bits | 1110yyyy  10yxxxxx  10xxxxxx
265    * 21 bits | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
266    */
267 
268   buf = buf + ofs - 1;
269 
270   /* this octet is either 10_ or 0_ */
271   uint32_t cp = *buf;
272 
273   if (!(cp & 0x80))
274     return cp;
275 
276   assert (!utf8_lead_p(cp));
277   cp &= 0x3f;
278 
279   /* this octet is either 10_ or 110_ */
280   {
281     const uint8_t b = *(--buf);
282     assert (!utf8_lead_p(b) || ((b & 0xe0) == 0xc0));
283 
284     cp |=  (b & 0x3f) << 6;
285 
286     if (b & 0x40) {
287       assert (cp > 0x7f); assert (cp < 0x800);
288       return cp;
289     }
290   }
291 
292   /* this octet is either 10_ or 1110_ */
293   {
294     const uint8_t b = *(--buf);
295     assert (!utf8_lead_p(b) || ((b & 0xf0) == 0xe0));
296 
297     if (b & 0x40) {
298       cp |= (b & 0xf) << 12;
299 
300       assert (cp > 0x7ff); assert (cp < 0x10000);
301       assert (cp < 0xd800 || cp > 0xdfff);
302       return cp;
303     }
304 
305     cp |= (b & 0x3f) << 12;
306   }
307 
308   /* this octet must be 11110_ */
309   const uint8_t b = *(buf-1);
310   assert ((b & 0xf8) == 0xf0);
311 
312   cp |= (b & 0x7) << 18;
313 
314   assert (cp > 0xffff); assert (cp < 0x110000);
315   return cp;
316 }
317 
318 /* Retrieve i-th code-point in (valid) UTF8 stream
319  *
320  * Returns -1 if out of bounds
321  */
322 codepoint_t
hs_text_short_index_cp(const uint8_t buf[],const size_t n,const size_t i)323 hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i)
324 {
325   const size_t ofs = hs_text_short_index_ofs(buf, n, i);
326 
327   if (ofs >= n)
328     return -1;
329 
330   return hs_text_short_decode_cp(&buf[ofs]);
331 }
332 
333 /* Retrieve i-th code-point in (valid) UTF8 stream
334  *
335  * Returns -1 if out of bounds
336  */
337 codepoint_t
hs_text_short_index_cp_rev(const uint8_t buf[],const size_t n,const size_t i)338 hs_text_short_index_cp_rev(const uint8_t buf[], const size_t n, const size_t i)
339 {
340   const size_t ofs = hs_text_short_index_ofs_rev(buf, n, i);
341 
342   if (ofs >= n)
343     return -1;
344 
345   return hs_text_short_decode_cp(&buf[ofs]);
346 }
347 
348 /* Validate UTF8 encoding
349 
350  7 bits | 0xxxxxxx
351 11 bits | 110yyyyx  10xxxxxx
352 16 bits | 1110yyyy  10yxxxxx  10xxxxxx
353 21 bits | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
354 
355 Valid code-points:
356 
357  [U+0000 .. U+D7FF] + [U+E000 .. U+10FFFF]
358 
359 Return values:
360 
361   0 -> ok
362 
363   1 -> invalid byte/code-point
364 
365  -1 -> truncated (1 byte missing)
366  -2 -> truncated (2 byte missing)
367  -3 -> truncated (3 byte missing)
368 
369 */
370 
371 int
hs_text_short_is_valid_utf8(const uint8_t buf[],const size_t n)372 hs_text_short_is_valid_utf8(const uint8_t buf[], const size_t n)
373 {
374   size_t j = 0;
375 
376   while (j < n) {
377     const uint8_t b0 = buf[j++];
378 
379     if (!(b0 & 0x80))
380       continue; /* b0 elem [ 0x00 .. 0x7f ] */
381 
382     if ((b0 & 0xe0) == 0xc0) { /* [ 0xc0 .. 0xdf ] */
383       if (!(b0 & 0x1e)) return 1; /* 0xc0 or 0xc1; denorm */
384       if (j >= n) return -1;
385 
386       goto l_trail1; /* b1 */
387     }
388 
389     if ((b0 & 0xf0) == 0xe0) { /* [ 0xe0 .. 0xef ] */
390       if ((j+1) >= n) return (n-(j+2));
391 
392       const uint8_t b1 = buf[j++];
393       if (utf8_lead_p(b1)) return 1; /* b1 elem [ 0x80 .. 0xbf ] */
394 
395       /* if b0==0xe0: b1 elem [ 0xa0 .. 0xbf ] */
396       if (!((b0 & 0x0f) | (b1 & 0x20))) return 1; /* denorm */
397 
398       /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
399       /* if b0==0xed: b1 elem [ 0x80 .. 0x9f ] */
400       if ((b0 == 0xed) && (b1 & 0x20)) return 1;
401 
402       goto l_trail1; /* b2 */
403     }
404 
405     if ((b0 & 0xfc) == 0xf0) { /* [ 0xf0 .. 0xf3 ] */
406       if ((j+2) >= n) return (n-(j+3));
407 
408       const uint8_t b1 = buf[j++];
409 
410       if (utf8_lead_p(b1))         /* b1 elem [ 0x80 .. 0xbf ] */
411         return 1;
412 
413       if (!((b0 & 0x03) | (b1 & 0x30))) /* if b0==0xf0: b1 elem [ 0x90 .. 0xbf ] */
414         return 1;
415 
416       goto l_trail2; /* b1, b2 */
417     }
418 
419     if (b0 == 0xf4) {
420       if ((j+2) >= n) return (n-(j+3));
421 
422       /* b1 */
423       if ((buf[j++] & 0xf0) != 0x80) return 1;
424       /* b1 elem [ 0x80 .. 0x8f ] */
425 
426     l_trail2:
427       /* b2 */
428       if (utf8_lead_p(buf[j++])) return 1;
429       /* b2 elem [ 0x80 .. 0xbf ] */
430 
431     l_trail1:
432       /* b3 */
433       if (utf8_lead_p(buf[j++])) return 1;
434       /* b3 elem [ 0x80 .. 0xbf ] */
435 
436       continue;
437     }
438 
439     /* invalid b0 byte */
440     return 1;
441   }
442 
443   assert(j == n);
444 
445   return 0;
446 }
447 
448 
449 /* Returns length of longest ASCII-code-point prefix.
450  */
451 size_t
hs_text_short_ascii_length(const uint8_t buf[],const size_t n)452 hs_text_short_ascii_length(const uint8_t buf[], const size_t n)
453 {
454   size_t j = 0;
455 
456   if (is_64bit) {
457     /* "vectorized" optimisation checking 8 octets at once
458      *
459      * NB: A 64-bit aligned buffer is assumed. This is assumption is
460      * justified when the buffer is the payload of a `ByteArray#`.
461      */
462     const uint64_t *buf64 = (const uint64_t*)buf;
463 
464     for (; (j+7) < n; j+=8, ++buf64)
465       if (*buf64 & UINT64_C(0x8080808080808080))
466         break;
467   } else {
468     /* "vectorized" optimisation checking 4 octets at once */
469     const uint32_t *buf32 = (const uint32_t*)buf;
470 
471     for (; (j+3) < n; j+=4, ++buf32)
472       if (*buf32 & UINT64_C(0x80808080))
473         break;
474   }
475 
476   for (; j < n; ++j)
477     if (buf[j] & 0x80)
478       return j;
479 
480   return j;
481 }
482 
483 /* Test whether well-formed UTF8 string contains only ASCII code-points
484  * returns 0 if not ASCII
485  *
486  * This code assumes a naturally aligned buf[]
487  */
488 int
hs_text_short_is_ascii(const uint8_t buf[],const size_t n)489 hs_text_short_is_ascii(const uint8_t buf[], const size_t n)
490 {
491   size_t j = 0;
492 
493   if (n < 2)
494     return 1;
495 
496   if (is_64bit) {
497     /* "vectorized" optimisation checking 8 octets at once
498      *
499      * NB: A 64-bit aligned buffer is assumed. This is assumption is
500      * justified when the buffer is the payload of a `ByteArray#`.
501      *
502      */
503     const uint64_t *buf64 = (const uint64_t*)buf;
504 
505     for (; (j+7) < n; j+=8, ++buf64)
506       if (*buf64 & UINT64_C(0x8080808080808080))
507         return 0;
508 
509     if (j < n) {
510       const int maskshift = (8 - (n - j)) << 3;
511       const uint64_t mask = is_bigendian ? (UINT64_C(0x8080808080808080) << maskshift)  /* big endian */
512                                          : (UINT64_C(0x8080808080808080) >> maskshift); /* little endian */
513 
514       if (*buf64 & mask)
515         return 0;
516     }
517   } else {
518     /* "vectorized" optimisation checking 4 octets at once */
519     const uint32_t *buf32 = (const uint32_t*)buf;
520 
521     for (; (j+3) < n; j+=4, ++buf32)
522       if (*buf32 & UINT64_C(0x80808080))
523         return 0;
524 
525     for (; j < n; ++j)
526       if (buf[j] & 0x80)
527         return 0;
528   }
529 
530   return 1;
531 }
532 
533 /*
534  * Compute length of (transcoded) mutf8 literal
535  *
536  * If the mutf8 literal does not contain either surrogates nor escaped
537  * NULs, a positive length is returned which matches what strlen(3)
538  * would have returned.
539  *
540  * Otherwise, a negated size is returned which corresponds to the size
541  * of a the mutf8->utf8 transcoded string.
542  *
543  */
544 HsInt
hs_text_short_mutf8_strlen(const uint8_t buf[])545 hs_text_short_mutf8_strlen(const uint8_t buf[])
546 {
547   size_t j = 0;
548   size_t nulls = 0;
549   bool surr_seen = false;
550 
551   for (;;) {
552     const uint8_t b0 = buf[j];
553 
554     if (unlikely(!b0))
555       break;
556 
557     if (likely(!(b0 & 0x80)))
558       j += 1;   /* 0_______ */
559     else
560       switch(b0 >> 4) {
561       case 0xf: /* 11110___ */
562         j += 4;
563         break;
564       case 0xe: /* 1110____ */
565         /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
566         if (unlikely(!surr_seen && (b0 == 0xed) && (buf[j+1] & 0x20)))
567           surr_seen = true;
568         j += 3;
569         break;
570       default:  /* 110_____ */
571         /* escaped NUL */
572         if (unlikely((b0 == 0xc0) && (buf[j+1] == 0x80)))
573           nulls += 1;
574         j += 2;
575         break;
576       }
577   } /* for */
578 
579 
580   if ((nulls > 0) || surr_seen)
581     return -(HsInt)(j - nulls);
582 
583   return j;
584 }
585 
586 /* Transcode Modified UTF-8 to proper UTF-8
587  *
588  * This involves
589  *
590  *  1. Unescape denormal 2-byte NULs (0xC0 0x80)
591  *  2. Rewrite surrogate pairs to U+FFFD
592  */
593 void
hs_text_short_mutf8_trans(const uint8_t src0[],uint8_t dst0[])594 hs_text_short_mutf8_trans(const uint8_t src0[], uint8_t dst0[])
595 {
596   const uint8_t *src = src0;
597   uint8_t *dst = dst0;
598 
599   for (;;) {
600     const uint8_t b0 = *src++;
601     assert(utf8_lead_p(b0));
602 
603     if (likely(!(b0 & 0x80))) { /* 0_______ */
604       if (unlikely(!b0))
605         break;
606 
607       *dst++ = b0;
608       continue;
609     }
610 
611     switch(b0 >> 4) {
612     case 0xf: /* 11110___ */
613       assert(!utf8_lead_p(src[0]));
614       assert(!utf8_lead_p(src[1]));
615       assert(!utf8_lead_p(src[2]));
616       *dst++ = b0;
617       *dst++ = *src++;
618       *dst++ = *src++;
619       *dst++ = *src++;
620       break;
621 
622     case 0xe: { /* 1110____ */
623       const uint8_t b1 = *src++;
624       const uint8_t b2 = *src++;
625       assert(!utf8_lead_p(b1));
626       assert(!utf8_lead_p(b2));
627       if (unlikely((b0 == 0xed) && (b1 & 0x20))) {
628         /* UTF16 Surrogate pairs [U+D800 .. U+DFFF]
629          * -> translate into U+FFFD
630          */
631         *dst++ = 0xef;
632         *dst++ = 0xbf;
633         *dst++ = 0xbd;
634       } else {
635         *dst++ = b0;
636         *dst++ = b1;
637         *dst++ = b2;
638       }
639       break;
640     }
641     default: { /* 110_____ */
642       const uint8_t b1 = *src++;
643       assert(!utf8_lead_p(b1));
644       if (unlikely((b0 == 0xc0) && (b1 == 0x80))) {
645         /* escaped/denormal U+0000 -> normalize */
646         *dst++ = 0x00;
647       } else {
648         *dst++ = b0;
649         *dst++ = b1;
650       }
651       break;
652     }
653     } /* switch */
654   } /* for */
655 
656   assert(labs(hs_text_short_mutf8_strlen(src0)) == (dst - dst0));
657 }
658