1 /*
2  * Copyright (c) 2014-2020 Pavel Kalvoda <me@pavelkalvoda.com>
3  *
4  * libcbor is free software; you can redistribute it and/or modify
5  * it under the terms of the MIT license. See LICENSE for details.
6  */
7 
8 #include "unicode.h"
9 #include <stdint.h>
10 
11 #define UTF8_ACCEPT 0
12 #define UTF8_REJECT 1
13 
14 static const uint8_t utf8d[] = {
15     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
16     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
17     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 00..1f */
18     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
19     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
20     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 20..3f */
21     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
22     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
23     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 40..5f */
24     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
25     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
26     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 60..7f */
27     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
28     1,   1,   1,   1,   1,   9,   9,   9,   9,   9,   9,
29     9,   9,   9,   9,   9,   9,   9,   9,   9,   9, /* 80..9f */
30     7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
31     7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
32     7,   7,   7,   7,   7,   7,   7,   7,   7,   7, /* a0..bf */
33     8,   8,   2,   2,   2,   2,   2,   2,   2,   2,   2,
34     2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
35     2,   2,   2,   2,   2,   2,   2,   2,   2,   2, /* c0..df */
36     0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3,
37     0x3, 0x3, 0x4, 0x3, 0x3, /* e0..ef */
38     0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
39     0x8, 0x8, 0x8, 0x8, 0x8, /* f0..ff */
40     0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4,
41     0x6, 0x1, 0x1, 0x1, 0x1, /* s0..s0 */
42     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
43     1,   1,   1,   1,   1,   1,   0,   1,   1,   1,   1,
44     1,   0,   1,   0,   1,   1,   1,   1,   1,   1, /* s1..s2 */
45     1,   2,   1,   1,   1,   1,   1,   2,   1,   2,   1,
46     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
47     1,   2,   1,   1,   1,   1,   1,   1,   1,   1, /* s3..s4 */
48     1,   2,   1,   1,   1,   1,   1,   1,   1,   2,   1,
49     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
50     1,   3,   1,   3,   1,   1,   1,   1,   1,   1, /* s5..s6 */
51     1,   3,   1,   1,   1,   1,   1,   3,   1,   3,   1,
52     1,   1,   1,   1,   1,   1,   3,   1,   1,   1,   1,
53     1,   1,   1,   1,   1,   1,   1,   1,   1,   1, /* s7..s8 */
54 };
55 
56 /* Copyright of this function: (c) 2008-2009 Bjoern Hoehrmann
57  * <bjoern@hoehrmann.de> */
58 /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
59 uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
60   uint32_t type = utf8d[byte];
61 
62   *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
63                                    : (0xff >> type) & (byte);
64 
65   *state = utf8d[256 + *state * 16 + type];
66   return *state;
67 }
68 
69 uint64_t _cbor_unicode_codepoint_count(cbor_data source, uint64_t source_length,
70                                        struct _cbor_unicode_status* status) {
71   *status =
72       (struct _cbor_unicode_status){.location = 0, .status = _CBOR_UNICODE_OK};
73   uint32_t codepoint, state = UTF8_ACCEPT, res;
74   uint64_t pos = 0, count = 0;
75 
76   for (; pos < source_length; pos++) {
77     res = _cbor_unicode_decode(&state, &codepoint, source[pos]);
78 
79     if (res == UTF8_ACCEPT) {
80       count++;
81     } else if (res == UTF8_REJECT) {
82       goto error;
83     }
84   }
85 
86   /* Unfinished multibyte codepoint */
87   if (state != UTF8_ACCEPT) goto error;
88 
89   return count;
90 
91 error:
92   *status = (struct _cbor_unicode_status){.location = pos,
93                                           .status = _CBOR_UNICODE_BADCP};
94   return 0;
95 }
96