1 /* Copyright (c) 2018, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 #include <openssl/bytestring.h>
16 
17 #include "internal.h"
18 
19 
is_valid_code_point(uint32_t v)20 static int is_valid_code_point(uint32_t v) {
21   // References in the following are to Unicode 9.0.0.
22   if (// The Unicode space runs from zero to 0x10ffff (3.4 D9).
23       v > 0x10ffff ||
24       // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved
25       // (3.4 D14)
26       (v & 0xfffe) == 0xfffe ||
27       (v >= 0xfdd0 && v <= 0xfdef) ||
28       // Surrogate code points are invalid (3.2 C1).
29       (v >= 0xd800 && v <= 0xdfff)) {
30     return 0;
31   }
32   return 1;
33 }
34 
35 // BOTTOM_BITS returns a byte with the bottom |n| bits set.
36 #define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1)
37 
38 // TOP_BITS returns a byte with the top |n| bits set.
39 #define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n)))
40 
cbs_get_utf8(CBS * cbs,uint32_t * out)41 int cbs_get_utf8(CBS *cbs, uint32_t *out) {
42   uint8_t c;
43   if (!CBS_get_u8(cbs, &c)) {
44     return 0;
45   }
46   if (c <= 0x7f) {
47     *out = c;
48     return 1;
49   }
50   uint32_t v, lower_bound;
51   size_t len;
52   if ((c & TOP_BITS(3)) == TOP_BITS(2)) {
53     v = c & BOTTOM_BITS(5);
54     len = 1;
55     lower_bound = 0x80;
56   } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) {
57     v = c & BOTTOM_BITS(4);
58     len = 2;
59     lower_bound = 0x800;
60   } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) {
61     v = c & BOTTOM_BITS(3);
62     len = 3;
63     lower_bound = 0x10000;
64   } else {
65     return 0;
66   }
67   for (size_t i = 0; i < len; i++) {
68     if (!CBS_get_u8(cbs, &c) ||
69         (c & TOP_BITS(2)) != TOP_BITS(1)) {
70       return 0;
71     }
72     v <<= 6;
73     v |= c & BOTTOM_BITS(6);
74   }
75   if (!is_valid_code_point(v) ||
76       v < lower_bound) {
77     return 0;
78   }
79   *out = v;
80   return 1;
81 }
82 
cbs_get_latin1(CBS * cbs,uint32_t * out)83 int cbs_get_latin1(CBS *cbs, uint32_t *out) {
84   uint8_t c;
85   if (!CBS_get_u8(cbs, &c)) {
86     return 0;
87   }
88   *out = c;
89   return 1;
90 }
91 
cbs_get_ucs2_be(CBS * cbs,uint32_t * out)92 int cbs_get_ucs2_be(CBS *cbs, uint32_t *out) {
93   // Note UCS-2 (used by BMPString) does not support surrogates.
94   uint16_t c;
95   if (!CBS_get_u16(cbs, &c) ||
96       !is_valid_code_point(c)) {
97     return 0;
98   }
99   *out = c;
100   return 1;
101 }
102 
cbs_get_utf32_be(CBS * cbs,uint32_t * out)103 int cbs_get_utf32_be(CBS *cbs, uint32_t *out) {
104   return CBS_get_u32(cbs, out) && is_valid_code_point(*out);
105 }
106 
cbb_get_utf8_len(uint32_t u)107 size_t cbb_get_utf8_len(uint32_t u) {
108   if (u <= 0x7f) {
109     return 1;
110   }
111   if (u <= 0x7ff) {
112     return 2;
113   }
114   if (u <= 0xffff) {
115     return 3;
116   }
117   return 4;
118 }
119 
cbb_add_utf8(CBB * cbb,uint32_t u)120 int cbb_add_utf8(CBB *cbb, uint32_t u) {
121   if (!is_valid_code_point(u)) {
122     return 0;
123   }
124   if (u <= 0x7f) {
125     return CBB_add_u8(cbb, (uint8_t)u);
126   }
127   if (u <= 0x7ff) {
128     return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) &&
129            CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
130   }
131   if (u <= 0xffff) {
132     return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) &&
133            CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
134            CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
135   }
136   if (u <= 0x10ffff) {
137     return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) &&
138            CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) &&
139            CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
140            CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
141   }
142   return 0;
143 }
144 
cbb_add_latin1(CBB * cbb,uint32_t u)145 int cbb_add_latin1(CBB *cbb, uint32_t u) {
146   return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u);
147 }
148 
cbb_add_ucs2_be(CBB * cbb,uint32_t u)149 int cbb_add_ucs2_be(CBB *cbb, uint32_t u) {
150   return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u);
151 }
152 
cbb_add_utf32_be(CBB * cbb,uint32_t u)153 int cbb_add_utf32_be(CBB *cbb, uint32_t u) {
154   return is_valid_code_point(u) && CBB_add_u32(cbb, u);
155 }
156