1 /*
2  * RESTinio
3  */
4 
5 /*!
6  * @file
7  * @brief An implementation of checker for UTF-8 sequences.
8  *
9  * @since v.0.6.5
10  */
11 
12 #pragma once
13 
14 #include <restinio/compiler_features.hpp>
15 
16 #include <cstdint>
17 
18 namespace restinio
19 {
20 
21 namespace utils
22 {
23 
24 //
25 // utf8_checker_t
26 //
27 
28 /*!
29  * @brief Helper class for checking UTF-8 byte sequence during parsing
30  * URI or incoming byte stream.
31  *
32  * Note: this class is moved to restinio::utils namespace in v.0.6.5.
33  */
34 class utf8_checker_t
35 {
36 public:
37 	utf8_checker_t() = default;
38 
39 	RESTINIO_NODISCARD
40 	bool
process_byte(std::uint8_t byte)41 	process_byte( std::uint8_t byte ) noexcept
42 	{
43 		check_overlong( byte );
44 
45 		if( m_current_symbol_rest_bytes > 0 )
46 		{
47 			// check byte is 10xxxxxx.
48 			if( (byte  & 0xC0) == 0x80 )
49 			{
50 				m_current_symbol <<= 6;
51 				byte &= 0x3F;
52 
53 				m_current_symbol |= byte;
54 
55 				if( --m_current_symbol_rest_bytes == 0 )
56 				{
57 					validate_current_symbol();
58 				}
59 			}
60 			else
61 			{
62 				m_state = state_t::invalid;
63 			}
64 		}
65 		else
66 		{
67 			m_current_symbol = 0;
68 
69 			if( (byte & 0x80) == 0x00)
70 			{
71 				// mask 0xxxxxxx
72 				m_current_symbol_rest_bytes = 0;
73 			}
74 			else if( (byte & 0xE0) == 0xC0)
75 			{
76 				// mask 110xxxxx
77 				m_current_symbol_rest_bytes = 1;
78 				byte &= 0x1F;
79 			}
80 			else if( (byte & 0xF0) == 0xE0)
81 			{
82 				// mask 1110xxxx
83 				m_current_symbol_rest_bytes = 2;
84 				byte &= 0xF;
85 			}
86 			else if( (byte & 0xF8) == 0xF0)
87 			{
88 				// mask 11110xxx
89 				m_current_symbol_rest_bytes = 3;
90 				byte &= 0x7;
91 			}
92 			else if( (byte & 0xFC) == 0xF8)
93 			{
94 				// mask 111110xx
95 				m_current_symbol_rest_bytes = 4;
96 				byte &= 0x3;
97 			}
98 			else if( (byte & 0xFE) == 0xFC)
99 			{
100 				// mask 1111110x
101 				m_current_symbol_rest_bytes = 5;
102 				byte &= 0x1;
103 			}
104 			else
105 			{
106 				m_state = state_t::invalid;
107 			}
108 
109 			m_current_symbol = byte;
110 		}
111 
112 		return m_state == state_t::valid || m_state == state_t::may_be_overlong;
113 	}
114 
115 	/*!
116 	 * @return true if the current sequence finalized.
117 	 */
118 	RESTINIO_NODISCARD
119 	bool
finalized() const120 	finalized() const noexcept
121 	{
122 		return m_current_symbol_rest_bytes == 0;
123 	}
124 
125 	void
reset()126 	reset() noexcept
127 	{
128 		m_current_symbol = 0;
129 		m_current_symbol_rest_bytes = 0;
130 	}
131 
132 	RESTINIO_NODISCARD
133 	std::uint32_t
current_symbol() const134 	current_symbol() const noexcept { return m_current_symbol; }
135 
136 private:
137 
138 	void
validate_current_symbol()139 	validate_current_symbol() noexcept
140 	{
141 		if( (m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF) ||
142 			(m_current_symbol >= 0x110000) )
143 		{
144 			m_state = state_t::invalid;
145 		}
146 	}
147 
148 	void
check_overlong(std::uint8_t byte)149 	check_overlong( std::uint8_t byte ) noexcept
150 	{
151 		if( m_current_symbol_rest_bytes > 0 &&
152 				m_state == state_t::may_be_overlong )
153 		{
154 			if( m_current_symbol_rest_bytes == 2 &&
155 				(byte & 0xE0) == 0x80 )
156 				m_state = state_t::overlong;
157 			else if( m_current_symbol_rest_bytes == 3 &&
158 				(byte & 0xF0) == 0x80 )
159 				m_state = state_t::overlong;
160 			else if( m_current_symbol_rest_bytes == 4 &&
161 				(byte & 0xF8) == 0x80 )
162 				m_state = state_t::overlong;
163 			else if( m_current_symbol_rest_bytes == 5 &&
164 				(byte & 0xFC) == 0x80 )
165 				m_state = state_t::overlong;
166 			else
167 				m_state = state_t::valid;
168 		}
169 		else
170 		{
171 			if( byte == 0xC0 || byte == 0xC1 )
172 			{
173 				m_state = state_t::overlong;
174 			}
175 			else if( byte == 0xE0 )
176 			{
177 				m_state = state_t::may_be_overlong;
178 			}
179 			else if( byte == 0xF0 )
180 			{
181 				m_state = state_t::may_be_overlong;
182 			}
183 			if( byte == 0xF8 )
184 			{
185 				m_state = state_t::may_be_overlong;
186 			}
187 			if( byte == 0xFC )
188 			{
189 				m_state = state_t::may_be_overlong;
190 			}
191 		}
192 	}
193 
194 	std::uint32_t m_current_symbol = 0u;
195 
196 	std::size_t m_current_symbol_rest_bytes = 0u;
197 
198 	enum class state_t
199 	{
200 		valid,
201 		invalid,
202 		may_be_overlong,
203 		overlong
204 	};
205 
206 	state_t m_state = state_t::valid;
207 };
208 
209 } /* namespace utils */
210 
211 } /* namespace restinio */
212 
213