1 /*
2 Copyright (C) 2015-2021, Dirk Krause
3 SPDX-License-Identifier: BSD-3-Clause
4 */
5
6 /*
7 WARNING: This file was generated by the dkct program (see
8 http://dktools.sourceforge.net/ for details).
9 Changes you make here will be lost if dkct is run again!
10 You should modify the original source and run dkct on it.
11 Original source: dk4utf8.ctr
12 */
13
14 /** @file dk4utf8.c The dk4utf8 module.
15 */
16
17
18 #include "dk4conf.h"
19
20 #if DK4_HAVE_ASSERT_H
21 #ifndef ASSERT_H_INCLUDED
22 #include <assert.h>
23 #define ASSERT_H_INCLUDED 1
24 #endif
25 #endif
26
27 #include <libdk4base/dk4types.h>
28 #include <libdk4base/dk4error.h>
29 #include <libdk4c/dk4utf8.h>
30 #include <libdk4c/dk4rec26.h>
31
32
33
34
35
36
37
38 void
dk4utf8_init(dk4_utf8_decoder_t * ptr)39 dk4utf8_init(dk4_utf8_decoder_t *ptr)
40 {
41 #if DK4_USE_ASSERT
42 assert(NULL != ptr);
43 #endif
44 if (NULL != ptr) {
45 ptr->uc = dkC32(0);
46 ptr->bfc = 0;
47 ptr->bfn = 0;
48 }
49 }
50
51
52
53 int
dk4utf8_add(dk4_utf8_decoder_t * ptr,unsigned char byte)54 dk4utf8_add(dk4_utf8_decoder_t *ptr, unsigned char byte)
55 {
56 int back = DK4_EDSTM_ERROR;
57 #if DK4_USE_ASSERT
58 assert(NULL != ptr);
59 #endif
60 if (NULL != ptr) {
61 switch (ptr->bfc) {
62 case 0: {
63 if (0x7F < byte) { /* more than 1 byte */
64 if ((0xE0 & byte) == 0xC0) { /* 2 bytes */
65 ptr->uc = (((dk4_c32_t)byte) << 6) & dkC32(0x000007C0);
66 ptr->bfc = 2;
67 ptr->bfn = 1;
68 back = DK4_EDSTM_ACCEPT;
69 } else {
70 if ((0xF0 & byte) == 0xE0) { /* 3 bytes */
71 ptr->uc = (((dk4_c32_t)byte) << 12) & dkC32(0x0000F000);
72 ptr->bfc = 3;
73 ptr->bfn = 1;
74 back = DK4_EDSTM_ACCEPT;
75 } else {
76 if ((0xF8 & byte) == 0xF0) { /* 4 bytes */
77 ptr->uc = (((dk4_c32_t)byte) << 18) & dkC32(0x001C0000);
78 ptr->bfc = 4;
79 ptr->bfn = 1;
80 back = DK4_EDSTM_ACCEPT;
81 } else {
82 if ((0xFC & byte) == 0xF8) { /* 5 bytes */
83 ptr->uc = ((((dk4_c32_t)byte) << 24) & dkC32(0x03000000));
84 ptr->bfc = 5;
85 ptr->bfn = 1;
86 back = DK4_EDSTM_ACCEPT;
87 } else {
88 if ((0xFE & byte) == 0xFC) { /* 6 bytes */
89 ptr->uc = dkC32(0x40000000);
90 ptr->bfc = 6;
91 ptr->bfn = 1;
92 back = DK4_EDSTM_ACCEPT;
93 } else {
94 /* ERROR: Illegal input! */
95 }
96 }
97 }
98 }
99 }
100 } else { /* just one byte */
101 ptr->uc = (dk4_c32_t)byte;
102 ptr->uc &= dkC32(0xFF);
103 back = DK4_EDSTM_FINISHED;
104 ptr->bfn = 0;
105 }
106 } break;
107 case 2: {
108 switch (ptr->bfn) {
109 case 1: {
110 if ((0xC0 & byte) == 0x80) {
111 back = DK4_EDSTM_FINISHED;
112 ptr->uc |= (((dk4_c32_t)byte) & dkC32(0x0000003F));
113 }
114 ptr->bfc = 0;
115 ptr->bfn = 0;
116 } break;
117 }
118 } break;
119 case 3: {
120 switch (ptr->bfn) {
121 case 1: {
122 if ((0xC0 & byte) == 0x80) {
123 back = DK4_EDSTM_ACCEPT;
124 ptr->uc |= ((((dk4_c32_t)byte) << 6) & dkC32(0x00000FC0));
125 }
126 ptr->bfn = 2;
127 } break;
128 case 2: {
129 if ((0xC0 & byte) == 0x80) {
130 back = DK4_EDSTM_FINISHED;
131 ptr->uc |= (((dk4_c32_t)byte) & dkC32(0x0000003F));
132 }
133 ptr->bfc = 0;
134 ptr->bfn = 0;
135 } break;
136 }
137 } break;
138 case 4: {
139 switch (ptr->bfn) {
140 case 1: {
141 if ((0xC0 & byte) == 0x80) {
142 back = DK4_EDSTM_ACCEPT;
143 ptr->uc |= ((((dk4_c32_t)byte) << 12) & dkC32(0x0003F000));
144 }
145 ptr->bfn = 2;
146 } break;
147 case 2: {
148 if ((0xC0 & byte) == 0x80) {
149 back = DK4_EDSTM_ACCEPT;
150 ptr->uc |= ((((dk4_c32_t)byte) << 6) & dkC32(0x00000FC0));
151 }
152 ptr->bfn = 3;
153 } break;
154 case 3: {
155 if ((0xC0 & byte) == 0x80) {
156 back = DK4_EDSTM_FINISHED;
157 ptr->uc |= (((dk4_c32_t)byte) & dkC32(0x0000003F));
158 }
159 ptr->bfc = 0;
160 ptr->bfn = 0;
161 } break;
162 }
163 } break;
164 case 5: {
165 switch (ptr->bfn) {
166 case 1: {
167 if ((0xC0 & byte) == 0x80) {
168 back = DK4_EDSTM_ACCEPT;
169 ptr->uc |= ((((dk4_c32_t)byte) << 18) & dkC32(0x00FC0000));
170 }
171 ptr->bfn = 2;
172 } break;
173 case 2: {
174 if ((0xC0 & byte) == 0x80) {
175 back = DK4_EDSTM_ACCEPT;
176 ptr->uc |= ((((dk4_c32_t)byte) << 12) & dkC32(0x0003F000));
177 }
178 ptr->bfn = 3;
179 } break;
180 case 3: {
181 if ((0xC0 & byte) == 0x80) {
182 back = DK4_EDSTM_ACCEPT;
183 ptr->uc |= ((((dk4_c32_t)byte) << 6) & dkC32(0x00000FC0));
184 }
185 ptr->bfn = 4;
186 } break;
187 case 4: {
188 if ((0xC0 & byte) == 0x80) {
189 back = DK4_EDSTM_FINISHED;
190 ptr->uc |= (((dk4_c32_t)byte) & dkC32(0x0000003F));
191 }
192 ptr->bfc = 0;
193 ptr->bfn = 0;
194 } break;
195 }
196 } break;
197 case 6: {
198 switch (ptr->bfn) {
199 case 1: {
200 if ((0xC0 & byte) == 0x80) {
201 back = DK4_EDSTM_ACCEPT;
202 ptr->uc |= ((((dk4_c32_t)byte) << 24) & dkC32(0x3F000000));
203 }
204 ptr->bfn = 2;
205 } break;
206 case 2: {
207 if ((0xC0 & byte) == 0x80) {
208 back = DK4_EDSTM_ACCEPT;
209 ptr->uc |= ((((dk4_c32_t)byte) << 18) & dkC32(0x00FC0000));
210 }
211 ptr->bfn = 3;
212 } break;
213 case 3: {
214 if ((0xC0 & byte) == 0x80) {
215 back = DK4_EDSTM_ACCEPT;
216 ptr->uc |= ((((dk4_c32_t)byte) << 12) & dkC32(0x0003F000));
217 }
218 ptr->bfn = 4;
219 } break;
220 case 4: {
221 if ((0xC0 & byte) == 0x80) {
222 back = DK4_EDSTM_ACCEPT;
223 ptr->uc |= ((((dk4_c32_t)byte) << 6) & dkC32(0x00000FC0));
224 }
225 ptr->bfn = 5;
226 } break;
227 case 5: {
228 if ((0xC0 & byte) == 0x80) {
229 back = DK4_EDSTM_FINISHED;
230 ptr->uc |= (((dk4_c32_t)byte) & dkC32(0x0000003F));
231 }
232 ptr->bfc = 0;
233 ptr->bfn = 0;
234 } break;
235 }
236 } break;
237 }
238 }
239 return back;
240 }
241
242
243
244 dk4_c32_t
dk4utf8_get(dk4_utf8_decoder_t const * ptr)245 dk4utf8_get(dk4_utf8_decoder_t const *ptr)
246 {
247 dk4_c32_t back = dkC32(0);
248 #if DK4_USE_ASSERT
249 assert(NULL != ptr);
250 #endif
251 if (NULL != ptr) {
252 back = ptr->uc;
253 }
254 return back;
255 }
256
257
258
259 int
dk4utf8_is_empty(dk4_utf8_decoder_t const * ptr)260 dk4utf8_is_empty(dk4_utf8_decoder_t const *ptr)
261 {
262 int back = 0;
263 #if DK4_USE_ASSERT
264 assert(NULL != ptr);
265 #endif
266 if (NULL != ptr) {
267 if (0 == ptr->bfc) {
268 back = 1;
269 }
270 }
271 return back;
272 }
273
274
275
276 /** Extract part of character by right-shift and mask.
277 @param c Character to retrieve bits from.
278 @param w Shift width (bits to the right).
279 @param m Mask to apply.
280 @param o Or-value (bits to add).
281 @return Operations result.
282 */
283 static
284 unsigned char
dk4utf8_rshift_mask_or(dk4_c32_t c,unsigned w,unsigned long m,unsigned long o)285 dk4utf8_rshift_mask_or(dk4_c32_t c, unsigned w, unsigned long m, unsigned long o)
286 {
287 unsigned long u;
288
289 u = (unsigned long)c;
290 if (0 < w) {
291 u = u >> w;
292 }
293 u &= m;
294 u |= o;
295 return ((unsigned char)u);
296 }
297
298
299
300 int
dk4utf8_encode(unsigned char * rb,size_t * pszrb,dk4_c32_t c,dk4_er_t * erp)301 dk4utf8_encode(unsigned char *rb, size_t *pszrb, dk4_c32_t c, dk4_er_t *erp)
302 {
303 int back = 0;
304
305 #if DK4_USE_ASSERT
306 assert(NULL != rb);
307 assert(NULL != pszrb);
308 assert(6 < *pszrb);
309 #endif
310 if ((NULL != rb) && (NULL != pszrb)) {
311 if (6 < *pszrb) {
312 if (dkC32(0x7F) < c) {
313 if (dkC32(0x000007FF) < c) {
314 if (dkC32(0x001FFFFF) < c) {
315 if (dkC32(0x03FFFFFF) < c) {
316 if (dkC32(0x80000000) < c) { /* 6 bytes */
317 back = 1;
318 *pszrb = 6;
319 #if VERSION_BEFORE_20210729
320 /*
321 2021-07-29: Modified to silence compiler
322 warnings.
323 Not a bug fix.
324 */
325 rb[0] = ((unsigned char)((c >> 30) & dkC32(0x00000001))) | 0xFC;
326 rb[1] = ((unsigned char)((c >> 24) & dkC32(0x0000003F))) | 0x80;
327 rb[2] = ((unsigned char)((c >> 18) & dkC32(0x0000003F))) | 0x80;
328 rb[3] = ((unsigned char)((c >> 12) & dkC32(0x0000003F))) | 0x80;
329 rb[4] = ((unsigned char)((c >> 6) & dkC32(0x0000003F))) | 0x80;
330 rb[5] = ((unsigned char)( c & dkC32(0x0000003F))) | 0x80;
331 #else
332 rb[0] = dk4utf8_rshift_mask_or(c, 30, 0x00000001UL, 0xFCUL);
333 rb[1] = dk4utf8_rshift_mask_or(c, 24, 0x0000003FUL, 0x80UL);
334 rb[2] = dk4utf8_rshift_mask_or(c, 18, 0x0000003FUL, 0x80UL);
335 rb[3] = dk4utf8_rshift_mask_or(c, 12, 0x0000003FUL, 0x80UL);
336 rb[4] = dk4utf8_rshift_mask_or(c, 6, 0x0000003FUL, 0x80UL);
337 rb[5] = dk4utf8_rshift_mask_or(c, 0, 0x0000003FUL, 0x80UL);
338 #endif
339 rb[6] = '\0';
340 } else { /* 5 bytes */
341 back = 1;
342 *pszrb = 5;
343 #if VERSION_BEFORE_20210729
344 rb[0] = ((unsigned char)((c >> 24) & dkC32(0x00000003))) | 0xF8;
345 rb[1] = ((unsigned char)((c >> 18) & dkC32(0x0000003F))) | 0x80;
346 rb[2] = ((unsigned char)((c >> 12) & dkC32(0x0000003F))) | 0x80;
347 rb[3] = ((unsigned char)((c >> 6) & dkC32(0x0000003F))) | 0x80;
348 rb[4] = ((unsigned char)( c & dkC32(0x0000003F))) | 0x80;
349 #else
350 rb[0] = dk4utf8_rshift_mask_or(c, 24, 0x00000003UL, 0xF8UL);
351 rb[1] = dk4utf8_rshift_mask_or(c, 18, 0x0000003FUL, 0x80UL);
352 rb[2] = dk4utf8_rshift_mask_or(c, 12, 0x0000003FUL, 0x80UL);
353 rb[3] = dk4utf8_rshift_mask_or(c, 6, 0x0000003FUL, 0x80UL);
354 rb[4] = dk4utf8_rshift_mask_or(c, 0, 0x0000003FUL, 0x80UL);
355 #endif
356 rb[5] = '\0';
357 }
358 } else { /* 4 bytes */
359 back = 1;
360 *pszrb = 4;
361 #if VERSION_BEFORE_20210729
362 rb[0] = ((unsigned char)((c >> 18) & dkC32(0x00000007))) | 0xF0;
363 rb[1] = ((unsigned char)((c >> 12) & dkC32(0x0000003F))) | 0x80;
364 rb[2] = ((unsigned char)((c >> 6) & dkC32(0x0000003F))) | 0x80;
365 rb[3] = ((unsigned char)( c & dkC32(0x0000003F))) | 0x80;
366 #else
367 rb[0] = dk4utf8_rshift_mask_or(c, 18, 0x00000007UL, 0xF0UL);
368 rb[1] = dk4utf8_rshift_mask_or(c, 12, 0x0000003FUL, 0x80UL);
369 rb[2] = dk4utf8_rshift_mask_or(c, 6, 0x0000003FUL, 0x80UL);
370 rb[3] = dk4utf8_rshift_mask_or(c, 0, 0x0000003FUL, 0x80UL);
371 #endif
372 rb[4] = '\0';
373 }
374 } else { /* 3 bytes */
375 back = 1;
376 *pszrb = 3;
377 #if VERSION_BEFORE_20210729
378 rb[0] = ((unsigned char)((c >> 12) & dkC32(0x0000000F))) | 0xE0;
379 rb[1] = ((unsigned char)((c >> 6) & dkC32(0x0000003F))) | 0x80;
380 rb[2] = ((unsigned char)( c & dkC32(0x0000003F))) | 0x80;
381 #else
382 rb[0] = dk4utf8_rshift_mask_or(c, 12, 0x0000000FUL, 0xE0UL);
383 rb[1] = dk4utf8_rshift_mask_or(c, 6, 0x0000003FUL, 0x80UL);
384 rb[2] = dk4utf8_rshift_mask_or(c, 0, 0x0000003FUL, 0x80UL);
385 #endif
386 rb[3] = '\0';
387 }
388 } else { /* 2 bytes */
389 back = 1;
390 *pszrb = 2;
391 #if VERSION_BEFORE_20210729
392 rb[0] = ((unsigned char)((c >> 6) & dkC32(0x0000001F))) | 0xC0;
393 rb[1] = ((unsigned char)( c & dkC32(0x0000003F))) | 0x80;
394 #else
395 rb[0] = dk4utf8_rshift_mask_or(c, 6, 0x0000001FUL, 0xC0UL);
396 rb[1] = dk4utf8_rshift_mask_or(c, 0, 0x0000003FUL, 0x80UL);
397 #endif
398 rb[2] = '\0';
399 }
400 } else { /* 1 byte */
401 back = 1;
402 *pszrb = 1;
403 #if 0
404 rb[0] = (unsigned char)(c & dkC32(0x0000007F));
405 #endif
406 rb[0] = dk4utf8_rshift_mask_or(c, 0, 0x0000007FUL, 0UL);
407 rb[1] = '\0';
408 }
409 } else {
410 dk4error_set_simple_error_code(erp, DK4_E_BUFFER_TOO_SMALL);
411 }
412 } else {
413 dk4error_set_simple_error_code(erp, DK4_E_INVALID_ARGUMENTS);
414 }
415 return back;
416 }
417
418