1 #include <stdlib.h>
2 #include "../../src/bsdconv.h"
3
4 // Ref: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
5
6 struct my_s{
7 int status;
8 ucs_t lead_surrogate;
9 ucs_t ucs;
10 int cesu;
11 int loose;
12 int nul;
13 int overlong;
14 int super;
15 };
16
cbcreate(struct bsdconv_instance * ins,struct bsdconv_hash_entry * arg)17 int cbcreate(struct bsdconv_instance *ins, struct bsdconv_hash_entry *arg){
18 struct my_s *r = malloc(sizeof(struct my_s));
19 THIS_CODEC(ins)->priv = r;
20 r->cesu = 0;
21 r->loose = 0;
22 r->nul = 0;
23 r->overlong = 0;
24 r->super = 0;
25
26 while(arg){
27 if(strcasecmp(arg->key, "CESU")==0){
28 r->cesu = 1;
29 }else if(strcasecmp(arg->key, "LOOSE")==0){
30 r->loose = 1;
31 }else if(strcasecmp(arg->key, "NUL")==0){
32 r->nul = 1;
33 }else if(strcasecmp(arg->key, "OVERLONG")==0){
34 r->overlong = 1;
35 }else if(strcasecmp(arg->key, "SUPER")==0){
36 r->super = 1;
37 }
38 arg=arg->next;
39 }
40 return 0;
41 }
42
cbinit(struct bsdconv_instance * ins)43 void cbinit(struct bsdconv_instance *ins){
44 struct my_s *r=THIS_CODEC(ins)->priv;
45 r->status = 0;
46 r->lead_surrogate.ucs4 = 0;
47 r->ucs.ucs4 = 0;
48 }
49
cbdestroy(struct bsdconv_instance * ins)50 void cbdestroy(struct bsdconv_instance *ins){
51 struct my_s *r=THIS_CODEC(ins)->priv;
52 free(r);
53 }
54
55 #define DEADEND() do{ \
56 t->status = 0; \
57 t->lead_surrogate.ucs4 = 0; \
58 t->ucs.ucs4 = 0; \
59 this_phase->state.status=DEADEND; \
60 return; \
61 }while(0)
62
63 #define COMMIT() do{ \
64 this_phase->state.status=NEXTPHASE; \
65 t->status = 0; \
66 uint32_t ucs = be32toh(t->ucs.ucs4); \
67 if(ucs >= 0xD800 && ucs <= 0xDBFF){ \
68 if(t->cesu){ \
69 if(t->lead_surrogate.ucs4){ \
70 if(t->loose){ \
71 PASS(t->lead_surrogate); \
72 }else{ \
73 DEADEND(); \
74 } \
75 } \
76 t->lead_surrogate.ucs4 = t->ucs.ucs4; \
77 t->ucs.ucs4 = 0; \
78 if(t->loose){ \
79 this_phase->state.status=SUBMATCH; \
80 }else{ \
81 this_phase->state.status=CONTINUE; \
82 } \
83 }else if(t->loose){ \
84 PASS(t->ucs); \
85 t->ucs.ucs4 = 0; \
86 }else{ \
87 DEADEND(); \
88 } \
89 }else if(ucs >= 0xDC00 && ucs <= 0xDFFF){ \
90 if(t->cesu){ \
91 if(t->lead_surrogate.ucs4){ \
92 uint32_t cp = 0x10000; \
93 cp |= ((be32toh(t->lead_surrogate.ucs4) - 0xD800) << 10) & bb11111111110000000000; \
94 cp |= (ucs - 0xDC00) & bb1111111111; \
95 t->ucs.ucs4 = htobe32(cp); \
96 PASS(t->ucs); \
97 t->lead_surrogate.ucs4 = 0; \
98 t->ucs.ucs4 = 0; \
99 }else if(t->loose){ \
100 PASS(t->ucs); \
101 t->ucs.ucs4 = 0; \
102 }else{ \
103 DEADEND(); \
104 } \
105 }else if(t->loose){ \
106 PASS(t->ucs); \
107 t->ucs.ucs4 = 0; \
108 }else{ \
109 DEADEND(); \
110 } \
111 }else{ \
112 if(t->lead_surrogate.ucs4){ \
113 if(t->loose){ \
114 PASS(t->lead_surrogate); \
115 t->lead_surrogate.ucs4 = 0; \
116 }else{ \
117 DEADEND(); \
118 } \
119 } \
120 PASS(t->ucs); \
121 t->ucs.ucs4 = 0; \
122 } \
123 return; \
124 }while(0)
125
126 #define PASS(x) do{ \
127 if((x).ucs4==0 && !t->nul){ \
128 DEADEND(); \
129 } \
130 int i; \
131 for(i=0;i<3 /* instead of 4, to map NUL to 0100 */;i+=1){ \
132 if((x).byte[i] != 0){ \
133 break; \
134 } \
135 } \
136 int len = 4 - i + 1; \
137 char *buf = malloc(len); \
138 buf[0] = 0x01; \
139 int p = 1; \
140 while(i<4){ \
141 buf[p] = (x).byte[i]; \
142 i += 1; \
143 p += 1; \
144 } \
145 DATA_MALLOC(ins, this_phase->data_tail->next); \
146 this_phase->data_tail=this_phase->data_tail->next; \
147 this_phase->data_tail->next=NULL; \
148 this_phase->data_tail->len=len; \
149 this_phase->data_tail->flags=F_FREE; \
150 this_phase->data_tail->data=buf; \
151 }while(0)
152
cbflush(struct bsdconv_instance * ins)153 void cbflush(struct bsdconv_instance *ins){
154 struct bsdconv_phase *this_phase=THIS_PHASE(ins);
155 struct my_s *t=THIS_CODEC(ins)->priv;
156 if(t->lead_surrogate.ucs4 && t->loose){
157 PASS(t->lead_surrogate);
158 t->lead_surrogate.ucs4 = 0;
159 }
160 }
161
cbconv(struct bsdconv_instance * ins)162 void cbconv(struct bsdconv_instance *ins){
163 struct bsdconv_phase *this_phase=THIS_PHASE(ins);
164 struct my_s *t=THIS_CODEC(ins)->priv;
165 unsigned char d;
166
167 for(;this_phase->i<this_phase->curr->len;this_phase->i+=1){
168 d=UCP(this_phase->curr->data)[this_phase->i];
169 switch(t->status){
170 case 0:
171 if((d & bb10000000) == 0){ // *0₃₃₃₃₃₃₃, total 7
172 /* exclude ASCII */
173 DEADEND();
174
175 /* Unreachable */
176 t->ucs.ucs4 = d;
177 COMMIT();
178 }else if((d & bb11100000) == bb11000000){ // *110₂₂₂₃₃ 10₃₃₃₃₃₃, total 11 (+4)
179 t->status = 21;
180 t->ucs.byte[2] |= (d >> 2) & bb00000111;
181 t->ucs.byte[3] |= (d << 6) & bb10000000;
182 if(!t->overlong && t->ucs.ucs4==0){
183 DEADEND();
184 }
185 t->ucs.byte[3] |= (d << 6) & bb01000000;
186 }else if((d & bb11110000) == bb11100000){ // *1110₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 16 (+5)
187 t->status = 31;
188 t->ucs.byte[2] |= (d << 4) & bb11110000;
189 }else if((d & bb11111000) == bb11110000){ // *11110₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 21 (+5)
190 t->status = 41;
191 t->ucs.byte[1] |= (d << 2) & bb00011100;
192 }else if((d & bb11111100) == bb11111000){ // *111110₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
193 if(!t->super){
194 DEADEND();
195 }
196 t->status = 51;
197 t->ucs.byte[0] |= d & bb00000011;
198 }else if((d & bb11111110) == bb11111100){ // *1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
199 if(!t->super){
200 DEADEND();
201 }
202 t->status = 61;
203 t->ucs.byte[0] |= (d << 6) & bb01000000;
204 }else{
205 DEADEND();
206 }
207 break;
208 case 21:
209 if((d & bb11000000) == bb10000000){ // 110₂₂₂₃₃ *10₃₃₃₃₃₃, total 11 (+4)
210 t->ucs.byte[3] |= d & bb00111111;
211 COMMIT();
212 }else{
213 DEADEND();
214 }
215 break;
216 case 31:
217 if((d & bb11000000) == bb10000000){ // 1110₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 16 (+5)
218 t->status = 32;
219 t->ucs.byte[2] |= (d >> 2) & bb00001000;
220 if(!t->overlong && t->ucs.ucs4==0){
221 DEADEND();
222 }
223 t->ucs.byte[2] |= (d >> 2) & bb00000111;
224 t->ucs.byte[3] |= (d << 6) & bb11000000;
225 }else{
226 DEADEND();
227 }
228 break;
229 case 32:
230 if((d & bb11000000) == bb10000000){ // 1110₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 16 (+5)
231 t->ucs.byte[3] |= d & bb00111111;
232 COMMIT();
233 }else{
234 DEADEND();
235 }
236 break;
237 case 41:
238 if((d & bb11000000) == bb10000000){ // 11110₁₁₁ *10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 21 (+5)
239 t->status = 42;
240 t->ucs.byte[1] |= (d >> 4) & bb00000011;
241 if(!t->overlong && t->ucs.ucs4==0){
242 DEADEND();
243 }
244 t->ucs.byte[2] |= (d << 4) & bb11110000;
245 }else{
246 DEADEND();
247 }
248 break;
249 case 42:
250 if((d & bb11000000) == bb10000000){ // 11110₁₁₁ 10₁₁₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 21 (+5)
251 t->status = 43;
252 t->ucs.byte[2] |= (d >> 2) & bb00001111;
253 t->ucs.byte[3] |= (d << 6) & bb11000000;
254 }else{
255 DEADEND();
256 }
257 break;
258 case 43:
259 if((d & bb11000000) == bb10000000){ // 11110₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 21 (+5)
260 t->ucs.byte[3] |= d & bb00111111;
261 COMMIT();
262 }else{
263 DEADEND();
264 }
265 break;
266 case 51:
267 if((d & bb11000000) == bb10000000){ // 111110₀₀ *10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
268 t->status = 52;
269 t->ucs.byte[1] |= (d << 2) & bb11100000;
270 if(!t->overlong && t->ucs.ucs4==0){
271 DEADEND();
272 }
273 t->ucs.byte[1] |= (d << 2) & bb00011100;
274 }else{
275 DEADEND();
276 }
277 break;
278 case 52:
279 if((d & bb11000000) == bb10000000){ // 111110₀₀ 10₁₁₁₁₁₁ *10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
280 t->status = 53;
281 t->ucs.byte[1] |= (d >> 4) & bb00000011;
282 t->ucs.byte[2] |= (d << 4) & bb11110000;
283 }else{
284 DEADEND();
285 }
286 break;
287 case 53:
288 if((d & bb11000000) == bb10000000){ // 111110₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
289 t->status = 54;
290 t->ucs.byte[2] |= (d >> 2) & bb00001111;
291 t->ucs.byte[3] |= (d << 6) & bb11000000;
292 }else{
293 DEADEND();
294 }
295 break;
296 case 54:
297 if((d & bb11000000) == bb10000000){ // 111110₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 26 (+5)
298 t->ucs.byte[3] |= d & bb00111111;
299 COMMIT();
300 }else{
301 DEADEND();
302 }
303 break;
304 case 61:
305 if((d & bb11000000) == bb10000000){ // 1111110₀ *10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
306 t->status = 62;
307 t->ucs.byte[0] |= d & bb00111100;
308 if(!t->overlong && t->ucs.ucs4==0){
309 DEADEND();
310 }
311 t->ucs.byte[0] |= d & bb00000011;
312 }else{
313 DEADEND();
314 }
315 break;
316 case 62:
317 if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ *10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
318 t->status = 63;
319 t->ucs.byte[1] |= (d << 2) & bb11111100;
320 }else{
321 DEADEND();
322 }
323 break;
324 case 63:
325 if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ *10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
326 t->status = 64;
327 t->ucs.byte[1] |= (d >> 4) & bb00000011;
328 t->ucs.byte[2] |= (d << 4) & bb11110000;
329 }else{
330 DEADEND();
331 }
332 break;
333 case 64:
334 if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
335 t->status = 65;
336 t->ucs.byte[2] |= (d >> 2) & bb00001111;
337 t->ucs.byte[3] |= (d << 6) & bb11000000;
338 }else{
339 DEADEND();
340 }
341 break;
342 case 65:
343 if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 31 (+5)
344 t->ucs.byte[3] |= d & bb00111111;
345 COMMIT();
346 }else{
347 DEADEND();
348 }
349 break;
350 default:
351 DEADEND();
352 }
353 }
354 this_phase->state.status=CONTINUE;
355 return;
356 }
357