1 #include <stdlib.h>
2 #include "../../src/bsdconv.h"
3 
4 // Ref: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
5 
6 struct my_s{
7 	int status;
8 	ucs_t lead_surrogate;
9 	ucs_t ucs;
10 	int cesu;
11 	int loose;
12 	int nul;
13 	int overlong;
14 	int super;
15 };
16 
cbcreate(struct bsdconv_instance * ins,struct bsdconv_hash_entry * arg)17 int cbcreate(struct bsdconv_instance *ins, struct bsdconv_hash_entry *arg){
18 	struct my_s *r = malloc(sizeof(struct my_s));
19 	THIS_CODEC(ins)->priv = r;
20 	r->cesu = 0;
21 	r->loose = 0;
22 	r->nul = 0;
23 	r->overlong = 0;
24 	r->super = 0;
25 
26 	while(arg){
27 		if(strcasecmp(arg->key, "CESU")==0){
28 			r->cesu = 1;
29 		}else if(strcasecmp(arg->key, "LOOSE")==0){
30 			r->loose = 1;
31 		}else if(strcasecmp(arg->key, "NUL")==0){
32 			r->nul = 1;
33 		}else if(strcasecmp(arg->key, "OVERLONG")==0){
34 			r->overlong = 1;
35 		}else if(strcasecmp(arg->key, "SUPER")==0){
36 			r->super = 1;
37 		}
38 		arg=arg->next;
39 	}
40 	return 0;
41 }
42 
cbinit(struct bsdconv_instance * ins)43 void cbinit(struct bsdconv_instance *ins){
44 	struct my_s *r=THIS_CODEC(ins)->priv;
45 	r->status = 0;
46 	r->lead_surrogate.ucs4 = 0;
47 	r->ucs.ucs4 = 0;
48 }
49 
cbdestroy(struct bsdconv_instance * ins)50 void cbdestroy(struct bsdconv_instance *ins){
51 	struct my_s *r=THIS_CODEC(ins)->priv;
52 	free(r);
53 }
54 
55 #define DEADEND() do{	\
56 	t->status = 0;	\
57 	t->lead_surrogate.ucs4 = 0;	\
58 	t->ucs.ucs4 = 0;	\
59 	this_phase->state.status=DEADEND;	\
60 	return;	\
61 }while(0)
62 
63 #define COMMIT() do{	\
64 	this_phase->state.status=NEXTPHASE;	\
65 	t->status = 0;	\
66 	uint32_t ucs = be32toh(t->ucs.ucs4);	\
67 	if(ucs >= 0xD800 && ucs <= 0xDBFF){	\
68 		if(t->cesu){	\
69 			if(t->lead_surrogate.ucs4){	\
70 				if(t->loose){	\
71 					PASS(t->lead_surrogate);	\
72 				}else{	\
73 					DEADEND();	\
74 				}	\
75 			}	\
76 			t->lead_surrogate.ucs4 = t->ucs.ucs4;	\
77 			t->ucs.ucs4 = 0;	\
78 			if(t->loose){	\
79 				this_phase->state.status=SUBMATCH;	\
80 			}else{	\
81 				this_phase->state.status=CONTINUE;	\
82 			}	\
83 		}else if(t->loose){	\
84 			PASS(t->ucs);	\
85 			t->ucs.ucs4 = 0;	\
86 		}else{	\
87 			DEADEND();	\
88 		}	\
89 	}else if(ucs >= 0xDC00 && ucs <= 0xDFFF){	\
90 		if(t->cesu){	\
91 			if(t->lead_surrogate.ucs4){	\
92 				uint32_t cp = 0x10000;	\
93 				cp |= ((be32toh(t->lead_surrogate.ucs4) - 0xD800) << 10) & bb11111111110000000000;	\
94 				cp |= (ucs - 0xDC00) & bb1111111111;	\
95 				t->ucs.ucs4 = htobe32(cp);	\
96 				PASS(t->ucs);	\
97 				t->lead_surrogate.ucs4 = 0;	\
98 				t->ucs.ucs4 = 0;	\
99 			}else if(t->loose){	\
100 				PASS(t->ucs);	\
101 				t->ucs.ucs4 = 0;	\
102 			}else{	\
103 				DEADEND();	\
104 			}	\
105 		}else if(t->loose){	\
106 			PASS(t->ucs);	\
107 			t->ucs.ucs4 = 0;	\
108 		}else{	\
109 			DEADEND();	\
110 		}	\
111 	}else{	\
112 		if(t->lead_surrogate.ucs4){	\
113 			if(t->loose){	\
114 				PASS(t->lead_surrogate);	\
115 				t->lead_surrogate.ucs4 = 0;	\
116 			}else{	\
117 				DEADEND();	\
118 			}	\
119 		}	\
120 		PASS(t->ucs);	\
121 		t->ucs.ucs4 = 0;	\
122 	}	\
123 	return;	\
124 }while(0)
125 
126 #define PASS(x) do{	\
127 	if((x).ucs4==0 && !t->nul){	\
128 		DEADEND();	\
129 	}	\
130 	int i;	\
131 	for(i=0;i<3 /* instead of 4, to map NUL to 0100 */;i+=1){	\
132 		if((x).byte[i] != 0){	\
133 			break;	\
134 		}	\
135 	}	\
136 	int len = 4 - i + 1;	\
137 	char *buf = malloc(len);	\
138 	buf[0] = 0x01;	\
139 	int p = 1;	\
140 	while(i<4){	\
141 		buf[p] = (x).byte[i];	\
142 		i += 1;	\
143 		p += 1;	\
144 	}	\
145 	DATA_MALLOC(ins, this_phase->data_tail->next);	\
146 	this_phase->data_tail=this_phase->data_tail->next;	\
147 	this_phase->data_tail->next=NULL;	\
148 	this_phase->data_tail->len=len;	\
149 	this_phase->data_tail->flags=F_FREE;	\
150 	this_phase->data_tail->data=buf;	\
151 }while(0)
152 
cbflush(struct bsdconv_instance * ins)153 void cbflush(struct bsdconv_instance *ins){
154 	struct bsdconv_phase *this_phase=THIS_PHASE(ins);
155 	struct my_s *t=THIS_CODEC(ins)->priv;
156 	if(t->lead_surrogate.ucs4 && t->loose){
157 		PASS(t->lead_surrogate);
158 		t->lead_surrogate.ucs4 = 0;
159 	}
160 }
161 
cbconv(struct bsdconv_instance * ins)162 void cbconv(struct bsdconv_instance *ins){
163 	struct bsdconv_phase *this_phase=THIS_PHASE(ins);
164 	struct my_s *t=THIS_CODEC(ins)->priv;
165 	unsigned char d;
166 
167 	for(;this_phase->i<this_phase->curr->len;this_phase->i+=1){
168 		d=UCP(this_phase->curr->data)[this_phase->i];
169 		switch(t->status){
170 			case 0:
171 				if((d & bb10000000) == 0){ // *0₃₃₃₃₃₃₃, total 7
172 					/* exclude ASCII */
173 					DEADEND();
174 
175 					/* Unreachable */
176 					t->ucs.ucs4 = d;
177 					COMMIT();
178 				}else if((d & bb11100000) == bb11000000){ // *110₂₂₂₃₃ 10₃₃₃₃₃₃, total 11 (+4)
179 					t->status = 21;
180 					t->ucs.byte[2] |= (d >> 2) & bb00000111;
181 					t->ucs.byte[3] |= (d << 6) & bb10000000;
182 					if(!t->overlong && t->ucs.ucs4==0){
183 						DEADEND();
184 					}
185 					t->ucs.byte[3] |= (d << 6) & bb01000000;
186 				}else if((d & bb11110000) == bb11100000){ // *1110₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 16 (+5)
187 					t->status = 31;
188 					t->ucs.byte[2] |= (d << 4) & bb11110000;
189 				}else if((d & bb11111000) == bb11110000){ // *11110₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 21 (+5)
190 					t->status = 41;
191 					t->ucs.byte[1] |= (d << 2) & bb00011100;
192 				}else if((d & bb11111100) == bb11111000){ // *111110₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
193 					if(!t->super){
194 						DEADEND();
195 					}
196 					t->status = 51;
197 					t->ucs.byte[0] |= d & bb00000011;
198 				}else if((d & bb11111110) == bb11111100){ // *1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
199 					if(!t->super){
200 						DEADEND();
201 					}
202 					t->status = 61;
203 					t->ucs.byte[0] |= (d << 6) & bb01000000;
204 				}else{
205 					DEADEND();
206 				}
207 				break;
208 			case 21:
209 				if((d & bb11000000) == bb10000000){ // 110₂₂₂₃₃ *10₃₃₃₃₃₃, total 11 (+4)
210 					t->ucs.byte[3] |= d & bb00111111;
211 					COMMIT();
212 				}else{
213 					DEADEND();
214 				}
215 				break;
216 			case 31:
217 				if((d & bb11000000) == bb10000000){ // 1110₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 16 (+5)
218 					t->status = 32;
219 					t->ucs.byte[2] |= (d >> 2) & bb00001000;
220 					if(!t->overlong && t->ucs.ucs4==0){
221 						DEADEND();
222 					}
223 					t->ucs.byte[2] |= (d >> 2) & bb00000111;
224 					t->ucs.byte[3] |= (d << 6) & bb11000000;
225 				}else{
226 					DEADEND();
227 				}
228 				break;
229 			case 32:
230 				if((d & bb11000000) == bb10000000){ // 1110₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 16 (+5)
231 					t->ucs.byte[3] |= d & bb00111111;
232 					COMMIT();
233 				}else{
234 					DEADEND();
235 				}
236 				break;
237 			case 41:
238 				if((d & bb11000000) == bb10000000){ // 11110₁₁₁ *10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 21 (+5)
239 					t->status = 42;
240 					t->ucs.byte[1] |= (d >> 4) & bb00000011;
241 					if(!t->overlong && t->ucs.ucs4==0){
242 						DEADEND();
243 					}
244 					t->ucs.byte[2] |= (d << 4) & bb11110000;
245 				}else{
246 					DEADEND();
247 				}
248 				break;
249 			case 42:
250 				if((d & bb11000000) == bb10000000){ // 11110₁₁₁ 10₁₁₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 21 (+5)
251 					t->status = 43;
252 					t->ucs.byte[2] |= (d >> 2) & bb00001111;
253 					t->ucs.byte[3] |= (d << 6) & bb11000000;
254 				}else{
255 					DEADEND();
256 				}
257 				break;
258 			case 43:
259 				if((d & bb11000000) == bb10000000){ // 11110₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 21 (+5)
260 					t->ucs.byte[3] |= d & bb00111111;
261 					COMMIT();
262 				}else{
263 					DEADEND();
264 				}
265 				break;
266 			case 51:
267 				if((d & bb11000000) == bb10000000){ // 111110₀₀ *10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
268 					t->status = 52;
269 					t->ucs.byte[1] |= (d << 2) & bb11100000;
270 					if(!t->overlong && t->ucs.ucs4==0){
271 						DEADEND();
272 					}
273 					t->ucs.byte[1] |= (d << 2) & bb00011100;
274 				}else{
275 					DEADEND();
276 				}
277 				break;
278 			case 52:
279 				if((d & bb11000000) == bb10000000){ // 111110₀₀ 10₁₁₁₁₁₁ *10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
280 					t->status = 53;
281 					t->ucs.byte[1] |= (d >> 4) & bb00000011;
282 					t->ucs.byte[2] |= (d << 4) & bb11110000;
283 				}else{
284 					DEADEND();
285 				}
286 				break;
287 			case 53:
288 				if((d & bb11000000) == bb10000000){ // 111110₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 26 (+5)
289 					t->status = 54;
290 					t->ucs.byte[2] |= (d >> 2) & bb00001111;
291 					t->ucs.byte[3] |= (d << 6) & bb11000000;
292 				}else{
293 					DEADEND();
294 				}
295 				break;
296 			case 54:
297 				if((d & bb11000000) == bb10000000){ // 111110₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 26 (+5)
298 					t->ucs.byte[3] |= d & bb00111111;
299 					COMMIT();
300 				}else{
301 					DEADEND();
302 				}
303 				break;
304 			case 61:
305 				if((d & bb11000000) == bb10000000){ // 1111110₀ *10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
306 					t->status = 62;
307 					t->ucs.byte[0] |= d & bb00111100;
308 					if(!t->overlong && t->ucs.ucs4==0){
309 						DEADEND();
310 					}
311 					t->ucs.byte[0] |= d & bb00000011;
312 				}else{
313 					DEADEND();
314 				}
315 				break;
316 			case 62:
317 				if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ *10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
318 					t->status = 63;
319 					t->ucs.byte[1] |= (d << 2) & bb11111100;
320 				}else{
321 					DEADEND();
322 				}
323 				break;
324 			case 63:
325 				if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ *10₁₁₂₂₂₂ 10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
326 					t->status = 64;
327 					t->ucs.byte[1] |= (d >> 4) & bb00000011;
328 					t->ucs.byte[2] |= (d << 4) & bb11110000;
329 				}else{
330 					DEADEND();
331 				}
332 				break;
333 			case 64:
334 				if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ *10₂₂₂₂₃₃ 10₃₃₃₃₃₃, total 31 (+5)
335 					t->status = 65;
336 					t->ucs.byte[2] |= (d >> 2) & bb00001111;
337 					t->ucs.byte[3] |= (d << 6) & bb11000000;
338 				}else{
339 					DEADEND();
340 				}
341 				break;
342 			case 65:
343 				if((d & bb11000000) == bb10000000){ // 1111110₀ 10₀₀₀₀₀₀ 10₁₁₁₁₁₁ 10₁₁₂₂₂₂ 10₂₂₂₂₃₃ *10₃₃₃₃₃₃, total 31 (+5)
344 					t->ucs.byte[3] |= d & bb00111111;
345 					COMMIT();
346 				}else{
347 					DEADEND();
348 				}
349 				break;
350 			default:
351 				DEADEND();
352 		}
353 	}
354 	this_phase->state.status=CONTINUE;
355 	return;
356 }
357