1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf16.h"
32 
33 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
34 
35 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
36 
37 const mbfl_encoding mbfl_encoding_utf16 = {
38 	mbfl_no_encoding_utf16,
39 	"UTF-16",
40 	"UTF-16",
41 	mbfl_encoding_utf16_aliases,
42 	NULL,
43 	MBFL_ENCTYPE_MWC2,
44 	&vtbl_utf16_wchar,
45 	&vtbl_wchar_utf16
46 };
47 
48 const mbfl_encoding mbfl_encoding_utf16be = {
49 	mbfl_no_encoding_utf16be,
50 	"UTF-16BE",
51 	"UTF-16BE",
52 	NULL,
53 	NULL,
54 	MBFL_ENCTYPE_MWC2,
55 	&vtbl_utf16be_wchar,
56 	&vtbl_wchar_utf16be
57 };
58 
59 const mbfl_encoding mbfl_encoding_utf16le = {
60 	mbfl_no_encoding_utf16le,
61 	"UTF-16LE",
62 	"UTF-16LE",
63 	NULL,
64 	NULL,
65 	MBFL_ENCTYPE_MWC2,
66 	&vtbl_utf16le_wchar,
67 	&vtbl_wchar_utf16le
68 };
69 
70 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
71 	mbfl_no_encoding_utf16,
72 	mbfl_no_encoding_wchar,
73 	mbfl_filt_conv_common_ctor,
74 	NULL,
75 	mbfl_filt_conv_utf16_wchar,
76 	mbfl_filt_conv_utf16_wchar_flush,
77 	NULL,
78 };
79 
80 const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
81 	mbfl_no_encoding_wchar,
82 	mbfl_no_encoding_utf16,
83 	mbfl_filt_conv_common_ctor,
84 	NULL,
85 	mbfl_filt_conv_wchar_utf16be,
86 	mbfl_filt_conv_common_flush,
87 	NULL,
88 };
89 
90 const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
91 	mbfl_no_encoding_utf16be,
92 	mbfl_no_encoding_wchar,
93 	mbfl_filt_conv_common_ctor,
94 	NULL,
95 	mbfl_filt_conv_utf16be_wchar,
96 	mbfl_filt_conv_utf16_wchar_flush,
97 	NULL,
98 };
99 
100 const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
101 	mbfl_no_encoding_wchar,
102 	mbfl_no_encoding_utf16be,
103 	mbfl_filt_conv_common_ctor,
104 	NULL,
105 	mbfl_filt_conv_wchar_utf16be,
106 	mbfl_filt_conv_common_flush,
107 	NULL,
108 };
109 
110 const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
111 	mbfl_no_encoding_utf16le,
112 	mbfl_no_encoding_wchar,
113 	mbfl_filt_conv_common_ctor,
114 	NULL,
115 	mbfl_filt_conv_utf16le_wchar,
116 	mbfl_filt_conv_utf16_wchar_flush,
117 	NULL,
118 };
119 
120 const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
121 	mbfl_no_encoding_wchar,
122 	mbfl_no_encoding_utf16le,
123 	mbfl_filt_conv_common_ctor,
124 	NULL,
125 	mbfl_filt_conv_wchar_utf16le,
126 	mbfl_filt_conv_common_flush,
127 	NULL,
128 };
129 
130 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
131 
mbfl_filt_conv_utf16_wchar(int c,mbfl_convert_filter * filter)132 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
133 {
134 	/* Start with the assumption that the string is big-endian;
135 	 * If we find a little-endian BOM, then we will change that assumption */
136 	if (filter->status == 0) {
137 		filter->cache = c & 0xFF;
138 		filter->status = 1;
139 	} else {
140 		int n = (filter->cache << 8) | (c & 0xFF);
141 		if (n == 0xFFFE) {
142 			/* Switch to little-endian mode */
143 			filter->filter_function = mbfl_filt_conv_utf16le_wchar;
144 			filter->cache = filter->status = 0;
145 		} else {
146 			filter->filter_function = mbfl_filt_conv_utf16be_wchar;
147 			if (n >= 0xD800 && n <= 0xDBFF) {
148 				filter->cache = n & 0x3FF; /* Pick out 10 data bits */
149 				filter->status = 2;
150 				return 0;
151 			} else if (n >= 0xDC00 && n <= 0xDFFF) {
152 				/* This is wrong; second part of surrogate pair has come first */
153 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
154 			} else if (n != 0xFEFF) {
155 				CK((*filter->output_function)(n, filter->data));
156 			}
157 			filter->cache = filter->status = 0;
158 		}
159 	}
160 
161 	return 0;
162 }
163 
mbfl_filt_conv_utf16be_wchar(int c,mbfl_convert_filter * filter)164 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
165 {
166 	int n;
167 
168 	switch (filter->status) {
169 	case 0: /* First byte */
170 		filter->cache = c & 0xFF;
171 		filter->status = 1;
172 		break;
173 
174 	case 1: /* Second byte */
175 		n = (filter->cache << 8) | (c & 0xFF);
176 		if (n >= 0xD800 && n <= 0xDBFF) {
177 			filter->cache = n & 0x3FF; /* Pick out 10 data bits */
178 			filter->status = 2;
179 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
180 			/* This is wrong; second part of surrogate pair has come first */
181 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
182 			filter->status = 0;
183 		} else {
184 			CK((*filter->output_function)(n, filter->data));
185 			filter->status = 0;
186 		}
187 		break;
188 
189 	case 2: /* Second part of surrogate, first byte */
190 		filter->cache = (filter->cache << 8) | (c & 0xFF);
191 		filter->status = 3;
192 		break;
193 
194 	case 3: /* Second part of surrogate, second byte */
195 		n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
196 		if (n >= 0xD800 && n <= 0xDBFF) {
197 			/* Wrong; that's the first half of a surrogate pair, not the second */
198 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
199 			filter->cache = n & 0x3FF;
200 			filter->status = 2;
201 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
202 			n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
203 			CK((*filter->output_function)(n, filter->data));
204 			filter->status = 0;
205 		} else {
206 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
207 			CK((*filter->output_function)(n, filter->data));
208 			filter->status = 0;
209 		}
210 	}
211 
212 	return 0;
213 }
214 
mbfl_filt_conv_wchar_utf16be(int c,mbfl_convert_filter * filter)215 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
216 {
217 	int n;
218 
219 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
220 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
221 		CK((*filter->output_function)(c & 0xff, filter->data));
222 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
223 		n = ((c >> 10) - 0x40) | 0xd800;
224 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
225 		CK((*filter->output_function)(n & 0xff, filter->data));
226 		n = (c & 0x3ff) | 0xdc00;
227 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
228 		CK((*filter->output_function)(n & 0xff, filter->data));
229 	} else {
230 		CK(mbfl_filt_conv_illegal_output(c, filter));
231 	}
232 
233 	return 0;
234 }
235 
mbfl_filt_conv_utf16le_wchar(int c,mbfl_convert_filter * filter)236 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
237 {
238 	int n;
239 
240 	switch (filter->status) {
241 	case 0:
242 		filter->cache = c & 0xff;
243 		filter->status = 1;
244 		break;
245 
246 	case 1:
247 		if ((c & 0xfc) == 0xd8) {
248 			/* Looks like we have a surrogate pair here */
249 			filter->cache += ((c & 0x3) << 8);
250 			filter->status = 2;
251 		} else if ((c & 0xfc) == 0xdc) {
252 			/* This is wrong; the second part of the surrogate pair has come first */
253 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
254 			filter->status = 0;
255 		} else {
256 			CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
257 			filter->status = 0;
258 		}
259 		break;
260 
261 	case 2:
262 		filter->cache = (filter->cache << 10) + (c & 0xff);
263 		filter->status = 3;
264 		break;
265 
266 	case 3:
267 		n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
268 		if (n >= 0xD800 && n <= 0xDBFF) {
269 			/* We previously saw the first part of a surrogate pair and were
270 			 * expecting the second part; this is another first part */
271 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
272 			filter->cache = n & 0x3FF;
273 			filter->status = 2;
274 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
275 			n = filter->cache + ((c & 0x3) << 8) + 0x10000;
276 			CK((*filter->output_function)(n, filter->data));
277 			filter->status = 0;
278 		} else {
279 			/* The first part of a surrogate pair was followed by some other codepoint
280 			 * which is not part of a surrogate pair at all */
281 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
282 			CK((*filter->output_function)(n, filter->data));
283 			filter->status = 0;
284 		}
285 		break;
286 	}
287 
288 	return 0;
289 }
290 
mbfl_filt_conv_wchar_utf16le(int c,mbfl_convert_filter * filter)291 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
292 {
293 	int n;
294 
295 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
296 		CK((*filter->output_function)(c & 0xff, filter->data));
297 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
298 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
299 		n = ((c >> 10) - 0x40) | 0xd800;
300 		CK((*filter->output_function)(n & 0xff, filter->data));
301 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
302 		n = (c & 0x3ff) | 0xdc00;
303 		CK((*filter->output_function)(n & 0xff, filter->data));
304 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
305 	} else {
306 		CK(mbfl_filt_conv_illegal_output(c, filter));
307 	}
308 
309 	return 0;
310 }
311 
mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter * filter)312 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
313 {
314 	if (filter->status) {
315 		/* Input string was truncated */
316 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
317 	}
318 
319 	if (filter->flush_function) {
320 		(*filter->flush_function)(filter->data);
321 	}
322 
323 	return 0;
324 }
325