1 /*	$NetBSD: ucs4.c,v 1.4 2014/12/10 04:37:55 christos Exp $	*/
2 
3 #ifndef lint
4 static char *rcsid = "Id: ucs4.c,v 1.1 2003/06/04 00:26:14 marka Exp ";
5 #endif
6 
7 /*
8  * Copyright (c) 2001 Japan Network Information Center.  All rights reserved.
9  *
10  * By using this file, you agree to the terms and conditions set forth bellow.
11  *
12  * 			LICENSE TERMS AND CONDITIONS
13  *
14  * The following License Terms and Conditions apply, unless a different
15  * license is obtained from Japan Network Information Center ("JPNIC"),
16  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
17  * Chiyoda-ku, Tokyo 101-0047, Japan.
18  *
19  * 1. Use, Modification and Redistribution (including distribution of any
20  *    modified or derived work) in source and/or binary forms is permitted
21  *    under this License Terms and Conditions.
22  *
23  * 2. Redistribution of source code must retain the copyright notices as they
24  *    appear in each source code file, this License Terms and Conditions.
25  *
26  * 3. Redistribution in binary form must reproduce the Copyright Notice,
27  *    this License Terms and Conditions, in the documentation and/or other
28  *    materials provided with the distribution.  For the purposes of binary
29  *    distribution the "Copyright Notice" refers to the following language:
30  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
31  *
32  * 4. The name of JPNIC may not be used to endorse or promote products
33  *    derived from this Software without specific prior written approval of
34  *    JPNIC.
35  *
36  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
37  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
38  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
39  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
40  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
41  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
42  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
45  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
46  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
47  */
48 
49 #include <config.h>
50 
51 #include <stddef.h>
52 #include <stdlib.h>
53 #include <string.h>
54 
55 #include <idn/assert.h>
56 #include <idn/result.h>
57 #include <idn/logmacro.h>
58 #include <idn/util.h>
59 #include <idn/ucs4.h>
60 #include <idn/debug.h>
61 
62 /*
63  * Unicode surrogate pair.
64  */
65 #define IS_SURROGATE_HIGH(v)	(0xd800 <= (v) && (v) <= 0xdbff)
66 #define IS_SURROGATE_LOW(v)	(0xdc00 <= (v) && (v) <= 0xdfff)
67 #define SURROGATE_HIGH(v)	(SURROGATE_H_OFF + (((v) - 0x10000) >> 10))
68 #define SURROGATE_LOW(v)	(SURROGATE_L_OFF + ((v) & 0x3ff))
69 #define SURROGATE_BASE		0x10000
70 #define SURROGATE_H_OFF		0xd800
71 #define SURROGATE_L_OFF		0xdc00
72 #define COMBINE_SURROGATE(h, l) \
73 	(SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF))
74 
75 /*
76  * ASCII ctype macros.
77  * Note that these macros evaluate the argument multiple times.  Be careful.
78  */
79 #define ASCII_TOUPPER(c) \
80 	(('a' <= (c) && (c) <= 'z') ? ((c) - 'a' + 'A') : (c))
81 #define ASCII_TOLOWER(c) \
82 	(('A' <= (c) && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c))
83 
84 idn_result_t
idn_ucs4_ucs4toutf16(const unsigned long * ucs4,unsigned short * utf16,size_t tolen)85 idn_ucs4_ucs4toutf16(const unsigned long *ucs4, unsigned short *utf16,
86 		     size_t tolen) {
87 	unsigned short *utf16p = utf16;
88 	unsigned long v;
89 	idn_result_t r;
90 
91 	TRACE(("idn_ucs4_ucs4toutf16(ucs4=\"%s\", tolen=%d)\n",
92 	       idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
93 
94 	while (*ucs4 != '\0') {
95 		v = *ucs4++;
96 
97 		if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
98 			WARNING(("idn_ucs4_ucs4toutf16: UCS4 string contains "
99 				 "surrogate pair\n"));
100 			r = idn_invalid_encoding;
101 			goto ret;
102 		} else if (v > 0xffff) {
103 			/* Convert to surrogate pair */
104 			if (v >= 0x110000) {
105 				r = idn_invalid_encoding;
106 				goto ret;
107 			}
108 			if (tolen < 2) {
109 				r = idn_buffer_overflow;
110 				goto ret;
111 			}
112 			*utf16p++ = SURROGATE_HIGH(v);
113 			*utf16p++ = SURROGATE_LOW(v);
114 			tolen -= 2;
115 		} else {
116 			if (tolen < 1) {
117 				r = idn_buffer_overflow;
118 				goto ret;
119 			}
120 			*utf16p++ = v;
121 			tolen--;
122 		}
123 	}
124 
125 	if (tolen < 1) {
126 		r = idn_buffer_overflow;
127 		goto ret;
128 	}
129 	*utf16p = '\0';
130 
131 	r = idn_success;
132 ret:
133 	if (r == idn_success) {
134 		TRACE(("idn_ucs4_ucs4toutf16(): success (utf16=\"%s\")\n",
135 		       idn__debug_utf16xstring(utf16, 50)));
136 	} else {
137 		TRACE(("idn_ucs4_ucs4toutf16(): %s\n",
138 		       idn_result_tostring(r)));
139 	}
140 	return (r);
141 }
142 
143 idn_result_t
idn_ucs4_utf16toucs4(const unsigned short * utf16,unsigned long * ucs4,size_t tolen)144 idn_ucs4_utf16toucs4(const unsigned short *utf16, unsigned long *ucs4,
145 		     size_t tolen) {
146 	unsigned long *ucs4p = ucs4;
147 	unsigned short v0, v1;
148 	idn_result_t r;
149 
150 	TRACE(("idn_ucs4_utf16toucs4(utf16=\"%s\", tolen=%d)\n",
151 	       idn__debug_utf16xstring(utf16, 50), (int)tolen));
152 
153 	while (*utf16 != '\0') {
154 		v0 = *utf16;
155 
156 		if (tolen < 1) {
157 			r = idn_buffer_overflow;
158 			goto ret;
159 		}
160 
161 		if (IS_SURROGATE_HIGH(v0)) {
162 			v1 = *(utf16 + 1);
163 			if (!IS_SURROGATE_LOW(v1)) {
164 				WARNING(("idn_ucs4_utf16toucs4: "
165 					 "corrupted surrogate pair\n"));
166 				r = idn_invalid_encoding;
167 				goto ret;
168 			}
169 			*ucs4p++ = COMBINE_SURROGATE(v0, v1);
170 			tolen--;
171 			utf16 += 2;
172 
173 		} else {
174 			*ucs4p++ = v0;
175 			tolen--;
176 			utf16++;
177 
178 		}
179 	}
180 
181 	if (tolen < 1) {
182 		r = idn_buffer_overflow;
183 		goto ret;
184 	}
185 	*ucs4p = '\0';
186 
187 	r = idn_success;
188 ret:
189 	if (r == idn_success) {
190 		TRACE(("idn_ucs4_utf16toucs4(): success (ucs4=\"%s\")\n",
191 		       idn__debug_ucs4xstring(ucs4, 50)));
192 	} else {
193 		TRACE(("idn_ucs4_utf16toucs4(): %s\n",
194 		       idn_result_tostring(r)));
195 	}
196 	return (r);
197 }
198 
199 idn_result_t
idn_ucs4_utf8toucs4(const char * utf8,unsigned long * ucs4,size_t tolen)200 idn_ucs4_utf8toucs4(const char *utf8, unsigned long *ucs4, size_t tolen) {
201 	const unsigned char *utf8p = (const unsigned char *)utf8;
202 	unsigned long *ucs4p = ucs4;
203 	unsigned long v, min;
204 	unsigned char c;
205 	int width;
206 	int i;
207 	idn_result_t r;
208 
209 	TRACE(("idn_ucs4_utf8toucs4(utf8=\"%s\", tolen=%d)\n",
210 	       idn__debug_xstring(utf8, 50), (int)tolen));
211 
212 	while(*utf8p != '\0') {
213 		c = *utf8p++;
214 		if (c < 0x80) {
215 			v = c;
216 			min = 0;
217 			width = 1;
218 		} else if (c < 0xc0) {
219 			WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
220 			r = idn_invalid_encoding;
221 			goto ret;
222 		} else if (c < 0xe0) {
223 			v = c & 0x1f;
224 			min = 0x80;
225 			width = 2;
226 		} else if (c < 0xf0) {
227 			v = c & 0x0f;
228 			min = 0x800;
229 			width = 3;
230 		} else if (c < 0xf8) {
231 			v = c & 0x07;
232 			min = 0x10000;
233 			width = 4;
234 		} else if (c < 0xfc) {
235 			v = c & 0x03;
236 			min = 0x200000;
237 			width = 5;
238 		} else if (c < 0xfe) {
239 			v = c & 0x01;
240 			min = 0x4000000;
241 			width = 6;
242 		} else {
243 			WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
244 			r = idn_invalid_encoding;
245 			goto ret;
246 		}
247 
248 		for (i = width - 1; i > 0; i--) {
249 			c = *utf8p++;
250 			if (c < 0x80 || 0xc0 <= c) {
251 				WARNING(("idn_ucs4_utf8toucs4: "
252 					 "invalid character\n"));
253 				r = idn_invalid_encoding;
254 				goto ret;
255 			}
256 			v = (v << 6) | (c & 0x3f);
257 		}
258 
259 	        if (v < min) {
260 			WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
261 			r = idn_invalid_encoding;
262 			goto ret;
263 		}
264 		if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
265 			WARNING(("idn_ucs4_utf8toucs4: UTF-8 string contains "
266 				 "surrogate pair\n"));
267 			r = idn_invalid_encoding;
268 			goto ret;
269 		}
270 		if (tolen < 1) {
271 			r = idn_buffer_overflow;
272 			goto ret;
273 		}
274 		tolen--;
275 		*ucs4p++ = v;
276 	}
277 
278 	if (tolen < 1) {
279 		r = idn_buffer_overflow;
280 		goto ret;
281 	}
282 	*ucs4p = '\0';
283 
284 	r = idn_success;
285 ret:
286 	if (r == idn_success) {
287 		TRACE(("idn_ucs4_utf8toucs4(): success (ucs4=\"%s\")\n",
288 		       idn__debug_ucs4xstring(ucs4, 50)));
289 	} else {
290 		TRACE(("idn_ucs4_utf8toucs4(): %s\n",
291 		       idn_result_tostring(r)));
292 	}
293 	return (r);
294 }
295 
296 idn_result_t
idn_ucs4_ucs4toutf8(const unsigned long * ucs4,char * utf8,size_t tolen)297 idn_ucs4_ucs4toutf8(const unsigned long *ucs4, char *utf8, size_t tolen) {
298 	unsigned char *utf8p = (unsigned char *)utf8;
299 	unsigned long v;
300 	int width;
301 	int mask;
302 	int offset;
303 	idn_result_t r;
304 
305 	TRACE(("idn_ucs4_ucs4toutf8(ucs4=\"%s\", tolen=%d)\n",
306 	       idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
307 
308 	while (*ucs4 != '\0') {
309 		v = *ucs4++;
310 		if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
311 			WARNING(("idn_ucs4_ucs4toutf8: UCS4 string contains "
312 				 "surrogate pair\n"));
313 			r = idn_invalid_encoding;
314 			goto ret;
315 		}
316 		if (v < 0x80) {
317 			mask = 0;
318 			width = 1;
319 		} else if (v < 0x800) {
320 			mask = 0xc0;
321 			width = 2;
322 		} else if (v < 0x10000) {
323 			mask = 0xe0;
324 			width = 3;
325 		} else if (v < 0x200000) {
326 			mask = 0xf0;
327 			width = 4;
328 		} else if (v < 0x4000000) {
329 			mask = 0xf8;
330 			width = 5;
331 		} else if (v < 0x80000000) {
332 			mask = 0xfc;
333 			width = 6;
334 		} else {
335 			WARNING(("idn_ucs4_ucs4toutf8: invalid character\n"));
336 			r = idn_invalid_encoding;
337 			goto ret;
338 		}
339 
340 		if (tolen < width) {
341 			r = idn_buffer_overflow;
342 			goto ret;
343 		}
344 		offset = 6 * (width - 1);
345 		*utf8p++ = (v >> offset) | mask;
346 		mask = 0x80;
347 		while (offset > 0) {
348 			offset -= 6;
349 			*utf8p++ = ((v >> offset) & 0x3f) | mask;
350 		}
351 		tolen -= width;
352 	}
353 
354 	if (tolen < 1) {
355 		r = idn_buffer_overflow;
356 		goto ret;
357 	}
358 	*utf8p = '\0';
359 
360 	r = idn_success;
361 ret:
362 	if (r == idn_success) {
363 		TRACE(("idn_ucs4_ucs4toutf8(): success (utf8=\"%s\")\n",
364 		       idn__debug_xstring(utf8, 50)));
365 	} else {
366 		TRACE(("idn_ucs4_ucs4toutf8(): %s\n",
367 		       idn_result_tostring(r)));
368 	}
369 	return (r);
370 }
371 
372 size_t
idn_ucs4_strlen(const unsigned long * ucs4)373 idn_ucs4_strlen(const unsigned long *ucs4) {
374 	size_t len;
375 
376 	for (len = 0; *ucs4 != '\0'; ucs4++, len++)
377 		/* nothing to do */ ;
378 
379 	return (len);
380 }
381 
382 unsigned long *
idn_ucs4_strcpy(unsigned long * to,const unsigned long * from)383 idn_ucs4_strcpy(unsigned long *to, const unsigned long *from) {
384 	unsigned long *result = to;
385 
386 	while (*from != '\0')
387 		*to++ = *from++;
388 	*to = '\0';
389 
390 	return (result);
391 }
392 
393 unsigned long *
idn_ucs4_strcat(unsigned long * to,const unsigned long * from)394 idn_ucs4_strcat(unsigned long *to, const unsigned long *from) {
395 	unsigned long *result = to;
396 
397 	while (*to != '\0')
398 		to++;
399 
400 	while (*from != '\0')
401 		*to++ = *from++;
402 	*to = '\0';
403 
404 	return (result);
405 }
406 
407 int
idn_ucs4_strcmp(const unsigned long * str1,const unsigned long * str2)408 idn_ucs4_strcmp(const unsigned long *str1, const unsigned long *str2) {
409 	while (*str1 != '\0') {
410 		if (*str1 > *str2)
411 			return (1);
412 		else if (*str1 < *str2)
413 			return (-1);
414 		str1++;
415 		str2++;
416 	}
417 
418 	if (*str1 > *str2)
419 		return (1);
420 	else if (*str1 < *str2)
421 		return (-1);
422 
423 	return (0);
424 }
425 
426 int
idn_ucs4_strcasecmp(const unsigned long * str1,const unsigned long * str2)427 idn_ucs4_strcasecmp(const unsigned long *str1, const unsigned long *str2) {
428 	unsigned long c1, c2;
429 
430 	while (*str1 != '\0') {
431 		c1 = ASCII_TOLOWER(*str1);
432 		c2 = ASCII_TOLOWER(*str2);
433 		if (c1 > c2)
434 			return (1);
435 		else if (c1 < c2)
436 			return (-1);
437 		str1++;
438 		str2++;
439 	}
440 
441 	c1 = ASCII_TOLOWER(*str1);
442 	c2 = ASCII_TOLOWER(*str2);
443 	if (c1 > c2)
444 		return (1);
445 	else if (c1 < c2)
446 		return (-1);
447 
448 	return (0);
449 }
450 
451 
452 unsigned long *
idn_ucs4_strdup(const unsigned long * str)453 idn_ucs4_strdup(const unsigned long *str) {
454 	size_t length = idn_ucs4_strlen(str);
455 	unsigned long *dupstr;
456 
457 	dupstr = (unsigned long *)malloc(sizeof(*str) * (length + 1));
458 	if (dupstr == NULL)
459 		return NULL;
460 	memcpy(dupstr, str, sizeof(*str) * (length + 1));
461 
462 	return dupstr;
463 }
464