1 #ifndef lint
2 static char *rcsid = "$Id: race.c,v 1.1 2003/06/04 00:26:07 marka Exp $";
3 #endif
4 
5 /*
6  * Copyright (c) 2000,2001,2002 Japan Network Information Center.
7  * All rights reserved.
8  *
9  * By using this file, you agree to the terms and conditions set forth bellow.
10  *
11  * 			LICENSE TERMS AND CONDITIONS
12  *
13  * The following License Terms and Conditions apply, unless a different
14  * license is obtained from Japan Network Information Center ("JPNIC"),
15  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
16  * Chiyoda-ku, Tokyo 101-0047, Japan.
17  *
18  * 1. Use, Modification and Redistribution (including distribution of any
19  *    modified or derived work) in source and/or binary forms is permitted
20  *    under this License Terms and Conditions.
21  *
22  * 2. Redistribution of source code must retain the copyright notices as they
23  *    appear in each source code file, this License Terms and Conditions.
24  *
25  * 3. Redistribution in binary form must reproduce the Copyright Notice,
26  *    this License Terms and Conditions, in the documentation and/or other
27  *    materials provided with the distribution.  For the purposes of binary
28  *    distribution the "Copyright Notice" refers to the following language:
29  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
30  *
31  * 4. The name of JPNIC may not be used to endorse or promote products
32  *    derived from this Software without specific prior written approval of
33  *    JPNIC.
34  *
35  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
36  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
38  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
39  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
40  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
41  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
42  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
43  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
44  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
45  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
46  */
47 
48 #include <config.h>
49 
50 #include <stddef.h>
51 #include <stdlib.h>
52 #include <string.h>
53 
54 #include <idn/result.h>
55 #include <idn/assert.h>
56 #include <idn/logmacro.h>
57 #include <idn/converter.h>
58 #include <idn/ucs4.h>
59 #include <idn/debug.h>
60 #include <idn/race.h>
61 #include <idn/util.h>
62 
63 #ifndef IDN_RACE_PREFIX
64 #define IDN_RACE_PREFIX		"bq--"
65 #endif
66 #define RACE_2OCTET_MODE	0xd8
67 #define RACE_ESCAPE		0xff
68 #define RACE_ESCAPE_2ND		0x99
69 
70 #define RACE_BUF_SIZE		128		/* more than enough */
71 
72 /*
73  * Unicode surrogate pair.
74  */
75 #define IS_SURROGATE_HIGH(v)	(0xd800 <= (v) && (v) <= 0xdbff)
76 #define IS_SURROGATE_LOW(v)	(0xdc00 <= (v) && (v) <= 0xdfff)
77 #define SURROGATE_HIGH(v)	(SURROGATE_H_OFF + (((v) - 0x10000) >> 10))
78 #define SURROGATE_LOW(v)	(SURROGATE_L_OFF + ((v) & 0x3ff))
79 #define SURROGATE_BASE		0x10000
80 #define SURROGATE_H_OFF		0xd800
81 #define SURROGATE_L_OFF		0xdc00
82 #define COMBINE_SURROGATE(h, l) \
83 	(SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF))
84 
85 /*
86  * Compression type.
87  */
88 enum {
89 	compress_one,	/* all characters are in a single row */
90 	compress_two,	/* row 0 and another row */
91 	compress_none	/* nope */
92 };
93 
94 static idn_result_t	race_decode_decompress(const char *from,
95 					       unsigned short *buf,
96 					       size_t buflen);
97 static idn_result_t	race_compress_encode(const unsigned short *p,
98 					     int compress_mode,
99 					     char *to, size_t tolen);
100 static int		get_compress_mode(unsigned short *p);
101 
102 idn_result_t
idn__race_decode(idn_converter_t ctx,void * privdata,const char * from,unsigned long * to,size_t tolen)103 idn__race_decode(idn_converter_t ctx, void *privdata,
104 		 const char *from, unsigned long *to, size_t tolen) {
105 	unsigned short *buf = NULL;
106 	size_t prefixlen = strlen(IDN_RACE_PREFIX);
107 	size_t fromlen;
108 	size_t buflen;
109 	idn_result_t r;
110 
111 	assert(ctx != NULL);
112 
113 	TRACE(("idn__race_decode(from=\"%s\", tolen=%d)\n",
114 	       idn__debug_xstring(from, 50), (int)tolen));
115 
116 	if (!idn__util_asciihaveaceprefix(from, IDN_RACE_PREFIX)) {
117 		if (*from == '\0') {
118 			r = idn_ucs4_utf8toucs4(from, to, tolen);
119 			goto ret;
120 		}
121 		r = idn_invalid_encoding;
122 		goto ret;
123 	}
124 	from += prefixlen;
125 	fromlen = strlen(from);
126 
127 	/*
128 	 * Allocate sufficient buffer.
129 	 */
130 	buflen = fromlen + 1;
131 	buf = malloc(sizeof(*buf) * buflen);
132 	if (buf == NULL) {
133 		r = idn_nomemory;
134 		goto ret;
135 	}
136 
137 	/*
138 	 * Decode base32 and decompress.
139 	 */
140 	r = race_decode_decompress(from, buf, buflen);
141 	if (r != idn_success)
142 		goto ret;
143 
144 	/*
145 	 * Now 'buf' points the decompressed string, which must contain
146 	 * UTF-16 characters.
147 	 */
148 
149 	/*
150 	 * Convert to UCS4.
151 	 */
152 	r = idn_ucs4_utf16toucs4(buf, to, tolen);
153 	if (r != idn_success)
154 		goto ret;
155 
156 ret:
157 	free(buf);
158 	if (r == idn_success) {
159 		TRACE(("idn__race_decode(): succcess (to=\"%s\")\n",
160 		       idn__debug_ucs4xstring(to, 50)));
161 	} else {
162 		TRACE(("idn__race_decode(): %s\n", idn_result_tostring(r)));
163 	}
164 	return (r);
165 }
166 
167 static idn_result_t
race_decode_decompress(const char * from,unsigned short * buf,size_t buflen)168 race_decode_decompress(const char *from, unsigned short *buf, size_t buflen)
169 {
170 	unsigned short *p = buf;
171 	unsigned int bitbuf = 0;
172 	int bitlen = 0;
173 	int i, j;
174 	size_t len;
175 
176 	while (*from != '\0') {
177 		int c = *from++;
178 		int x;
179 
180 		if ('a' <= c && c <= 'z')
181 			x = c - 'a';
182 		else if ('A' <= c && c <= 'Z')
183 			x = c - 'A';
184 		else if ('2' <= c && c <= '7')
185 			x = c - '2' + 26;
186 		else
187 			return (idn_invalid_encoding);
188 
189 		bitbuf = (bitbuf << 5) + x;
190 		bitlen += 5;
191 		if (bitlen >= 8) {
192 			*p++ = (bitbuf >> (bitlen - 8)) & 0xff;
193 			bitlen -= 8;
194 		}
195 	}
196 	len = p - buf;
197 
198 	/*
199 	 * Now 'buf' holds the decoded string.
200 	 */
201 
202 	/*
203 	 * Decompress.
204 	 */
205 	if (buf[0] == RACE_2OCTET_MODE) {
206 		if ((len - 1) % 2 != 0)
207 			return (idn_invalid_encoding);
208 		for (i = 1, j = 0; i < len; i += 2, j++)
209 			buf[j] = (buf[i] << 8) + buf[i + 1];
210 		len = j;
211 	} else {
212 		unsigned short c = buf[0] << 8;	/* higher octet */
213 
214 		for (i = 1, j = 0; i < len; j++) {
215 			if (buf[i] == RACE_ESCAPE) {
216 				if (i + 1 >= len)
217 					return (idn_invalid_encoding);
218 				else if (buf[i + 1] == RACE_ESCAPE_2ND)
219 					buf[j] = c | 0xff;
220 				else
221 					buf[j] = buf[i + 1];
222 				i += 2;
223 
224 			} else if (buf[i] == 0x99 && c == 0x00) {
225 				/*
226 				 * The RACE specification says this is error.
227 				 */
228 				return (idn_invalid_encoding);
229 
230 			} else {
231 				buf[j] = c | buf[i++];
232 			}
233 		}
234 		len = j;
235 	}
236 	buf[len] = '\0';
237 
238 	return (idn_success);
239 }
240 
241 idn_result_t
idn__race_encode(idn_converter_t ctx,void * privdata,const unsigned long * from,char * to,size_t tolen)242 idn__race_encode(idn_converter_t ctx, void *privdata,
243 		 const unsigned long *from, char *to, size_t tolen) {
244 	char *to_org = to;
245 	unsigned short *p, *buf = NULL;
246 	size_t prefixlen = strlen(IDN_RACE_PREFIX);
247 	size_t buflen;
248 	size_t fromlen;
249 	idn_result_t r;
250 	int compress_mode;
251 
252 	assert(ctx != NULL);
253 
254 	TRACE(("idn__race_encode(from=\"%s\", tolen=%d)\n",
255 	       idn__debug_ucs4xstring(from, 50), (int)tolen));
256 
257 	if (*from == '\0') {
258 		r = idn_ucs4_ucs4toutf8(from, to, tolen);
259 		goto ret;
260 	} else if (idn__util_ucs4haveaceprefix(from, IDN_RACE_PREFIX)) {
261 		r = idn_prohibited;
262 		goto ret;
263 	}
264 
265 	if (tolen < prefixlen) {
266 		r  = idn_buffer_overflow;
267 		goto ret;
268 	}
269 	memcpy(to, IDN_RACE_PREFIX, prefixlen);
270 	to += prefixlen;
271 	tolen -= prefixlen;
272 
273 	fromlen = idn_ucs4_strlen(from);
274 	buflen = fromlen * 2 + 2;
275 
276 	/*
277 	 * Convert to UTF-16.
278 	 * Preserve space for a character at the top of the buffer.
279 	 */
280 	for (;;) {
281 		unsigned short *new_buf;
282 
283 		new_buf = realloc(buf, sizeof(*buf) * buflen);
284 		if (new_buf == NULL) {
285 			r = idn_nomemory;
286 			goto ret;
287 		}
288 		buf = new_buf;
289 
290 		r = idn_ucs4_ucs4toutf16(from, buf + 1, buflen - 1);
291 		if (r == idn_success)
292 			break;
293 		else if (r != idn_buffer_overflow)
294 			goto ret;
295 
296 		buflen = fromlen * 2 + 2;
297 	}
298 	p = buf + 1;
299 
300 	/*
301 	 * Now 'p' contains UTF-16 encoded string.
302 	 */
303 
304 	/*
305 	 * Check U+0099.
306 	 * RACE doesn't permit U+0099 in an input string.
307 	 */
308 	for (p = buf + 1; *p != '\0'; p++) {
309 		if (*p == 0x0099) {
310 			r = idn_invalid_encoding;
311 			goto ret;
312 		}
313 	}
314 
315 	/*
316 	 * Compress, encode in base-32 and output.
317 	 */
318 	compress_mode = get_compress_mode(buf + 1);
319 	r = race_compress_encode(buf, compress_mode, to, tolen);
320 
321 ret:
322 	free(buf);
323 	if (r == idn_success) {
324 		TRACE(("idn__race_encode(): succcess (to=\"%s\")\n",
325 		       idn__debug_xstring(to_org, 50)));
326 	} else {
327 		TRACE(("idn__race_encode(): %s\n", idn_result_tostring(r)));
328 	}
329 	return (r);
330 }
331 
332 static idn_result_t
race_compress_encode(const unsigned short * p,int compress_mode,char * to,size_t tolen)333 race_compress_encode(const unsigned short *p, int compress_mode,
334 		     char *to, size_t tolen)
335 {
336 	unsigned long bitbuf = *p++;	/* bit stream buffer */
337 	int bitlen = 8;			/* # of bits in 'bitbuf' */
338 
339 	while (*p != '\0' || bitlen > 0) {
340 		unsigned int c = *p;
341 
342 		if (c == '\0') {
343 			/* End of data.  Flush. */
344 			bitbuf <<= (5 - bitlen);
345 			bitlen = 5;
346 		} else if (compress_mode == compress_none) {
347 			/* Push 16 bit data. */
348 			bitbuf = (bitbuf << 16) | c;
349 			bitlen += 16;
350 			p++;
351 		} else {/* compress_mode == compress_one/compress_two */
352 			/* Push 8 or 16 bit data. */
353 			if (compress_mode == compress_two &&
354 			    (c & 0xff00) == 0) {
355 				/* Upper octet is zero (and not U1). */
356 				bitbuf = (bitbuf << 16) | 0xff00 | c;
357 				bitlen += 16;
358 			} else if ((c & 0xff) == 0xff) {
359 				/* Lower octet is 0xff. */
360 				bitbuf = (bitbuf << 16) |
361 					(RACE_ESCAPE << 8) | RACE_ESCAPE_2ND;
362 				bitlen += 16;
363 			} else {
364 				/* Just output lower octet. */
365 				bitbuf = (bitbuf << 8) | (c & 0xff);
366 				bitlen += 8;
367 			}
368 			p++;
369 		}
370 
371 		/*
372 		 * Output bits in 'bitbuf' in 5-bit unit.
373 		 */
374 		while (bitlen >= 5) {
375 			int x;
376 
377 			/* Get top 5 bits. */
378 			x = (bitbuf >> (bitlen - 5)) & 0x1f;
379 			bitlen -= 5;
380 
381 			/* Encode. */
382 			if (x < 26)
383 				x += 'a';
384 			else
385 				x = (x - 26) + '2';
386 
387 			if (tolen < 1)
388 				return (idn_buffer_overflow);
389 
390 			*to++ = x;
391 			tolen--;
392 		}
393 	}
394 
395 	if (tolen <= 0)
396 		return (idn_buffer_overflow);
397 
398 	*to = '\0';
399 	return (idn_success);
400 }
401 
402 static int
get_compress_mode(unsigned short * p)403 get_compress_mode(unsigned short *p) {
404 	int zero = 0;
405 	unsigned int upper = 0;
406 	unsigned short *modepos = p - 1;
407 
408 	while (*p != '\0') {
409 		unsigned int hi = *p++ & 0xff00;
410 
411 		if (hi == 0) {
412 			zero++;
413 		} else if (hi == upper) {
414 			;
415 		} else if (upper == 0) {
416 			upper = hi;
417 		} else {
418 			*modepos = RACE_2OCTET_MODE;
419 			return (compress_none);
420 		}
421 	}
422 	*modepos = upper >> 8;
423 	if (upper > 0 && zero > 0)
424 		return (compress_two);
425 	else
426 		return (compress_one);
427 }
428