1 /*	$NetBSD: unormalize.c,v 1.4 2014/12/10 04:37:55 christos Exp $	*/
2 
3 #ifndef lint
4 static char *rcsid = "Id: unormalize.c,v 1.1 2003/06/04 00:26:43 marka Exp ";
5 #endif
6 
7 /*
8  * Copyright (c) 2000,2001,2002 Japan Network Information Center.
9  * All rights reserved.
10  *
11  * By using this file, you agree to the terms and conditions set forth bellow.
12  *
13  * 			LICENSE TERMS AND CONDITIONS
14  *
15  * The following License Terms and Conditions apply, unless a different
16  * license is obtained from Japan Network Information Center ("JPNIC"),
17  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18  * Chiyoda-ku, Tokyo 101-0047, Japan.
19  *
20  * 1. Use, Modification and Redistribution (including distribution of any
21  *    modified or derived work) in source and/or binary forms is permitted
22  *    under this License Terms and Conditions.
23  *
24  * 2. Redistribution of source code must retain the copyright notices as they
25  *    appear in each source code file, this License Terms and Conditions.
26  *
27  * 3. Redistribution in binary form must reproduce the Copyright Notice,
28  *    this License Terms and Conditions, in the documentation and/or other
29  *    materials provided with the distribution.  For the purposes of binary
30  *    distribution the "Copyright Notice" refers to the following language:
31  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
32  *
33  * 4. The name of JPNIC may not be used to endorse or promote products
34  *    derived from this Software without specific prior written approval of
35  *    JPNIC.
36  *
37  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
41  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48  */
49 
50 #include <config.h>
51 
52 #include <stddef.h>
53 #include <stdlib.h>
54 #include <string.h>
55 
56 #include <idn/result.h>
57 #include <idn/assert.h>
58 #include <idn/logmacro.h>
59 #include <idn/ucs4.h>
60 #include <idn/unicode.h>
61 #include <idn/unormalize.h>
62 #include <idn/debug.h>
63 
64 #if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY)
65 #define memmove(a,b,c)	bcopy((char *)(b),(char *)(a),(int)(c))
66 #endif
67 
68 #define WORKBUF_SIZE		128
69 #define WORKBUF_SIZE_MAX	10000
70 
71 typedef struct {
72 	idn__unicode_version_t version; /* Unicode version */
73 	int cur;		/* pointing now processing character */
74 	int last;		/* pointing just after the last character */
75 	int size;		/* size of UCS and CLASS array */
76 	unsigned long *ucs4;	/* UCS-4 characters */
77 	int *class;		/* and their canonical classes */
78 	unsigned long ucs4_buf[WORKBUF_SIZE];	/* local buffer */
79 	int class_buf[WORKBUF_SIZE];		/* ditto */
80 } workbuf_t;
81 
82 static idn_result_t	normalize(idn__unicode_version_t version,
83 				  int do_composition, int compat,
84 				  const unsigned long *from,
85 				  unsigned long *to, size_t tolen);
86 static idn_result_t	decompose(workbuf_t *wb, unsigned long c, int compat);
87 static void		get_class(workbuf_t *wb);
88 static void		reorder(workbuf_t *wb);
89 static void		compose(workbuf_t *wb);
90 static idn_result_t	flush_before_cur(workbuf_t *wb,
91 					 unsigned long **top, size_t *tolenp);
92 static void		workbuf_init(workbuf_t *wb);
93 static void		workbuf_free(workbuf_t *wb);
94 static idn_result_t	workbuf_extend(workbuf_t *wb);
95 static idn_result_t	workbuf_append(workbuf_t *wb, unsigned long c);
96 static void		workbuf_shift(workbuf_t *wb, int shift);
97 static void		workbuf_removevoid(workbuf_t *wb);
98 
99 idn_result_t
100 idn__unormalize_formkc(idn__unicode_version_t version,
101 		       const unsigned long *from, unsigned long *to,
102 		       size_t tolen) {
103 	assert(version != NULL && from != NULL && to != NULL && tolen >= 0);
104 	TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n",
105 	       idn__debug_ucs4xstring(from, 50), tolen));
106 	return (normalize(version, 1, 1, from, to, tolen));
107 }
108 
109 static idn_result_t
110 normalize(idn__unicode_version_t version, int do_composition, int compat,
111 	  const unsigned long *from, unsigned long *to, size_t tolen) {
112 	workbuf_t wb;
113 	idn_result_t r = idn_success;
114 
115 	/*
116 	 * Initialize working buffer.
117 	 */
118 	workbuf_init(&wb);
119 	wb.version = version;
120 
121 	while (*from != '\0') {
122 		unsigned long c;
123 
124 		assert(wb.cur == wb.last);
125 
126 		/*
127 		 * Get one character from 'from'.
128 		 */
129 		c = *from++;
130 
131 		/*
132 		 * Decompose it.
133 		 */
134 		if ((r = decompose(&wb, c, compat)) != idn_success)
135 			goto ret;
136 
137 		/*
138 		 * Get canonical class.
139 		 */
140 		get_class(&wb);
141 
142 		/*
143 		 * Reorder & compose.
144 		 */
145 		for (; wb.cur < wb.last; wb.cur++) {
146 			if (wb.cur == 0) {
147 				continue;
148 			} else if (wb.class[wb.cur] > 0) {
149 				/*
150 				 * This is not a starter. Try reordering.
151 				 * Note that characters up to it are
152 				 * already in canonical order.
153 				 */
154 				reorder(&wb);
155 				continue;
156 			}
157 
158 			/*
159 			 * This is a starter character, and there are
160 			 * some characters before it.  Those characters
161 			 * have been reordered properly, and
162 			 * ready for composition.
163 			 */
164 			if (do_composition && wb.class[0] == 0)
165 				compose(&wb);
166 
167 			/*
168 			 * If CUR points to a starter character,
169 			 * then process of characters before CUR are
170 			 * already finished, because any further
171 			 * reordering/composition for them are blocked
172 			 * by the starter CUR points.
173 			 */
174 			if (wb.cur > 0 && wb.class[wb.cur] == 0) {
175 				/* Flush everything before CUR. */
176 				r = flush_before_cur(&wb, &to, &tolen);
177 				if (r != idn_success)
178 					goto ret;
179 			}
180 		}
181 	}
182 
183 	if (r == idn_success) {
184 		if (do_composition && wb.cur > 0 && wb.class[0] == 0) {
185 			/*
186 			 * There is some characters left in WB.
187 			 * They are ordered, but not composed yet.
188 			 * Now CUR points just after the last character in WB,
189 			 * and since compose() tries to compose characters
190 			 * between top and CUR inclusive, we must make CUR
191 			 * one character back during compose().
192 			 */
193 			wb.cur--;
194 			compose(&wb);
195 			wb.cur++;
196 		}
197 		/*
198 		 * Call this even when WB.CUR == 0, to make TO
199 		 * NUL-terminated.
200 		 */
201 		r = flush_before_cur(&wb, &to, &tolen);
202 		if (r != idn_success)
203 			goto ret;
204 	}
205 
206 	if (tolen <= 0) {
207 		r = idn_buffer_overflow;
208 		goto ret;
209 	}
210 	*to = '\0';
211 
212 ret:
213 	workbuf_free(&wb);
214 	return (r);
215 }
216 
217 static idn_result_t
218 decompose(workbuf_t *wb, unsigned long c, int compat) {
219 	idn_result_t r;
220 	int dec_len;
221 
222 again:
223 	r = idn__unicode_decompose(wb->version, compat, wb->ucs4 + wb->last,
224 				   wb->size - wb->last, c, &dec_len);
225 	switch (r) {
226 	case idn_success:
227 		wb->last += dec_len;
228 		return (idn_success);
229 	case idn_notfound:
230 		return (workbuf_append(wb, c));
231 	case idn_buffer_overflow:
232 		if ((r = workbuf_extend(wb)) != idn_success)
233 			return (r);
234 		if (wb->size > WORKBUF_SIZE_MAX) {
235 			WARNING(("idn__unormalize_form*: "
236 				"working buffer too large\n"));
237 			return (idn_nomemory);
238 		}
239 		goto again;
240 	default:
241 		return (r);
242 	}
243 	/* NOTREACHED */
244 }
245 
246 static void
247 get_class(workbuf_t *wb) {
248 	int i;
249 
250 	for (i = wb->cur; i < wb->last; i++)
251 		wb->class[i] = idn__unicode_canonicalclass(wb->version,
252 							   wb->ucs4[i]);
253 }
254 
255 static void
256 reorder(workbuf_t *wb) {
257 	unsigned long c;
258 	int i;
259 	int class;
260 
261 	assert(wb != NULL);
262 
263 	i = wb->cur;
264 	c = wb->ucs4[i];
265 	class = wb->class[i];
266 
267 	while (i > 0 && wb->class[i - 1] > class) {
268 		wb->ucs4[i] = wb->ucs4[i - 1];
269 		wb->class[i] =wb->class[i - 1];
270 		i--;
271 		wb->ucs4[i] = c;
272 		wb->class[i] = class;
273 	}
274 }
275 
276 static void
277 compose(workbuf_t *wb) {
278 	int cur;
279 	unsigned long *ucs4;
280 	int *class;
281 	int last_class;
282 	int nvoids;
283 	int i;
284 	idn__unicode_version_t ver;
285 
286 	assert(wb != NULL && wb->class[0] == 0);
287 
288 	cur = wb->cur;
289 	ucs4 = wb->ucs4;
290 	class = wb->class;
291 	ver = wb->version;
292 
293 	/*
294 	 * If there are no decomposition sequence that begins with
295 	 * the top character, composition is impossible.
296 	 */
297 	if (!idn__unicode_iscompositecandidate(ver, ucs4[0]))
298 		return;
299 
300 	last_class = 0;
301 	nvoids = 0;
302 	for (i = 1; i <= cur; i++) {
303 		unsigned long c;
304 		int cl = class[i];
305 
306 		if ((last_class < cl || cl == 0) &&
307 		    idn__unicode_compose(ver, ucs4[0], ucs4[i],
308 					 &c) == idn_success) {
309 			/*
310 			 * Replace the top character with the composed one.
311 			 */
312 			ucs4[0] = c;
313 			class[0] = idn__unicode_canonicalclass(ver, c);
314 
315 			class[i] = -1;	/* void this character */
316 			nvoids++;
317 		} else {
318 			last_class = cl;
319 		}
320 	}
321 
322 	/* Purge void characters, if any. */
323 	if (nvoids > 0)
324 		workbuf_removevoid(wb);
325 }
326 
327 static idn_result_t
328 flush_before_cur(workbuf_t *wb, unsigned long **top, size_t *tolenp) {
329 	if (*tolenp < wb->cur)
330 		return (idn_buffer_overflow);
331 
332 	memcpy(*top, wb->ucs4, sizeof(**top) * wb->cur);
333 	*top += wb->cur;
334 	*tolenp -= wb->cur;
335 	workbuf_shift(wb, wb->cur);
336 
337 	return (idn_success);
338 }
339 
340 static void
341 workbuf_init(workbuf_t *wb) {
342 	wb->cur = 0;
343 	wb->last = 0;
344 	wb->size = WORKBUF_SIZE;
345 	wb->ucs4 = wb->ucs4_buf;
346 	wb->class = wb->class_buf;
347 }
348 
349 static void
350 workbuf_free(workbuf_t *wb) {
351 	if (wb->ucs4 != wb->ucs4_buf) {
352 		free(wb->ucs4);
353 		free(wb->class);
354 	}
355 }
356 
357 static idn_result_t
358 workbuf_extend(workbuf_t *wb) {
359 	int newsize = wb->size * 3;
360 
361 	if (wb->ucs4 == wb->ucs4_buf) {
362 		wb->ucs4 = malloc(sizeof(wb->ucs4[0]) * newsize);
363 		wb->class = malloc(sizeof(wb->class[0]) * newsize);
364 	} else {
365 		wb->ucs4 = realloc(wb->ucs4, sizeof(wb->ucs4[0]) * newsize);
366 		wb->class = realloc(wb->class, sizeof(wb->class[0]) * newsize);
367 	}
368 	if (wb->ucs4 == NULL || wb->class == NULL)
369 		return (idn_nomemory);
370 	else
371 		return (idn_success);
372 }
373 
374 static idn_result_t
375 workbuf_append(workbuf_t *wb, unsigned long c) {
376 	idn_result_t r;
377 
378 	if (wb->last >= wb->size && (r = workbuf_extend(wb)) != idn_success)
379 		return (r);
380 	wb->ucs4[wb->last++] = c;
381 	return (idn_success);
382 }
383 
384 static void
385 workbuf_shift(workbuf_t *wb, int shift) {
386 	int nmove;
387 
388 	assert(wb != NULL && wb->cur >= shift);
389 
390 	nmove = wb->last - shift;
391 	(void)memmove(&wb->ucs4[0], &wb->ucs4[shift],
392 		      nmove * sizeof(wb->ucs4[0]));
393 	(void)memmove(&wb->class[0], &wb->class[shift],
394 		      nmove * sizeof(wb->class[0]));
395 	wb->cur -= shift;
396 	wb->last -= shift;
397 }
398 
399 static void
400 workbuf_removevoid(workbuf_t *wb) {
401 	int i, j;
402 	int last = wb->last;
403 
404 	for (i = j = 0; i < last; i++) {
405 		if (wb->class[i] >= 0) {
406 			if (j < i) {
407 				wb->ucs4[j] = wb->ucs4[i];
408 				wb->class[j] = wb->class[i];
409 			}
410 			j++;
411 		}
412 	}
413 	wb->cur -= last - j;
414 	wb->last = j;
415 }
416