1 #ifndef lint
2 static char *rcsid = "$Id: ucsset.c,v 1.7 2002/11/29 09:08:05 ishisone Exp $";
3 #endif
4 
5 /*
6  * Copyright (c) 2001 Japan Network Information Center.  All rights reserved.
7  *
8  * By using this file, you agree to the terms and conditions set forth bellow.
9  *
10  * 			LICENSE TERMS AND CONDITIONS
11  *
12  * The following License Terms and Conditions apply, unless a different
13  * license is obtained from Japan Network Information Center ("JPNIC"),
14  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
15  * Chiyoda-ku, Tokyo 101-0047, Japan.
16  *
17  * 1. Use, Modification and Redistribution (including distribution of any
18  *    modified or derived work) in source and/or binary forms is permitted
19  *    under this License Terms and Conditions.
20  *
21  * 2. Redistribution of source code must retain the copyright notices as they
22  *    appear in each source code file, this License Terms and Conditions.
23  *
24  * 3. Redistribution in binary form must reproduce the Copyright Notice,
25  *    this License Terms and Conditions, in the documentation and/or other
26  *    materials provided with the distribution.  For the purposes of binary
27  *    distribution the "Copyright Notice" refers to the following language:
28  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
29  *
30  * 4. The name of JPNIC may not be used to endorse or promote products
31  *    derived from this Software without specific prior written approval of
32  *    JPNIC.
33  *
34  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
35  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
37  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
38  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
39  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
40  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
41  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
42  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
43  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
44  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
45  */
46 
47 #include <config.h>
48 
49 #include <stdlib.h>
50 #include <stdio.h>
51 #include <string.h>
52 
53 #include <idn/result.h>
54 #include <idn/assert.h>
55 #include <idn/logmacro.h>
56 #include <idn/ucsset.h>
57 
58 #define UCS_MAX		0x80000000UL
59 
60 #define INIT_SIZE	50
61 
62 /*
63  * Code point range.
64  *
65  * The set of code points is represented by an array of code point ranges.
66  * In the building phase, specified ranges by 'idn_ucsset_add' or
67  * 'idn_ucsset_addrange' are simply appended to the array.
68  * And 'idn_ucsset_fix' sorts the array by the code point value, and also
69  * merges any intersecting ranges.  Since the array is sorted, a binary
70  * search can be used for looking up.
71  */
72 typedef struct {
73 	unsigned long from;
74 	unsigned long to;
75 } range_t;
76 
77 /*
78  * Code point segment.
79  *
80  * To speed up searching further, the entire region of UCS-4 code points
81  * (U+0000 - U+7FFFFFFF) are divided into segments. For each segment,
82  * the first and last element of the range array corresponding to the
83  * segment are computed by 'idn_ucsset_fix'.  This narrows down the
84  * (initial) search range.
85  */
86 typedef struct {
87 	int range_start;	/* index of ucsset.ranges */
88 	int range_end;		/* ditto */
89 } segment_t;
90 
91 /*
92  * Code point to segment index conversion.
93  *
94  * Below is the function that maps a code point to the corresponding segment.
95  * The mapping is non-uniform, so that BMP, the following 16 planes that
96  * comprise Unicode code points together with BMP, and other planes
97  * have different granularity.
98  */
99 #define SEG_THLD1	0x10000		/* BMP */
100 #define SEG_THLD2	0x110000	/* Unicode (BMP+16planes) */
101 #define SEG_SFT1	10		/* BMP: 1K code points/segment */
102 #define SEG_SFT2	14		/* following 16 planes: 16K cp/seg */
103 #define SEG_SFT3	24		/* rest: 16M cp/seg */
104 #define SEG_OFF1	(SEG_THLD1 >> SEG_SFT1)
105 #define SEG_OFF2	(((SEG_THLD2 - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1)
106 #define SEG_INDEX(v) \
107 	(((v) < SEG_THLD1) ? ((v) >> SEG_SFT1) : \
108 	 ((v) < SEG_THLD2) ? ((((v) - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1) : \
109 	 ((((v) - SEG_THLD2) >> SEG_SFT3) + SEG_OFF2))
110 #define SEG_LEN	(SEG_INDEX(UCS_MAX - 1) + 1)
111 
112 /*
113  * Representation of set of UCS code points.
114  */
115 typedef struct idn_ucsset {
116 	segment_t segments[SEG_LEN];
117 	int fixed;
118 	int size;			/* allocated size of 'ranges' */
119 	int nranges;			/* num of ranges */
120 	range_t *ranges;
121 	int refcnt;			/* reference count */
122 } ucsset;
123 
124 static idn_result_t	addrange(idn_ucsset_t ctx, unsigned long from,
125 				 unsigned long to, char *func_name);
126 static int		comp_range(const void *v1, const void *v2);
127 
128 idn_result_t
idn_ucsset_create(idn_ucsset_t * ctx)129 idn_ucsset_create(idn_ucsset_t *ctx) {
130 	idn_ucsset_t bm;
131 
132 	assert(ctx != NULL);
133 
134 	TRACE(("idn_ucsset_create()\n"));
135 
136 	if ((bm = malloc(sizeof(ucsset))) == NULL) {
137 		WARNING(("idn_ucsset_create: malloc failed\n"));
138 		return idn_nomemory;
139 	}
140 	bm->size = bm->nranges = 0;
141 	bm->ranges = NULL;
142 	bm->fixed = 0;
143 	bm->refcnt = 1;
144 	*ctx = bm;
145 	return (idn_success);
146 }
147 
148 void
idn_ucsset_destroy(idn_ucsset_t ctx)149 idn_ucsset_destroy(idn_ucsset_t ctx) {
150 	assert(ctx != NULL && ctx->refcnt > 0);
151 
152 	TRACE(("idn_ucsset_destroy()\n"));
153 
154 	if (--ctx->refcnt == 0) {
155 		if (ctx->ranges != NULL)
156 			free(ctx->ranges);
157 		free(ctx);
158 	}
159 }
160 
161 void
idn_ucsset_incrref(idn_ucsset_t ctx)162 idn_ucsset_incrref(idn_ucsset_t ctx) {
163 	assert(ctx != NULL && ctx->refcnt > 0);
164 
165 	TRACE(("idn_ucsset_incrref()\n"));
166 
167 	ctx->refcnt++;
168 }
169 
170 idn_result_t
idn_ucsset_add(idn_ucsset_t ctx,unsigned long v)171 idn_ucsset_add(idn_ucsset_t ctx, unsigned long v) {
172 	assert(ctx != NULL && ctx->refcnt > 0);
173 
174 	TRACE(("idn_ucsset_add(v=U+%lX)\n", v));
175 
176 	return (addrange(ctx, v, v, "idn_ucsset_add"));
177 }
178 
179 idn_result_t
idn_ucsset_addrange(idn_ucsset_t ctx,unsigned long from,unsigned long to)180 idn_ucsset_addrange(idn_ucsset_t ctx, unsigned long from,
181 			 unsigned long to)
182 {
183 	assert(ctx != NULL && ctx->refcnt > 0);
184 
185 	TRACE(("idn_ucsset_addrange(from=U+%lX, to=U+%lX)\n",
186 	       from, to));
187 
188 	return (addrange(ctx, from, to, "idn_ucsset_addrange"));
189 }
190 
191 void
idn_ucsset_fix(idn_ucsset_t ctx)192 idn_ucsset_fix(idn_ucsset_t ctx) {
193 	int nranges;
194 	range_t *ranges;
195 	segment_t *segments;
196 	int i, j;
197 
198 	assert(ctx != NULL && ctx->refcnt > 0);
199 
200 	TRACE(("idn_ucsset_fix()\n"));
201 
202 	nranges = ctx->nranges;
203 	ranges = ctx->ranges;
204 	segments = ctx->segments;
205 
206 	if (ctx->fixed)
207 		return;
208 
209 	ctx->fixed = 1;
210 
211 	/* Initialize segment array */
212 	for (i = 0; i < SEG_LEN; i++) {
213 		segments[i].range_start = -1;
214 		segments[i].range_end = -1;
215 	}
216 
217 	/* If the set is empty, there's nothing to be done. */
218 	if (nranges == 0)
219 		return;
220 
221 	/* Sort ranges. */
222 	qsort(ranges, nranges, sizeof(range_t), comp_range);
223 
224 	/* Merge overlapped/continuous ranges. */
225 	for (i = 0, j = 1; j < nranges; j++) {
226 		if (ranges[i].to + 1 >= ranges[j].from) {
227 			/* can be merged */
228 			if (ranges[i].to < ranges[j].to) {
229 				ranges[i].to = ranges[j].to;
230 			}
231 		} else {
232 			i++;
233 			if (i < j)
234 				ranges[i] = ranges[j];
235 		}
236 	}
237 	/* 'i' points the last range in the array. */
238 	ctx->nranges = nranges = ++i;
239 
240 	/* Create segment array. */
241 	for (i = 0; i < nranges; i++) {
242 		int fidx = SEG_INDEX(ranges[i].from);
243 		int tidx = SEG_INDEX(ranges[i].to);
244 
245 		for (j = fidx; j <= tidx; j++) {
246 			if (segments[j].range_start < 0)
247 				segments[j].range_start = i;
248 			segments[j].range_end = i;
249 		}
250 	}
251 
252 #if 0
253 	/*
254 	 * Does the standard guarantee realloc() always succeeds
255 	 * when shrinking?
256 	 */
257 	/* Shrink malloc'ed space if possible. */
258 	ctx->ranges = realloc(ctx->ranges, ctx->nranges * sizeof(range_t));
259 #endif
260 }
261 
262 idn_result_t
idn_ucsset_lookup(idn_ucsset_t ctx,unsigned long v,int * found)263 idn_ucsset_lookup(idn_ucsset_t ctx, unsigned long v, int *found) {
264 	int idx;
265 	segment_t *segments;
266 
267 	assert(ctx != NULL && ctx->refcnt > 0 && found != NULL);
268 
269 	TRACE(("idn_ucsset_lookup(v=U+%lX)\n", v));
270 
271 	/* Make sure it is fixed. */
272 	if (!ctx->fixed) {
273 		WARNING(("idn_ucsset_lookup: not fixed yet\n"));
274 		return (idn_failure);
275 	}
276 
277 	/* Check the given code point. */
278 	if (v >= UCS_MAX)
279 		return (idn_invalid_codepoint);
280 
281 	/* Get the segment 'v' belongs to. */
282 	segments = ctx->segments;
283 	idx = SEG_INDEX(v);
284 
285 	/* Do binary search. */
286 	*found = 0;
287 	if (segments[idx].range_start >= 0) {
288 		int lo = segments[idx].range_start;
289 		int hi = segments[idx].range_end;
290 		range_t *ranges = ctx->ranges;
291 
292 		while (lo <= hi) {
293 			int mid = (lo + hi) / 2;
294 			if (v < ranges[mid].from) {
295 				hi = mid - 1;
296 			} else if (v > ranges[mid].to) {
297 				lo = mid + 1;
298 			} else {
299 				*found = 1;
300 				break;
301 			}
302 		}
303 	}
304 	return (idn_success);
305 }
306 
307 static idn_result_t
addrange(idn_ucsset_t ctx,unsigned long from,unsigned long to,char * func_name)308 addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to,
309 	 char *func_name)
310 {
311 	range_t *newbuf;
312 
313 	/* Check the given code points. */
314 	if (from > UCS_MAX) {
315 		WARNING(("%s: code point out of range (U+%lX)\n",
316 			 func_name, from));
317 		return (idn_invalid_codepoint);
318 	} else if (to > UCS_MAX) {
319 		WARNING(("%s: code point out of range (U+%lX)\n",
320 			 func_name, to));
321 		return (idn_invalid_codepoint);
322 	} else if (from > to) {
323 		WARNING(("%s: invalid range spec (U+%lX-U+%lX)\n",
324 			 func_name, from, to));
325 		return (idn_invalid_codepoint);
326 	}
327 
328 	/* Make sure it is not fixed yet. */
329 	if (ctx->fixed) {
330 		WARNING(("%s: attempt to add to already fixed object\n",
331 			 func_name));
332 		return (idn_failure);
333 	}
334 
335 	/* Append the specified range to the 'ranges' array. */
336 	if (ctx->nranges >= ctx->size) {
337 		/* Make it bigger. */
338 		if (ctx->size == 0)
339 			ctx->size = INIT_SIZE;
340 		else
341 			ctx->size *= 2;
342 		newbuf = realloc(ctx->ranges, ctx->size * sizeof(range_t));
343 		if (newbuf == NULL)
344 			return (idn_nomemory);
345 		ctx->ranges = newbuf;
346 	}
347 	ctx->ranges[ctx->nranges].from = from;
348 	ctx->ranges[ctx->nranges].to = to;
349 	ctx->nranges++;
350 
351 	return (idn_success);
352 }
353 
354 static int
comp_range(const void * v1,const void * v2)355 comp_range(const void *v1, const void *v2) {
356 	/*
357 	 * Range comparation function suitable for qsort().
358 	 */
359 	const range_t *r1 = v1;
360 	const range_t *r2 = v2;
361 
362 	if (r1->from < r2->from)
363 		return (-1);
364 	else if (r1->from > r2->from)
365 		return (1);
366 	else
367 		return (0);
368 }
369