1 #ifndef lint
2 static char *rcsid = "$Id: unicode.c,v 1.22 2003/01/16 06:09:13 m-kasahr Exp $";
3 #endif
4 
5 /*
6  * Copyright (c) 2000,2001,2002 Japan Network Information Center.
7  * All rights reserved.
8  *
9  * By using this file, you agree to the terms and conditions set forth bellow.
10  *
11  * 			LICENSE TERMS AND CONDITIONS
12  *
13  * The following License Terms and Conditions apply, unless a different
14  * license is obtained from Japan Network Information Center ("JPNIC"),
15  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
16  * Chiyoda-ku, Tokyo 101-0047, Japan.
17  *
18  * 1. Use, Modification and Redistribution (including distribution of any
19  *    modified or derived work) in source and/or binary forms is permitted
20  *    under this License Terms and Conditions.
21  *
22  * 2. Redistribution of source code must retain the copyright notices as they
23  *    appear in each source code file, this License Terms and Conditions.
24  *
25  * 3. Redistribution in binary form must reproduce the Copyright Notice,
26  *    this License Terms and Conditions, in the documentation and/or other
27  *    materials provided with the distribution.  For the purposes of binary
28  *    distribution the "Copyright Notice" refers to the following language:
29  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
30  *
31  * 4. The name of JPNIC may not be used to endorse or promote products
32  *    derived from this Software without specific prior written approval of
33  *    JPNIC.
34  *
35  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
36  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
38  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
39  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
40  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
41  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
42  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
43  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
44  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
45  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
46  */
47 
48 #include <config.h>
49 
50 #include <stddef.h>
51 #include <stdlib.h>
52 #include <string.h>
53 
54 #include <idn/result.h>
55 #include <idn/logmacro.h>
56 #include <idn/assert.h>
57 #include <idn/unicode.h>
58 
59 #define UNICODE_CURRENT	"3.2.0"
60 
61 #define UCS_MAX		0x10ffff
62 #define END_BIT		0x80000000
63 
64 /*
65  * Some constants for Hangul decomposition/composition.
66  */
67 #define SBase		0xac00
68 #define LBase		0x1100
69 #define VBase		0x1161
70 #define TBase		0x11a7
71 #define LCount		19
72 #define VCount		21
73 #define TCount		28
74 #define SLast		(SBase + LCount * VCount * TCount)
75 
76 /*
77  * Symbol composition macro.
78  */
79 #define compose_sym(a, b)		compose_symX(a, b)
80 #define compose_symX(a, b)		a ## b
81 
82 struct composition {
83 	unsigned long c2;	/* 2nd character */
84 	unsigned long comp;	/* composed character */
85 };
86 
87 #include "unicodedata_320.c"
88 #define VERSION v320
89 #include "unicode_template.c"
90 #undef VERSION
91 
92 typedef int	(*unicode_canonclassproc)(unsigned long v);
93 typedef int	(*unicode_decomposeproc)(unsigned long c,
94 					 const unsigned long **seqp);
95 typedef int	(*unicode_composeproc)(unsigned long c,
96 				       const struct composition **compp);
97 
98 static struct idn__unicode_ops {
99 	char *version;
100 	unicode_canonclassproc canonclass_proc;
101 	unicode_decomposeproc decompose_proc;
102 	unicode_composeproc compose_proc;
103 } unicode_versions[] = {
104 #define MAKE_UNICODE_HANDLE(version, suffix) \
105 	{ version, \
106 	  compose_sym(canonclass_, suffix), \
107 	  compose_sym(decompose_, suffix), \
108 	  compose_sym(compose_, suffix) }
109 	MAKE_UNICODE_HANDLE("3.2.0", v320),
110 	{ NULL },
111 #undef MAKE_UNICODE_HANDLE
112 };
113 
114 idn_result_t
idn__unicode_create(const char * version,idn__unicode_version_t * versionp)115 idn__unicode_create(const char *version,
116 		    idn__unicode_version_t *versionp) {
117 	idn__unicode_version_t v;
118 
119 	assert(versionp != NULL);
120 	TRACE(("idn__unicode_create(version=%-.50s)\n",
121 	       version == NULL ? "<NULL>" : version));
122 
123 	if (version == NULL)
124 		version = UNICODE_CURRENT;
125 
126 	for (v = unicode_versions; v->version != NULL; v++) {
127 		if (strcmp(v->version, version) == 0) {
128 			*versionp = v;
129 			return (idn_success);
130 		}
131 	}
132 	return (idn_notfound);
133 }
134 
135 void
idn__unicode_destroy(idn__unicode_version_t version)136 idn__unicode_destroy(idn__unicode_version_t version) {
137 	assert(version != NULL);
138 	TRACE(("idn__unicode_destroy()\n"));
139 	/* Nothing to do */
140 }
141 
142 int
idn__unicode_canonicalclass(idn__unicode_version_t version,unsigned long c)143 idn__unicode_canonicalclass(idn__unicode_version_t version, unsigned long c) {
144 	if (c > UCS_MAX)
145 		return (0);
146 
147 	return (*version->canonclass_proc)(c);
148 }
149 
150 idn_result_t
idn__unicode_decompose(idn__unicode_version_t version,int compat,unsigned long * v,size_t vlen,unsigned long c,int * decomp_lenp)151 idn__unicode_decompose(idn__unicode_version_t version,
152 		       int compat, unsigned long *v, size_t vlen,
153 		       unsigned long c, int *decomp_lenp) {
154 	unsigned long *vorg = v;
155 	int seqidx;
156 	const unsigned long *seq;
157 
158 	assert(v != NULL && vlen >= 0 && decomp_lenp != NULL);
159 
160 	if (c > UCS_MAX)
161 		return (idn_notfound);
162 
163 	/*
164 	 * First, check for Hangul.
165 	 */
166 	if (SBase <= c && c < SLast) {
167 		int idx, t_offset, v_offset, l_offset;
168 
169 		idx = c - SBase;
170 		t_offset = idx % TCount;
171 		idx /= TCount;
172 		v_offset = idx % VCount;
173 		l_offset = idx / VCount;
174 		if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
175 			return (idn_buffer_overflow);
176 		*v++ = LBase + l_offset;
177 		*v++ = VBase + v_offset;
178 		if (t_offset > 0)
179 			*v++ = TBase + t_offset;
180 		*decomp_lenp = v - vorg;
181 		return (idn_success);
182 	}
183 
184 	/*
185 	 * Look up decomposition table.  If no decomposition is defined
186 	 * or if it is a compatibility decomosition when canonical
187 	 * decomposition requested, return 'idn_notfound'.
188 	 */
189 	seqidx = (*version->decompose_proc)(c, &seq);
190 	if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
191 		return (idn_notfound);
192 
193 	/*
194 	 * Copy the decomposed sequence.  The end of the sequence are
195 	 * marked with END_BIT.
196 	 */
197 	do {
198 		unsigned long c;
199 		int dlen;
200 		idn_result_t r;
201 
202 		c = *seq & ~END_BIT;
203 
204 		/* Decompose recursively. */
205 		r = idn__unicode_decompose(version, compat, v, vlen, c, &dlen);
206 		if (r == idn_success) {
207 			v += dlen;
208 			vlen -= dlen;
209 		} else if (r == idn_notfound) {
210 			if (vlen < 1)
211 				return (idn_buffer_overflow);
212 			*v++ = c;
213 			vlen--;
214 		} else {
215 			return (r);
216 		}
217 
218 	} while ((*seq++ & END_BIT) == 0);
219 
220 	*decomp_lenp = v - vorg;
221 
222 	return (idn_success);
223 }
224 
225 int
idn__unicode_iscompositecandidate(idn__unicode_version_t version,unsigned long c)226 idn__unicode_iscompositecandidate(idn__unicode_version_t version,
227 				  unsigned long c) {
228 	const struct composition *dummy;
229 
230 	if (c > UCS_MAX)
231 		return (0);
232 
233 	/* Check for Hangul */
234 	if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
235 		return (1);
236 
237 	/*
238 	 * Look up composition table.  If there are no composition
239 	 * that begins with the given character, it is not a
240 	 * composition candidate.
241 	 */
242 	if ((*version->compose_proc)(c, &dummy) == 0)
243 		return (0);
244 	else
245 		return (1);
246 }
247 
248 idn_result_t
idn__unicode_compose(idn__unicode_version_t version,unsigned long c1,unsigned long c2,unsigned long * compp)249 idn__unicode_compose(idn__unicode_version_t version, unsigned long c1,
250 		     unsigned long c2, unsigned long *compp) {
251 	int n;
252 	int lo, hi;
253 	const struct composition *cseq;
254 
255 	assert(compp != NULL);
256 
257 	if (c1 > UCS_MAX || c2 > UCS_MAX)
258 		return (idn_notfound);
259 
260 	/*
261 	 * Check for Hangul.
262 	 */
263 	if (LBase <= c1 && c1 < LBase + LCount &&
264 	    VBase <= c2 && c2 < VBase + VCount) {
265 		/*
266 		 * Hangul L and V.
267 		 */
268 		*compp = SBase +
269 			((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
270 		return (idn_success);
271 	} else if (SBase <= c1 && c1 < SLast &&
272 		   TBase <= c2 && c2 < TBase + TCount &&
273 		   (c1 - SBase) % TCount == 0) {
274 		/*
275 		 * Hangul LV and T.
276 		 */
277 		*compp = c1 + (c2 - TBase);
278 		return (idn_success);
279 	}
280 
281 	/*
282 	 * Look up composition table.  If the result is 0, no composition
283 	 * is defined.  Otherwise, upper 16bits of the result contains
284 	 * the number of composition that begins with 'c1', and the lower
285 	 * 16bits is the offset in 'compose_seq'.
286 	 */
287 	if ((n = (*version->compose_proc)(c1, &cseq)) == 0)
288 		return (idn_notfound);
289 
290 	/*
291 	 * The composite sequences are sorted by the 2nd character 'c2'.
292 	 * So we can use binary search.
293 	 */
294 	lo = 0;
295 	hi = n - 1;
296 	while (lo <= hi) {
297 		int mid = (lo + hi) / 2;
298 
299 		if (cseq[mid].c2 < c2) {
300 			lo = mid + 1;
301 		} else if (cseq[mid].c2 > c2) {
302 			hi = mid - 1;
303 		} else {
304 			*compp = cseq[mid].comp;
305 			return (idn_success);
306 		}
307 	}
308 	return (idn_notfound);
309 }
310