1 /*	$NetBSD: unicode.c,v 1.4 2014/12/10 04:37:55 christos Exp $	*/
2 
3 #ifndef lint
4 static char *rcsid = "Id: unicode.c,v 1.1 2003/06/04 00:26:16 marka Exp ";
5 #endif
6 
7 /*
8  * Copyright (c) 2000,2001,2002 Japan Network Information Center.
9  * All rights reserved.
10  *
11  * By using this file, you agree to the terms and conditions set forth bellow.
12  *
13  * 			LICENSE TERMS AND CONDITIONS
14  *
15  * The following License Terms and Conditions apply, unless a different
16  * license is obtained from Japan Network Information Center ("JPNIC"),
17  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18  * Chiyoda-ku, Tokyo 101-0047, Japan.
19  *
20  * 1. Use, Modification and Redistribution (including distribution of any
21  *    modified or derived work) in source and/or binary forms is permitted
22  *    under this License Terms and Conditions.
23  *
24  * 2. Redistribution of source code must retain the copyright notices as they
25  *    appear in each source code file, this License Terms and Conditions.
26  *
27  * 3. Redistribution in binary form must reproduce the Copyright Notice,
28  *    this License Terms and Conditions, in the documentation and/or other
29  *    materials provided with the distribution.  For the purposes of binary
30  *    distribution the "Copyright Notice" refers to the following language:
31  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
32  *
33  * 4. The name of JPNIC may not be used to endorse or promote products
34  *    derived from this Software without specific prior written approval of
35  *    JPNIC.
36  *
37  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
41  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48  */
49 
50 #include <config.h>
51 
52 #include <stddef.h>
53 #include <stdlib.h>
54 #include <string.h>
55 
56 #include <idn/result.h>
57 #include <idn/logmacro.h>
58 #include <idn/assert.h>
59 #include <idn/unicode.h>
60 
61 #define UNICODE_CURRENT	"3.2.0"
62 
63 #define UCS_MAX		0x10ffff
64 #define END_BIT		0x80000000
65 
66 /*
67  * Some constants for Hangul decomposition/composition.
68  */
69 #define SBase		0xac00
70 #define LBase		0x1100
71 #define VBase		0x1161
72 #define TBase		0x11a7
73 #define LCount		19
74 #define VCount		21
75 #define TCount		28
76 #define SLast		(SBase + LCount * VCount * TCount)
77 
78 /*
79  * Symbol composition macro.
80  */
81 #define compose_sym(a, b)		compose_symX(a, b)
82 #define compose_symX(a, b)		a ## b
83 
84 struct composition {
85 	unsigned long c2;	/* 2nd character */
86 	unsigned long comp;	/* composed character */
87 };
88 
89 #include "unicodedata_320.c"
90 #define VERSION v320
91 #include "unicode_template.c"
92 #undef VERSION
93 
94 typedef int	(*unicode_canonclassproc)(unsigned long v);
95 typedef int	(*unicode_decomposeproc)(unsigned long c,
96 					 const unsigned long **seqp);
97 typedef int	(*unicode_composeproc)(unsigned long c,
98 				       const struct composition **compp);
99 
100 static struct idn__unicode_ops {
101 	char *version;
102 	unicode_canonclassproc canonclass_proc;
103 	unicode_decomposeproc decompose_proc;
104 	unicode_composeproc compose_proc;
105 } unicode_versions[] = {
106 #define MAKE_UNICODE_HANDLE(version, suffix) \
107 	{ version, \
108 	  compose_sym(canonclass_, suffix), \
109 	  compose_sym(decompose_, suffix), \
110 	  compose_sym(compose_, suffix) }
111 	MAKE_UNICODE_HANDLE("3.2.0", v320),
112 	{ NULL },
113 #undef MAKE_UNICODE_HANDLE
114 };
115 
116 idn_result_t
117 idn__unicode_create(const char *version,
118 		    idn__unicode_version_t *versionp) {
119 	idn__unicode_version_t v;
120 
121 	assert(versionp != NULL);
122 	TRACE(("idn__unicode_create(version=%-.50s)\n",
123 	       version == NULL ? "<NULL>" : version));
124 
125 	if (version == NULL)
126 		version = UNICODE_CURRENT;
127 
128 	for (v = unicode_versions; v->version != NULL; v++) {
129 		if (strcmp(v->version, version) == 0) {
130 			*versionp = v;
131 			return (idn_success);
132 		}
133 	}
134 	return (idn_notfound);
135 }
136 
137 void
138 idn__unicode_destroy(idn__unicode_version_t version) {
139 	assert(version != NULL);
140 	TRACE(("idn__unicode_destroy()\n"));
141 	/* Nothing to do */
142 }
143 
144 int
145 idn__unicode_canonicalclass(idn__unicode_version_t version, unsigned long c) {
146 	if (c > UCS_MAX)
147 		return (0);
148 
149 	return (*version->canonclass_proc)(c);
150 }
151 
152 idn_result_t
153 idn__unicode_decompose(idn__unicode_version_t version,
154 		       int compat, unsigned long *v, size_t vlen,
155 		       unsigned long c, int *decomp_lenp) {
156 	unsigned long *vorg = v;
157 	int seqidx;
158 	const unsigned long *seq;
159 
160 	assert(v != NULL && vlen >= 0 && decomp_lenp != NULL);
161 
162 	if (c > UCS_MAX)
163 		return (idn_notfound);
164 
165 	/*
166 	 * First, check for Hangul.
167 	 */
168 	if (SBase <= c && c < SLast) {
169 		int idx, t_offset, v_offset, l_offset;
170 
171 		idx = c - SBase;
172 		t_offset = idx % TCount;
173 		idx /= TCount;
174 		v_offset = idx % VCount;
175 		l_offset = idx / VCount;
176 		if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
177 			return (idn_buffer_overflow);
178 		*v++ = LBase + l_offset;
179 		*v++ = VBase + v_offset;
180 		if (t_offset > 0)
181 			*v++ = TBase + t_offset;
182 		*decomp_lenp = v - vorg;
183 		return (idn_success);
184 	}
185 
186 	/*
187 	 * Look up decomposition table.  If no decomposition is defined
188 	 * or if it is a compatibility decomosition when canonical
189 	 * decomposition requested, return 'idn_notfound'.
190 	 */
191 	seqidx = (*version->decompose_proc)(c, &seq);
192 	if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
193 		return (idn_notfound);
194 
195 	/*
196 	 * Copy the decomposed sequence.  The end of the sequence are
197 	 * marked with END_BIT.
198 	 */
199 	do {
200 		unsigned long c;
201 		int dlen;
202 		idn_result_t r;
203 
204 		c = *seq & ~END_BIT;
205 
206 		/* Decompose recursively. */
207 		r = idn__unicode_decompose(version, compat, v, vlen, c, &dlen);
208 		if (r == idn_success) {
209 			v += dlen;
210 			vlen -= dlen;
211 		} else if (r == idn_notfound) {
212 			if (vlen < 1)
213 				return (idn_buffer_overflow);
214 			*v++ = c;
215 			vlen--;
216 		} else {
217 			return (r);
218 		}
219 
220 	} while ((*seq++ & END_BIT) == 0);
221 
222 	*decomp_lenp = v - vorg;
223 
224 	return (idn_success);
225 }
226 
227 int
228 idn__unicode_iscompositecandidate(idn__unicode_version_t version,
229 				  unsigned long c) {
230 	const struct composition *dummy;
231 
232 	if (c > UCS_MAX)
233 		return (0);
234 
235 	/* Check for Hangul */
236 	if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
237 		return (1);
238 
239 	/*
240 	 * Look up composition table.  If there are no composition
241 	 * that begins with the given character, it is not a
242 	 * composition candidate.
243 	 */
244 	if ((*version->compose_proc)(c, &dummy) == 0)
245 		return (0);
246 	else
247 		return (1);
248 }
249 
250 idn_result_t
251 idn__unicode_compose(idn__unicode_version_t version, unsigned long c1,
252 		     unsigned long c2, unsigned long *compp) {
253 	int n;
254 	int lo, hi;
255 	const struct composition *cseq;
256 
257 	assert(compp != NULL);
258 
259 	if (c1 > UCS_MAX || c2 > UCS_MAX)
260 		return (idn_notfound);
261 
262 	/*
263 	 * Check for Hangul.
264 	 */
265 	if (LBase <= c1 && c1 < LBase + LCount &&
266 	    VBase <= c2 && c2 < VBase + VCount) {
267 		/*
268 		 * Hangul L and V.
269 		 */
270 		*compp = SBase +
271 			((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
272 		return (idn_success);
273 	} else if (SBase <= c1 && c1 < SLast &&
274 		   TBase <= c2 && c2 < TBase + TCount &&
275 		   (c1 - SBase) % TCount == 0) {
276 		/*
277 		 * Hangul LV and T.
278 		 */
279 		*compp = c1 + (c2 - TBase);
280 		return (idn_success);
281 	}
282 
283 	/*
284 	 * Look up composition table.  If the result is 0, no composition
285 	 * is defined.  Otherwise, upper 16bits of the result contains
286 	 * the number of composition that begins with 'c1', and the lower
287 	 * 16bits is the offset in 'compose_seq'.
288 	 */
289 	if ((n = (*version->compose_proc)(c1, &cseq)) == 0)
290 		return (idn_notfound);
291 
292 	/*
293 	 * The composite sequences are sorted by the 2nd character 'c2'.
294 	 * So we can use binary search.
295 	 */
296 	lo = 0;
297 	hi = n - 1;
298 	while (lo <= hi) {
299 		int mid = (lo + hi) / 2;
300 
301 		if (cseq[mid].c2 < c2) {
302 			lo = mid + 1;
303 		} else if (cseq[mid].c2 > c2) {
304 			hi = mid - 1;
305 		} else {
306 			*compp = cseq[mid].comp;
307 			return (idn_success);
308 		}
309 	}
310 	return (idn_notfound);
311 }
312