1 /*	$NetBSD: normalize.c,v 1.1.1.2 2014/04/24 12:45:56 pettai Exp $	*/
2 
3 /*
4  * Copyright (c) 2004 Kungliga Tekniska Högskolan
5  * (Royal Institute of Technology, Stockholm, Sweden).
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * 3. Neither the name of the Institute nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39 #include "windlocl.h"
40 
41 #include <assert.h>
42 #include <stdlib.h>
43 #include <errno.h>
44 #include <stdio.h>
45 
46 #include <krb5/roken.h>
47 
48 #include "normalize_table.h"
49 
50 static int
translation_cmp(const void * key,const void * data)51 translation_cmp(const void *key, const void *data)
52 {
53     const struct translation *t1 = (const struct translation *)key;
54     const struct translation *t2 = (const struct translation *)data;
55 
56     return t1->key - t2->key;
57 }
58 
59 enum { s_base  = 0xAC00};
60 enum { s_count = 11172};
61 enum { l_base  = 0x1100};
62 enum { l_count = 19};
63 enum { v_base  = 0x1161};
64 enum { v_count = 21};
65 enum { t_base  = 0x11A7};
66 enum { t_count = 28};
67 enum { n_count = v_count * t_count};
68 
69 static int
hangul_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)70 hangul_decomp(const uint32_t *in, size_t in_len,
71 	      uint32_t *out, size_t *out_len)
72 {
73     uint32_t u = *in;
74     unsigned s_index;
75     unsigned l, v, t;
76     unsigned o;
77 
78     if (u < s_base || u >= s_base + s_count)
79 	return 0;
80     s_index = u - s_base;
81     l = l_base + s_index / n_count;
82     v = v_base + (s_index % n_count) / t_count;
83     t = t_base + s_index % t_count;
84     o = 2;
85     if (t != t_base)
86 	++o;
87     if (*out_len < o)
88 	return WIND_ERR_OVERRUN;
89     out[0] = l;
90     out[1] = v;
91     if (t != t_base)
92 	out[2] = t;
93     *out_len = o;
94     return 1;
95 }
96 
97 static uint32_t
hangul_composition(const uint32_t * in,size_t in_len)98 hangul_composition(const uint32_t *in, size_t in_len)
99 {
100     if (in_len < 2)
101 	return 0;
102     if (in[0] >= l_base && in[0] < l_base + l_count) {
103 	unsigned l_index = in[0] - l_base;
104 	unsigned v_index;
105 
106 	if (in[1] < v_base || in[1] >= v_base + v_count)
107 	    return 0;
108 	v_index = in[1] - v_base;
109 	return (l_index * v_count + v_index) * t_count + s_base;
110     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
111 	unsigned s_index = in[0] - s_base;
112 	unsigned t_index;
113 
114 	if (s_index % t_count != 0)
115 	    return 0;
116 	if (in[1] < t_base || in[1] >= t_base + t_count)
117 	    return 0;
118 	t_index = in[1] - t_base;
119 	return in[0] + t_index;
120     }
121     return 0;
122 }
123 
124 static int
compat_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)125 compat_decomp(const uint32_t *in, size_t in_len,
126 	      uint32_t *out, size_t *out_len)
127 {
128     unsigned i;
129     unsigned o = 0;
130 
131     for (i = 0; i < in_len; ++i) {
132 	struct translation ts = {in[i]};
133 	size_t sub_len = *out_len - o;
134 	int ret;
135 
136 	ret = hangul_decomp(in + i, in_len - i,
137 			    out + o, &sub_len);
138 	if (ret) {
139 	    if (ret == WIND_ERR_OVERRUN)
140 		return ret;
141 	    o += sub_len;
142 	} else {
143 	    void *s = bsearch(&ts,
144 			      _wind_normalize_table,
145 			      _wind_normalize_table_size,
146 			      sizeof(_wind_normalize_table[0]),
147 			      translation_cmp);
148 	    if (s != NULL) {
149 		const struct translation *t = (const struct translation *)s;
150 
151 		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
152 				    t->val_len,
153 				    out + o, &sub_len);
154 		if (ret)
155 		    return ret;
156 		o += sub_len;
157 	    } else {
158 		if (o >= *out_len)
159 		    return WIND_ERR_OVERRUN;
160 		out[o++] = in[i];
161 
162 	    }
163 	}
164     }
165     *out_len = o;
166     return 0;
167 }
168 
169 static void
swap_char(uint32_t * a,uint32_t * b)170 swap_char(uint32_t * a, uint32_t * b)
171 {
172     uint32_t t;
173     t = *a;
174     *a = *b;
175     *b = t;
176 }
177 
178 /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
179  * that all have Canonical_Combining_Class > 0 */
180 static void
canonical_reorder_sequence(uint32_t * a,size_t len)181 canonical_reorder_sequence(uint32_t * a, size_t len)
182 {
183     size_t i, j;
184 
185     if (len <= 1)
186 	return;
187 
188     for (i = 1; i < len; i++) {
189 	for (j = i;
190 	     j > 0 &&
191 		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
192 	     j--)
193 	    swap_char(&a[j], &a[j-1]);
194     }
195 }
196 
197 static void
canonical_reorder(uint32_t * tmp,size_t tmp_len)198 canonical_reorder(uint32_t *tmp, size_t tmp_len)
199 {
200     size_t i;
201 
202     for (i = 0; i < tmp_len; ++i) {
203 	int cc = _wind_combining_class(tmp[i]);
204 	if (cc) {
205 	    size_t j;
206 	    for (j = i + 1;
207 		 j < tmp_len && _wind_combining_class(tmp[j]);
208 		 ++j)
209 		;
210 	    canonical_reorder_sequence(&tmp[i], j - i);
211 	    i = j;
212 	}
213     }
214 }
215 
216 static uint32_t
find_composition(const uint32_t * in,unsigned in_len)217 find_composition(const uint32_t *in, unsigned in_len)
218 {
219     unsigned short canon_index = 0;
220     uint32_t cur;
221     unsigned n = 0;
222 
223     cur = hangul_composition(in, in_len);
224     if (cur)
225 	return cur;
226 
227     do {
228 	const struct canon_node *c = &_wind_canon_table[canon_index];
229 	unsigned i;
230 
231 	if (n % 5 == 0) {
232 	    cur = *in++;
233 	    if (in_len-- == 0)
234 		return c->val;
235 	}
236 
237 	i = cur >> 16;
238 	if (i < c->next_start || i >= c->next_end)
239 	    canon_index = 0;
240 	else
241 	    canon_index =
242 		_wind_canon_next_table[c->next_offset + i - c->next_start];
243 	if (canon_index != 0) {
244 	    cur = (cur << 4) & 0xFFFFF;
245 	    ++n;
246 	}
247     } while (canon_index != 0);
248     return 0;
249 }
250 
251 static int
combine(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)252 combine(const uint32_t *in, size_t in_len,
253 	uint32_t *out, size_t *out_len)
254 {
255     unsigned i;
256     int ostarter;
257     unsigned o = 0;
258     int old_cc;
259 
260     for (i = 0; i < in_len;) {
261 	while (i < in_len && _wind_combining_class(in[i]) != 0) {
262 	    out[o++] = in[i++];
263 	}
264 	if (i < in_len) {
265 	    if (o >= *out_len)
266 		return WIND_ERR_OVERRUN;
267 	    ostarter = o;
268 	    out[o++] = in[i++];
269 	    old_cc   = -1;
270 
271 	    while (i < in_len) {
272 		uint32_t comb;
273 		uint32_t v[2];
274 		int cc;
275 
276 		v[0] = out[ostarter];
277 		v[1] = in[i];
278 
279 		cc = _wind_combining_class(in[i]);
280 		if (old_cc != cc && (comb = find_composition(v, 2))) {
281 		    out[ostarter] = comb;
282 		} else if (cc == 0) {
283 		    break;
284 		} else {
285 		    if (o >= *out_len)
286 			return WIND_ERR_OVERRUN;
287 		    out[o++] = in[i];
288 		    old_cc   = cc;
289 		}
290 		++i;
291 	    }
292 	}
293     }
294     *out_len = o;
295     return 0;
296 }
297 
298 int
_wind_stringprep_normalize(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)299 _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
300 			   uint32_t *out, size_t *out_len)
301 {
302     size_t tmp_len;
303     uint32_t *tmp;
304     int ret;
305 
306     if (in_len == 0) {
307 	*out_len = 0;
308 	return 0;
309     }
310 
311     tmp_len = in_len * 4;
312     if (tmp_len < MAX_LENGTH_CANON)
313 	tmp_len = MAX_LENGTH_CANON;
314     tmp = malloc(tmp_len * sizeof(uint32_t));
315     if (tmp == NULL)
316 	return ENOMEM;
317 
318     ret = compat_decomp(in, in_len, tmp, &tmp_len);
319     if (ret) {
320 	free(tmp);
321 	return ret;
322     }
323     canonical_reorder(tmp, tmp_len);
324     ret = combine(tmp, tmp_len, out, out_len);
325     free(tmp);
326     return ret;
327 }
328