1 /*	$NetBSD: utf8.c,v 1.4 2014/12/10 04:37:55 christos Exp $	*/
2 
3 #ifndef lint
4 static char *rcsid = "Id: utf8.c,v 1.1 2003/06/04 00:26:44 marka Exp ";
5 #endif
6 
7 /*
8  * Copyright (c) 2000 Japan Network Information Center.  All rights reserved.
9  *
10  * By using this file, you agree to the terms and conditions set forth bellow.
11  *
12  * 			LICENSE TERMS AND CONDITIONS
13  *
14  * The following License Terms and Conditions apply, unless a different
15  * license is obtained from Japan Network Information Center ("JPNIC"),
16  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
17  * Chiyoda-ku, Tokyo 101-0047, Japan.
18  *
19  * 1. Use, Modification and Redistribution (including distribution of any
20  *    modified or derived work) in source and/or binary forms is permitted
21  *    under this License Terms and Conditions.
22  *
23  * 2. Redistribution of source code must retain the copyright notices as they
24  *    appear in each source code file, this License Terms and Conditions.
25  *
26  * 3. Redistribution in binary form must reproduce the Copyright Notice,
27  *    this License Terms and Conditions, in the documentation and/or other
28  *    materials provided with the distribution.  For the purposes of binary
29  *    distribution the "Copyright Notice" refers to the following language:
30  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
31  *
32  * 4. The name of JPNIC may not be used to endorse or promote products
33  *    derived from this Software without specific prior written approval of
34  *    JPNIC.
35  *
36  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
37  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
38  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
39  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
40  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
41  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
42  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
45  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
46  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
47  */
48 
49 #include <config.h>
50 
51 #include <stddef.h>
52 
53 #include <idn/assert.h>
54 #include <idn/logmacro.h>
55 #include <idn/utf8.h>
56 #include <idn/debug.h>
57 
58 #define UTF8_WIDTH(c) \
59 	(((c) < 0x80) ? 1 : \
60 	 ((c) < 0xc0) ? 0 : \
61 	 ((c) < 0xe0) ? 2 : \
62 	 ((c) < 0xf0) ? 3 : \
63 	 ((c) < 0xf8) ? 4 : \
64 	 ((c) < 0xfc) ? 5 : \
65 	 ((c) < 0xfe) ? 6 : 0)
66 
67 #define VALID_CONT_BYTE(c)	(0x80 <= (c) && (c) < 0xc0)
68 
69 int
70 idn_utf8_mblen(const char *s) {
71 	int c = *(unsigned char *)s;
72 
73 	assert(s != NULL);
74 
75 #if 0
76 	TRACE(("idn_utf8_mblen(s=<%s>)\n", idn__debug_hexstring(s, 6)));
77 #endif
78 
79 	return UTF8_WIDTH(c);
80 }
81 
82 int
83 idn_utf8_getmb(const char *s, size_t len, char *buf) {
84 	/* buf must be at least 7-bytes long */
85 	const unsigned char *p = (const unsigned char *)s;
86 	unsigned char *q = (unsigned char *)buf;
87 	int width = UTF8_WIDTH(*p);
88 	int w;
89 
90 	assert(s != NULL);
91 
92 #if 0
93 	TRACE(("idn_utf8_getmb(s=<%s>,len=%d)\n",
94 	      idn__debug_hexstring(s, 6), len));
95 #endif
96 
97 	if (width == 0 || len < width)
98 		return (0);
99 
100 	/* Copy the first byte. */
101 	*q++ = *p++;
102 
103 	/* .. and the rest. */
104 	w = width;
105 	while (--w > 0) {
106 		if (!VALID_CONT_BYTE(*p))
107 			return (0);
108 		*q++ = *p++;
109 	}
110 	return (width);
111 }
112 
113 int
114 idn_utf8_getwc(const char *s, size_t len, unsigned long *vp) {
115 	unsigned long v;
116 	unsigned long min;
117 	const unsigned char *p = (const unsigned char *)s;
118 	int c;
119 	int width;
120 	int rest;
121 
122 	assert(s != NULL);
123 
124 #if 0
125 	TRACE(("idn_utf8_getwc(s=<%s>,len=%d)\n",
126 	      idn__debug_hexstring(s, 10), len));
127 #endif
128 
129 	c = *p++;
130 	width = UTF8_WIDTH(c);
131 
132 	switch (width) {
133 	case 0:
134 		return (0);
135 	case 1:
136 		v = c;
137 		min = 0;
138 		break;
139 	case 2:
140 		v = c & 0x1f;
141 		min = 0x80;
142 		break;
143 	case 3:
144 		v = c & 0xf;
145 		min = 0x800;
146 		break;
147 	case 4:
148 		v = c & 0x7;
149 		min = 0x10000;
150 		break;
151 	case 5:
152 		v = c & 3;
153 		min = 0x200000;
154 		break;
155 	case 6:
156 		v = c & 1;
157 		min = 0x4000000;
158 		break;
159 	default:
160 		FATAL(("idn_utf8_getint: internal error\n"));
161 		return (0);
162 	}
163 
164 	if (len < width)
165 		return (0);
166 
167 	rest = width - 1;
168 	while (rest-- > 0) {
169 		if (!VALID_CONT_BYTE(*p))
170 			return (0);
171 		v = (v << 6) | (*p & 0x3f);
172 		p++;
173 	}
174 
175 	if (v < min)
176 		return (0);
177 
178 	*vp = v;
179 	return (width);
180 }
181 
182 int
183 idn_utf8_putwc(char *s, size_t len, unsigned long v) {
184 	unsigned char *p = (unsigned char *)s;
185 	int mask;
186 	int off;
187 	int l;
188 
189 	assert(s != NULL);
190 
191 #if 0
192 	TRACE(("idn_utf8_putwc(v=%lx)\n", v));
193 #endif
194 
195 	if (v < 0x80) {
196 		mask = 0;
197 		l = 1;
198 	} else if (v < 0x800) {
199 		mask = 0xc0;
200 		l = 2;
201 	} else if (v < 0x10000) {
202 		mask = 0xe0;
203 		l = 3;
204 	} else if (v < 0x200000) {
205 		mask = 0xf0;
206 		l = 4;
207 	} else if (v < 0x4000000) {
208 		mask = 0xf8;
209 		l = 5;
210 	} else if (v < 0x80000000) {
211 		mask = 0xfc;
212 		l = 6;
213 	} else {
214 		return (0);
215 	}
216 
217 	if (len < l)
218 		return (0);
219 
220 	off = 6 * (l - 1);
221 	*p++ = (v >> off) | mask;
222 	mask = 0x80;
223 	while (off > 0) {
224 		off -= 6;
225 		*p++ = ((v >> off) & 0x3f) | mask;
226 	}
227 	return l;
228 }
229 
230 int
231 idn_utf8_isvalidchar(const char *s) {
232 	unsigned long dummy;
233 
234 	TRACE(("idn_utf8_isvalidchar(s=<%s>)\n",
235 	      idn__debug_hexstring(s, 6)));
236 
237 	return (idn_utf8_getwc(s, 6, &dummy) > 0);
238 }
239 
240 int
241 idn_utf8_isvalidstring(const char *s) {
242 	unsigned long dummy;
243 	int width;
244 
245 	assert(s != NULL);
246 
247 	TRACE(("idn_utf8_isvalidstring(s=<%s>)\n",
248 	      idn__debug_hexstring(s, 20)));
249 
250 	while (*s != '\0') {
251 		width = idn_utf8_getwc(s, 6, &dummy);
252 		if (width == 0)
253 			return (0);
254 		s += width;
255 	}
256 	return (1);
257 }
258 
259 char *
260 idn_utf8_findfirstbyte(const char *s, const char *known_top) {
261 	const unsigned char *p = (const unsigned char *)s;
262 	const unsigned char *t = (const unsigned char *)known_top;
263 
264 	assert(s != NULL && known_top != NULL && known_top <= s);
265 
266 	TRACE(("idn_utf8_findfirstbyte(s=<%s>)\n",
267 	      idn__debug_hexstring(s, 8)));
268 
269 	while (p >= t) {
270 		if (!VALID_CONT_BYTE(*p))
271 		    break;
272 		p--;
273 	}
274 	if (p < t || UTF8_WIDTH(*p) == 0)
275 		return (NULL);
276 
277 	return ((char *)p);
278 }
279