1 /* $OpenLDAP$ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3  *
4  * Copyright 2002-2021 The OpenLDAP Foundation.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted only as authorized by the OpenLDAP
9  * Public License.
10  *
11  * A copy of this license is available in the file LICENSE in the
12  * top-level directory of the distribution or, alternatively, at
13  * <http://www.OpenLDAP.org/license.html>.
14  */
15 /* ACKNOWLEDGEMENTS:
16  * This work was initially developed by Howard Chu for inclusion in
17  * OpenLDAP Software.
18  */
19 
20 /*
21  * Basic T.61 <-> UTF-8 conversion
22  *
23  * These routines will perform a lossless translation from T.61 to UTF-8
24  * and a lossy translation from UTF-8 to T.61.
25  */
26 
27 #include "portable.h"
28 
29 #include <stdio.h>
30 
31 #include <ac/stdlib.h>
32 
33 #include <ac/socket.h>
34 #include <ac/string.h>
35 #include <ac/time.h>
36 
37 #include "ldap-int.h"
38 #include "ldap_utf8.h"
39 
40 #include "ldap_defaults.h"
41 
42 /*
43  * T.61 is somewhat braindead; even in the 7-bit space it is not
44  * completely equivalent to 7-bit US-ASCII. Our definition of the
45  * character set comes from RFC 1345 with a slightly more readable
46  * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
47  *
48  * Even though '#' and '$' are present in the 7-bit US-ASCII space,
49  * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
50  * xA6 and xA4.
51  *
52  * Also T.61 lacks
53  *	backslash 	\	(x5C)
54  *	caret		^	(x5E)
55  *	backquote	`	(x60)
56  *	left brace	{	(x7B)
57  *	right brace	}	(x7D)
58  *	tilde		~	(x7E)
59  *
60  * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
61  * accents of some form or another. There are predefined combinations
62  * for certain characters, but they can also be used arbitrarily. The
63  * table at dkuug.dk maps these accents to the E000 "private use" range
64  * of the Unicode space, but I believe they more properly belong in the
65  * 0300 range (non-spacing accents). The transformation is complicated
66  * slightly because Unicode wants the non-spacing character to follow
67  * the base character, while T.61 has the non-spacing character leading.
68  * Also, T.61 specifically recognizes certain combined pairs as "characters"
69  * but doesn't specify how to treat unrecognized pairs. This code will
70  * always attempt to combine pairs when a known Unicode composite exists.
71  */
72 
73 static const wchar_t t61_tab[] = {
74 	0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
75 	0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
76 	0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
77 	0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
78 	0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
79 	0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
80 	0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
81 	0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
82 	0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
83 	0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
84 	0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
85 	0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
86 	0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
87 	0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
88 	0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
89 	0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
90 	0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
91 	0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
92 	0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
93 	0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
94 	0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
95 	0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
96 	0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
97 	0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
98 	0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
99 	0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
100 	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
101 	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
102 	0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
103 	0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
104 	0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
105 	0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
106 };
107 
108 typedef wchar_t wvec16[16];
109 typedef wchar_t wvec32[32];
110 typedef wchar_t wvec64[64];
111 
112 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
113 static const wvec16 accents = {
114 	0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
115 	0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
116 
117 /* In the following tables, base characters commented in (parentheses)
118  * are not defined by T.61 but are mapped anyway since their Unicode
119  * composite exists.
120  */
121 
122 /* Grave accented chars AEIOU (NWY) */
123 static const wvec32 c1_vec1 = {
124 	/* Upper case */
125 	0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
126 	0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
127 static const wvec32 c1_vec2 = {
128 	/* Lower case */
129 	0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
130 	0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
131 
132 static const wvec32 *c1_grave[] = {
133 	NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
134 };
135 
136 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
137 static const wvec32 c2_vec1 = {
138 	/* Upper case */
139 	0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
140 	0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
141 	0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
142 	0, 0xdd, 0x179, 0, 0, 0, 0, 0};
143 static const wvec32 c2_vec2 = {
144 	/* Lower case */
145 	0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
146 	0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
147 	0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
148 	0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
149 static const wvec32 c2_vec3 = {
150 	/* (AE and ae) */
151 	0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
152 	0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
153 
154 static const wvec32 *c2_acute[] = {
155 	NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
156 };
157 
158 /* Circumflex AEIOUYCGHJSW (Z) */
159 static const wvec32 c3_vec1 = {
160 	/* Upper case */
161 	0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
162 	0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
163 	0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
164 	0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
165 static const wvec32 c3_vec2 = {
166 	/* Lower case */
167 	0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
168 	0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
169 	0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
170 	0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
171 static const wvec32 *c3_circumflex[] = {
172 	NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
173 };
174 
175 /* Tilde AIOUN (EVY) */
176 static const wvec32 c4_vec1 = {
177 	/* Upper case */
178 	0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
179 	0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
180 static const wvec32 c4_vec2 = {
181 	/* Lower case */
182 	0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
183 	0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
184 static const wvec32 *c4_tilde[] = {
185 	NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
186 };
187 
188 /* Macron AEIOU (YG) */
189 static const wvec32 c5_vec1 = {
190 	/* Upper case */
191 	0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
192 	0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
193 static const wvec32 c5_vec2 = {
194 	/* Lower case */
195 	0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
196 	0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
197 static const wvec32 c5_vec3 = {
198 	/* (AE and ae) */
199 	0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 	0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
201 static const wvec32 *c5_macron[] = {
202 	NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
203 };
204 
205 /* Breve AUG (EIO) */
206 static const wvec32 c6_vec1 = {
207 	/* Upper case */
208 	0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
209 	0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
210 static const wvec32 c6_vec2 = {
211 	/* Lower case */
212 	0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
213 	0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
214 static const wvec32 *c6_breve[] = {
215 	NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
216 };
217 
218 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
219 static const wvec32 c7_vec1 = {
220 	/* Upper case */
221 	0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
222 	0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
223 	0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
224 	0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
225 static const wvec32 c7_vec2 = {
226 	/* Lower case */
227 	0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
228 	0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
229 	0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
230 	0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
231 static const wvec32 *c7_dotabove[] = {
232 	NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
233 };
234 
235 /* Diaeresis AEIOUY (HWXt) */
236 static const wvec32 c8_vec1 = {
237 	/* Upper case */
238 	0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
239 	0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
240 static const wvec32 c8_vec2 = {
241 	/* Lower case */
242 	0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
243 	0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
244 static const wvec32 *c8_diaeresis[] = {
245 	NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
246 };
247 
248 /* Ring Above AU (wy) */
249 static const wvec32 ca_vec1 = {
250 	/* Upper case */
251 	0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
252 	0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
253 static const wvec32 ca_vec2 = {
254 	/* Lower case */
255 	0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
256 	0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
257 static const wvec32 *ca_ringabove[] = {
258 	NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
259 };
260 
261 /* Cedilla CGKLNRST (EDH) */
262 static const wvec32 cb_vec1 = {
263 	/* Upper case */
264 	0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
265 	0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
266 	0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
267 static const wvec32 cb_vec2 = {
268 	/* Lower case */
269 	0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
270 	0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
271 	0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272 static const wvec32 *cb_cedilla[] = {
273 	NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
274 };
275 
276 /* Double Acute Accent OU */
277 static const wvec32 cd_vec1 = {
278 	/* Upper case */
279 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
280 	0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
281 static const wvec32 cd_vec2 = {
282 	/* Lower case */
283 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
284 	0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
285 static const wvec32 *cd_doubleacute[] = {
286 	NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
287 };
288 
289 /* Ogonek AEIU (O) */
290 static const wvec32 ce_vec1 = {
291 	/* Upper case */
292 	0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
293 	0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
294 static const wvec32 ce_vec2 = {
295 	/* Lower case */
296 	0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
297 	0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
298 static const wvec32 *ce_ogonek[] = {
299 	NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
300 };
301 
302 /* Caron CDELNRSTZ (AIOUGKjH) */
303 static const wvec32 cf_vec1 = {
304 	/* Upper case */
305 	0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
306 	0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
307 	0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
308 	0, 0, 0x17d, 0, 0, 0, 0, 0};
309 static const wvec32 cf_vec2 = {
310 	/* Lower case */
311 	0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
312 	0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
313 	0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
314 	0, 0, 0x17e, 0, 0, 0, 0, 0};
315 static const wvec32 *cf_caron[] = {
316 	NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
317 };
318 
319 static const wvec32 **cx_tab[] = {
320 	NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
321 	c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
322 	cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
323 
ldap_t61s_valid(struct berval * str)324 int ldap_t61s_valid( struct berval *str )
325 {
326 	unsigned char *c = (unsigned char *)str->bv_val;
327 	int i;
328 
329 	for (i=0; i < str->bv_len; c++,i++)
330 		if (!t61_tab[*c])
331 			return 0;
332 	return 1;
333 }
334 
335 /* Transform a T.61 string to UTF-8.
336  */
ldap_t61s_to_utf8s(struct berval * src,struct berval * dst)337 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
338 {
339 	unsigned char *c;
340 	char *d;
341 	int i, wlen = 0;
342 
343 	/* Just count the length of the UTF-8 result first */
344 	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
345 		/* Invalid T.61 characters? */
346 		if (!t61_tab[*c])
347 			return LDAP_INVALID_SYNTAX;
348 		if ((*c & 0xf0) == 0xc0) {
349 			int j = *c & 0x0f;
350 			/* If this is the end of the string, or if the base
351 			 * character is just a space, treat this as a regular
352 			 * spacing character.
353 			 */
354 			if ((!c[1] || c[1] == 0x20) && accents[j]) {
355 				wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
356 			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
357 			/* We have a composite mapping for this pair */
358 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
359 				wlen += ldap_x_wc_to_utf8( NULL,
360 					(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
361 			} else {
362 			/* No mapping, just swap it around so the base
363 			 * character comes first.
364 			 */
365 			 	wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
366 				wlen += ldap_x_wc_to_utf8(NULL,
367 					t61_tab[*c], 0);
368 			}
369 			c++; i++;
370 			continue;
371 		} else {
372 			wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
373 		}
374 	}
375 
376 	/* Now transform the string */
377 	dst->bv_len = wlen;
378 	dst->bv_val = LDAP_MALLOC( wlen+1 );
379 	d = dst->bv_val;
380 	if (!d)
381 		return LDAP_NO_MEMORY;
382 
383 	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
384 		if ((*c & 0xf0) == 0xc0) {
385 			int j = *c & 0x0f;
386 			/* If this is the end of the string, or if the base
387 			 * character is just a space, treat this as a regular
388 			 * spacing character.
389 			 */
390 			if ((!c[1] || c[1] == 0x20) && accents[j]) {
391 				d += ldap_x_wc_to_utf8(d, accents[j], 6);
392 			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
393 			/* We have a composite mapping for this pair */
394 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
395 				d += ldap_x_wc_to_utf8(d,
396 				(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
397 			} else {
398 			/* No mapping, just swap it around so the base
399 			 * character comes first.
400 			 */
401 				d += ldap_x_wc_to_utf8(d, c[1], 6);
402 				d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
403 			}
404 			c++; i++;
405 			continue;
406 		} else {
407 			d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
408 		}
409 	}
410 	*d = '\0';
411 	return LDAP_SUCCESS;
412 }
413 
414 /* For the reverse mapping, we just pay attention to the Latin-oriented
415  * code blocks. These are
416  *	0000 - 007f Basic Latin
417  *	0080 - 00ff Latin-1 Supplement
418  *	0100 - 017f Latin Extended-A
419  *	0180 - 024f Latin Extended-B
420  *	1e00 - 1eff Latin Extended Additional
421  *
422  * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
423  * unrecognized characters are replaced with '?' 0x3f.
424  */
425 
426 static const wvec64 u000 = {
427 	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
428 	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
429 	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
430 	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
431 	0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
432 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
433 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
434 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
435 
436 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
437  * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
438  * on their own, even though it provides them as combiners for other
439  * letters. T.61 doesn't define these pairings either, so this may just
440  * have to be replaced with '?' 0x3f if other software can't cope with it.
441  */
442 static const wvec64 u001 = {
443 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
444 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
445 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
446 	0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
447 	0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
448 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
449 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
450 	0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
451 
452 static const wvec64 u002 = {
453 	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
454 	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
455 	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
456 	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
457 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
458 	0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
459 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
460 	0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
461 
462 static const wvec64 u003 = {
463 	0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
464 	0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
465 	0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
466 	0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
467 	0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
468 	0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
469 	0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
470 	0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
471 
472 /* These codes are used here but not defined by T.61:
473  * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
474  */
475 static const wvec64 u010 = {
476 	0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
477 	0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
478 	0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
479 	0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
480 	0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
481 	0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
482 	0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
483 	0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
484 
485 /* These codes are used here but not defined by T.61:
486  * x14e = xc6/x4f, x14f = xc6/x6f
487  */
488 static const wvec64 u011 = {
489 	0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
490 	0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
491 	0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
492 	0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
493 	0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
494 	0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
495 	0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
496 	0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
497 
498 /* All of the codes in this block are undefined in T.61.
499  */
500 static const wvec64 u013 = {
501 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
502 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
503 	0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
504 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
505 	0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
506 	0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
507 	0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
508 	0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
509 
510 /* All of the codes in this block are undefined in T.61.
511  */
512 static const wvec64 u020 = {
513 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
514 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
515 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
516 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
517 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
518 	0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
519 	0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
520 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
521 
522 static const wvec64 u023 = {
523 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
524 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
525 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
526 	0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
527 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
528 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
529 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
531 
532 /* These are the non-spacing characters by themselves. They should
533  * never appear by themselves in actual text.
534  */
535 static const wvec64 u030 = {
536 	0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
537 	0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
538 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
539 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
540 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
541 	0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
542 	0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
543 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
544 
545 /* None of the following blocks are defined in T.61.
546  */
547 static const wvec64 u1e0 = {
548 	0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
549 	0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
550 	0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
551 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
552 	0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
553 	0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
554 	0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
555 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
556 };
557 
558 static const wvec64 u1e1 = {
559 	0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
560 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
561 	0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
562 	0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
563 	0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
564 	0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
565 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566 	0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
567 };
568 
569 static const wvec64 u1e2 = {
570 	0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
571 	0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
572 	0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
573 	0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
574 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
575 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
576 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
577 	0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
578 };
579 
580 static const wvec64 u1e3 = {
581 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
582 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
583 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
584 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
585 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
586 	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587 	0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
588 	0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
589 };
590 
591 static const wvec64 *wc00[] = {
592 	&u000, &u001, &u002, &u003,
593 	&u010, &u011, NULL, &u013,
594 	&u020, NULL, NULL, &u023,
595 	&u030, NULL, NULL, NULL};
596 
597 static const wvec64 *wc1e[] = {
598 	&u1e0, &u1e1, &u1e2, &u1e3};
599 
600 
ldap_utf8s_to_t61s(struct berval * src,struct berval * dst)601 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
602 {
603 	char *c, *d;
604 	wchar_t tmp;
605 	int i, j, tlen = 0;
606 
607 	/* Just count the length of the T.61 result first */
608 	for (i=0,c=src->bv_val; i < src->bv_len;) {
609 		j = ldap_x_utf8_to_wc( &tmp, c );
610 		if (j == -1)
611 			return LDAP_INVALID_SYNTAX;
612 		switch (tmp >> 8) {
613 		case 0x00:
614 		case 0x01:
615 		case 0x02:
616 		case 0x03:
617 			if (wc00[tmp >> 6] &&
618 				((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
619 				tlen++;
620 			}
621 			tlen++;
622 			break;
623 		case 0x1e:
624 			if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
625 				tlen++;
626 			}
627 		case 0x21:
628 		default:
629 			tlen ++;
630 			break;
631 		}
632 		i += j;
633 		c += j;
634 	}
635 	dst->bv_len = tlen;
636 	dst->bv_val = LDAP_MALLOC( tlen+1 );
637 	if (!dst->bv_val)
638 		return LDAP_NO_MEMORY;
639 
640 	d = dst->bv_val;
641 	for (i=0,c=src->bv_val; i < src->bv_len;) {
642 		j = ldap_x_utf8_to_wc( &tmp, c );
643 		switch (tmp >> 8) {
644 		case 0x00:
645 		case 0x01:
646 		case 0x02:
647 			if (wc00[tmp >> 6]) {
648 				tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
649 				if (tmp & 0xff00)
650 					*d++ = (tmp >> 8);
651 				*d++ = tmp & 0xff;
652 			} else {
653 				*d++ = 0x3f;
654 			}
655 			break;
656 		case 0x03:
657 			/* swap order of non-spacing characters */
658 			if (wc00[tmp >> 6]) {
659 				wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
660 				if (t2 != 0x3f) {
661 					d[0] = d[-1];
662 					d[-1] = t2;
663 					d++;
664 				} else {
665 					*d++ = 0x3f;
666 				}
667 			} else {
668 				*d++ = 0x3f;
669 			}
670 			break;
671 		case 0x1e:
672 			tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
673 			if (tmp & 0xff00)
674 				*d++ = (tmp >> 8);
675 			*d++ = tmp & 0xff;
676 			break;
677 		case 0x21:
678 			if (tmp == 0x2126) {
679 				*d++ = 0xe0;
680 				break;
681 			}
682 			/* FALLTHRU */
683 		default:
684 			*d++ = 0x3f;
685 			break;
686 		}
687 		i += j;
688 		c += j;
689 	}
690 	*d = '\0';
691 	return LDAP_SUCCESS;
692 }
693