1 /* $OpenLDAP$ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3 *
4 * Copyright 1998-2021 The OpenLDAP Foundation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
9 * Public License.
10 *
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
14 */
15
16 #include "portable.h"
17
18 #include <ac/bytes.h>
19 #include <ac/ctype.h>
20 #include <ac/string.h>
21 #include <ac/stdlib.h>
22
23 #include <lber_pvt.h>
24
25 #include <ldap_utf8.h>
26 #include <ldap_pvt_uc.h>
27
28 #define malloc(x) ber_memalloc_x(x,ctx)
29 #define realloc(x,y) ber_memrealloc_x(x,y,ctx)
30 #define free(x) ber_memfree_x(x,ctx)
31
ucstrncmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)32 int ucstrncmp(
33 const ldap_unicode_t *u1,
34 const ldap_unicode_t *u2,
35 ber_len_t n )
36 {
37 for(; 0 < n; ++u1, ++u2, --n ) {
38 if( *u1 != *u2 ) {
39 return *u1 < *u2 ? -1 : +1;
40 }
41 if ( *u1 == 0 ) {
42 return 0;
43 }
44 }
45 return 0;
46 }
47
ucstrncasecmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)48 int ucstrncasecmp(
49 const ldap_unicode_t *u1,
50 const ldap_unicode_t *u2,
51 ber_len_t n )
52 {
53 for(; 0 < n; ++u1, ++u2, --n ) {
54 ldap_unicode_t uu1 = uctolower( *u1 );
55 ldap_unicode_t uu2 = uctolower( *u2 );
56
57 if( uu1 != uu2 ) {
58 return uu1 < uu2 ? -1 : +1;
59 }
60 if ( uu1 == 0 ) {
61 return 0;
62 }
63 }
64 return 0;
65 }
66
ucstrnchr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)67 ldap_unicode_t * ucstrnchr(
68 const ldap_unicode_t *u,
69 ber_len_t n,
70 ldap_unicode_t c )
71 {
72 for(; 0 < n; ++u, --n ) {
73 if( *u == c ) {
74 return (ldap_unicode_t *) u;
75 }
76 }
77
78 return NULL;
79 }
80
ucstrncasechr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)81 ldap_unicode_t * ucstrncasechr(
82 const ldap_unicode_t *u,
83 ber_len_t n,
84 ldap_unicode_t c )
85 {
86 c = uctolower( c );
87 for(; 0 < n; ++u, --n ) {
88 if( uctolower( *u ) == c ) {
89 return (ldap_unicode_t *) u;
90 }
91 }
92
93 return NULL;
94 }
95
ucstr2upper(ldap_unicode_t * u,ber_len_t n)96 void ucstr2upper(
97 ldap_unicode_t *u,
98 ber_len_t n )
99 {
100 for(; 0 < n; ++u, --n ) {
101 *u = uctoupper( *u );
102 }
103 }
104
UTF8bvnormalize(struct berval * bv,struct berval * newbv,unsigned flags,void * ctx)105 struct berval * UTF8bvnormalize(
106 struct berval *bv,
107 struct berval *newbv,
108 unsigned flags,
109 void *ctx )
110 {
111 int i, j, len, clen, outpos, ucsoutlen, outsize, last;
112 int didnewbv = 0;
113 char *out, *outtmp, *s;
114 ac_uint4 *ucs, *p, *ucsout;
115
116 static unsigned char mask[] = {
117 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
118
119 unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
120 unsigned approx = flags & LDAP_UTF8_APPROX;
121
122 if ( bv == NULL ) {
123 return NULL;
124 }
125
126 s = bv->bv_val;
127 len = bv->bv_len;
128
129 if ( len == 0 ) {
130 return ber_dupbv_x( newbv, bv, ctx );
131 }
132
133 if ( !newbv ) {
134 newbv = ber_memalloc_x( sizeof(struct berval), ctx );
135 if ( !newbv ) return NULL;
136 didnewbv = 1;
137 }
138
139 /* Should first check to see if string is already in proper
140 * normalized form. This is almost as time consuming as
141 * the normalization though.
142 */
143
144 /* finish off everything up to character before first non-ascii */
145 if ( LDAP_UTF8_ISASCII( s ) ) {
146 if ( casefold ) {
147 outsize = len + 7;
148 out = (char *) ber_memalloc_x( outsize, ctx );
149 if ( out == NULL ) {
150 fail:
151 if ( didnewbv )
152 ber_memfree_x( newbv, ctx );
153 return NULL;
154 }
155 outpos = 0;
156
157 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
158 out[outpos++] = TOLOWER( s[i-1] );
159 }
160 if ( i == len ) {
161 out[outpos++] = TOLOWER( s[len-1] );
162 out[outpos] = '\0';
163 newbv->bv_val = out;
164 newbv->bv_len = outpos;
165 return newbv;
166 }
167 } else {
168 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
169 /* empty */
170 }
171
172 if ( i == len ) {
173 return ber_str2bv_x( s, len, 1, newbv, ctx );
174 }
175
176 outsize = len + 7;
177 out = (char *) ber_memalloc_x( outsize, ctx );
178 if ( out == NULL ) {
179 goto fail;
180 }
181 outpos = i - 1;
182 memcpy(out, s, outpos);
183 }
184 } else {
185 outsize = len + 7;
186 out = (char *) ber_memalloc_x( outsize, ctx );
187 if ( out == NULL ) {
188 goto fail;
189 }
190 outpos = 0;
191 i = 0;
192 }
193
194 p = ucs = ber_memalloc_x( len * sizeof(*ucs), ctx );
195 if ( ucs == NULL ) {
196 ber_memfree_x(out, ctx);
197 goto fail;
198 }
199
200 /* convert character before first non-ascii to ucs-4 */
201 if ( i > 0 ) {
202 *p = casefold ? TOLOWER( s[i-1] ) : s[i-1];
203 p++;
204 }
205
206 /* s[i] is now first non-ascii character */
207 for (;;) {
208 /* s[i] is non-ascii */
209 /* convert everything up to next ascii to ucs-4 */
210 while ( i < len ) {
211 clen = LDAP_UTF8_CHARLEN2( s + i, clen );
212 if ( clen == 0 ) {
213 ber_memfree_x( ucs, ctx );
214 ber_memfree_x( out, ctx );
215 goto fail;
216 }
217 if ( clen == 1 ) {
218 /* ascii */
219 break;
220 }
221 *p = s[i] & mask[clen];
222 i++;
223 for( j = 1; j < clen; j++ ) {
224 if ( (s[i] & 0xc0) != 0x80 ) {
225 ber_memfree_x( ucs, ctx );
226 ber_memfree_x( out, ctx );
227 goto fail;
228 }
229 *p <<= 6;
230 *p |= s[i] & 0x3f;
231 i++;
232 }
233 if ( casefold ) {
234 *p = uctolower( *p );
235 }
236 p++;
237 }
238 /* normalize ucs of length p - ucs */
239 uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx );
240 if ( approx ) {
241 for ( j = 0; j < ucsoutlen; j++ ) {
242 if ( ucsout[j] < 0x80 ) {
243 out[outpos++] = ucsout[j];
244 }
245 }
246 } else {
247 ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
248 /* convert ucs to utf-8 and store in out */
249 for ( j = 0; j < ucsoutlen; j++ ) {
250 /* allocate more space if not enough room for
251 6 bytes and terminator */
252 if ( outsize - outpos < 7 ) {
253 outsize = ucsoutlen - j + outpos + 6;
254 outtmp = (char *) ber_memrealloc_x( out, outsize, ctx );
255 if ( outtmp == NULL ) {
256 ber_memfree_x( ucsout, ctx );
257 ber_memfree_x( ucs, ctx );
258 ber_memfree_x( out, ctx );
259 goto fail;
260 }
261 out = outtmp;
262 }
263 outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
264 }
265 }
266
267 ber_memfree_x( ucsout, ctx );
268 ucsout = NULL;
269
270 if ( i == len ) {
271 break;
272 }
273
274 last = i;
275
276 /* Allocate more space in out if necessary */
277 if (len - i >= outsize - outpos) {
278 outsize += 1 + ((len - i) - (outsize - outpos));
279 outtmp = (char *) ber_memrealloc_x(out, outsize, ctx);
280 if (outtmp == NULL) {
281 ber_memfree_x( ucs, ctx );
282 ber_memfree_x( out, ctx );
283 goto fail;
284 }
285 out = outtmp;
286 }
287
288 /* s[i] is ascii */
289 /* finish off everything up to char before next non-ascii */
290 for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
291 out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1];
292 }
293 if ( i == len ) {
294 out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1];
295 break;
296 }
297
298 /* convert character before next non-ascii to ucs-4 */
299 *ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1];
300 p = ucs + 1;
301 }
302
303 ber_memfree_x( ucs, ctx );
304 out[outpos] = '\0';
305 newbv->bv_val = out;
306 newbv->bv_len = outpos;
307 return newbv;
308 }
309
310 /* compare UTF8-strings, optionally ignore casing */
311 /* slow, should be optimized */
UTF8bvnormcmp(struct berval * bv1,struct berval * bv2,unsigned flags,void * ctx)312 int UTF8bvnormcmp(
313 struct berval *bv1,
314 struct berval *bv2,
315 unsigned flags,
316 void *ctx )
317 {
318 int i, l1, l2, len, ulen, res = 0;
319 char *s1, *s2, *done;
320 ac_uint4 *ucs, *ucsout1, *ucsout2;
321
322 unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
323 unsigned norm1 = flags & LDAP_UTF8_ARG1NFC;
324 unsigned norm2 = flags & LDAP_UTF8_ARG2NFC;
325
326 if (bv1 == NULL) {
327 return bv2 == NULL ? 0 : -1;
328
329 } else if (bv2 == NULL) {
330 return 1;
331 }
332
333 l1 = bv1->bv_len;
334 l2 = bv2->bv_len;
335
336 len = (l1 < l2) ? l1 : l2;
337 if (len == 0) {
338 return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
339 }
340
341 s1 = bv1->bv_val;
342 s2 = bv2->bv_val;
343 done = s1 + len;
344
345 while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
346 if (casefold) {
347 char c1 = TOLOWER(*s1);
348 char c2 = TOLOWER(*s2);
349 res = c1 - c2;
350 } else {
351 res = *s1 - *s2;
352 }
353 s1++;
354 s2++;
355 if (res) {
356 /* done unless next character in s1 or s2 is non-ascii */
357 if (s1 < done) {
358 if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
359 break;
360 }
361 } else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) ||
362 ((len < l2) && !LDAP_UTF8_ISASCII(s2)))
363 {
364 break;
365 }
366 return res;
367 }
368 }
369
370 /* We have encountered non-ascii or strings equal up to len */
371
372 /* set i to number of iterations */
373 i = s1 - done + len;
374 /* passed through loop at least once? */
375 if (i > 0) {
376 if (!res && (s1 == done) &&
377 ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
378 ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
379 /* all ascii and equal up to len */
380 return l1 - l2;
381 }
382
383 /* rewind one char, and do normalized compare from there */
384 s1--;
385 s2--;
386 l1 -= i - 1;
387 l2 -= i - 1;
388 }
389
390 /* Should first check to see if strings are already in
391 * proper normalized form.
392 */
393 ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) );
394 if ( ucs == NULL ) {
395 return l1 > l2 ? 1 : -1; /* what to do??? */
396 }
397
398 /*
399 * XXYYZ: we convert to ucs4 even though -llunicode
400 * expects ucs2 in an ac_uint4
401 */
402
403 /* convert and normalize 1st string */
404 for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
405 ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
406 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
407 free( ucs );
408 return -1; /* what to do??? */
409 }
410 len = LDAP_UTF8_CHARLEN( s1 + i );
411 }
412
413 if ( norm1 ) {
414 ucsout1 = ucs;
415 l1 = ulen;
416 ucs = malloc( l2 * sizeof(*ucs) );
417 if ( ucs == NULL ) {
418 free( ucsout1 );
419 return l1 > l2 ? 1 : -1; /* what to do??? */
420 }
421 } else {
422 uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx );
423 l1 = uccanoncomp( ucsout1, l1 );
424 }
425
426 /* convert and normalize 2nd string */
427 for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
428 ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
429 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
430 free( ucsout1 );
431 free( ucs );
432 return 1; /* what to do??? */
433 }
434 len = LDAP_UTF8_CHARLEN( s2 + i );
435 }
436
437 if ( norm2 ) {
438 ucsout2 = ucs;
439 l2 = ulen;
440 } else {
441 uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx );
442 l2 = uccanoncomp( ucsout2, l2 );
443 free( ucs );
444 }
445
446 res = casefold
447 ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
448 : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
449 free( ucsout1 );
450 free( ucsout2 );
451
452 if ( res != 0 ) {
453 return res;
454 }
455 if ( l1 == l2 ) {
456 return 0;
457 }
458 return l1 > l2 ? 1 : -1;
459 }
460