1 /* $OpenLDAP$ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3  *
4  * Copyright 1998-2021 The OpenLDAP Foundation.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted only as authorized by the OpenLDAP
9  * Public License.
10  *
11  * A copy of this license is available in file LICENSE in the
12  * top-level directory of the distribution or, alternatively, at
13  * <http://www.OpenLDAP.org/license.html>.
14  */
15 
16 #include "portable.h"
17 
18 #include <ac/bytes.h>
19 #include <ac/ctype.h>
20 #include <ac/string.h>
21 #include <ac/stdlib.h>
22 
23 #include <lber_pvt.h>
24 
25 #include <ldap_utf8.h>
26 #include <ldap_pvt_uc.h>
27 
28 #define	malloc(x)	ber_memalloc_x(x,ctx)
29 #define	realloc(x,y)	ber_memrealloc_x(x,y,ctx)
30 #define	free(x)		ber_memfree_x(x,ctx)
31 
ucstrncmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)32 int ucstrncmp(
33 	const ldap_unicode_t *u1,
34 	const ldap_unicode_t *u2,
35 	ber_len_t n )
36 {
37 	for(; 0 < n; ++u1, ++u2, --n ) {
38 		if( *u1 != *u2 ) {
39 			return *u1 < *u2 ? -1 : +1;
40 		}
41 		if ( *u1 == 0 ) {
42 			return 0;
43 		}
44 	}
45 	return 0;
46 }
47 
ucstrncasecmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)48 int ucstrncasecmp(
49 	const ldap_unicode_t *u1,
50 	const ldap_unicode_t *u2,
51 	ber_len_t n )
52 {
53 	for(; 0 < n; ++u1, ++u2, --n ) {
54 		ldap_unicode_t uu1 = uctolower( *u1 );
55 		ldap_unicode_t uu2 = uctolower( *u2 );
56 
57 		if( uu1 != uu2 ) {
58 			return uu1 < uu2 ? -1 : +1;
59 		}
60 		if ( uu1 == 0 ) {
61 			return 0;
62 		}
63 	}
64 	return 0;
65 }
66 
ucstrnchr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)67 ldap_unicode_t * ucstrnchr(
68 	const ldap_unicode_t *u,
69 	ber_len_t n,
70 	ldap_unicode_t c )
71 {
72 	for(; 0 < n; ++u, --n ) {
73 		if( *u == c ) {
74 			return (ldap_unicode_t *) u;
75 		}
76 	}
77 
78 	return NULL;
79 }
80 
ucstrncasechr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)81 ldap_unicode_t * ucstrncasechr(
82 	const ldap_unicode_t *u,
83 	ber_len_t n,
84 	ldap_unicode_t c )
85 {
86 	c = uctolower( c );
87 	for(; 0 < n; ++u, --n ) {
88 		if( uctolower( *u ) == c ) {
89 			return (ldap_unicode_t *) u;
90 		}
91 	}
92 
93 	return NULL;
94 }
95 
ucstr2upper(ldap_unicode_t * u,ber_len_t n)96 void ucstr2upper(
97 	ldap_unicode_t *u,
98 	ber_len_t n )
99 {
100 	for(; 0 < n; ++u, --n ) {
101 		*u = uctoupper( *u );
102 	}
103 }
104 
UTF8bvnormalize(struct berval * bv,struct berval * newbv,unsigned flags,void * ctx)105 struct berval * UTF8bvnormalize(
106 	struct berval *bv,
107 	struct berval *newbv,
108 	unsigned flags,
109 	void *ctx )
110 {
111 	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
112 	int didnewbv = 0;
113 	char *out, *outtmp, *s;
114 	ac_uint4 *ucs, *p, *ucsout;
115 
116 	static unsigned char mask[] = {
117 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
118 
119 	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
120 	unsigned approx = flags & LDAP_UTF8_APPROX;
121 
122 	if ( bv == NULL ) {
123 		return NULL;
124 	}
125 
126 	s = bv->bv_val;
127 	len = bv->bv_len;
128 
129 	if ( len == 0 ) {
130 		return ber_dupbv_x( newbv, bv, ctx );
131 	}
132 
133 	if ( !newbv ) {
134 		newbv = ber_memalloc_x( sizeof(struct berval), ctx );
135 		if ( !newbv ) return NULL;
136 		didnewbv = 1;
137 	}
138 
139 	/* Should first check to see if string is already in proper
140 	 * normalized form. This is almost as time consuming as
141 	 * the normalization though.
142 	 */
143 
144 	/* finish off everything up to character before first non-ascii */
145 	if ( LDAP_UTF8_ISASCII( s ) ) {
146 		if ( casefold ) {
147 			outsize = len + 7;
148 			out = (char *) ber_memalloc_x( outsize, ctx );
149 			if ( out == NULL ) {
150 fail:
151 				if ( didnewbv )
152 					ber_memfree_x( newbv, ctx );
153 				return NULL;
154 			}
155 			outpos = 0;
156 
157 			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
158 				out[outpos++] = TOLOWER( s[i-1] );
159 			}
160 			if ( i == len ) {
161 				out[outpos++] = TOLOWER( s[len-1] );
162 				out[outpos] = '\0';
163 				newbv->bv_val = out;
164 				newbv->bv_len = outpos;
165 				return newbv;
166 			}
167 		} else {
168 			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
169 				/* empty */
170 			}
171 
172 			if ( i == len ) {
173 				return ber_str2bv_x( s, len, 1, newbv, ctx );
174 			}
175 
176 			outsize = len + 7;
177 			out = (char *) ber_memalloc_x( outsize, ctx );
178 			if ( out == NULL ) {
179 				goto fail;
180 			}
181 			outpos = i - 1;
182 			memcpy(out, s, outpos);
183 		}
184 	} else {
185 		outsize = len + 7;
186 		out = (char *) ber_memalloc_x( outsize, ctx );
187 		if ( out == NULL ) {
188 			goto fail;
189 		}
190 		outpos = 0;
191 		i = 0;
192 	}
193 
194 	p = ucs = ber_memalloc_x( len * sizeof(*ucs), ctx );
195 	if ( ucs == NULL ) {
196 		ber_memfree_x(out, ctx);
197 		goto fail;
198 	}
199 
200 	/* convert character before first non-ascii to ucs-4 */
201 	if ( i > 0 ) {
202 		*p = casefold ? TOLOWER( s[i-1] ) : s[i-1];
203 		p++;
204 	}
205 
206 	/* s[i] is now first non-ascii character */
207 	for (;;) {
208 		/* s[i] is non-ascii */
209 		/* convert everything up to next ascii to ucs-4 */
210 		while ( i < len ) {
211 			clen = LDAP_UTF8_CHARLEN2( s + i, clen );
212 			if ( clen == 0 ) {
213 				ber_memfree_x( ucs, ctx );
214 				ber_memfree_x( out, ctx );
215 				goto fail;
216 			}
217 			if ( clen == 1 ) {
218 				/* ascii */
219 				break;
220 			}
221 			*p = s[i] & mask[clen];
222 			i++;
223 			for( j = 1; j < clen; j++ ) {
224 				if ( (s[i] & 0xc0) != 0x80 ) {
225 					ber_memfree_x( ucs, ctx );
226 					ber_memfree_x( out, ctx );
227 					goto fail;
228 				}
229 				*p <<= 6;
230 				*p |= s[i] & 0x3f;
231 				i++;
232 			}
233 			if ( casefold ) {
234 				*p = uctolower( *p );
235 			}
236 			p++;
237 		}
238 		/* normalize ucs of length p - ucs */
239 		uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx );
240 		if ( approx ) {
241 			for ( j = 0; j < ucsoutlen; j++ ) {
242 				if ( ucsout[j] < 0x80 ) {
243 					out[outpos++] = ucsout[j];
244 				}
245 			}
246 		} else {
247 			ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
248 			/* convert ucs to utf-8 and store in out */
249 			for ( j = 0; j < ucsoutlen; j++ ) {
250 				/* allocate more space if not enough room for
251 				   6 bytes and terminator */
252 				if ( outsize - outpos < 7 ) {
253 					outsize = ucsoutlen - j + outpos + 6;
254 					outtmp = (char *) ber_memrealloc_x( out, outsize, ctx );
255 					if ( outtmp == NULL ) {
256 						ber_memfree_x( ucsout, ctx );
257 						ber_memfree_x( ucs, ctx );
258 						ber_memfree_x( out, ctx );
259 						goto fail;
260 					}
261 					out = outtmp;
262 				}
263 				outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
264 			}
265 		}
266 
267 		ber_memfree_x( ucsout, ctx );
268 		ucsout = NULL;
269 
270 		if ( i == len ) {
271 			break;
272 		}
273 
274 		last = i;
275 
276 		/* Allocate more space in out if necessary */
277 		if (len - i >= outsize - outpos) {
278 			outsize += 1 + ((len - i) - (outsize - outpos));
279 			outtmp = (char *) ber_memrealloc_x(out, outsize, ctx);
280 			if (outtmp == NULL) {
281 				ber_memfree_x( ucs, ctx );
282 				ber_memfree_x( out, ctx );
283 				goto fail;
284 			}
285 			out = outtmp;
286 		}
287 
288 		/* s[i] is ascii */
289 		/* finish off everything up to char before next non-ascii */
290 		for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
291 			out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1];
292 		}
293 		if ( i == len ) {
294 			out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1];
295 			break;
296 		}
297 
298 		/* convert character before next non-ascii to ucs-4 */
299 		*ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1];
300 		p = ucs + 1;
301 	}
302 
303 	ber_memfree_x( ucs, ctx );
304 	out[outpos] = '\0';
305 	newbv->bv_val = out;
306 	newbv->bv_len = outpos;
307 	return newbv;
308 }
309 
310 /* compare UTF8-strings, optionally ignore casing */
311 /* slow, should be optimized */
UTF8bvnormcmp(struct berval * bv1,struct berval * bv2,unsigned flags,void * ctx)312 int UTF8bvnormcmp(
313 	struct berval *bv1,
314 	struct berval *bv2,
315 	unsigned flags,
316 	void *ctx )
317 {
318 	int i, l1, l2, len, ulen, res = 0;
319 	char *s1, *s2, *done;
320 	ac_uint4 *ucs, *ucsout1, *ucsout2;
321 
322 	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
323 	unsigned norm1 = flags & LDAP_UTF8_ARG1NFC;
324 	unsigned norm2 = flags & LDAP_UTF8_ARG2NFC;
325 
326 	if (bv1 == NULL) {
327 		return bv2 == NULL ? 0 : -1;
328 
329 	} else if (bv2 == NULL) {
330 		return 1;
331 	}
332 
333 	l1 = bv1->bv_len;
334 	l2 = bv2->bv_len;
335 
336 	len = (l1 < l2) ? l1 : l2;
337 	if (len == 0) {
338 		return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
339 	}
340 
341 	s1 = bv1->bv_val;
342 	s2 = bv2->bv_val;
343 	done = s1 + len;
344 
345 	while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
346 		if (casefold) {
347 			char c1 = TOLOWER(*s1);
348 			char c2 = TOLOWER(*s2);
349 			res = c1 - c2;
350 		} else {
351 			res = *s1 - *s2;
352 		}
353 		s1++;
354 		s2++;
355 		if (res) {
356 			/* done unless next character in s1 or s2 is non-ascii */
357 			if (s1 < done) {
358 				if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
359 					break;
360 				}
361 			} else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) ||
362 				((len < l2) && !LDAP_UTF8_ISASCII(s2)))
363 			{
364 				break;
365 			}
366 			return res;
367 		}
368 	}
369 
370 	/* We have encountered non-ascii or strings equal up to len */
371 
372 	/* set i to number of iterations */
373 	i = s1 - done + len;
374 	/* passed through loop at least once? */
375 	if (i > 0) {
376 		if (!res && (s1 == done) &&
377 		    ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
378 		    ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
379 			/* all ascii and equal up to len */
380 			return l1 - l2;
381 		}
382 
383 		/* rewind one char, and do normalized compare from there */
384 		s1--;
385 		s2--;
386 		l1 -= i - 1;
387 		l2 -= i - 1;
388 	}
389 
390 	/* Should first check to see if strings are already in
391 	 * proper normalized form.
392 	 */
393 	ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) );
394 	if ( ucs == NULL ) {
395 		return l1 > l2 ? 1 : -1; /* what to do??? */
396 	}
397 
398 	/*
399 	 * XXYYZ: we convert to ucs4 even though -llunicode
400 	 * expects ucs2 in an ac_uint4
401 	 */
402 
403 	/* convert and normalize 1st string */
404 	for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
405 		ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
406 		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
407 			free( ucs );
408 			return -1; /* what to do??? */
409 		}
410 		len = LDAP_UTF8_CHARLEN( s1 + i );
411 	}
412 
413 	if ( norm1 ) {
414 		ucsout1 = ucs;
415 		l1 = ulen;
416 		ucs = malloc( l2 * sizeof(*ucs) );
417 		if ( ucs == NULL ) {
418 			free( ucsout1 );
419 			return l1 > l2 ? 1 : -1; /* what to do??? */
420 		}
421 	} else {
422 		uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx );
423 		l1 = uccanoncomp( ucsout1, l1 );
424 	}
425 
426 	/* convert and normalize 2nd string */
427 	for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
428 		ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
429 		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
430 			free( ucsout1 );
431 			free( ucs );
432 			return 1; /* what to do??? */
433 		}
434 		len = LDAP_UTF8_CHARLEN( s2 + i );
435 	}
436 
437 	if ( norm2 ) {
438 		ucsout2 = ucs;
439 		l2 = ulen;
440 	} else {
441 		uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx );
442 		l2 = uccanoncomp( ucsout2, l2 );
443 		free( ucs );
444 	}
445 
446 	res = casefold
447 		? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
448 		: ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
449 	free( ucsout1 );
450 	free( ucsout2 );
451 
452 	if ( res != 0 ) {
453 		return res;
454 	}
455 	if ( l1 == l2 ) {
456 		return 0;
457 	}
458 	return l1 > l2 ? 1 : -1;
459 }
460