1 /* utf-8.c -- Basic UTF-8 routines */
2 /* $OpenLDAP$ */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4  *
5  * Copyright 1998-2021 The OpenLDAP Foundation.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted only as authorized by the OpenLDAP
10  * Public License.
11  *
12  * A copy of this license is available in the file LICENSE in the
13  * top-level directory of the distribution or, alternatively, at
14  * <http://www.OpenLDAP.org/license.html>.
15  */
16 /* Basic UTF-8 routines
17  *
18  * These routines are "dumb".  Though they understand UTF-8,
19  * they don't grok Unicode.  That is, they can push bits,
20  * but don't have a clue what the bits represent.  That's
21  * good enough for use with the LDAP Client SDK.
22  *
23  * These routines are not optimized.
24  */
25 
26 #include "portable.h"
27 
28 #include <stdio.h>
29 
30 #include <ac/stdlib.h>
31 
32 #include <ac/socket.h>
33 #include <ac/string.h>
34 #include <ac/time.h>
35 
36 #include "ldap_utf8.h"
37 
38 #include "ldap-int.h"
39 #include "ldap_defaults.h"
40 
41 /*
42  * return the number of bytes required to hold the
43  * NULL-terminated UTF-8 string NOT INCLUDING the
44  * termination.
45  */
ldap_utf8_bytes(const char * p)46 ber_len_t ldap_utf8_bytes( const char * p )
47 {
48 	ber_len_t bytes;
49 
50 	for( bytes=0; p[bytes]; bytes++ ) {
51 		/* EMPTY */ ;
52 	}
53 
54 	return bytes;
55 }
56 
ldap_utf8_chars(const char * p)57 ber_len_t ldap_utf8_chars( const char * p )
58 {
59 	/* could be optimized and could check for invalid sequences */
60 	ber_len_t chars=0;
61 
62 	for( ; *p ; LDAP_UTF8_INCR(p) ) {
63 		chars++;
64 	}
65 
66 	return chars;
67 }
68 
69 /* return offset to next character */
ldap_utf8_offset(const char * p)70 int ldap_utf8_offset( const char * p )
71 {
72 	return LDAP_UTF8_NEXT(p) - p;
73 }
74 
75 /*
76  * Returns length indicated by first byte.
77  */
78 const char ldap_utf8_lentab[] = {
79 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83 	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
86 	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
87 
ldap_utf8_charlen(const char * p)88 int ldap_utf8_charlen( const char * p )
89 {
90 	if (!(*p & 0x80))
91 		return 1;
92 
93 	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
94 }
95 
96 /*
97  * Make sure the UTF-8 char used the shortest possible encoding
98  * returns charlen if valid, 0 if not.
99  *
100  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
101  * The table is slightly modified from that of the RFC.
102  *
103  * UCS-4 range (hex)      UTF-8 sequence (binary)
104  * 0000 0000-0000 007F   0.......
105  * 0000 0080-0000 07FF   110++++. 10......
106  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
107  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
108  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
109  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
110  *
111  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
112  * at least one of the '+' bits must be set, otherwise the character
113  * should have been encoded in fewer octets. Note that in the two-octet
114  * case, only the first octet needs to be validated, and this is done
115  * in the ldap_utf8_lentab[] above.
116  */
117 
118 /* mask of required bits in second octet */
119 #undef c
120 #define c const char
121 c ldap_utf8_mintab[] = {
122 	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
123 	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
124 	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125 	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
126 #undef c
127 
ldap_utf8_charlen2(const char * p)128 int ldap_utf8_charlen2( const char * p )
129 {
130 	int i = LDAP_UTF8_CHARLEN( p );
131 
132 	if ( i > 2 ) {
133 		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
134 			i = 0;
135 	}
136 	return i;
137 }
138 
139 /* conv UTF-8 to UCS-4, useful for comparisons */
ldap_x_utf8_to_ucs4(const char * p)140 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
141 {
142     const unsigned char *c = (const unsigned char *) p;
143     ldap_ucs4_t ch;
144 	int len, i;
145 	static unsigned char mask[] = {
146 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147 
148 	len = LDAP_UTF8_CHARLEN2(p, len);
149 
150 	if( len == 0 ) return LDAP_UCS4_INVALID;
151 
152 	ch = c[0] & mask[len];
153 
154 	for(i=1; i < len; i++) {
155 		if ((c[i] & 0xc0) != 0x80) {
156 			return LDAP_UCS4_INVALID;
157 		}
158 
159 		ch <<= 6;
160 		ch |= c[i] & 0x3f;
161 	}
162 
163 	return ch;
164 }
165 
166 /* conv UCS-4 to UTF-8, not used */
ldap_x_ucs4_to_utf8(ldap_ucs4_t c,char * buf)167 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
168 {
169 	int len=0;
170 	unsigned char* p = (unsigned char *) buf;
171 
172 	/* not a valid Unicode character */
173 	if ( c < 0 ) return 0;
174 
175 	/* Just return length, don't convert */
176 	if(buf == NULL) {
177 		if( c < 0x80 ) return 1;
178 		else if( c < 0x800 ) return 2;
179 		else if( c < 0x10000 ) return 3;
180 		else if( c < 0x200000 ) return 4;
181 		else if( c < 0x4000000 ) return 5;
182 		else return 6;
183 	}
184 
185 	if( c < 0x80 ) {
186 		p[len++] = c;
187 
188 	} else if( c < 0x800 ) {
189 		p[len++] = 0xc0 | ( c >> 6 );
190 		p[len++] = 0x80 | ( c & 0x3f );
191 
192 	} else if( c < 0x10000 ) {
193 		p[len++] = 0xe0 | ( c >> 12 );
194 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
195 		p[len++] = 0x80 | ( c & 0x3f );
196 
197 	} else if( c < 0x200000 ) {
198 		p[len++] = 0xf0 | ( c >> 18 );
199 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
200 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
201 		p[len++] = 0x80 | ( c & 0x3f );
202 
203 	} else if( c < 0x4000000 ) {
204 		p[len++] = 0xf8 | ( c >> 24 );
205 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
206 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
207 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
208 		p[len++] = 0x80 | ( c & 0x3f );
209 
210 	} else /* if( c < 0x80000000 ) */ {
211 		p[len++] = 0xfc | ( c >> 30 );
212 		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
213 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
214 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
215 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
216 		p[len++] = 0x80 | ( c & 0x3f );
217 	}
218 
219 	return len;
220 }
221 
222 #define LDAP_UCS_UTF8LEN(c)	\
223 	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
224 	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
225 
226 /* Convert a string to UTF-8 format. The input string is expected to
227  * have characters of 1, 2, or 4 octets (in network byte order)
228  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
229  * types respectively. (Here T61STRING just means that there is one
230  * octet per character and characters may use the high bit of the octet.
231  * The characters are assumed to use ISO mappings, no provision is made
232  * for converting from T.61 coding rules to Unicode.)
233  */
234 
235 int
ldap_ucs_to_utf8s(struct berval * ucs,int csize,struct berval * utf8s)236 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
237 {
238 	unsigned char *in, *end;
239 	char *ptr;
240 	ldap_ucs4_t u;
241 	int i, l = 0;
242 
243 	utf8s->bv_val = NULL;
244 	utf8s->bv_len = 0;
245 
246 	in = (unsigned char *)ucs->bv_val;
247 
248 	/* Make sure we stop at an even multiple of csize */
249 	end = in + ( ucs->bv_len & ~(csize-1) );
250 
251 	for (; in < end; ) {
252 		u = *in++;
253 		if (csize > 1) {
254 			u <<= 8;
255 			u |= *in++;
256 		}
257 		if (csize > 2) {
258 			u <<= 8;
259 			u |= *in++;
260 			u <<= 8;
261 			u |= *in++;
262 		}
263 		i = LDAP_UCS_UTF8LEN(u);
264 		if (i == 0)
265 			return LDAP_INVALID_SYNTAX;
266 		l += i;
267 	}
268 
269 	utf8s->bv_val = LDAP_MALLOC( l+1 );
270 	if (utf8s->bv_val == NULL)
271 		return LDAP_NO_MEMORY;
272 	utf8s->bv_len = l;
273 
274 	ptr = utf8s->bv_val;
275 	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
276 		u = *in++;
277 		if (csize > 1) {
278 			u <<= 8;
279 			u |= *in++;
280 		}
281 		if (csize > 2) {
282 			u <<= 8;
283 			u |= *in++;
284 			u <<= 8;
285 			u |= *in++;
286 		}
287 		ptr += ldap_x_ucs4_to_utf8(u, ptr);
288 	}
289 	*ptr = '\0';
290 	return LDAP_SUCCESS;
291 }
292 
293 /*
294  * Advance to the next UTF-8 character
295  *
296  * Ignores length of multibyte character, instead rely on
297  * continuation markers to find start of next character.
298  * This allows for "resyncing" of when invalid characters
299  * are provided provided the start of the next character
300  * is appears within the 6 bytes examined.
301  */
ldap_utf8_next(const char * p)302 char* ldap_utf8_next( const char * p )
303 {
304 	int i;
305 	const unsigned char *u = (const unsigned char *) p;
306 
307 	if( LDAP_UTF8_ISASCII(u) ) {
308 		return (char *) &p[1];
309 	}
310 
311 	for( i=1; i<6; i++ ) {
312 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
313 			return (char *) &p[i];
314 		}
315 	}
316 
317 	return (char *) &p[i];
318 }
319 
320 /*
321  * Advance to the previous UTF-8 character
322  *
323  * Ignores length of multibyte character, instead rely on
324  * continuation markers to find start of next character.
325  * This allows for "resyncing" of when invalid characters
326  * are provided provided the start of the next character
327  * is appears within the 6 bytes examined.
328  */
ldap_utf8_prev(const char * p)329 char* ldap_utf8_prev( const char * p )
330 {
331 	int i;
332 	const unsigned char *u = (const unsigned char *) p;
333 
334 	for( i=-1; i>-6 ; i-- ) {
335 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
336 			return (char *) &p[i];
337 		}
338 	}
339 
340 	return (char *) &p[i];
341 }
342 
343 /*
344  * Copy one UTF-8 character from src to dst returning
345  * number of bytes copied.
346  *
347  * Ignores length of multibyte character, instead rely on
348  * continuation markers to find start of next character.
349  * This allows for "resyncing" of when invalid characters
350  * are provided provided the start of the next character
351  * is appears within the 6 bytes examined.
352  */
ldap_utf8_copy(char * dst,const char * src)353 int ldap_utf8_copy( char* dst, const char *src )
354 {
355 	int i;
356 	const unsigned char *u = (const unsigned char *) src;
357 
358 	dst[0] = src[0];
359 
360 	if( LDAP_UTF8_ISASCII(u) ) {
361 		return 1;
362 	}
363 
364 	for( i=1; i<6; i++ ) {
365 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
366 			return i;
367 		}
368 		dst[i] = src[i];
369 	}
370 
371 	return i;
372 }
373 
374 #ifndef UTF8_ALPHA_CTYPE
375 /*
376  * UTF-8 ctype routines
377  * Only deals with characters < 0x80 (ie: US-ASCII)
378  */
379 
ldap_utf8_isascii(const char * p)380 int ldap_utf8_isascii( const char * p )
381 {
382 	unsigned c = * (const unsigned char *) p;
383 	return LDAP_ASCII(c);
384 }
385 
ldap_utf8_isdigit(const char * p)386 int ldap_utf8_isdigit( const char * p )
387 {
388 	unsigned c = * (const unsigned char *) p;
389 
390 	if(!LDAP_ASCII(c)) return 0;
391 
392 	return LDAP_DIGIT( c );
393 }
394 
ldap_utf8_isxdigit(const char * p)395 int ldap_utf8_isxdigit( const char * p )
396 {
397 	unsigned c = * (const unsigned char *) p;
398 
399 	if(!LDAP_ASCII(c)) return 0;
400 
401 	return LDAP_HEX(c);
402 }
403 
ldap_utf8_isspace(const char * p)404 int ldap_utf8_isspace( const char * p )
405 {
406 	unsigned c = * (const unsigned char *) p;
407 
408 	if(!LDAP_ASCII(c)) return 0;
409 
410 	switch(c) {
411 	case ' ':
412 	case '\t':
413 	case '\n':
414 	case '\r':
415 	case '\v':
416 	case '\f':
417 		return 1;
418 	}
419 
420 	return 0;
421 }
422 
423 /*
424  * These are not needed by the C SDK and are
425  * not "good enough" for general use.
426  */
ldap_utf8_isalpha(const char * p)427 int ldap_utf8_isalpha( const char * p )
428 {
429 	unsigned c = * (const unsigned char *) p;
430 
431 	if(!LDAP_ASCII(c)) return 0;
432 
433 	return LDAP_ALPHA(c);
434 }
435 
ldap_utf8_isalnum(const char * p)436 int ldap_utf8_isalnum( const char * p )
437 {
438 	unsigned c = * (const unsigned char *) p;
439 
440 	if(!LDAP_ASCII(c)) return 0;
441 
442 	return LDAP_ALNUM(c);
443 }
444 
ldap_utf8_islower(const char * p)445 int ldap_utf8_islower( const char * p )
446 {
447 	unsigned c = * (const unsigned char *) p;
448 
449 	if(!LDAP_ASCII(c)) return 0;
450 
451 	return LDAP_LOWER(c);
452 }
453 
ldap_utf8_isupper(const char * p)454 int ldap_utf8_isupper( const char * p )
455 {
456 	unsigned c = * (const unsigned char *) p;
457 
458 	if(!LDAP_ASCII(c)) return 0;
459 
460 	return LDAP_UPPER(c);
461 }
462 #endif
463 
464 
465 /*
466  * UTF-8 string routines
467  */
468 
469 /* like strchr() */
470 char * (ldap_utf8_strchr)( const char *str, const char *chr )
471 {
472 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
473 		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
474 			return (char *) str;
475 		}
476 	}
477 
478 	return NULL;
479 }
480 
481 /* like strcspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strcspn)482 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
483 {
484 	const char *cstr;
485 	const char *cset;
486 
487 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
488 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
489 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
490 				return cstr - str;
491 			}
492 		}
493 	}
494 
495 	return cstr - str;
496 }
497 
498 /* like strspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strspn)499 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
500 {
501 	const char *cstr;
502 	const char *cset;
503 
504 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
505 		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
506 			if( *cset == '\0' ) {
507 				return cstr - str;
508 			}
509 
510 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
511 				break;
512 			}
513 		}
514 	}
515 
516 	return cstr - str;
517 }
518 
519 /* like strpbrk(), replaces strchr() as well */
520 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
521 {
522 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
523 		const char *cset;
524 
525 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
526 			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
527 				return (char *) str;
528 			}
529 		}
530 	}
531 
532 	return NULL;
533 }
534 
535 /* like strtok_r(), not strtok() */
536 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
537 {
538 	char *begin;
539 	char *end;
540 
541 	if( last == NULL ) return NULL;
542 
543 	begin = str ? str : *last;
544 
545 	begin += ldap_utf8_strspn( begin, sep );
546 
547 	if( *begin == '\0' ) {
548 		*last = NULL;
549 		return NULL;
550 	}
551 
552 	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
553 
554 	if( *end != '\0' ) {
555 		char *next = LDAP_UTF8_NEXT( end );
556 		*end = '\0';
557 		end = next;
558 	}
559 
560 	*last = end;
561 	return begin;
562 }
563