1 /* utf-8.c -- Basic UTF-8 routines */
2 /* $OpenLDAP$ */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 *
5 * Copyright 1998-2021 The OpenLDAP Foundation.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
10 * Public License.
11 *
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
15 */
16 /* Basic UTF-8 routines
17 *
18 * These routines are "dumb". Though they understand UTF-8,
19 * they don't grok Unicode. That is, they can push bits,
20 * but don't have a clue what the bits represent. That's
21 * good enough for use with the LDAP Client SDK.
22 *
23 * These routines are not optimized.
24 */
25
26 #include "portable.h"
27
28 #include <stdio.h>
29
30 #include <ac/stdlib.h>
31
32 #include <ac/socket.h>
33 #include <ac/string.h>
34 #include <ac/time.h>
35
36 #include "ldap_utf8.h"
37
38 #include "ldap-int.h"
39 #include "ldap_defaults.h"
40
41 /*
42 * return the number of bytes required to hold the
43 * NULL-terminated UTF-8 string NOT INCLUDING the
44 * termination.
45 */
ldap_utf8_bytes(const char * p)46 ber_len_t ldap_utf8_bytes( const char * p )
47 {
48 ber_len_t bytes;
49
50 for( bytes=0; p[bytes]; bytes++ ) {
51 /* EMPTY */ ;
52 }
53
54 return bytes;
55 }
56
ldap_utf8_chars(const char * p)57 ber_len_t ldap_utf8_chars( const char * p )
58 {
59 /* could be optimized and could check for invalid sequences */
60 ber_len_t chars=0;
61
62 for( ; *p ; LDAP_UTF8_INCR(p) ) {
63 chars++;
64 }
65
66 return chars;
67 }
68
69 /* return offset to next character */
ldap_utf8_offset(const char * p)70 int ldap_utf8_offset( const char * p )
71 {
72 return LDAP_UTF8_NEXT(p) - p;
73 }
74
75 /*
76 * Returns length indicated by first byte.
77 */
78 const char ldap_utf8_lentab[] = {
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
86 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
87
ldap_utf8_charlen(const char * p)88 int ldap_utf8_charlen( const char * p )
89 {
90 if (!(*p & 0x80))
91 return 1;
92
93 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
94 }
95
96 /*
97 * Make sure the UTF-8 char used the shortest possible encoding
98 * returns charlen if valid, 0 if not.
99 *
100 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
101 * The table is slightly modified from that of the RFC.
102 *
103 * UCS-4 range (hex) UTF-8 sequence (binary)
104 * 0000 0000-0000 007F 0.......
105 * 0000 0080-0000 07FF 110++++. 10......
106 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
107 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
108 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
109 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
110 *
111 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
112 * at least one of the '+' bits must be set, otherwise the character
113 * should have been encoded in fewer octets. Note that in the two-octet
114 * case, only the first octet needs to be validated, and this is done
115 * in the ldap_utf8_lentab[] above.
116 */
117
118 /* mask of required bits in second octet */
119 #undef c
120 #define c const char
121 c ldap_utf8_mintab[] = {
122 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
123 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
124 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
126 #undef c
127
ldap_utf8_charlen2(const char * p)128 int ldap_utf8_charlen2( const char * p )
129 {
130 int i = LDAP_UTF8_CHARLEN( p );
131
132 if ( i > 2 ) {
133 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
134 i = 0;
135 }
136 return i;
137 }
138
139 /* conv UTF-8 to UCS-4, useful for comparisons */
ldap_x_utf8_to_ucs4(const char * p)140 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
141 {
142 const unsigned char *c = (const unsigned char *) p;
143 ldap_ucs4_t ch;
144 int len, i;
145 static unsigned char mask[] = {
146 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147
148 len = LDAP_UTF8_CHARLEN2(p, len);
149
150 if( len == 0 ) return LDAP_UCS4_INVALID;
151
152 ch = c[0] & mask[len];
153
154 for(i=1; i < len; i++) {
155 if ((c[i] & 0xc0) != 0x80) {
156 return LDAP_UCS4_INVALID;
157 }
158
159 ch <<= 6;
160 ch |= c[i] & 0x3f;
161 }
162
163 return ch;
164 }
165
166 /* conv UCS-4 to UTF-8, not used */
ldap_x_ucs4_to_utf8(ldap_ucs4_t c,char * buf)167 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
168 {
169 int len=0;
170 unsigned char* p = (unsigned char *) buf;
171
172 /* not a valid Unicode character */
173 if ( c < 0 ) return 0;
174
175 /* Just return length, don't convert */
176 if(buf == NULL) {
177 if( c < 0x80 ) return 1;
178 else if( c < 0x800 ) return 2;
179 else if( c < 0x10000 ) return 3;
180 else if( c < 0x200000 ) return 4;
181 else if( c < 0x4000000 ) return 5;
182 else return 6;
183 }
184
185 if( c < 0x80 ) {
186 p[len++] = c;
187
188 } else if( c < 0x800 ) {
189 p[len++] = 0xc0 | ( c >> 6 );
190 p[len++] = 0x80 | ( c & 0x3f );
191
192 } else if( c < 0x10000 ) {
193 p[len++] = 0xe0 | ( c >> 12 );
194 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
195 p[len++] = 0x80 | ( c & 0x3f );
196
197 } else if( c < 0x200000 ) {
198 p[len++] = 0xf0 | ( c >> 18 );
199 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
200 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
201 p[len++] = 0x80 | ( c & 0x3f );
202
203 } else if( c < 0x4000000 ) {
204 p[len++] = 0xf8 | ( c >> 24 );
205 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
206 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
207 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
208 p[len++] = 0x80 | ( c & 0x3f );
209
210 } else /* if( c < 0x80000000 ) */ {
211 p[len++] = 0xfc | ( c >> 30 );
212 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
213 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
214 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
215 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
216 p[len++] = 0x80 | ( c & 0x3f );
217 }
218
219 return len;
220 }
221
222 #define LDAP_UCS_UTF8LEN(c) \
223 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
224 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
225
226 /* Convert a string to UTF-8 format. The input string is expected to
227 * have characters of 1, 2, or 4 octets (in network byte order)
228 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
229 * types respectively. (Here T61STRING just means that there is one
230 * octet per character and characters may use the high bit of the octet.
231 * The characters are assumed to use ISO mappings, no provision is made
232 * for converting from T.61 coding rules to Unicode.)
233 */
234
235 int
ldap_ucs_to_utf8s(struct berval * ucs,int csize,struct berval * utf8s)236 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
237 {
238 unsigned char *in, *end;
239 char *ptr;
240 ldap_ucs4_t u;
241 int i, l = 0;
242
243 utf8s->bv_val = NULL;
244 utf8s->bv_len = 0;
245
246 in = (unsigned char *)ucs->bv_val;
247
248 /* Make sure we stop at an even multiple of csize */
249 end = in + ( ucs->bv_len & ~(csize-1) );
250
251 for (; in < end; ) {
252 u = *in++;
253 if (csize > 1) {
254 u <<= 8;
255 u |= *in++;
256 }
257 if (csize > 2) {
258 u <<= 8;
259 u |= *in++;
260 u <<= 8;
261 u |= *in++;
262 }
263 i = LDAP_UCS_UTF8LEN(u);
264 if (i == 0)
265 return LDAP_INVALID_SYNTAX;
266 l += i;
267 }
268
269 utf8s->bv_val = LDAP_MALLOC( l+1 );
270 if (utf8s->bv_val == NULL)
271 return LDAP_NO_MEMORY;
272 utf8s->bv_len = l;
273
274 ptr = utf8s->bv_val;
275 for (in = (unsigned char *)ucs->bv_val; in < end; ) {
276 u = *in++;
277 if (csize > 1) {
278 u <<= 8;
279 u |= *in++;
280 }
281 if (csize > 2) {
282 u <<= 8;
283 u |= *in++;
284 u <<= 8;
285 u |= *in++;
286 }
287 ptr += ldap_x_ucs4_to_utf8(u, ptr);
288 }
289 *ptr = '\0';
290 return LDAP_SUCCESS;
291 }
292
293 /*
294 * Advance to the next UTF-8 character
295 *
296 * Ignores length of multibyte character, instead rely on
297 * continuation markers to find start of next character.
298 * This allows for "resyncing" of when invalid characters
299 * are provided provided the start of the next character
300 * is appears within the 6 bytes examined.
301 */
ldap_utf8_next(const char * p)302 char* ldap_utf8_next( const char * p )
303 {
304 int i;
305 const unsigned char *u = (const unsigned char *) p;
306
307 if( LDAP_UTF8_ISASCII(u) ) {
308 return (char *) &p[1];
309 }
310
311 for( i=1; i<6; i++ ) {
312 if ( ( u[i] & 0xc0 ) != 0x80 ) {
313 return (char *) &p[i];
314 }
315 }
316
317 return (char *) &p[i];
318 }
319
320 /*
321 * Advance to the previous UTF-8 character
322 *
323 * Ignores length of multibyte character, instead rely on
324 * continuation markers to find start of next character.
325 * This allows for "resyncing" of when invalid characters
326 * are provided provided the start of the next character
327 * is appears within the 6 bytes examined.
328 */
ldap_utf8_prev(const char * p)329 char* ldap_utf8_prev( const char * p )
330 {
331 int i;
332 const unsigned char *u = (const unsigned char *) p;
333
334 for( i=-1; i>-6 ; i-- ) {
335 if ( ( u[i] & 0xc0 ) != 0x80 ) {
336 return (char *) &p[i];
337 }
338 }
339
340 return (char *) &p[i];
341 }
342
343 /*
344 * Copy one UTF-8 character from src to dst returning
345 * number of bytes copied.
346 *
347 * Ignores length of multibyte character, instead rely on
348 * continuation markers to find start of next character.
349 * This allows for "resyncing" of when invalid characters
350 * are provided provided the start of the next character
351 * is appears within the 6 bytes examined.
352 */
ldap_utf8_copy(char * dst,const char * src)353 int ldap_utf8_copy( char* dst, const char *src )
354 {
355 int i;
356 const unsigned char *u = (const unsigned char *) src;
357
358 dst[0] = src[0];
359
360 if( LDAP_UTF8_ISASCII(u) ) {
361 return 1;
362 }
363
364 for( i=1; i<6; i++ ) {
365 if ( ( u[i] & 0xc0 ) != 0x80 ) {
366 return i;
367 }
368 dst[i] = src[i];
369 }
370
371 return i;
372 }
373
374 #ifndef UTF8_ALPHA_CTYPE
375 /*
376 * UTF-8 ctype routines
377 * Only deals with characters < 0x80 (ie: US-ASCII)
378 */
379
ldap_utf8_isascii(const char * p)380 int ldap_utf8_isascii( const char * p )
381 {
382 unsigned c = * (const unsigned char *) p;
383 return LDAP_ASCII(c);
384 }
385
ldap_utf8_isdigit(const char * p)386 int ldap_utf8_isdigit( const char * p )
387 {
388 unsigned c = * (const unsigned char *) p;
389
390 if(!LDAP_ASCII(c)) return 0;
391
392 return LDAP_DIGIT( c );
393 }
394
ldap_utf8_isxdigit(const char * p)395 int ldap_utf8_isxdigit( const char * p )
396 {
397 unsigned c = * (const unsigned char *) p;
398
399 if(!LDAP_ASCII(c)) return 0;
400
401 return LDAP_HEX(c);
402 }
403
ldap_utf8_isspace(const char * p)404 int ldap_utf8_isspace( const char * p )
405 {
406 unsigned c = * (const unsigned char *) p;
407
408 if(!LDAP_ASCII(c)) return 0;
409
410 switch(c) {
411 case ' ':
412 case '\t':
413 case '\n':
414 case '\r':
415 case '\v':
416 case '\f':
417 return 1;
418 }
419
420 return 0;
421 }
422
423 /*
424 * These are not needed by the C SDK and are
425 * not "good enough" for general use.
426 */
ldap_utf8_isalpha(const char * p)427 int ldap_utf8_isalpha( const char * p )
428 {
429 unsigned c = * (const unsigned char *) p;
430
431 if(!LDAP_ASCII(c)) return 0;
432
433 return LDAP_ALPHA(c);
434 }
435
ldap_utf8_isalnum(const char * p)436 int ldap_utf8_isalnum( const char * p )
437 {
438 unsigned c = * (const unsigned char *) p;
439
440 if(!LDAP_ASCII(c)) return 0;
441
442 return LDAP_ALNUM(c);
443 }
444
ldap_utf8_islower(const char * p)445 int ldap_utf8_islower( const char * p )
446 {
447 unsigned c = * (const unsigned char *) p;
448
449 if(!LDAP_ASCII(c)) return 0;
450
451 return LDAP_LOWER(c);
452 }
453
ldap_utf8_isupper(const char * p)454 int ldap_utf8_isupper( const char * p )
455 {
456 unsigned c = * (const unsigned char *) p;
457
458 if(!LDAP_ASCII(c)) return 0;
459
460 return LDAP_UPPER(c);
461 }
462 #endif
463
464
465 /*
466 * UTF-8 string routines
467 */
468
469 /* like strchr() */
470 char * (ldap_utf8_strchr)( const char *str, const char *chr )
471 {
472 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
473 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
474 return (char *) str;
475 }
476 }
477
478 return NULL;
479 }
480
481 /* like strcspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strcspn)482 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
483 {
484 const char *cstr;
485 const char *cset;
486
487 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
488 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
489 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
490 return cstr - str;
491 }
492 }
493 }
494
495 return cstr - str;
496 }
497
498 /* like strspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strspn)499 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
500 {
501 const char *cstr;
502 const char *cset;
503
504 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
505 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
506 if( *cset == '\0' ) {
507 return cstr - str;
508 }
509
510 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
511 break;
512 }
513 }
514 }
515
516 return cstr - str;
517 }
518
519 /* like strpbrk(), replaces strchr() as well */
520 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
521 {
522 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
523 const char *cset;
524
525 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
526 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
527 return (char *) str;
528 }
529 }
530 }
531
532 return NULL;
533 }
534
535 /* like strtok_r(), not strtok() */
536 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
537 {
538 char *begin;
539 char *end;
540
541 if( last == NULL ) return NULL;
542
543 begin = str ? str : *last;
544
545 begin += ldap_utf8_strspn( begin, sep );
546
547 if( *begin == '\0' ) {
548 *last = NULL;
549 return NULL;
550 }
551
552 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
553
554 if( *end != '\0' ) {
555 char *next = LDAP_UTF8_NEXT( end );
556 *end = '\0';
557 end = next;
558 }
559
560 *last = end;
561 return begin;
562 }
563