xref: /freebsd/lib/libc/locale/collate.c (revision 069ac184)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7  *		at Electronni Visti IA, Kiev, Ukraine.
8  *			All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  *
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Adapted to xlocale by John Marino <draco@marino.st>
37  */
38 
39 #include "namespace.h"
40 
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/mman.h>
44 
45 #include <assert.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <wchar.h>
50 #include <errno.h>
51 #include <unistd.h>
52 #include <fcntl.h>
53 #include "un-namespace.h"
54 
55 #include "collate.h"
56 #include "setlocale.h"
57 #include "ldpart.h"
58 #include "libc_private.h"
59 
60 struct xlocale_collate __xlocale_global_collate = {
61 	{{0}, "C"}, 1, 0, 0, 0
62 };
63 
64 struct xlocale_collate __xlocale_C_collate = {
65 	{{0}, "C"}, 1, 0, 0, 0
66 };
67 
68 struct xlocale_collate __xlocale_POSIX_collate = {
69 	{{0}, "POSIX"}, 1, 0, 0, 0
70 };
71 
72 struct xlocale_collate __xlocale_CUTF8_collate = {
73 	{{0}, "C.UTF-8"}, 1, 0, 0, 0
74 };
75 
76 static int
77 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
78 
79 static void
80 destruct_collate(void *t)
81 {
82 	struct xlocale_collate *table = t;
83 	if (table->map && (table->maplen > 0)) {
84 		(void) munmap(table->map, table->maplen);
85 	}
86 	free(t);
87 }
88 
89 void *
90 __collate_load(const char *encoding, __unused locale_t unused)
91 {
92 	if (strcmp(encoding, "C") == 0)
93 		return (&__xlocale_C_collate);
94 	else if (strcmp(encoding, "POSIX") == 0)
95 		return (&__xlocale_POSIX_collate);
96 	else if (strcmp(encoding, "C.UTF-8") == 0)
97 		return (&__xlocale_CUTF8_collate);
98 
99 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate),
100 	    1);
101 	if (table == NULL)
102 		return (NULL);
103 	table->header.header.destructor = destruct_collate;
104 
105 	/*
106 	 * FIXME: Make sure that _LDP_CACHE is never returned.  We
107 	 * should be doing the caching outside of this section.
108 	 */
109 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
110 		xlocale_release(table);
111 		return (NULL);
112 	}
113 	return (table);
114 }
115 
116 /**
117  * Load the collation tables for the specified encoding into the global table.
118  */
119 int
120 __collate_load_tables(const char *encoding)
121 {
122 
123 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
124 }
125 
126 static int
127 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
128 {
129 	int i, chains, z;
130 	char *buf;
131 	char *TMP;
132 	char *map;
133 	collate_info_t *info;
134 	struct stat sbuf;
135 	int fd;
136 
137 	table->__collate_load_error = 1;
138 
139 	/* 'encoding' must be already checked. */
140 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
141 	    strncmp(encoding, "C.", 2) == 0) {
142 		return (_LDP_CACHE);
143 	}
144 
145 	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
146 		return (_LDP_ERROR);
147 
148 	if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) {
149 		free(buf);
150 		return (_LDP_ERROR);
151 	}
152 	free(buf);
153 	if (_fstat(fd, &sbuf) < 0) {
154 		(void) _close(fd);
155 		return (_LDP_ERROR);
156 	}
157 	if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN +
158 			    XLOCALE_DEF_VERSION_LEN +
159 			    sizeof (*info))) {
160 		(void) _close(fd);
161 		errno = EINVAL;
162 		return (_LDP_ERROR);
163 	}
164 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
165 	(void) _close(fd);
166 	if ((TMP = map) == MAP_FAILED) {
167 		return (_LDP_ERROR);
168 	}
169 
170 	if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) {
171 		(void) munmap(map, sbuf.st_size);
172 		errno = EINVAL;
173 		return (_LDP_ERROR);
174 	}
175 	TMP += COLLATE_FMT_VERSION_LEN;
176 	strlcat(table->header.version, TMP, sizeof (table->header.version));
177 	TMP += XLOCALE_DEF_VERSION_LEN;
178 
179 	info = (void *)TMP;
180 	TMP += sizeof (*info);
181 
182 	if ((info->directive_count < 1) ||
183 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
184 	    ((chains = info->chain_count) < 0)) {
185 		(void) munmap(map, sbuf.st_size);
186 		errno = EINVAL;
187 		return (_LDP_ERROR);
188 	}
189 
190 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
191 	    (sizeof (collate_chain_t) * chains) +
192 	    (sizeof (collate_large_t) * info->large_count);
193 	for (z = 0; z < info->directive_count; z++) {
194 		i += sizeof (collate_subst_t) * info->subst_count[z];
195 	}
196 	if (i != (sbuf.st_size - (TMP - map))) {
197 		(void) munmap(map, sbuf.st_size);
198 		errno = EINVAL;
199 		return (_LDP_ERROR);
200 	}
201 
202 	if (table->map && (table->maplen > 0)) {
203 		(void) munmap(table->map, table->maplen);
204 	}
205 	table->map = map;
206 	table->maplen = sbuf.st_size;
207 	table->info = info;
208 	table->char_pri_table = (void *)TMP;
209 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
210 
211 	for (z = 0; z < info->directive_count; z++) {
212 		if (info->subst_count[z] > 0) {
213 			table->subst_table[z] = (void *)TMP;
214 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
215 		} else {
216 			table->subst_table[z] = NULL;
217 		}
218 	}
219 
220 	if (chains > 0) {
221 		table->chain_pri_table = (void *)TMP;
222 		TMP += chains * sizeof (collate_chain_t);
223 	} else
224 		table->chain_pri_table = NULL;
225 	if (info->large_count > 0)
226 		table->large_pri_table = (void *)TMP;
227 	else
228 		table->large_pri_table = NULL;
229 
230 	table->__collate_load_error = 0;
231 	return (_LDP_LOADED);
232 }
233 
234 static const int32_t *
235 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
236 {
237 	const collate_subst_t *p;
238 	int n = table->info->subst_count[pass];
239 
240 	if (n == 0)
241 		return (NULL);
242 
243 	if (pass >= table->info->directive_count)
244 		return (NULL);
245 
246 	if (!(key & COLLATE_SUBST_PRIORITY))
247 		return (NULL);
248 
249 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
250 	assert(p->key == key);
251 
252 	return (p->pri);
253 }
254 
255 static collate_chain_t *
256 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
257 {
258 	int low = 0;
259 	int high = table->info->chain_count - 1;
260 	int next, compar, l;
261 	collate_chain_t *p;
262 	collate_chain_t *tab = table->chain_pri_table;
263 
264 	if (high < 0)
265 		return (NULL);
266 
267 	while (low <= high) {
268 		next = (low + high) / 2;
269 		p = tab + next;
270 		compar = *key - *p->str;
271 		if (compar == 0) {
272 			l = wcsnlen(p->str, COLLATE_STR_LEN);
273 			compar = wcsncmp(key, p->str, l);
274 			if (compar == 0) {
275 				*len = l;
276 				return (p);
277 			}
278 		}
279 		if (compar > 0)
280 			low = next + 1;
281 		else
282 			high = next - 1;
283 	}
284 	return (NULL);
285 }
286 
287 static collate_large_t *
288 largesearch(struct xlocale_collate *table, const wchar_t key)
289 {
290 	int low = 0;
291 	int high = table->info->large_count - 1;
292 	int next, compar;
293 	collate_large_t *p;
294 	collate_large_t *tab = table->large_pri_table;
295 
296 	if (high < 0)
297 		return (NULL);
298 
299 	while (low <= high) {
300 		next = (low + high) / 2;
301 		p = tab + next;
302 		compar = key - p->val;
303 		if (compar == 0)
304 			return (p);
305 		if (compar > 0)
306 			low = next + 1;
307 		else
308 			high = next - 1;
309 	}
310 	return (NULL);
311 }
312 
313 void
314 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
315     int *pri, int which, const int **state)
316 {
317 	collate_chain_t *p2;
318 	collate_large_t *match;
319 	int p, l;
320 	const int *sptr;
321 
322 	/*
323 	 * If this is the "last" pass for the UNDEFINED, then
324 	 * we just return the priority itself.
325 	 */
326 	if (which >= table->info->directive_count) {
327 		*pri = *t;
328 		*len = 1;
329 		*state = NULL;
330 		return;
331 	}
332 
333 	/*
334 	 * If we have remaining substitution data from a previous
335 	 * call, consume it first.
336 	 */
337 	if ((sptr = *state) != NULL) {
338 		*pri = *sptr;
339 		sptr++;
340 		if ((sptr == *state) || (sptr == NULL))
341 			*state = NULL;
342 		else
343 			*state = sptr;
344 		*len = 0;
345 		return;
346 	}
347 
348 	/* No active substitutions */
349 	*len = 1;
350 
351 	/*
352 	 * Check for composites such as diphthongs that collate as a
353 	 * single element (aka chains or collating-elements).
354 	 */
355 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
356 	    ((p = p2->pri[which]) >= 0)) {
357 
358 		*len = l;
359 		*pri = p;
360 
361 	} else if (*t <= UCHAR_MAX) {
362 
363 		/*
364 		 * Character is a small (8-bit) character.
365 		 * We just look these up directly for speed.
366 		 */
367 		*pri = table->char_pri_table[*t].pri[which];
368 
369 	} else if ((table->info->large_count > 0) &&
370 	    ((match = largesearch(table, *t)) != NULL)) {
371 
372 		/*
373 		 * Character was found in the extended table.
374 		 */
375 		*pri = match->pri.pri[which];
376 
377 	} else {
378 		/*
379 		 * Character lacks a specific definition.
380 		 */
381 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
382 			/* Mask off sign bit to prevent ordering confusion. */
383 			*pri = (*t & COLLATE_MAX_PRIORITY);
384 		} else {
385 			*pri = table->info->undef_pri[which];
386 		}
387 		/* No substitutions for undefined characters! */
388 		return;
389 	}
390 
391 	/*
392 	 * Try substituting (expanding) the character.  We are
393 	 * currently doing this *after* the chain compression.  I
394 	 * think it should not matter, but this way might be slightly
395 	 * faster.
396 	 *
397 	 * We do this after the priority search, as this will help us
398 	 * to identify a single key value.  In order for this to work,
399 	 * its important that the priority assigned to a given element
400 	 * to be substituted be unique for that level.  The localedef
401 	 * code ensures this for us.
402 	 */
403 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
404 		if ((*pri = *sptr) > 0) {
405 			sptr++;
406 			*state = *sptr ? sptr : NULL;
407 		}
408 	}
409 
410 }
411 
412 /*
413  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
414  * NOT NULL terminate.  That is left to the caller.
415  */
416 size_t
417 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
418     size_t room)
419 {
420 	int		pri;
421 	int		len;
422 	const wchar_t	*t;
423 	wchar_t		*tr = NULL;
424 	int		direc;
425 	int		pass;
426 	const int32_t 	*state;
427 	size_t		want = 0;
428 	size_t		need = 0;
429 	int		ndir = table->info->directive_count;
430 
431 	assert(src);
432 
433 	for (pass = 0; pass <= ndir; pass++) {
434 
435 		state = NULL;
436 
437 		if (pass != 0) {
438 			/* insert level separator from the previous pass */
439 			if (room) {
440 				*xf++ = 1;
441 				room--;
442 			}
443 			want++;
444 		}
445 
446 		/* special pass for undefined */
447 		if (pass == ndir) {
448 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
449 		} else {
450 			direc = table->info->directive[pass];
451 		}
452 
453 		t = src;
454 
455 		if (direc & DIRECTIVE_BACKWARD) {
456 			wchar_t *bp, *fp, c;
457 			free(tr);
458 			if ((tr = wcsdup(t)) == NULL) {
459 				errno = ENOMEM;
460 				goto fail;
461 			}
462 			bp = tr;
463 			fp = tr + wcslen(tr) - 1;
464 			while (bp < fp) {
465 				c = *bp;
466 				*bp++ = *fp;
467 				*fp-- = c;
468 			}
469 			t = (const wchar_t *)tr;
470 		}
471 
472 		if (direc & DIRECTIVE_POSITION) {
473 			while (*t || state) {
474 				_collate_lookup(table, t, &len, &pri, pass, &state);
475 				t += len;
476 				if (pri <= 0) {
477 					if (pri < 0) {
478 						errno = EINVAL;
479 						goto fail;
480 					}
481 					state = NULL;
482 					pri = COLLATE_MAX_PRIORITY;
483 				}
484 				if (room) {
485 					*xf++ = pri;
486 					room--;
487 				}
488 				want++;
489 				need = want;
490 			}
491 		} else {
492 			while (*t || state) {
493 				_collate_lookup(table, t, &len, &pri, pass, &state);
494 				t += len;
495 				if (pri <= 0) {
496 					if (pri < 0) {
497 						errno = EINVAL;
498 						goto fail;
499 					}
500 					state = NULL;
501 					continue;
502 				}
503 				if (room) {
504 					*xf++ = pri;
505 					room--;
506 				}
507 				want++;
508 				need = want;
509 			}
510 		}
511 	}
512 	free(tr);
513 	return (need);
514 
515 fail:
516 	free(tr);
517 	return ((size_t)(-1));
518 }
519 
520 /*
521  * In the non-POSIX case, we transform each character into a string of
522  * characters representing the character's priority.  Since char is usually
523  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
524  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
525  * bits per byte.
526  *
527  * It turns out that we sometimes have real priorities that are
528  * 31-bits wide.  (But: be careful using priorities where the high
529  * order bit is set -- i.e. the priority is negative.  The sort order
530  * may be surprising!)
531  *
532  * TODO: This would be a good area to optimize somewhat.  It turns out
533  * that real prioririties *except for the last UNDEFINED pass* are generally
534  * very small.  We need the localedef code to precalculate the max
535  * priority for us, and ideally also give us a mask, and then we could
536  * severely limit what we expand to.
537  */
538 #define	XFRM_BYTES	6
539 #define	XFRM_OFFSET	('0')	/* make all printable characters */
540 #define	XFRM_SHIFT	6
541 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
542 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
543 
544 static int
545 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
546 {
547 	/* we use unsigned to ensure zero fill on right shift */
548 	uint32_t val = (uint32_t)table->info->pri_count[pass];
549 	int nc = 0;
550 
551 	while (val) {
552 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
553 		pri >>= XFRM_SHIFT;
554 		val >>= XFRM_SHIFT;
555 		p++;
556 		nc++;
557 	}
558 	return (nc);
559 }
560 
561 size_t
562 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
563     size_t room)
564 {
565 	int		pri;
566 	int		len;
567 	const wchar_t	*t;
568 	wchar_t		*tr = NULL;
569 	int		direc;
570 	int		pass;
571 	const int32_t 	*state;
572 	size_t		want = 0;
573 	size_t		need = 0;
574 	int		b;
575 	uint8_t		buf[XFRM_BYTES];
576 	int		ndir = table->info->directive_count;
577 
578 	assert(src);
579 
580 	for (pass = 0; pass <= ndir; pass++) {
581 
582 		state = NULL;
583 
584 		if (pass != 0) {
585 			/* insert level separator from the previous pass */
586 			if (room) {
587 				*xf++ = XFRM_SEP;
588 				room--;
589 			}
590 			want++;
591 		}
592 
593 		/* special pass for undefined */
594 		if (pass == ndir) {
595 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
596 		} else {
597 			direc = table->info->directive[pass];
598 		}
599 
600 		t = src;
601 
602 		if (direc & DIRECTIVE_BACKWARD) {
603 			wchar_t *bp, *fp, c;
604 			free(tr);
605 			if ((tr = wcsdup(t)) == NULL) {
606 				errno = ENOMEM;
607 				goto fail;
608 			}
609 			bp = tr;
610 			fp = tr + wcslen(tr) - 1;
611 			while (bp < fp) {
612 				c = *bp;
613 				*bp++ = *fp;
614 				*fp-- = c;
615 			}
616 			t = (const wchar_t *)tr;
617 		}
618 
619 		if (direc & DIRECTIVE_POSITION) {
620 			while (*t || state) {
621 
622 				_collate_lookup(table, t, &len, &pri, pass, &state);
623 				t += len;
624 				if (pri <= 0) {
625 					if (pri < 0) {
626 						errno = EINVAL;
627 						goto fail;
628 					}
629 					state = NULL;
630 					pri = COLLATE_MAX_PRIORITY;
631 				}
632 
633 				b = xfrm(table, buf, pri, pass);
634 				want += b;
635 				if (room) {
636 					while (b) {
637 						b--;
638 						if (room) {
639 							*xf++ = buf[b];
640 							room--;
641 						}
642 					}
643 				}
644 				need = want;
645 			}
646 		} else {
647 			while (*t || state) {
648 				_collate_lookup(table, t, &len, &pri, pass, &state);
649 				t += len;
650 				if (pri <= 0) {
651 					if (pri < 0) {
652 						errno = EINVAL;
653 						goto fail;
654 					}
655 					state = NULL;
656 					continue;
657 				}
658 
659 				b = xfrm(table, buf, pri, pass);
660 				want += b;
661 				if (room) {
662 
663 					while (b) {
664 						b--;
665 						if (room) {
666 							*xf++ = buf[b];
667 							room--;
668 						}
669 					}
670 				}
671 				need = want;
672 			}
673 		}
674 	}
675 	free(tr);
676 	return (need);
677 
678 fail:
679 	free(tr);
680 	return ((size_t)(-1));
681 }
682 
683 /*
684  * __collate_equiv_value returns the primary collation value for the given
685  * collating symbol specified by str and len.  Zero or negative is returned
686  * if the collating symbol was not found.  This function is used by bracket
687  * code in the TRE regex library.
688  */
689 int
690 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
691 {
692 	int32_t e;
693 
694 	if (len < 1 || len >= COLLATE_STR_LEN)
695 		return (-1);
696 
697 	FIX_LOCALE(locale);
698 	struct xlocale_collate *table =
699 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
700 
701 	if (table->__collate_load_error)
702 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
703 
704 	if (len == 1) {
705 		e = -1;
706 		if (*str <= UCHAR_MAX)
707 			e = table->char_pri_table[*str].pri[0];
708 		else if (table->info->large_count > 0) {
709 			collate_large_t *match_large;
710 			match_large = largesearch(table, *str);
711 			if (match_large)
712 				e = match_large->pri.pri[0];
713 		}
714 		if (e == 0)
715 			return (1);
716 		return (e > 0 ? e : 0);
717 	}
718 	if (table->info->chain_count > 0) {
719 		wchar_t name[COLLATE_STR_LEN];
720 		collate_chain_t *match_chain;
721 		int clen;
722 
723 		wcsncpy (name, str, len);
724 		name[len] = 0;
725 		match_chain = chainsearch(table, name, &clen);
726 		if (match_chain) {
727 			e = match_chain->pri[0];
728 			if (e == 0)
729 				return (1);
730 			return (e < 0 ? -e : e);
731 		}
732 	}
733 	return (0);
734 }
735