xref: /freebsd/lib/libc/locale/collate.c (revision a0ee8cc6)
1 /*-
2  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5  *		at Electronni Visti IA, Kiev, Ukraine.
6  *			All rights reserved.
7  *
8  * Copyright (c) 2011 The FreeBSD Foundation
9  * All rights reserved.
10  * Portions of this software were developed by David Chisnall
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Adapted to xlocale by John Marino <draco@marino.st>
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "namespace.h"
41 
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include <errno.h>
52 #include <unistd.h>
53 #include <fcntl.h>
54 #include "un-namespace.h"
55 
56 #include "collate.h"
57 #include "setlocale.h"
58 #include "ldpart.h"
59 #include "libc_private.h"
60 
61 struct xlocale_collate __xlocale_global_collate = {
62 	{{0}, "C"}, 1, 0, 0, 0
63 };
64 
65 struct xlocale_collate __xlocale_C_collate = {
66 	{{0}, "C"}, 1, 0, 0, 0
67 };
68 
69 static int
70 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
71 
72 static void
73 destruct_collate(void *t)
74 {
75 	struct xlocale_collate *table = t;
76 	if (table->map && (table->maplen > 0)) {
77 		(void) munmap(table->map, table->maplen);
78 	}
79 	free(t);
80 }
81 
82 void *
83 __collate_load(const char *encoding, __unused locale_t unused)
84 {
85 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
86 		return &__xlocale_C_collate;
87 	}
88 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
89 	table->header.header.destructor = destruct_collate;
90 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
91 	// the caching outside of this section
92 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
93 		xlocale_release(table);
94 		return NULL;
95 	}
96 	return table;
97 }
98 
99 /**
100  * Load the collation tables for the specified encoding into the global table.
101  */
102 int
103 __collate_load_tables(const char *encoding)
104 {
105 
106 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
107 }
108 
109 int
110 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
111 {
112 	int i, chains, z;
113 	char *buf;
114 	char *TMP;
115 	char *map;
116 	collate_info_t *info;
117 	struct stat sbuf;
118 	int fd;
119 
120 	table->__collate_load_error = 1;
121 
122 	/* 'encoding' must be already checked. */
123 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
124 		return (_LDP_CACHE);
125 	}
126 
127 	asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding);
128 	if (buf == NULL)
129 		return (_LDP_ERROR);
130 
131 	if ((fd = _open(buf, O_RDONLY)) < 0) {
132 		free(buf);
133 		return (_LDP_ERROR);
134 	}
135 	free(buf);
136 	if (_fstat(fd, &sbuf) < 0) {
137 		(void) _close(fd);
138 		return (_LDP_ERROR);
139 	}
140 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
141 		(void) _close(fd);
142 		errno = EINVAL;
143 		return (_LDP_ERROR);
144 	}
145 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
146 	(void) _close(fd);
147 	if ((TMP = map) == NULL) {
148 		return (_LDP_ERROR);
149 	}
150 
151 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
152 		(void) munmap(map, sbuf.st_size);
153 		errno = EINVAL;
154 		return (_LDP_ERROR);
155 	}
156 	TMP += COLLATE_STR_LEN;
157 
158 	info = (void *)TMP;
159 	TMP += sizeof (*info);
160 
161 	if ((info->directive_count < 1) ||
162 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
163 	    ((chains = info->chain_count) < 0)) {
164 		(void) munmap(map, sbuf.st_size);
165 		errno = EINVAL;
166 		return (_LDP_ERROR);
167 	}
168 
169 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
170 	    (sizeof (collate_chain_t) * chains) +
171 	    (sizeof (collate_large_t) * info->large_count);
172 	for (z = 0; z < info->directive_count; z++) {
173 		i += sizeof (collate_subst_t) * info->subst_count[z];
174 	}
175 	if (i != (sbuf.st_size - (TMP - map))) {
176 		(void) munmap(map, sbuf.st_size);
177 		errno = EINVAL;
178 		return (_LDP_ERROR);
179 	}
180 
181 	table->info = info;
182 	table->char_pri_table = (void *)TMP;
183 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
184 
185 	for (z = 0; z < info->directive_count; z++) {
186 		if (info->subst_count[z] > 0) {
187 			table->subst_table[z] = (void *)TMP;
188 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
189 		} else {
190 			table->subst_table[z] = NULL;
191 		}
192 	}
193 
194 	if (chains > 0) {
195 		table->chain_pri_table = (void *)TMP;
196 		TMP += chains * sizeof (collate_chain_t);
197 	} else
198 		table->chain_pri_table = NULL;
199 	if (info->large_count > 0)
200 		table->large_pri_table = (void *)TMP;
201 	else
202 		table->large_pri_table = NULL;
203 
204 	table->__collate_load_error = 0;
205 	return (_LDP_LOADED);
206 }
207 
208 static const int32_t *
209 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
210 {
211 	const collate_subst_t *p;
212 	int n = table->info->subst_count[pass];
213 
214 	if (n == 0)
215 		return (NULL);
216 
217 	if (pass >= table->info->directive_count)
218 		return (NULL);
219 
220 	if (!(key & COLLATE_SUBST_PRIORITY))
221 		return (NULL);
222 
223 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
224 	assert(p->key == key);
225 	return (p->pri);
226 }
227 
228 static collate_chain_t *
229 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
230 {
231 	int low = 0;
232 	int high = table->info->chain_count - 1;;
233 	int next, compar, l;
234 	collate_chain_t *p;
235 	collate_chain_t *tab = table->chain_pri_table;
236 
237 	if (high < 0)
238 		return (NULL);
239 
240 	while (low <= high) {
241 		next = (low + high) / 2;
242 		p = tab + next;
243 		compar = *key - *p->str;
244 		if (compar == 0) {
245 			l = wcsnlen(p->str, COLLATE_STR_LEN);
246 			compar = wcsncmp(key, p->str, l);
247 			if (compar == 0) {
248 				*len = l;
249 				return (p);
250 			}
251 		}
252 		if (compar > 0)
253 			low = next + 1;
254 		else
255 			high = next - 1;
256 	}
257 	return (NULL);
258 }
259 
260 static collate_large_t *
261 largesearch(struct xlocale_collate *table, const wchar_t key)
262 {
263 	int low = 0;
264 	int high = table->info->large_count - 1;
265 	int next, compar;
266 	collate_large_t *p;
267 	collate_large_t *tab = table->large_pri_table;
268 
269 	if (high < 0)
270 		return (NULL);
271 
272 	while (low <= high) {
273 		next = (low + high) / 2;
274 		p = tab + next;
275 		compar = key - p->val;
276 		if (compar == 0)
277 			return (p);
278 		if (compar > 0)
279 			low = next + 1;
280 		else
281 			high = next - 1;
282 	}
283 	return (NULL);
284 }
285 
286 void
287 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
288     int *pri, int which, const int **state)
289 {
290 	collate_chain_t *p2;
291 	collate_large_t *match;
292 	int p, l;
293 	const int *sptr;
294 
295 	/*
296 	 * If this is the "last" pass for the UNDEFINED, then
297 	 * we just return the priority itself.
298 	 */
299 	if (which >= table->info->directive_count) {
300 		*pri = *t;
301 		*len = 1;
302 		*state = NULL;
303 		return;
304 	}
305 
306 	/*
307 	 * If we have remaining substitution data from a previous
308 	 * call, consume it first.
309 	 */
310 	if ((sptr = *state) != NULL) {
311 		*pri = *sptr;
312 		sptr++;
313 		if ((sptr == *state) || (sptr == NULL))
314 			*state = NULL;
315 		else
316 			*state = sptr;
317 		*len = 0;
318 		return;
319 	}
320 
321 	/* No active substitutions */
322 	*len = 1;
323 
324 	/*
325 	 * Check for composites such as dipthongs that collate as a
326 	 * single element (aka chains or collating-elements).
327 	 */
328 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
329 	    ((p = p2->pri[which]) >= 0)) {
330 
331 		*len = l;
332 		*pri = p;
333 
334 	} else if (*t <= UCHAR_MAX) {
335 
336 		/*
337 		 * Character is a small (8-bit) character.
338 		 * We just look these up directly for speed.
339 		 */
340 		*pri = table->char_pri_table[*t].pri[which];
341 
342 	} else if ((table->info->large_count > 0) &&
343 	    ((match = largesearch(table, *t)) != NULL)) {
344 
345 		/*
346 		 * Character was found in the extended table.
347 		 */
348 		*pri = match->pri.pri[which];
349 
350 	} else {
351 		/*
352 		 * Character lacks a specific definition.
353 		 */
354 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
355 			/* Mask off sign bit to prevent ordering confusion. */
356 			*pri = (*t & COLLATE_MAX_PRIORITY);
357 		} else {
358 			*pri = table->info->undef_pri[which];
359 		}
360 		/* No substitutions for undefined characters! */
361 		return;
362 	}
363 
364 	/*
365 	 * Try substituting (expanding) the character.  We are
366 	 * currently doing this *after* the chain compression.  I
367 	 * think it should not matter, but this way might be slightly
368 	 * faster.
369 	 *
370 	 * We do this after the priority search, as this will help us
371 	 * to identify a single key value.  In order for this to work,
372 	 * its important that the priority assigned to a given element
373 	 * to be substituted be unique for that level.  The localedef
374 	 * code ensures this for us.
375 	 */
376 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
377 		if ((*pri = *sptr) > 0) {
378 			sptr++;
379 			*state = *sptr ? sptr : NULL;
380 		}
381 	}
382 
383 }
384 
385 /*
386  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
387  * NOT NULL terminate.  That is left to the caller.
388  */
389 size_t
390 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
391     size_t room)
392 {
393 	int		pri;
394 	int		len;
395 	const wchar_t	*t;
396 	wchar_t		*tr = NULL;
397 	int		direc;
398 	int		pass;
399 	const int32_t 	*state;
400 	size_t		want = 0;
401 	size_t		need = 0;
402 	int		ndir = table->info->directive_count;
403 
404 	assert(src);
405 
406 	for (pass = 0; pass <= ndir; pass++) {
407 
408 		state = NULL;
409 
410 		if (pass != 0) {
411 			/* insert level separator from the previous pass */
412 			if (room) {
413 				*xf++ = 1;
414 				room--;
415 			}
416 			want++;
417 		}
418 
419 		/* special pass for undefined */
420 		if (pass == ndir) {
421 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
422 		} else {
423 			direc = table->info->directive[pass];
424 		}
425 
426 		t = src;
427 
428 		if (direc & DIRECTIVE_BACKWARD) {
429 			wchar_t *bp, *fp, c;
430 			free(tr);
431 			if ((tr = wcsdup(t)) == NULL) {
432 				errno = ENOMEM;
433 				goto fail;
434 			}
435 			bp = tr;
436 			fp = tr + wcslen(tr) - 1;
437 			while (bp < fp) {
438 				c = *bp;
439 				*bp++ = *fp;
440 				*fp-- = c;
441 			}
442 			t = (const wchar_t *)tr;
443 		}
444 
445 		if (direc & DIRECTIVE_POSITION) {
446 			while (*t || state) {
447 				_collate_lookup(table, t, &len, &pri, pass, &state);
448 				t += len;
449 				if (pri <= 0) {
450 					if (pri < 0) {
451 						errno = EINVAL;
452 						goto fail;
453 					}
454 					pri = COLLATE_MAX_PRIORITY;
455 				}
456 				if (room) {
457 					*xf++ = pri;
458 					room--;
459 				}
460 				want++;
461 				need = want;
462 			}
463 		} else {
464 			while (*t || state) {
465 				_collate_lookup(table, t, &len, &pri, pass, &state);
466 				t += len;
467 				if (pri <= 0) {
468 					if (pri < 0) {
469 						errno = EINVAL;
470 						goto fail;
471 					}
472 					continue;
473 				}
474 				if (room) {
475 					*xf++ = pri;
476 					room--;
477 				}
478 				want++;
479 				need = want;
480 			}
481 		}
482 	}
483 	free(tr);
484 	return (need);
485 
486 fail:
487 	free(tr);
488 	return ((size_t)(-1));
489 }
490 
491 /*
492  * In the non-POSIX case, we transform each character into a string of
493  * characters representing the character's priority.  Since char is usually
494  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
495  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
496  * bits per byte.
497  *
498  * It turns out that we sometimes have real priorities that are
499  * 31-bits wide.  (But: be careful using priorities where the high
500  * order bit is set -- i.e. the priority is negative.  The sort order
501  * may be surprising!)
502  *
503  * TODO: This would be a good area to optimize somewhat.  It turns out
504  * that real prioririties *except for the last UNDEFINED pass* are generally
505  * very small.  We need the localedef code to precalculate the max
506  * priority for us, and ideally also give us a mask, and then we could
507  * severely limit what we expand to.
508  */
509 #define	XFRM_BYTES	6
510 #define	XFRM_OFFSET	('0')	/* make all printable characters */
511 #define	XFRM_SHIFT	6
512 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
513 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
514 
515 static int
516 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
517 {
518 	/* we use unsigned to ensure zero fill on right shift */
519 	uint32_t val = (uint32_t)table->info->pri_count[pass];
520 	int nc = 0;
521 
522 	while (val) {
523 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
524 		pri >>= XFRM_SHIFT;
525 		val >>= XFRM_SHIFT;
526 		p++;
527 		nc++;
528 	}
529 	return (nc);
530 }
531 
532 size_t
533 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
534     size_t room)
535 {
536 	int		pri;
537 	int		len;
538 	const wchar_t	*t;
539 	wchar_t		*tr = NULL;
540 	int		direc;
541 	int		pass;
542 	const int32_t 	*state;
543 	size_t		want = 0;
544 	size_t		need = 0;
545 	int		b;
546 	uint8_t		buf[XFRM_BYTES];
547 	int		ndir = table->info->directive_count;
548 
549 	assert(src);
550 
551 	for (pass = 0; pass <= ndir; pass++) {
552 
553 		state = NULL;
554 
555 		if (pass != 0) {
556 			/* insert level separator from the previous pass */
557 			if (room) {
558 				*xf++ = XFRM_SEP;
559 				room--;
560 			}
561 			want++;
562 		}
563 
564 		/* special pass for undefined */
565 		if (pass == ndir) {
566 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
567 		} else {
568 			direc = table->info->directive[pass];
569 		}
570 
571 		t = src;
572 
573 		if (direc & DIRECTIVE_BACKWARD) {
574 			wchar_t *bp, *fp, c;
575 			free(tr);
576 			if ((tr = wcsdup(t)) == NULL) {
577 				errno = ENOMEM;
578 				goto fail;
579 			}
580 			bp = tr;
581 			fp = tr + wcslen(tr) - 1;
582 			while (bp < fp) {
583 				c = *bp;
584 				*bp++ = *fp;
585 				*fp-- = c;
586 			}
587 			t = (const wchar_t *)tr;
588 		}
589 
590 		if (direc & DIRECTIVE_POSITION) {
591 			while (*t || state) {
592 
593 				_collate_lookup(table, t, &len, &pri, pass, &state);
594 				t += len;
595 				if (pri <= 0) {
596 					if (pri < 0) {
597 						errno = EINVAL;
598 						goto fail;
599 					}
600 					pri = COLLATE_MAX_PRIORITY;
601 				}
602 
603 				b = xfrm(table, buf, pri, pass);
604 				want += b;
605 				if (room) {
606 					while (b) {
607 						b--;
608 						if (room) {
609 							*xf++ = buf[b];
610 							room--;
611 						}
612 					}
613 				}
614 				need = want;
615 			}
616 		} else {
617 			while (*t || state) {
618 				_collate_lookup(table, t, &len, &pri, pass, &state);
619 				t += len;
620 				if (pri <= 0) {
621 					if (pri < 0) {
622 						errno = EINVAL;
623 						goto fail;
624 					}
625 					continue;
626 				}
627 
628 				b = xfrm(table, buf, pri, pass);
629 				want += b;
630 				if (room) {
631 
632 					while (b) {
633 						b--;
634 						if (room) {
635 							*xf++ = buf[b];
636 							room--;
637 						}
638 					}
639 				}
640 				need = want;
641 			}
642 		}
643 	}
644 	free(tr);
645 	return (need);
646 
647 fail:
648 	free(tr);
649 	return ((size_t)(-1));
650 }
651 
652 /*
653  * __collate_equiv_value returns the primary collation value for the given
654  * collating symbol specified by str and len.  Zero or negative is returned
655  * if the collating symbol was not found.  This function is used by bracket
656  * code in the TRE regex library.
657  */
658 int
659 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
660 {
661 	int32_t e;
662 
663 	if (len < 1 || len >= COLLATE_STR_LEN)
664 		return (-1);
665 
666 	FIX_LOCALE(locale);
667 	struct xlocale_collate *table =
668 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
669 
670 	if (table->__collate_load_error)
671 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
672 
673 	if (len == 1) {
674 		e = -1;
675 		if (*str <= UCHAR_MAX)
676 			e = table->char_pri_table[*str].pri[0];
677 		else if (table->info->large_count > 0) {
678 			collate_large_t *match_large;
679 			match_large = largesearch(table, *str);
680 			if (match_large)
681 				e = match_large->pri.pri[0];
682 		}
683 		if (e == 0)
684 			return (1);
685 		return (e > 0 ? e : 0);
686 	}
687 	if (table->info->chain_count > 0) {
688 		wchar_t name[COLLATE_STR_LEN];
689 		collate_chain_t *match_chain;
690 		int clen;
691 
692 		wcsncpy (name, str, len);
693 		name[len] = 0;
694 		match_chain = chainsearch(table, name, &clen);
695 		if (match_chain) {
696 			e = match_chain->pri[0];
697 			if (e == 0)
698 				return (1);
699 			return (e < 0 ? -e : e);
700 		}
701 	}
702 	return (0);
703 }
704