xref: /dragonfly/lib/libc/locale/collate.c (revision 50b09fda)
1 /*
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *		at Electronni Visti IA, Kiev, Ukraine.
5  *			All rights reserved.
6  *
7  * Copyright (c) 2011 The FreeBSD Foundation
8  * All rights reserved.
9  * Portions of this software were developed by David Chisnall
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Adapted to xlocale by John Marino <draco@marino.st>
34  */
35 
36 #include "namespace.h"
37 #include <sys/types.h>
38 #include <sys/stat.h>
39 #include <sys/mman.h>
40 #include <assert.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <wchar.h>
45 #include <errno.h>
46 #include <unistd.h>
47 #include <fcntl.h>
48 #include "un-namespace.h"
49 
50 #include "collate.h"
51 #include "setlocale.h"
52 #include "ldpart.h"
53 #include "libc_private.h"
54 
55 struct xlocale_collate __xlocale_global_collate = {
56 	{{0}, "C"}, 1, 0, 0, 0
57 };
58 
59 struct xlocale_collate __xlocale_C_collate = {
60 	{{0}, "C"}, 1, 0, 0, 0
61 };
62 
63 static int
64 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
65 
66 static void
67 destruct_collate(void *t)
68 {
69 	struct xlocale_collate *table = t;
70 	if (table->map && (table->maplen > 0)) {
71 		(void) munmap(table->map, table->maplen);
72 	}
73 	free(t);
74 }
75 
76 void *
77 __collate_load(const char *encoding, __unused locale_t unused)
78 {
79 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
80 		return &__xlocale_C_collate;
81 	}
82 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
83 	table->header.header.destructor = destruct_collate;
84 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
85 	// the caching outside of this section
86 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
87 		xlocale_release(table);
88 		return NULL;
89 	}
90 	return table;
91 }
92 
93 /**
94  * Load the collation tables for the specified encoding into the global table.
95  */
96 int
97 __collate_load_tables(const char *encoding)
98 {
99 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
100 }
101 
102 int
103 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
104 {
105 	int i, chains, z;
106 	char buf[PATH_MAX];
107 	char *TMP;
108 	char *map;
109 	collate_info_t *info;
110 	struct stat sbuf;
111 	int fd;
112 
113 	table->__collate_load_error = 1;
114 
115 	/* 'encoding' must be already checked. */
116 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
117 		return (_LDP_CACHE);
118 	}
119 
120 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE",
121 	    _PathLocale, encoding);
122 
123 	if ((fd = _open(buf, O_RDONLY)) < 0)
124 		return (_LDP_ERROR);
125 	if (_fstat(fd, &sbuf) < 0) {
126 		(void) _close(fd);
127 		return (_LDP_ERROR);
128 	}
129 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
130 		(void) _close(fd);
131 		errno = EINVAL;
132 		return (_LDP_ERROR);
133 	}
134 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
135 	(void) _close(fd);
136 	if ((TMP = map) == NULL) {
137 		return (_LDP_ERROR);
138 	}
139 
140 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
141 		(void) munmap(map, sbuf.st_size);
142 		errno = EINVAL;
143 		return (_LDP_ERROR);
144 	}
145 	TMP += COLLATE_STR_LEN;
146 
147 	info = (void *)TMP;
148 	TMP += sizeof (*info);
149 
150 	if ((info->directive_count < 1) ||
151 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
152 	    ((chains = info->chain_count) < 0)) {
153 		(void) munmap(map, sbuf.st_size);
154 		errno = EINVAL;
155 		return (_LDP_ERROR);
156 	}
157 
158 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
159 	    (sizeof (collate_chain_t) * chains) +
160 	    (sizeof (collate_large_t) * info->large_count);
161 	for (z = 0; z < info->directive_count; z++) {
162 		i += sizeof (collate_subst_t) * info->subst_count[z];
163 	}
164 	if (i != (sbuf.st_size - (TMP - map))) {
165 		(void) munmap(map, sbuf.st_size);
166 		errno = EINVAL;
167 		return (_LDP_ERROR);
168 	}
169 
170 	table->info = info;
171 	table->char_pri_table = (void *)TMP;
172 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
173 
174 	for (z = 0; z < info->directive_count; z++) {
175 		if (info->subst_count[z] > 0) {
176 			table->subst_table[z] = (void *)TMP;
177 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
178 		} else {
179 			table->subst_table[z] = NULL;
180 		}
181 	}
182 
183 	if (chains > 0) {
184 		table->chain_pri_table = (void *)TMP;
185 		TMP += chains * sizeof (collate_chain_t);
186 	} else
187 		table->chain_pri_table = NULL;
188 	if (info->large_count > 0)
189 		table->large_pri_table = (void *)TMP;
190 	else
191 		table->large_pri_table = NULL;
192 
193 	table->__collate_load_error = 0;
194 	return (_LDP_LOADED);
195 }
196 
197 static const int32_t *
198 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
199 {
200 	const collate_subst_t *p;
201 	int n = table->info->subst_count[pass];
202 
203 	if (n == 0)
204 		return (NULL);
205 
206 	if (pass >= table->info->directive_count)
207 		return (NULL);
208 
209 	if (!(key & COLLATE_SUBST_PRIORITY))
210 		return (NULL);
211 
212 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
213 	assert(p->key == key);
214 	return (p->pri);
215 }
216 
217 static collate_chain_t *
218 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
219 {
220 	int low = 0;
221 	int high = table->info->chain_count - 1;;
222 	int next, compar, l;
223 	collate_chain_t *p;
224 	collate_chain_t *tab = table->chain_pri_table;
225 
226 	if (high < 0)
227 		return (NULL);
228 
229 	while (low <= high) {
230 		next = (low + high) / 2;
231 		p = tab + next;
232 		compar = *key - *p->str;
233 		if (compar == 0) {
234 			l = wcsnlen(p->str, COLLATE_STR_LEN);
235 			compar = wcsncmp(key, p->str, l);
236 			if (compar == 0) {
237 				*len = l;
238 				return (p);
239 			}
240 		}
241 		if (compar > 0)
242 			low = next + 1;
243 		else
244 			high = next - 1;
245 	}
246 	return (NULL);
247 }
248 
249 static collate_large_t *
250 largesearch(struct xlocale_collate *table, const wchar_t key)
251 {
252 	int low = 0;
253 	int high = table->info->large_count - 1;
254 	int next, compar;
255 	collate_large_t *p;
256 	collate_large_t *tab = table->large_pri_table;
257 
258 	if (high < 0)
259 		return (NULL);
260 
261 	while (low <= high) {
262 		next = (low + high) / 2;
263 		p = tab + next;
264 		compar = key - p->val;
265 		if (compar == 0)
266 			return (p);
267 		if (compar > 0)
268 			low = next + 1;
269 		else
270 			high = next - 1;
271 	}
272 	return (NULL);
273 }
274 
275 void
276 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
277     int *pri, int which, const int **state)
278 {
279 	collate_chain_t *p2;
280 	collate_large_t *match;
281 	int p, l;
282 	const int *sptr;
283 
284 	/*
285 	 * If this is the "last" pass for the UNDEFINED, then
286 	 * we just return the priority itself.
287 	 */
288 	if (which >= table->info->directive_count) {
289 		*pri = *t;
290 		*len = 1;
291 		*state = NULL;
292 		return;
293 	}
294 
295 	/*
296 	 * If we have remaining substitution data from a previous
297 	 * call, consume it first.
298 	 */
299 	if ((sptr = *state) != NULL) {
300 		*pri = *sptr;
301 		sptr++;
302 		if ((sptr == *state) || (sptr == NULL))
303 			*state = NULL;
304 		else
305 			*state = sptr;
306 		*len = 0;
307 		return;
308 	}
309 
310 	/* No active substitutions */
311 	*len = 1;
312 
313 	/*
314 	 * Check for composites such as diphthongs that collate as a
315 	 * single element (aka chains or collating-elements).
316 	 */
317 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
318 	    ((p = p2->pri[which]) >= 0)) {
319 
320 		*len = l;
321 		*pri = p;
322 
323 	} else if (*t <= UCHAR_MAX) {
324 
325 		/*
326 		 * Character is a small (8-bit) character.
327 		 * We just look these up directly for speed.
328 		 */
329 		*pri = table->char_pri_table[*t].pri[which];
330 
331 	} else if ((table->info->large_count > 0) &&
332 	    ((match = largesearch(table, *t)) != NULL)) {
333 
334 		/*
335 		 * Character was found in the extended table.
336 		 */
337 		*pri = match->pri.pri[which];
338 
339 	} else {
340 		/*
341 		 * Character lacks a specific definition.
342 		 */
343 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
344 			/* Mask off sign bit to prevent ordering confusion. */
345 			*pri = (*t & COLLATE_MAX_PRIORITY);
346 		} else {
347 			*pri = table->info->undef_pri[which];
348 		}
349 		/* No substitutions for undefined characters! */
350 		return;
351 	}
352 
353 	/*
354 	 * Try substituting (expanding) the character.  We are
355 	 * currently doing this *after* the chain compression.  I
356 	 * think it should not matter, but this way might be slightly
357 	 * faster.
358 	 *
359 	 * We do this after the priority search, as this will help us
360 	 * to identify a single key value.  In order for this to work,
361 	 * its important that the priority assigned to a given element
362 	 * to be substituted be unique for that level.  The localedef
363 	 * code ensures this for us.
364 	 */
365 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
366 		if ((*pri = *sptr) > 0) {
367 			sptr++;
368 			*state = *sptr ? sptr : NULL;
369 		}
370 	}
371 
372 }
373 
374 /*
375  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
376  * NOT NULL terminate.  That is left to the caller.
377  */
378 size_t
379 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
380     size_t room)
381 {
382 	int		pri;
383 	int		len;
384 	const wchar_t	*t;
385 	wchar_t		*tr = NULL;
386 	int		direc;
387 	int		pass;
388 	const int32_t 	*state;
389 	size_t		want = 0;
390 	size_t		need = 0;
391 	int		ndir = table->info->directive_count;
392 
393 	assert(src);
394 
395 	for (pass = 0; pass <= ndir; pass++) {
396 
397 		state = NULL;
398 
399 		if (pass != 0) {
400 			/* insert level separator from the previous pass */
401 			if (room) {
402 				*xf++ = 1;
403 				room--;
404 			}
405 			want++;
406 		}
407 
408 		/* special pass for undefined */
409 		if (pass == ndir) {
410 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
411 		} else {
412 			direc = table->info->directive[pass];
413 		}
414 
415 		t = src;
416 
417 		if (direc & DIRECTIVE_BACKWARD) {
418 			wchar_t *bp, *fp, c;
419 			if (tr)
420 				free(tr);
421 			if ((tr = wcsdup(t)) == NULL) {
422 				errno = ENOMEM;
423 				goto fail;
424 			}
425 			bp = tr;
426 			fp = tr + wcslen(tr) - 1;
427 			while (bp < fp) {
428 				c = *bp;
429 				*bp++ = *fp;
430 				*fp-- = c;
431 			}
432 			t = (const wchar_t *)tr;
433 		}
434 
435 		if (direc & DIRECTIVE_POSITION) {
436 			while (*t || state) {
437 				_collate_lookup(table, t, &len, &pri, pass, &state);
438 				t += len;
439 				if (pri <= 0) {
440 					if (pri < 0) {
441 						errno = EINVAL;
442 						goto fail;
443 					}
444 					state = NULL;
445 					pri = COLLATE_MAX_PRIORITY;
446 				}
447 				if (room) {
448 					*xf++ = pri;
449 					room--;
450 				}
451 				want++;
452 				need = want;
453 			}
454 		} else {
455 			while (*t || state) {
456 				_collate_lookup(table, t, &len, &pri, pass, &state);
457 				t += len;
458 				if (pri <= 0) {
459 					if (pri < 0) {
460 						errno = EINVAL;
461 						goto fail;
462 					}
463 					state = NULL;
464 					continue;
465 				}
466 				if (room) {
467 					*xf++ = pri;
468 					room--;
469 				}
470 				want++;
471 				need = want;
472 			}
473 		}
474 	}
475 	free(tr);
476 	return (need);
477 
478 fail:
479 	free(tr);
480 	return ((size_t)(-1));
481 }
482 
483 /*
484  * In the non-POSIX case, we transform each character into a string of
485  * characters representing the character's priority.  Since char is usually
486  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
487  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
488  * bits per byte.
489  *
490  * It turns out that we sometimes have real priorities that are
491  * 31-bits wide.  (But: be careful using priorities where the high
492  * order bit is set -- i.e. the priority is negative.  The sort order
493  * may be surprising!)
494  *
495  * TODO: This would be a good area to optimize somewhat.  It turns out
496  * that real prioririties *except for the last UNDEFINED pass* are generally
497  * very small.  We need the localedef code to precalculate the max
498  * priority for us, and ideally also give us a mask, and then we could
499  * severely limit what we expand to.
500  */
501 #define	XFRM_BYTES	6
502 #define	XFRM_OFFSET	('0')	/* make all printable characters */
503 #define	XFRM_SHIFT	6
504 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
505 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
506 
507 static int
508 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
509 {
510 	/* we use unsigned to ensure zero fill on right shift */
511 	uint32_t val = (uint32_t)table->info->pri_count[pass];
512 	int nc = 0;
513 
514 	while (val) {
515 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
516 		pri >>= XFRM_SHIFT;
517 		val >>= XFRM_SHIFT;
518 		p++;
519 		nc++;
520 	}
521 	return (nc);
522 }
523 
524 size_t
525 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
526     size_t room)
527 {
528 	int		pri;
529 	int		len;
530 	const wchar_t	*t;
531 	wchar_t		*tr = NULL;
532 	int		direc;
533 	int		pass;
534 	const int32_t 	*state;
535 	size_t		want = 0;
536 	size_t		need = 0;
537 	int		b;
538 	uint8_t		buf[XFRM_BYTES];
539 	int		ndir = table->info->directive_count;
540 
541 	assert(src);
542 
543 	for (pass = 0; pass <= ndir; pass++) {
544 
545 		state = NULL;
546 
547 		if (pass != 0) {
548 			/* insert level separator from the previous pass */
549 			if (room) {
550 				*xf++ = XFRM_SEP;
551 				room--;
552 			}
553 			want++;
554 		}
555 
556 		/* special pass for undefined */
557 		if (pass == ndir) {
558 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
559 		} else {
560 			direc = table->info->directive[pass];
561 		}
562 
563 		t = src;
564 
565 		if (direc & DIRECTIVE_BACKWARD) {
566 			wchar_t *bp, *fp, c;
567 			if (tr)
568 				free(tr);
569 			if ((tr = wcsdup(t)) == NULL) {
570 				errno = ENOMEM;
571 				goto fail;
572 			}
573 			bp = tr;
574 			fp = tr + wcslen(tr) - 1;
575 			while (bp < fp) {
576 				c = *bp;
577 				*bp++ = *fp;
578 				*fp-- = c;
579 			}
580 			t = (const wchar_t *)tr;
581 		}
582 
583 		if (direc & DIRECTIVE_POSITION) {
584 			while (*t || state) {
585 
586 				_collate_lookup(table, t, &len, &pri, pass, &state);
587 				t += len;
588 				if (pri <= 0) {
589 					if (pri < 0) {
590 						errno = EINVAL;
591 						goto fail;
592 					}
593 					state = NULL;
594 					pri = COLLATE_MAX_PRIORITY;
595 				}
596 
597 				b = xfrm(table, buf, pri, pass);
598 				want += b;
599 				if (room) {
600 					while (b) {
601 						b--;
602 						if (room) {
603 							*xf++ = buf[b];
604 							room--;
605 						}
606 					}
607 				}
608 				need = want;
609 			}
610 		} else {
611 			while (*t || state) {
612 				_collate_lookup(table, t, &len, &pri, pass, &state);
613 				t += len;
614 				if (pri <= 0) {
615 					if (pri < 0) {
616 						errno = EINVAL;
617 						goto fail;
618 					}
619 					state = NULL;
620 					continue;
621 				}
622 
623 				b = xfrm(table, buf, pri, pass);
624 				want += b;
625 				if (room) {
626 
627 					while (b) {
628 						b--;
629 						if (room) {
630 							*xf++ = buf[b];
631 							room--;
632 						}
633 					}
634 				}
635 				need = want;
636 			}
637 		}
638 	}
639 	free(tr);
640 	return (need);
641 
642 fail:
643 	free(tr);
644 	return ((size_t)(-1));
645 }
646 
647 /*
648  * __collate_equiv_value returns the primary collation value for the given
649  * collating symbol specified by str and len.  Zero or negative is returned
650  * if the collating symbol was not found.  This function is used by bracket
651  * code in the TRE regex library.
652  */
653 int
654 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
655 {
656 	int32_t e;
657 
658 	if (len < 1 || len >= COLLATE_STR_LEN)
659 		return (-1);
660 
661 	FIX_LOCALE(locale);
662 	struct xlocale_collate *table =
663 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
664 
665 	if (table->__collate_load_error)
666 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
667 
668 	if (len == 1) {
669 		e = -1;
670 		if (*str <= UCHAR_MAX)
671 			e = table->char_pri_table[*str].pri[0];
672 		else if (table->info->large_count > 0) {
673 			collate_large_t *match_large;
674 			match_large = largesearch(table, *str);
675 			if (match_large)
676 				e = match_large->pri.pri[0];
677 		}
678 		if (e == 0)
679 			return (1);
680 		return (e > 0 ? e : 0);
681 	}
682 	if (table->info->chain_count > 0) {
683 		wchar_t name[COLLATE_STR_LEN];
684 		collate_chain_t *match_chain;
685 		int clen;
686 
687 		wcsncpy (name, str, len);
688 		name[len] = 0;
689 		match_chain = chainsearch(table, name, &clen);
690 		if (match_chain) {
691 			e = match_chain->pri[0];
692 			if (e == 0)
693 				return (1);
694 			return (e < 0 ? -e : e);
695 		}
696 	}
697 	return (0);
698 }
699