xref: /dragonfly/lib/libc/locale/collate.c (revision 02318f07)
1 /*
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *		at Electronni Visti IA, Kiev, Ukraine.
5  *			All rights reserved.
6  *
7  * Copyright (c) 2011 The FreeBSD Foundation
8  * All rights reserved.
9  * Portions of this software were developed by David Chisnall
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Adapted to xlocale by John Marino <draco@marino.st>
34  */
35 
36 #include "namespace.h"
37 #include <sys/types.h>
38 #include <sys/stat.h>
39 #include <sys/mman.h>
40 #include <assert.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <wchar.h>
45 #include <errno.h>
46 #include <unistd.h>
47 #include <fcntl.h>
48 #include "un-namespace.h"
49 
50 #include "collate.h"
51 #include "setlocale.h"
52 #include "ldpart.h"
53 #include "libc_private.h"
54 
55 struct xlocale_collate __xlocale_global_collate = {
56 	{{0}, "C"}, 1, 0, 0, 0
57 };
58 
59 struct xlocale_collate __xlocale_C_collate = {
60 	{{0}, "C"}, 1, 0, 0, 0
61 };
62 
63 static int
64 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
65 
66 static void
67 destruct_collate(void *t)
68 {
69 	struct xlocale_collate *table = t;
70 	if (table->map && (table->maplen > 0)) {
71 		(void) munmap(table->map, table->maplen);
72 	}
73 	free(t);
74 }
75 
76 void *
77 __collate_load(const char *encoding, __unused locale_t unused)
78 {
79 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
80 		return &__xlocale_C_collate;
81 	}
82 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
83 	table->header.header.destructor = destruct_collate;
84 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
85 	// the caching outside of this section
86 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
87 		xlocale_release(table);
88 		return NULL;
89 	}
90 	return table;
91 }
92 
93 /**
94  * Load the collation tables for the specified encoding into the global table.
95  */
96 int
97 __collate_load_tables(const char *encoding)
98 {
99 	int ret = __collate_load_tables_l(encoding, &__xlocale_global_collate);
100 	return ret;
101 }
102 
103 int
104 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
105 {
106 	int i, chains, z;
107 	char buf[PATH_MAX];
108 	char *TMP;
109 	char *map;
110 	collate_info_t *info;
111 	struct stat sbuf;
112 	int fd;
113 
114 	table->__collate_load_error = 1;
115 
116 	/* 'encoding' must be already checked. */
117 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
118 		return (_LDP_CACHE);
119 	}
120 
121 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE",
122 	    _PathLocale, encoding);
123 
124 	if ((fd = _open(buf, O_RDONLY)) < 0)
125 		return (_LDP_ERROR);
126 	if (_fstat(fd, &sbuf) < 0) {
127 		(void) _close(fd);
128 		return (_LDP_ERROR);
129 	}
130 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
131 		(void) _close(fd);
132 		errno = EINVAL;
133 		return (_LDP_ERROR);
134 	}
135 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
136 	(void) _close(fd);
137 	if ((TMP = map) == NULL) {
138 		return (_LDP_ERROR);
139 	}
140 
141 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
142 		(void) munmap(map, sbuf.st_size);
143 		errno = EINVAL;
144 		return (_LDP_ERROR);
145 	}
146 	TMP += COLLATE_STR_LEN;
147 
148 	info = (void *)TMP;
149 	TMP += sizeof (*info);
150 
151 	if ((info->directive_count < 1) ||
152 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
153 	    ((chains = info->chain_count) < 0)) {
154 		(void) munmap(map, sbuf.st_size);
155 		errno = EINVAL;
156 		return (_LDP_ERROR);
157 	}
158 
159 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
160 	    (sizeof (collate_chain_t) * chains) +
161 	    (sizeof (collate_large_t) * info->large_count);
162 	for (z = 0; z < info->directive_count; z++) {
163 		i += sizeof (collate_subst_t) * info->subst_count[z];
164 	}
165 	if (i != (sbuf.st_size - (TMP - map))) {
166 		(void) munmap(map, sbuf.st_size);
167 		errno = EINVAL;
168 		return (_LDP_ERROR);
169 	}
170 
171 	table->info = info;
172 	table->char_pri_table = (void *)TMP;
173 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
174 
175 	for (z = 0; z < info->directive_count; z++) {
176 		if (info->subst_count[z] > 0) {
177 			table->subst_table[z] = (void *)TMP;
178 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
179 		} else {
180 			table->subst_table[z] = NULL;
181 		}
182 	}
183 
184 	if (chains > 0) {
185 		table->chain_pri_table = (void *)TMP;
186 		TMP += chains * sizeof (collate_chain_t);
187 	} else
188 		table->chain_pri_table = NULL;
189 	if (info->large_count > 0)
190 		table->large_pri_table = (void *)TMP;
191 	else
192 		table->large_pri_table = NULL;
193 
194 	table->__collate_load_error = 0;
195 	return (_LDP_LOADED);
196 }
197 
198 static const int32_t *
199 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
200 {
201 	const collate_subst_t *p;
202 	int n = table->info->subst_count[pass];
203 
204 	if (n == 0)
205 		return (NULL);
206 
207 	if (pass >= table->info->directive_count)
208 		return (NULL);
209 
210 	if (!(key & COLLATE_SUBST_PRIORITY))
211 		return (NULL);
212 
213 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
214 	assert(p->key == key);
215 	return (p->pri);
216 }
217 
218 static collate_chain_t *
219 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
220 {
221 	int low = 0;
222 	int high = table->info->chain_count - 1;;
223 	int next, compar, l;
224 	collate_chain_t *p;
225 	collate_chain_t *tab = table->chain_pri_table;
226 
227 	if (high < 0)
228 		return (NULL);
229 
230 	while (low <= high) {
231 		next = (low + high) / 2;
232 		p = tab + next;
233 		compar = *key - *p->str;
234 		if (compar == 0) {
235 			l = wcsnlen(p->str, COLLATE_STR_LEN);
236 			compar = wcsncmp(key, p->str, l);
237 			if (compar == 0) {
238 				*len = l;
239 				return (p);
240 			}
241 		}
242 		if (compar > 0)
243 			low = next + 1;
244 		else
245 			high = next - 1;
246 	}
247 	return (NULL);
248 }
249 
250 static collate_large_t *
251 largesearch(struct xlocale_collate *table, const wchar_t key)
252 {
253 	int low = 0;
254 	int high = table->info->large_count - 1;
255 	int next, compar;
256 	collate_large_t *p;
257 	collate_large_t *tab = table->large_pri_table;
258 
259 	if (high < 0)
260 		return (NULL);
261 
262 	while (low <= high) {
263 		next = (low + high) / 2;
264 		p = tab + next;
265 		compar = key - p->val;
266 		if (compar == 0)
267 			return (p);
268 		if (compar > 0)
269 			low = next + 1;
270 		else
271 			high = next - 1;
272 	}
273 	return (NULL);
274 }
275 
276 void
277 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
278     int *pri, int which, const int **state)
279 {
280 	collate_chain_t *p2;
281 	collate_large_t *match;
282 	int p, l;
283 	const int *sptr;
284 
285 	/*
286 	 * If this is the "last" pass for the UNDEFINED, then
287 	 * we just return the priority itself.
288 	 */
289 	if (which >= table->info->directive_count) {
290 		*pri = *t;
291 		*len = 1;
292 		*state = NULL;
293 		return;
294 	}
295 
296 	/*
297 	 * If we have remaining substitution data from a previous
298 	 * call, consume it first.
299 	 */
300 	if ((sptr = *state) != NULL) {
301 		*pri = *sptr;
302 		sptr++;
303 		if ((sptr == *state) || (sptr == NULL))
304 			*state = NULL;
305 		else
306 			*state = sptr;
307 		*len = 0;
308 		return;
309 	}
310 
311 	/* No active substitutions */
312 	*len = 1;
313 
314 	/*
315 	 * Check for composites such as diphthongs that collate as a
316 	 * single element (aka chains or collating-elements).
317 	 */
318 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
319 	    ((p = p2->pri[which]) >= 0)) {
320 
321 		*len = l;
322 		*pri = p;
323 
324 	} else if (*t <= UCHAR_MAX) {
325 
326 		/*
327 		 * Character is a small (8-bit) character.
328 		 * We just look these up directly for speed.
329 		 */
330 		*pri = table->char_pri_table[*t].pri[which];
331 
332 	} else if ((table->info->large_count > 0) &&
333 	    ((match = largesearch(table, *t)) != NULL)) {
334 
335 		/*
336 		 * Character was found in the extended table.
337 		 */
338 		*pri = match->pri.pri[which];
339 
340 	} else {
341 		/*
342 		 * Character lacks a specific definition.
343 		 */
344 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
345 			/* Mask off sign bit to prevent ordering confusion. */
346 			*pri = (*t & COLLATE_MAX_PRIORITY);
347 		} else {
348 			*pri = table->info->undef_pri[which];
349 		}
350 		/* No substitutions for undefined characters! */
351 		return;
352 	}
353 
354 	/*
355 	 * Try substituting (expanding) the character.  We are
356 	 * currently doing this *after* the chain compression.  I
357 	 * think it should not matter, but this way might be slightly
358 	 * faster.
359 	 *
360 	 * We do this after the priority search, as this will help us
361 	 * to identify a single key value.  In order for this to work,
362 	 * its important that the priority assigned to a given element
363 	 * to be substituted be unique for that level.  The localedef
364 	 * code ensures this for us.
365 	 */
366 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
367 		if ((*pri = *sptr) > 0) {
368 			sptr++;
369 			*state = *sptr ? sptr : NULL;
370 		}
371 	}
372 
373 }
374 
375 /*
376  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
377  * NOT NULL terminate.  That is left to the caller.
378  */
379 size_t
380 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
381     size_t room)
382 {
383 	int		pri;
384 	int		len;
385 	const wchar_t	*t;
386 	wchar_t		*tr = NULL;
387 	int		direc;
388 	int		pass;
389 	const int32_t 	*state;
390 	size_t		want = 0;
391 	size_t		need = 0;
392 	int		ndir = table->info->directive_count;
393 
394 	assert(src);
395 
396 	for (pass = 0; pass <= ndir; pass++) {
397 
398 		state = NULL;
399 
400 		if (pass != 0) {
401 			/* insert level separator from the previous pass */
402 			if (room) {
403 				*xf++ = 1;
404 				room--;
405 			}
406 			want++;
407 		}
408 
409 		/* special pass for undefined */
410 		if (pass == ndir) {
411 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
412 		} else {
413 			direc = table->info->directive[pass];
414 		}
415 
416 		t = src;
417 
418 		if (direc & DIRECTIVE_BACKWARD) {
419 			wchar_t *bp, *fp, c;
420 			if (tr)
421 				free(tr);
422 			if ((tr = wcsdup(t)) == NULL) {
423 				errno = ENOMEM;
424 				goto fail;
425 			}
426 			bp = tr;
427 			fp = tr + wcslen(tr) - 1;
428 			while (bp < fp) {
429 				c = *bp;
430 				*bp++ = *fp;
431 				*fp-- = c;
432 			}
433 			t = (const wchar_t *)tr;
434 		}
435 
436 		if (direc & DIRECTIVE_POSITION) {
437 			while (*t || state) {
438 				_collate_lookup(table, t, &len, &pri, pass, &state);
439 				t += len;
440 				if (pri <= 0) {
441 					if (pri < 0) {
442 						errno = EINVAL;
443 						goto fail;
444 					}
445 					state = NULL;
446 					pri = COLLATE_MAX_PRIORITY;
447 				}
448 				if (room) {
449 					*xf++ = pri;
450 					room--;
451 				}
452 				want++;
453 				need = want;
454 			}
455 		} else {
456 			while (*t || state) {
457 				_collate_lookup(table, t, &len, &pri, pass, &state);
458 				t += len;
459 				if (pri <= 0) {
460 					if (pri < 0) {
461 						errno = EINVAL;
462 						goto fail;
463 					}
464 					state = NULL;
465 					continue;
466 				}
467 				if (room) {
468 					*xf++ = pri;
469 					room--;
470 				}
471 				want++;
472 				need = want;
473 			}
474 		}
475 	}
476 	free(tr);
477 	return (need);
478 
479 fail:
480 	free(tr);
481 	return ((size_t)(-1));
482 }
483 
484 /*
485  * In the non-POSIX case, we transform each character into a string of
486  * characters representing the character's priority.  Since char is usually
487  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
488  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
489  * bits per byte.
490  *
491  * It turns out that we sometimes have real priorities that are
492  * 31-bits wide.  (But: be careful using priorities where the high
493  * order bit is set -- i.e. the priority is negative.  The sort order
494  * may be surprising!)
495  *
496  * TODO: This would be a good area to optimize somewhat.  It turns out
497  * that real prioririties *except for the last UNDEFINED pass* are generally
498  * very small.  We need the localedef code to precalculate the max
499  * priority for us, and ideally also give us a mask, and then we could
500  * severely limit what we expand to.
501  */
502 #define	XFRM_BYTES	6
503 #define	XFRM_OFFSET	('0')	/* make all printable characters */
504 #define	XFRM_SHIFT	6
505 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
506 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
507 
508 static int
509 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
510 {
511 	/* we use unsigned to ensure zero fill on right shift */
512 	uint32_t val = (uint32_t)table->info->pri_count[pass];
513 	int nc = 0;
514 
515 	while (val) {
516 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
517 		pri >>= XFRM_SHIFT;
518 		val >>= XFRM_SHIFT;
519 		p++;
520 		nc++;
521 	}
522 	return (nc);
523 }
524 
525 size_t
526 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
527     size_t room)
528 {
529 	int		pri;
530 	int		len;
531 	const wchar_t	*t;
532 	wchar_t		*tr = NULL;
533 	int		direc;
534 	int		pass;
535 	const int32_t 	*state;
536 	size_t		want = 0;
537 	size_t		need = 0;
538 	int		b;
539 	uint8_t		buf[XFRM_BYTES];
540 	int		ndir = table->info->directive_count;
541 
542 	assert(src);
543 
544 	for (pass = 0; pass <= ndir; pass++) {
545 
546 		state = NULL;
547 
548 		if (pass != 0) {
549 			/* insert level separator from the previous pass */
550 			if (room) {
551 				*xf++ = XFRM_SEP;
552 				room--;
553 			}
554 			want++;
555 		}
556 
557 		/* special pass for undefined */
558 		if (pass == ndir) {
559 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
560 		} else {
561 			direc = table->info->directive[pass];
562 		}
563 
564 		t = src;
565 
566 		if (direc & DIRECTIVE_BACKWARD) {
567 			wchar_t *bp, *fp, c;
568 			if (tr)
569 				free(tr);
570 			if ((tr = wcsdup(t)) == NULL) {
571 				errno = ENOMEM;
572 				goto fail;
573 			}
574 			bp = tr;
575 			fp = tr + wcslen(tr) - 1;
576 			while (bp < fp) {
577 				c = *bp;
578 				*bp++ = *fp;
579 				*fp-- = c;
580 			}
581 			t = (const wchar_t *)tr;
582 		}
583 
584 		if (direc & DIRECTIVE_POSITION) {
585 			while (*t || state) {
586 
587 				_collate_lookup(table, t, &len, &pri, pass, &state);
588 				t += len;
589 				if (pri <= 0) {
590 					if (pri < 0) {
591 						errno = EINVAL;
592 						goto fail;
593 					}
594 					state = NULL;
595 					pri = COLLATE_MAX_PRIORITY;
596 				}
597 
598 				b = xfrm(table, buf, pri, pass);
599 				want += b;
600 				if (room) {
601 					while (b) {
602 						b--;
603 						if (room) {
604 							*xf++ = buf[b];
605 							room--;
606 						}
607 					}
608 				}
609 				need = want;
610 			}
611 		} else {
612 			while (*t || state) {
613 				_collate_lookup(table, t, &len, &pri, pass, &state);
614 				t += len;
615 				if (pri <= 0) {
616 					if (pri < 0) {
617 						errno = EINVAL;
618 						goto fail;
619 					}
620 					state = NULL;
621 					continue;
622 				}
623 
624 				b = xfrm(table, buf, pri, pass);
625 				want += b;
626 				if (room) {
627 
628 					while (b) {
629 						b--;
630 						if (room) {
631 							*xf++ = buf[b];
632 							room--;
633 						}
634 					}
635 				}
636 				need = want;
637 			}
638 		}
639 	}
640 	free(tr);
641 	return (need);
642 
643 fail:
644 	free(tr);
645 	return ((size_t)(-1));
646 }
647 
648 /*
649  * __collate_equiv_value returns the primary collation value for the given
650  * collating symbol specified by str and len.  Zero or negative is returned
651  * if the collating symbol was not found.  This function is used by bracket
652  * code in the TRE regex library.
653  */
654 int
655 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
656 {
657 	int32_t e;
658 
659 	if (len < 1 || len >= COLLATE_STR_LEN)
660 		return (-1);
661 
662 	FIX_LOCALE(locale);
663 	struct xlocale_collate *table =
664 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
665 
666 	if (table->__collate_load_error)
667 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
668 
669 	if (len == 1) {
670 		e = -1;
671 		if (*str <= UCHAR_MAX)
672 			e = table->char_pri_table[*str].pri[0];
673 		else if (table->info->large_count > 0) {
674 			collate_large_t *match_large;
675 			match_large = largesearch(table, *str);
676 			if (match_large)
677 				e = match_large->pri.pri[0];
678 		}
679 		if (e == 0)
680 			return (1);
681 		return (e > 0 ? e : 0);
682 	}
683 	if (table->info->chain_count > 0) {
684 		wchar_t name[COLLATE_STR_LEN];
685 		collate_chain_t *match_chain;
686 		int clen;
687 
688 		wcsncpy (name, str, len);
689 		name[len] = 0;
690 		match_chain = chainsearch(table, name, &clen);
691 		if (match_chain) {
692 			e = match_chain->pri[0];
693 			if (e == 0)
694 				return (1);
695 			return (e < 0 ? -e : e);
696 		}
697 	}
698 	return (0);
699 }
700