xref: /dragonfly/lib/libc/locale/collate.c (revision c880cbaf)
1 /*
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *		at Electronni Visti IA, Kiev, Ukraine.
5  *			All rights reserved.
6  *
7  * Copyright (c) 2011 The FreeBSD Foundation
8  * All rights reserved.
9  * Portions of this software were developed by David Chisnall
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Adapted to xlocale by John Marino <draco@marino.st>
34  */
35 
36 #include "namespace.h"
37 #include <sys/types.h>
38 #include <sys/stat.h>
39 #include <sys/mman.h>
40 #include <assert.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <wchar.h>
45 #include <errno.h>
46 #include <unistd.h>
47 #include <fcntl.h>
48 #include "un-namespace.h"
49 
50 #include "collate.h"
51 #include "setlocale.h"
52 #include "ldpart.h"
53 #include "libc_private.h"
54 
55 struct xlocale_collate __xlocale_global_collate = {
56 	.header = {{0}, "C"},
57 	.__collate_load_error = 1
58 };
59 
60 struct xlocale_collate __xlocale_C_collate = {
61 	.header = {{0}, "C"},
62 	.__collate_load_error = 1
63 };
64 
65 static int
66 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
67 
68 static void
69 destruct_collate(void *t)
70 {
71 	struct xlocale_collate *table = t;
72 	if (table->map && (table->maplen > 0)) {
73 		(void) munmap(table->map, table->maplen);
74 	}
75 	free(t);
76 }
77 
78 void *
79 __collate_load(const char *encoding, __unused locale_t unused)
80 {
81 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
82 		return &__xlocale_C_collate;
83 	}
84 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
85 	table->header.header.destructor = destruct_collate;
86 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
87 	// the caching outside of this section
88 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
89 		xlocale_release(table);
90 		return NULL;
91 	}
92 	return table;
93 }
94 
95 /**
96  * Load the collation tables for the specified encoding into the global table.
97  */
98 int
99 __collate_load_tables(const char *encoding)
100 {
101 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
102 }
103 
104 int
105 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
106 {
107 	int i, chains, z;
108 	char buf[PATH_MAX];
109 	char *TMP;
110 	char *map;
111 	collate_info_t *info;
112 	struct stat sbuf;
113 	int fd;
114 
115 	table->__collate_load_error = 1;
116 
117 	/* 'encoding' must be already checked. */
118 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
119 		return (_LDP_CACHE);
120 	}
121 
122 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE",
123 	    _PathLocale, encoding);
124 
125 	if ((fd = _open(buf, O_RDONLY)) < 0)
126 		return (_LDP_ERROR);
127 	if (_fstat(fd, &sbuf) < 0) {
128 		(void) _close(fd);
129 		return (_LDP_ERROR);
130 	}
131 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
132 		(void) _close(fd);
133 		errno = EINVAL;
134 		return (_LDP_ERROR);
135 	}
136 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
137 	(void) _close(fd);
138 	if ((TMP = map) == NULL) {
139 		return (_LDP_ERROR);
140 	}
141 
142 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
143 		(void) munmap(map, sbuf.st_size);
144 		errno = EINVAL;
145 		return (_LDP_ERROR);
146 	}
147 	TMP += COLLATE_STR_LEN;
148 
149 	info = (void *)TMP;
150 	TMP += sizeof (*info);
151 
152 	if ((info->directive_count < 1) ||
153 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
154 	    ((chains = info->chain_count) < 0)) {
155 		(void) munmap(map, sbuf.st_size);
156 		errno = EINVAL;
157 		return (_LDP_ERROR);
158 	}
159 
160 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
161 	    (sizeof (collate_chain_t) * chains) +
162 	    (sizeof (collate_large_t) * info->large_count);
163 	for (z = 0; z < info->directive_count; z++) {
164 		i += sizeof (collate_subst_t) * info->subst_count[z];
165 	}
166 	if (i != (sbuf.st_size - (TMP - map))) {
167 		(void) munmap(map, sbuf.st_size);
168 		errno = EINVAL;
169 		return (_LDP_ERROR);
170 	}
171 
172 	table->info = info;
173 	table->char_pri_table = (void *)TMP;
174 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
175 
176 	for (z = 0; z < info->directive_count; z++) {
177 		if (info->subst_count[z] > 0) {
178 			table->subst_table[z] = (void *)TMP;
179 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
180 		} else {
181 			table->subst_table[z] = NULL;
182 		}
183 	}
184 
185 	if (chains > 0) {
186 		table->chain_pri_table = (void *)TMP;
187 		TMP += chains * sizeof (collate_chain_t);
188 	} else
189 		table->chain_pri_table = NULL;
190 	if (info->large_count > 0)
191 		table->large_pri_table = (void *)TMP;
192 	else
193 		table->large_pri_table = NULL;
194 
195 	table->__collate_load_error = 0;
196 	return (_LDP_LOADED);
197 }
198 
199 static const int32_t *
200 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
201 {
202 	const collate_subst_t *p;
203 	int n = table->info->subst_count[pass];
204 
205 	if (n == 0)
206 		return (NULL);
207 
208 	if (pass >= table->info->directive_count)
209 		return (NULL);
210 
211 	if (!(key & COLLATE_SUBST_PRIORITY))
212 		return (NULL);
213 
214 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
215 	assert(p->key == key);
216 	return (p->pri);
217 }
218 
219 static collate_chain_t *
220 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
221 {
222 	int low = 0;
223 	int high = table->info->chain_count - 1;;
224 	int next, compar, l;
225 	collate_chain_t *p;
226 	collate_chain_t *tab = table->chain_pri_table;
227 
228 	if (high < 0)
229 		return (NULL);
230 
231 	while (low <= high) {
232 		next = (low + high) / 2;
233 		p = tab + next;
234 		compar = *key - *p->str;
235 		if (compar == 0) {
236 			l = wcsnlen(p->str, COLLATE_STR_LEN);
237 			compar = wcsncmp(key, p->str, l);
238 			if (compar == 0) {
239 				*len = l;
240 				return (p);
241 			}
242 		}
243 		if (compar > 0)
244 			low = next + 1;
245 		else
246 			high = next - 1;
247 	}
248 	return (NULL);
249 }
250 
251 static collate_large_t *
252 largesearch(struct xlocale_collate *table, const wchar_t key)
253 {
254 	int low = 0;
255 	int high = table->info->large_count - 1;
256 	int next, compar;
257 	collate_large_t *p;
258 	collate_large_t *tab = table->large_pri_table;
259 
260 	if (high < 0)
261 		return (NULL);
262 
263 	while (low <= high) {
264 		next = (low + high) / 2;
265 		p = tab + next;
266 		compar = key - p->val;
267 		if (compar == 0)
268 			return (p);
269 		if (compar > 0)
270 			low = next + 1;
271 		else
272 			high = next - 1;
273 	}
274 	return (NULL);
275 }
276 
277 void
278 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
279     int *pri, int which, const int **state)
280 {
281 	collate_chain_t *p2;
282 	collate_large_t *match;
283 	int p, l;
284 	const int *sptr;
285 
286 	/*
287 	 * If this is the "last" pass for the UNDEFINED, then
288 	 * we just return the priority itself.
289 	 */
290 	if (which >= table->info->directive_count) {
291 		*pri = *t;
292 		*len = 1;
293 		*state = NULL;
294 		return;
295 	}
296 
297 	/*
298 	 * If we have remaining substitution data from a previous
299 	 * call, consume it first.
300 	 */
301 	if ((sptr = *state) != NULL) {
302 		*pri = *sptr;
303 		sptr++;
304 		if ((sptr == *state) || (sptr == NULL))
305 			*state = NULL;
306 		else
307 			*state = sptr;
308 		*len = 0;
309 		return;
310 	}
311 
312 	/* No active substitutions */
313 	*len = 1;
314 
315 	/*
316 	 * Check for composites such as diphthongs that collate as a
317 	 * single element (aka chains or collating-elements).
318 	 */
319 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
320 	    ((p = p2->pri[which]) >= 0)) {
321 
322 		*len = l;
323 		*pri = p;
324 
325 	} else if (*t <= UCHAR_MAX) {
326 
327 		/*
328 		 * Character is a small (8-bit) character.
329 		 * We just look these up directly for speed.
330 		 */
331 		*pri = table->char_pri_table[*t].pri[which];
332 
333 	} else if ((table->info->large_count > 0) &&
334 	    ((match = largesearch(table, *t)) != NULL)) {
335 
336 		/*
337 		 * Character was found in the extended table.
338 		 */
339 		*pri = match->pri.pri[which];
340 
341 	} else {
342 		/*
343 		 * Character lacks a specific definition.
344 		 */
345 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
346 			/* Mask off sign bit to prevent ordering confusion. */
347 			*pri = (*t & COLLATE_MAX_PRIORITY);
348 		} else {
349 			*pri = table->info->undef_pri[which];
350 		}
351 		/* No substitutions for undefined characters! */
352 		return;
353 	}
354 
355 	/*
356 	 * Try substituting (expanding) the character.  We are
357 	 * currently doing this *after* the chain compression.  I
358 	 * think it should not matter, but this way might be slightly
359 	 * faster.
360 	 *
361 	 * We do this after the priority search, as this will help us
362 	 * to identify a single key value.  In order for this to work,
363 	 * its important that the priority assigned to a given element
364 	 * to be substituted be unique for that level.  The localedef
365 	 * code ensures this for us.
366 	 */
367 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
368 		if ((*pri = *sptr) > 0) {
369 			sptr++;
370 			*state = *sptr ? sptr : NULL;
371 		}
372 	}
373 
374 }
375 
376 /*
377  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
378  * NOT NULL terminate.  That is left to the caller.
379  */
380 size_t
381 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
382     size_t room)
383 {
384 	int		pri;
385 	int		len;
386 	const wchar_t	*t;
387 	wchar_t		*tr = NULL;
388 	int		direc;
389 	int		pass;
390 	const int32_t 	*state;
391 	size_t		want = 0;
392 	size_t		need = 0;
393 	int		ndir = table->info->directive_count;
394 
395 	assert(src);
396 
397 	for (pass = 0; pass <= ndir; pass++) {
398 
399 		state = NULL;
400 
401 		if (pass != 0) {
402 			/* insert level separator from the previous pass */
403 			if (room) {
404 				*xf++ = 1;
405 				room--;
406 			}
407 			want++;
408 		}
409 
410 		/* special pass for undefined */
411 		if (pass == ndir) {
412 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
413 		} else {
414 			direc = table->info->directive[pass];
415 		}
416 
417 		t = src;
418 
419 		if (direc & DIRECTIVE_BACKWARD) {
420 			wchar_t *bp, *fp, c;
421 			if (tr)
422 				free(tr);
423 			if ((tr = wcsdup(t)) == NULL) {
424 				errno = ENOMEM;
425 				goto fail;
426 			}
427 			bp = tr;
428 			fp = tr + wcslen(tr) - 1;
429 			while (bp < fp) {
430 				c = *bp;
431 				*bp++ = *fp;
432 				*fp-- = c;
433 			}
434 			t = (const wchar_t *)tr;
435 		}
436 
437 		if (direc & DIRECTIVE_POSITION) {
438 			while (*t || state) {
439 				_collate_lookup(table, t, &len, &pri, pass, &state);
440 				t += len;
441 				if (pri <= 0) {
442 					if (pri < 0) {
443 						errno = EINVAL;
444 						goto fail;
445 					}
446 					state = NULL;
447 					pri = COLLATE_MAX_PRIORITY;
448 				}
449 				if (room) {
450 					*xf++ = pri;
451 					room--;
452 				}
453 				want++;
454 				need = want;
455 			}
456 		} else {
457 			while (*t || state) {
458 				_collate_lookup(table, t, &len, &pri, pass, &state);
459 				t += len;
460 				if (pri <= 0) {
461 					if (pri < 0) {
462 						errno = EINVAL;
463 						goto fail;
464 					}
465 					state = NULL;
466 					continue;
467 				}
468 				if (room) {
469 					*xf++ = pri;
470 					room--;
471 				}
472 				want++;
473 				need = want;
474 			}
475 		}
476 	}
477 	free(tr);
478 	return (need);
479 
480 fail:
481 	free(tr);
482 	return ((size_t)(-1));
483 }
484 
485 /*
486  * In the non-POSIX case, we transform each character into a string of
487  * characters representing the character's priority.  Since char is usually
488  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
489  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
490  * bits per byte.
491  *
492  * It turns out that we sometimes have real priorities that are
493  * 31-bits wide.  (But: be careful using priorities where the high
494  * order bit is set -- i.e. the priority is negative.  The sort order
495  * may be surprising!)
496  *
497  * TODO: This would be a good area to optimize somewhat.  It turns out
498  * that real prioririties *except for the last UNDEFINED pass* are generally
499  * very small.  We need the localedef code to precalculate the max
500  * priority for us, and ideally also give us a mask, and then we could
501  * severely limit what we expand to.
502  */
503 #define	XFRM_BYTES	6
504 #define	XFRM_OFFSET	('0')	/* make all printable characters */
505 #define	XFRM_SHIFT	6
506 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
507 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
508 
509 static int
510 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
511 {
512 	/* we use unsigned to ensure zero fill on right shift */
513 	uint32_t val = (uint32_t)table->info->pri_count[pass];
514 	int nc = 0;
515 
516 	while (val) {
517 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
518 		pri >>= XFRM_SHIFT;
519 		val >>= XFRM_SHIFT;
520 		p++;
521 		nc++;
522 	}
523 	return (nc);
524 }
525 
526 size_t
527 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
528     size_t room)
529 {
530 	int		pri;
531 	int		len;
532 	const wchar_t	*t;
533 	wchar_t		*tr = NULL;
534 	int		direc;
535 	int		pass;
536 	const int32_t 	*state;
537 	size_t		want = 0;
538 	size_t		need = 0;
539 	int		b;
540 	uint8_t		buf[XFRM_BYTES];
541 	int		ndir = table->info->directive_count;
542 
543 	assert(src);
544 
545 	for (pass = 0; pass <= ndir; pass++) {
546 
547 		state = NULL;
548 
549 		if (pass != 0) {
550 			/* insert level separator from the previous pass */
551 			if (room) {
552 				*xf++ = XFRM_SEP;
553 				room--;
554 			}
555 			want++;
556 		}
557 
558 		/* special pass for undefined */
559 		if (pass == ndir) {
560 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
561 		} else {
562 			direc = table->info->directive[pass];
563 		}
564 
565 		t = src;
566 
567 		if (direc & DIRECTIVE_BACKWARD) {
568 			wchar_t *bp, *fp, c;
569 			if (tr)
570 				free(tr);
571 			if ((tr = wcsdup(t)) == NULL) {
572 				errno = ENOMEM;
573 				goto fail;
574 			}
575 			bp = tr;
576 			fp = tr + wcslen(tr) - 1;
577 			while (bp < fp) {
578 				c = *bp;
579 				*bp++ = *fp;
580 				*fp-- = c;
581 			}
582 			t = (const wchar_t *)tr;
583 		}
584 
585 		if (direc & DIRECTIVE_POSITION) {
586 			while (*t || state) {
587 
588 				_collate_lookup(table, t, &len, &pri, pass, &state);
589 				t += len;
590 				if (pri <= 0) {
591 					if (pri < 0) {
592 						errno = EINVAL;
593 						goto fail;
594 					}
595 					state = NULL;
596 					pri = COLLATE_MAX_PRIORITY;
597 				}
598 
599 				b = xfrm(table, buf, pri, pass);
600 				want += b;
601 				if (room) {
602 					while (b) {
603 						b--;
604 						if (room) {
605 							*xf++ = buf[b];
606 							room--;
607 						}
608 					}
609 				}
610 				need = want;
611 			}
612 		} else {
613 			while (*t || state) {
614 				_collate_lookup(table, t, &len, &pri, pass, &state);
615 				t += len;
616 				if (pri <= 0) {
617 					if (pri < 0) {
618 						errno = EINVAL;
619 						goto fail;
620 					}
621 					state = NULL;
622 					continue;
623 				}
624 
625 				b = xfrm(table, buf, pri, pass);
626 				want += b;
627 				if (room) {
628 
629 					while (b) {
630 						b--;
631 						if (room) {
632 							*xf++ = buf[b];
633 							room--;
634 						}
635 					}
636 				}
637 				need = want;
638 			}
639 		}
640 	}
641 	free(tr);
642 	return (need);
643 
644 fail:
645 	free(tr);
646 	return ((size_t)(-1));
647 }
648 
649 /*
650  * __collate_equiv_value returns the primary collation value for the given
651  * collating symbol specified by str and len.  Zero or negative is returned
652  * if the collating symbol was not found.  This function is used by bracket
653  * code in the TRE regex library.
654  */
655 int
656 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
657 {
658 	int32_t e;
659 
660 	if (len < 1 || len >= COLLATE_STR_LEN)
661 		return (-1);
662 
663 	FIX_LOCALE(locale);
664 	struct xlocale_collate *table =
665 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
666 
667 	if (table->__collate_load_error)
668 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
669 
670 	if (len == 1) {
671 		e = -1;
672 		if (*str <= UCHAR_MAX)
673 			e = table->char_pri_table[*str].pri[0];
674 		else if (table->info->large_count > 0) {
675 			collate_large_t *match_large;
676 			match_large = largesearch(table, *str);
677 			if (match_large)
678 				e = match_large->pri.pri[0];
679 		}
680 		if (e == 0)
681 			return (1);
682 		return (e > 0 ? e : 0);
683 	}
684 	if (table->info->chain_count > 0) {
685 		wchar_t name[COLLATE_STR_LEN];
686 		collate_chain_t *match_chain;
687 		int clen;
688 
689 		wcsncpy (name, str, len);
690 		name[len] = 0;
691 		match_chain = chainsearch(table, name, &clen);
692 		if (match_chain) {
693 			e = match_chain->pri[0];
694 			if (e == 0)
695 				return (1);
696 			return (e < 0 ? -e : e);
697 		}
698 	}
699 	return (0);
700 }
701