xref: /dragonfly/lib/libc/locale/collate.c (revision e98bdfd3)
1 /*
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *		at Electronni Visti IA, Kiev, Ukraine.
5  *			All rights reserved.
6  *
7  * Copyright (c) 2011 The FreeBSD Foundation
8  * All rights reserved.
9  * Portions of this software were developed by David Chisnall
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Adapted to xlocale by John Marino <draco@marino.st>
34  */
35 
36 #include "namespace.h"
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include <errno.h>
42 #include <unistd.h>
43 #include <fcntl.h>
44 #include <assert.h>
45 #include <sys/types.h>
46 #include <sys/stat.h>
47 #include <sys/mman.h>
48 #include "un-namespace.h"
49 
50 #include "collate.h"
51 #include "setlocale.h"
52 #include "ldpart.h"
53 
54 struct xlocale_collate __xlocale_global_collate = {
55 	{{0}, "C"}, 1, 0, 0, 0
56 };
57 
58 struct xlocale_collate __xlocale_C_collate = {
59 	{{0}, "C"}, 1, 0, 0, 0
60 };
61 
62 #include "libc_private.h"
63 
64 int
65 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
66 
67 static void
68 destruct_collate(void *t)
69 {
70 	struct xlocale_collate *table = t;
71 	if (table->map && (table->maplen > 0)) {
72 		(void) munmap(table->map, table->maplen);
73 	}
74 	free(t);
75 }
76 
77 void *
78 __collate_load(const char *encoding, __unused locale_t unused)
79 {
80 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
81 		return &__xlocale_C_collate;
82 	}
83 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
84 	table->header.header.destructor = destruct_collate;
85 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
86 	// the caching outside of this section
87 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
88 		xlocale_release(table);
89 		return NULL;
90 	}
91 	return table;
92 }
93 
94 /**
95  * Load the collation tables for the specified encoding into the global table.
96  */
97 int
98 __collate_load_tables(const char *encoding)
99 {
100 	int ret = __collate_load_tables_l(encoding, &__xlocale_global_collate);
101 	return ret;
102 }
103 
104 int
105 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
106 {
107 	int i, chains, z;
108 	char buf[PATH_MAX];
109 	char *TMP;
110 	char *map;
111 	collate_info_t *info;
112 	struct stat sbuf;
113 	int fd;
114 
115 	table->__collate_load_error = 1;
116 
117 	/* 'encoding' must be already checked. */
118 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
119 		return (_LDP_CACHE);
120 	}
121 
122 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE",
123 	    _PathLocale, encoding);
124 
125 	if ((fd = _open(buf, O_RDONLY)) < 0)
126 		return (_LDP_ERROR);
127 	if (_fstat(fd, &sbuf) < 0) {
128 		(void) _close(fd);
129 		return (_LDP_ERROR);
130 	}
131 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
132 		(void) _close(fd);
133 		errno = EINVAL;
134 		return (_LDP_ERROR);
135 	}
136 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
137 	(void) _close(fd);
138 	if ((TMP = map) == NULL) {
139 		return (_LDP_ERROR);
140 	}
141 
142 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
143 		(void) munmap(map, sbuf.st_size);
144 		errno = EINVAL;
145 		return (_LDP_ERROR);
146 	}
147 	TMP += COLLATE_STR_LEN;
148 
149 	info = (void *)TMP;
150 	TMP += sizeof (*info);
151 
152 	if ((info->directive_count < 1) ||
153 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
154 	    ((chains = info->chain_count) < 0)) {
155 		(void) munmap(map, sbuf.st_size);
156 		errno = EINVAL;
157 		return (_LDP_ERROR);
158 	}
159 
160 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
161 	    (sizeof (collate_chain_t) * chains) +
162 	    (sizeof (collate_large_t) * info->large_count);
163 	for (z = 0; z < info->directive_count; z++) {
164 		i += sizeof (collate_subst_t) * info->subst_count[z];
165 	}
166 	if (i != (sbuf.st_size - (TMP - map))) {
167 		(void) munmap(map, sbuf.st_size);
168 		errno = EINVAL;
169 		return (_LDP_ERROR);
170 	}
171 
172 	table->info = info;
173 	table->char_pri_table = (void *)TMP;
174 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
175 
176 	for (z = 0; z < info->directive_count; z++) {
177 		if (info->subst_count[z] > 0) {
178 			table->subst_table[z] = (void *)TMP;
179 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
180 		} else {
181 			table->subst_table[z] = NULL;
182 		}
183 	}
184 
185 	if (chains > 0) {
186 		table->chain_pri_table = (void *)TMP;
187 		TMP += chains * sizeof (collate_chain_t);
188 	} else
189 		table->chain_pri_table = NULL;
190 	if (info->large_count > 0)
191 		table->large_pri_table = (void *)TMP;
192 	else
193 		table->large_pri_table = NULL;
194 
195 	table->__collate_load_error = 0;
196 	return (_LDP_LOADED);
197 }
198 
199 static const int32_t *
200 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
201 {
202 	const collate_subst_t *p;
203 	int n = table->info->subst_count[pass];
204 
205 	if (n == 0)
206 		return (NULL);
207 
208 	if (pass >= table->info->directive_count)
209 		return (NULL);
210 
211 	if (!(key & COLLATE_SUBST_PRIORITY))
212 		return (NULL);
213 
214 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
215 	assert(p->key == key);
216 	return (p->pri);
217 }
218 
219 static collate_chain_t *
220 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
221 {
222 	int low = 0;
223 	int high = table->info->chain_count - 1;;
224 	int next, compar, l;
225 	collate_chain_t *p;
226 	collate_chain_t *tab = table->chain_pri_table;
227 
228 	if (high < 0)
229 		return (NULL);
230 
231 	while (low <= high) {
232 		next = (low + high) / 2;
233 		p = tab + next;
234 		compar = *key - *p->str;
235 		if (compar == 0) {
236 			l = wcsnlen(p->str, COLLATE_STR_LEN);
237 			compar = wcsncmp(key, p->str, l);
238 			if (compar == 0) {
239 				*len = l;
240 				return (p);
241 			}
242 		}
243 		if (compar > 0)
244 			low = next + 1;
245 		else
246 			high = next - 1;
247 	}
248 	return (NULL);
249 }
250 
251 static collate_large_t *
252 largesearch(struct xlocale_collate *table, const wchar_t key)
253 {
254 	int low = 0;
255 	int high = table->info->large_count - 1;
256 	int next, compar;
257 	collate_large_t *p;
258 	collate_large_t *tab = table->large_pri_table;
259 
260 	if (high < 0)
261 		return (NULL);
262 
263 	while (low <= high) {
264 		next = (low + high) / 2;
265 		p = tab + next;
266 		compar = key - p->val;
267 		if (compar == 0)
268 			return (p);
269 		if (compar > 0)
270 			low = next + 1;
271 		else
272 			high = next - 1;
273 	}
274 	return (NULL);
275 }
276 
277 void
278 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
279     int *pri, int which, const int **state)
280 {
281 	collate_chain_t *p2;
282 	collate_large_t *match;
283 	int p, l;
284 	const int *sptr;
285 
286 	/*
287 	 * If this is the "last" pass for the UNDEFINED, then
288 	 * we just return the priority itself.
289 	 */
290 	if (which >= table->info->directive_count) {
291 		*pri = *t;
292 		*len = 1;
293 		*state = NULL;
294 		return;
295 	}
296 
297 	/*
298 	 * If we have remaining substitution data from a previous
299 	 * call, consume it first.
300 	 */
301 	if ((sptr = *state) != NULL) {
302 		*pri = *sptr;
303 		sptr++;
304 		if ((sptr == *state) || (sptr == NULL))
305 			*state = NULL;
306 		else
307 			*state = sptr;
308 		*len = 0;
309 		return;
310 	}
311 
312 	/* No active substitutions */
313 	*len = 1;
314 
315 	/*
316 	 * Check for composites such as dipthongs that collate as a
317 	 * single element (aka chains or collating-elements).
318 	 */
319 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
320 	    ((p = p2->pri[which]) >= 0)) {
321 
322 		*len = l;
323 		*pri = p;
324 
325 	} else if (*t <= UCHAR_MAX) {
326 
327 		/*
328 		 * Character is a small (8-bit) character.
329 		 * We just look these up directly for speed.
330 		 */
331 		*pri = table->char_pri_table[*t].pri[which];
332 
333 	} else if ((table->info->large_count > 0) &&
334 	    ((match = largesearch(table, *t)) != NULL)) {
335 
336 		/*
337 		 * Character was found in the extended table.
338 		 */
339 		*pri = match->pri.pri[which];
340 
341 	} else {
342 		/*
343 		 * Character lacks a specific definition.
344 		 */
345 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
346 			/* Mask off sign bit to prevent ordering confusion. */
347 			*pri = (*t & COLLATE_MAX_PRIORITY);
348 		} else {
349 			*pri = table->info->undef_pri[which];
350 		}
351 		/* No substitutions for undefined characters! */
352 		return;
353 	}
354 
355 	/*
356 	 * Try substituting (expanding) the character.  We are
357 	 * currently doing this *after* the chain compression.  I
358 	 * think it should not matter, but this way might be slightly
359 	 * faster.
360 	 *
361 	 * We do this after the priority search, as this will help us
362 	 * to identify a single key value.  In order for this to work,
363 	 * its important that the priority assigned to a given element
364 	 * to be substituted be unique for that level.  The localedef
365 	 * code ensures this for us.
366 	 */
367 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
368 		if ((*pri = *sptr) > 0) {
369 			sptr++;
370 			*state = *sptr ? sptr : NULL;
371 		}
372 	}
373 
374 }
375 
376 /*
377  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
378  * NOT NULL terminate.  That is left to the caller.
379  */
380 size_t
381 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
382     size_t room)
383 {
384 	int		pri;
385 	int		len;
386 	const wchar_t	*t;
387 	wchar_t		*tr = NULL;
388 	int		direc;
389 	int		pass;
390 	const int32_t 	*state;
391 	size_t		want = 0;
392 	size_t		need = 0;
393 	int		ndir = table->info->directive_count;
394 
395 	assert(src);
396 
397 	for (pass = 0; pass <= ndir; pass++) {
398 
399 		state = NULL;
400 
401 		if (pass != 0) {
402 			/* insert level separator from the previous pass */
403 			if (room) {
404 				*xf++ = 1;
405 				room--;
406 			}
407 			want++;
408 		}
409 
410 		/* special pass for undefined */
411 		if (pass == ndir) {
412 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
413 		} else {
414 			direc = table->info->directive[pass];
415 		}
416 
417 		t = src;
418 
419 		if (direc & DIRECTIVE_BACKWARD) {
420 			wchar_t *bp, *fp, c;
421 			if (tr)
422 				free(tr);
423 			if ((tr = wcsdup(t)) == NULL) {
424 				errno = ENOMEM;
425 				goto fail;
426 			}
427 			bp = tr;
428 			fp = tr + wcslen(tr) - 1;
429 			while (bp < fp) {
430 				c = *bp;
431 				*bp++ = *fp;
432 				*fp-- = c;
433 			}
434 			t = (const wchar_t *)tr;
435 		}
436 
437 		if (direc & DIRECTIVE_POSITION) {
438 			while (*t || state) {
439 				_collate_lookup(table, t, &len, &pri, pass, &state);
440 				t += len;
441 				if (pri <= 0) {
442 					if (pri < 0) {
443 						errno = EINVAL;
444 						goto fail;
445 					}
446 					pri = COLLATE_MAX_PRIORITY;
447 				}
448 				if (room) {
449 					*xf++ = pri;
450 					room--;
451 				}
452 				want++;
453 				need = want;
454 			}
455 		} else {
456 			while (*t || state) {
457 				_collate_lookup(table, t, &len, &pri, pass, &state);
458 				t += len;
459 				if (pri <= 0) {
460 					if (pri < 0) {
461 						errno = EINVAL;
462 						goto fail;
463 					}
464 					continue;
465 				}
466 				if (room) {
467 					*xf++ = pri;
468 					room--;
469 				}
470 				want++;
471 				need = want;
472 			}
473 		}
474 	}
475 	free(tr);
476 	return (need);
477 
478 fail:
479 	free(tr);
480 	return ((size_t)(-1));
481 }
482 
483 /*
484  * In the non-POSIX case, we transform each character into a string of
485  * characters representing the character's priority.  Since char is usually
486  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
487  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
488  * bits per byte.
489  *
490  * It turns out that we sometimes have real priorities that are
491  * 31-bits wide.  (But: be careful using priorities where the high
492  * order bit is set -- i.e. the priority is negative.  The sort order
493  * may be surprising!)
494  *
495  * TODO: This would be a good area to optimize somewhat.  It turns out
496  * that real prioririties *except for the last UNDEFINED pass* are generally
497  * very small.  We need the localedef code to precalculate the max
498  * priority for us, and ideally also give us a mask, and then we could
499  * severely limit what we expand to.
500  */
501 #define	XFRM_BYTES	6
502 #define	XFRM_OFFSET	('0')	/* make all printable characters */
503 #define	XFRM_SHIFT	6
504 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
505 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
506 
507 static int
508 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
509 {
510 	/* we use unsigned to ensure zero fill on right shift */
511 	uint32_t val = (uint32_t)table->info->pri_count[pass];
512 	int nc = 0;
513 
514 	while (val) {
515 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
516 		pri >>= XFRM_SHIFT;
517 		val >>= XFRM_SHIFT;
518 		p++;
519 		nc++;
520 	}
521 	return (nc);
522 }
523 
524 size_t
525 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
526     size_t room)
527 {
528 	int		pri;
529 	int		len;
530 	const wchar_t	*t;
531 	wchar_t		*tr = NULL;
532 	int		direc;
533 	int		pass;
534 	const int32_t 	*state;
535 	size_t		want = 0;
536 	size_t		need = 0;
537 	int		b;
538 	uint8_t		buf[XFRM_BYTES];
539 	int		ndir = table->info->directive_count;
540 
541 	assert(src);
542 
543 	for (pass = 0; pass <= ndir; pass++) {
544 
545 		state = NULL;
546 
547 		if (pass != 0) {
548 			/* insert level separator from the previous pass */
549 			if (room) {
550 				*xf++ = XFRM_SEP;
551 				room--;
552 			}
553 			want++;
554 		}
555 
556 		/* special pass for undefined */
557 		if (pass == ndir) {
558 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
559 		} else {
560 			direc = table->info->directive[pass];
561 		}
562 
563 		t = src;
564 
565 		if (direc & DIRECTIVE_BACKWARD) {
566 			wchar_t *bp, *fp, c;
567 			if (tr)
568 				free(tr);
569 			if ((tr = wcsdup(t)) == NULL) {
570 				errno = ENOMEM;
571 				goto fail;
572 			}
573 			bp = tr;
574 			fp = tr + wcslen(tr) - 1;
575 			while (bp < fp) {
576 				c = *bp;
577 				*bp++ = *fp;
578 				*fp-- = c;
579 			}
580 			t = (const wchar_t *)tr;
581 		}
582 
583 		if (direc & DIRECTIVE_POSITION) {
584 			while (*t || state) {
585 
586 				_collate_lookup(table, t, &len, &pri, pass, &state);
587 				t += len;
588 				if (pri <= 0) {
589 					if (pri < 0) {
590 						errno = EINVAL;
591 						goto fail;
592 					}
593 					pri = COLLATE_MAX_PRIORITY;
594 				}
595 
596 				b = xfrm(table, buf, pri, pass);
597 				want += b;
598 				if (room) {
599 					while (b) {
600 						b--;
601 						if (room) {
602 							*xf++ = buf[b];
603 							room--;
604 						}
605 					}
606 				}
607 				need = want;
608 			}
609 		} else {
610 			while (*t || state) {
611 				_collate_lookup(table, t, &len, &pri, pass, &state);
612 				t += len;
613 				if (pri <= 0) {
614 					if (pri < 0) {
615 						errno = EINVAL;
616 						goto fail;
617 					}
618 					continue;
619 				}
620 
621 				b = xfrm(table, buf, pri, pass);
622 				want += b;
623 				if (room) {
624 
625 					while (b) {
626 						b--;
627 						if (room) {
628 							*xf++ = buf[b];
629 							room--;
630 						}
631 					}
632 				}
633 				need = want;
634 			}
635 		}
636 	}
637 	free(tr);
638 	return (need);
639 
640 fail:
641 	free(tr);
642 	return ((size_t)(-1));
643 }
644 
645 /*
646  * __collate_equiv_value returns the primary collation value for the given
647  * collating symbol specified by str and len.  Zero or negative is returned
648  * if the collating symbol was not found.  This function is used by bracket
649  * code in the TRE regex library.
650  */
651 int
652 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
653 {
654 	int32_t e;
655 
656 	if (len < 1 || len >= COLLATE_STR_LEN)
657 		return (-1);
658 
659 	FIX_LOCALE(locale);
660 	struct xlocale_collate *table =
661 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
662 
663 	if (table->__collate_load_error)
664 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
665 
666 	if (len == 1) {
667 		e = -1;
668 		if (*str <= UCHAR_MAX)
669 			e = table->char_pri_table[*str].pri[0];
670 		else if (table->info->large_count > 0) {
671 			collate_large_t *match_large;
672 			match_large = largesearch(table, *str);
673 			if (match_large)
674 				e = match_large->pri.pri[0];
675 		}
676 		if (e == 0)
677 			return (1);
678 		return (e > 0 ? e : 0);
679 	}
680 	if (table->info->chain_count > 0) {
681 		wchar_t name[COLLATE_STR_LEN];
682 		collate_chain_t *match_chain;
683 		int clen;
684 
685 		wcsncpy (name, str, len);
686 		name[len] = 0;
687 		match_chain = chainsearch(table, name, &clen);
688 		if (match_chain) {
689 			e = match_chain->pri[0];
690 			if (e == 0)
691 				return (1);
692 			return (e < 0 ? -e : e);
693 		}
694 	}
695 	return (0);
696 }
697