xref: /dragonfly/lib/libc/locale/collate.c (revision 8524225e)
1 /*
2  * Copright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *		at Electronni Visti IA, Kiev, Ukraine.
5  *			All rights reserved.
6  *
7  * Copyright (c) 2011 The FreeBSD Foundation
8  * All rights reserved.
9  * Portions of this software were developed by David Chisnall
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Adapted to xlocale by John Marino <draco@marino.st>
34  */
35 
36 #include "namespace.h"
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include <errno.h>
42 #include <unistd.h>
43 #include <fcntl.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/mman.h>
47 #include "un-namespace.h"
48 
49 #include "collate.h"
50 #include "setlocale.h"
51 #include "ldpart.h"
52 
53 struct xlocale_collate __xlocale_global_collate = {
54 	{{0}, "C"}, 1, 0, 0, 0
55 };
56 
57 struct xlocale_collate __xlocale_C_collate = {
58 	{{0}, "C"}, 1, 0, 0, 0
59 };
60 
61 #include "libc_private.h"
62 
63 int
64 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
65 
66 static void
67 destruct_collate(void *t)
68 {
69 	struct xlocale_collate *table = t;
70 	if (table->map && (table->maplen > 0)) {
71 		(void) munmap(table->map, table->maplen);
72 	}
73 	free(t);
74 }
75 
76 void *
77 __collate_load(const char *encoding, __unused locale_t unused)
78 {
79 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
80 		return &__xlocale_C_collate;
81 	}
82 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
83 	table->header.header.destructor = destruct_collate;
84 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
85 	// the caching outside of this section
86 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
87 		xlocale_release(table);
88 		return NULL;
89 	}
90 	return table;
91 }
92 
93 /**
94  * Load the collation tables for the specified encoding into the global table.
95  */
96 int
97 __collate_load_tables(const char *encoding)
98 {
99 	int ret = __collate_load_tables_l(encoding, &__xlocale_global_collate);
100 	return ret;
101 }
102 
103 int
104 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
105 {
106 	int i, chains, z;
107 	char buf[PATH_MAX];
108 	char *TMP;
109 	char *map;
110 	collate_info_t *info;
111 	struct stat sbuf;
112 	int fd;
113 
114 	/* 'encoding' must be already checked. */
115 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
116 		table->__collate_load_error = 1;
117 		return (_LDP_CACHE);
118 	}
119 
120 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE",
121 	    _PathLocale, encoding);
122 
123 	if ((fd = _open(buf, O_RDONLY)) < 0)
124 		return (_LDP_ERROR);
125 	if (_fstat(fd, &sbuf) < 0) {
126 		(void) _close(fd);
127 		return (_LDP_ERROR);
128 	}
129 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
130 		(void) _close(fd);
131 		errno = EINVAL;
132 		return (_LDP_ERROR);
133 	}
134 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
135 	(void) _close(fd);
136 	if ((TMP = map) == NULL) {
137 		return (_LDP_ERROR);
138 	}
139 
140 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
141 		(void) munmap(map, sbuf.st_size);
142 		errno = EINVAL;
143 		return (_LDP_ERROR);
144 	}
145 	TMP += COLLATE_STR_LEN;
146 
147 	info = (void *)TMP;
148 	TMP += sizeof (*info);
149 
150 	if ((info->directive_count < 1) ||
151 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
152 	    ((chains = info->chain_count) < 0)) {
153 		(void) munmap(map, sbuf.st_size);
154 		errno = EINVAL;
155 		return (_LDP_ERROR);
156 	}
157 
158 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
159 	    (sizeof (collate_chain_t) * chains) +
160 	    (sizeof (collate_large_t) * info->large_count);
161 	for (z = 0; z < (info->directive_count); z++) {
162 		i += sizeof (collate_subst_t) * info->subst_count[z];
163 	}
164 	if (i != (sbuf.st_size - (TMP - map))) {
165 		(void) munmap(map, sbuf.st_size);
166 		errno = EINVAL;
167 		return (_LDP_ERROR);
168 	}
169 
170 	table->char_pri_table = (void *)TMP;
171 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
172 
173 	for (z = 0; z < info->directive_count; z++) {
174 		if (info->subst_count[z] > 0) {
175 			table->subst_table[z] = (void *)TMP;
176 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
177 		} else {
178 			table->subst_table[z] = NULL;
179 		}
180 	}
181 
182 	if (chains > 0) {
183 		table->chain_pri_table = (void *)TMP;
184 		TMP += chains * sizeof (collate_chain_t);
185 	} else
186 		table->chain_pri_table = NULL;
187 	if (info->large_count > 0)
188 		table->large_pri_table = (void *)TMP;
189 	else
190 		table->large_pri_table = NULL;
191 
192 	table->info = info;
193 	table->__collate_load_error = 0;
194 
195 	return (_LDP_LOADED);
196 }
197 
198 /*
199  * Note: for performance reasons, we have expanded bsearch here.  This avoids
200  * function call overhead with each comparison.
201  */
202 
203 static int32_t *
204 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
205 {
206 	collate_subst_t *p;
207 	int n = table->info->subst_count[pass];
208 
209 	if (n == 0)
210 		return (NULL);
211 
212 	if (pass >= table->info->directive_count)
213 		return (NULL);
214 
215 	if (!(key & COLLATE_SUBST_PRIORITY))
216 		return (NULL);
217 
218 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
219 	return (p->pri);
220 }
221 
222 static collate_chain_t *
223 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
224 {
225 	int low;
226 	int high;
227 	int next, compar, l;
228 	collate_chain_t *p;
229 	collate_chain_t *tab;
230 
231 	if (table->info->chain_count == 0)
232 		return (NULL);
233 
234 	low = 0;
235 	high = table->info->chain_count - 1;
236 	tab = table->chain_pri_table;
237 
238 	while (low <= high) {
239 		next = (low + high) / 2;
240 		p = tab + next;
241 		compar = *key - *p->str;
242 		if (compar == 0) {
243 			l = wcsnlen(p->str, COLLATE_STR_LEN);
244 			compar = wcsncmp(key, p->str, l);
245 			if (compar == 0) {
246 				*len = l;
247 				return (p);
248 			}
249 		}
250 		if (compar > 0)
251 			low = next + 1;
252 		else
253 			high = next - 1;
254 	}
255 	return (NULL);
256 }
257 
258 static collate_large_t *
259 largesearch(struct xlocale_collate *table, const wchar_t key)
260 {
261 	int low = 0;
262 	int high = table->info->large_count - 1;
263 	int next, compar;
264 	collate_large_t *p;
265 	collate_large_t *tab = table->large_pri_table;
266 
267 	if (table->info->large_count == 0)
268 		return (NULL);
269 
270 	while (low <= high) {
271 		next = (low + high) / 2;
272 		p = tab + next;
273 		compar = key - p->val;
274 		if (compar == 0)
275 			return (p);
276 		if (compar > 0)
277 			low = next + 1;
278 		else
279 			high = next - 1;
280 	}
281 	return (NULL);
282 }
283 
284 void
285 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
286     int *pri, int which, const int **state)
287 {
288 	collate_chain_t *p2;
289 	collate_large_t *match;
290 	int p, l;
291 	const int *sptr;
292 
293 	/*
294 	 * If this is the "last" pass for the UNDEFINED, then
295 	 * we just return the priority itself.
296 	 */
297 	if (which >= table->info->directive_count) {
298 		*pri = *t;
299 		*len = 1;
300 		*state = NULL;
301 		return;
302 	}
303 
304 	/*
305 	 * If we have remaining substitution data from a previous
306 	 * call, consume it first.
307 	 */
308 	if ((sptr = *state) != NULL) {
309 		*pri = *sptr;
310 		sptr++;
311 		*state = *sptr ? sptr : NULL;
312 		*len = 0;
313 		return;
314 	}
315 
316 	/* No active substitutions */
317 	*len = 1;
318 
319 	/*
320 	 * Check for composites such as dipthongs that collate as a
321 	 * single element (aka chains or collating-elements).
322 	 */
323 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
324 	    ((p = p2->pri[which]) >= 0)) {
325 
326 		*len = l;
327 		*pri = p;
328 
329 	} else if (*t <= UCHAR_MAX) {
330 
331 		/*
332 		 * Character is a small (8-bit) character.
333 		 * We just look these up directly for speed.
334 		 */
335 		*pri = table->char_pri_table[*t].pri[which];
336 
337 	} else if ((table->info->large_count > 0) &&
338 	    ((match = largesearch(table, *t)) != NULL)) {
339 
340 		/*
341 		 * Character was found in the extended table.
342 		 */
343 		*pri = match->pri.pri[which];
344 
345 	} else {
346 		/*
347 		 * Character lacks a specific definition.
348 		 */
349 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
350 			/* Mask off sign bit to prevent ordering confusion. */
351 			*pri = (*t & COLLATE_MAX_PRIORITY);
352 		} else {
353 			*pri = table->info->undef_pri[which];
354 		}
355 		/* No substitutions for undefined characters! */
356 		return;
357 	}
358 
359 	/*
360 	 * Try substituting (expanding) the character.  We are
361 	 * currently doing this *after* the chain compression.  I
362 	 * think it should not matter, but this way might be slightly
363 	 * faster.
364 	 *
365 	 * We do this after the priority search, as this will help us
366 	 * to identify a single key value.  In order for this to work,
367 	 * its important that the priority assigned to a given element
368 	 * to be substituted be unique for that level.  The localedef
369 	 * code ensures this for us.
370 	 */
371 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
372 		if ((*pri = *sptr) != 0) {
373 			sptr++;
374 			*state = *sptr ? sptr : NULL;
375 		}
376 	}
377 
378 }
379 
380 /*
381  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
382  * NOT NULL terminate.  That is left to the caller.
383  */
384 size_t
385 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
386     size_t room)
387 {
388 	int		pri;
389 	int		len;
390 	const wchar_t	*t;
391 	wchar_t		*tr = NULL;
392 	int		direc;
393 	int		pass;
394 	const int32_t 	*state;
395 	size_t		want = 0;
396 	size_t		need = 0;
397 
398 	for (pass = 0; pass <= table->info->directive_count; pass++) {
399 
400 		state = NULL;
401 
402 		if (pass != 0) {
403 			/* insert level separator from the previous pass */
404 			if (room) {
405 				*xf++ = 1;
406 				room--;
407 			}
408 			want++;
409 		}
410 
411 		/* special pass for undefined */
412 		if (pass == table->info->directive_count) {
413 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
414 		} else {
415 			direc = table->info->directive[pass];
416 		}
417 
418 		t = src;
419 
420 		if (direc & DIRECTIVE_BACKWARD) {
421 			wchar_t *bp, *fp, c;
422 			if (tr)
423 				free(tr);
424 			if ((tr = wcsdup(t)) == NULL) {
425 				errno = ENOMEM;
426 				goto fail;
427 			}
428 			bp = tr;
429 			fp = tr + wcslen(tr) - 1;
430 			while (bp < fp) {
431 				c = *bp;
432 				*bp++ = *fp;
433 				*fp-- = c;
434 			}
435 			t = (const wchar_t *)tr;
436 		}
437 
438 		if (direc & DIRECTIVE_POSITION) {
439 			while (*t || state) {
440 				_collate_lookup(table, t, &len, &pri, pass, &state);
441 				t += len;
442 				if (pri <= 0) {
443 					if (pri < 0) {
444 						errno = EINVAL;
445 						goto fail;
446 					}
447 					pri = COLLATE_MAX_PRIORITY;
448 				}
449 				if (room) {
450 					*xf++ = pri;
451 					room--;
452 				}
453 				want++;
454 				need = want;
455 			}
456 		} else {
457 			while (*t || state) {
458 				_collate_lookup(table, t, &len, &pri, pass, &state);
459 				t += len;
460 				if (pri <= 0) {
461 					if (pri < 0) {
462 						errno = EINVAL;
463 						goto fail;
464 					}
465 					continue;
466 				}
467 				if (room) {
468 					*xf++ = pri;
469 					room--;
470 				}
471 				want++;
472 				need = want;
473 			}
474 		}
475 	}
476 	if (tr)
477 		free(tr);
478 	return (need);
479 
480 fail:
481 	if (tr)
482 		free(tr);
483 	return ((size_t)(-1));
484 }
485 
486 /*
487  * In the non-POSIX case, we transform each character into a string of
488  * characters representing the character's priority.  Since char is usually
489  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
490  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
491  * bits per byte.
492  *
493  * It turns out that we sometimes have real priorities that are
494  * 31-bits wide.  (But: be careful using priorities where the high
495  * order bit is set -- i.e. the priority is negative.  The sort order
496  * may be surprising!)
497  *
498  * TODO: This would be a good area to optimize somewhat.  It turns out
499  * that real prioririties *except for the last UNDEFINED pass* are generally
500  * very small.  We need the localedef code to precalculate the max
501  * priority for us, and ideally also give us a mask, and then we could
502  * severely limit what we expand to.
503  */
504 #define	XFRM_BYTES	6
505 #define	XFRM_OFFSET	('0')	/* make all printable characters */
506 #define	XFRM_SHIFT	6
507 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
508 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
509 
510 static int
511 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
512 {
513 	/* we use unsigned to ensure zero fill on right shift */
514 	uint32_t val = (uint32_t)table->info->pri_count[pass];
515 	int nc = 0;
516 
517 	while (val) {
518 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
519 		pri >>= XFRM_SHIFT;
520 		val >>= XFRM_SHIFT;
521 		p++;
522 		nc++;
523 	}
524 	return (nc);
525 }
526 
527 size_t
528 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
529     size_t room)
530 {
531 	int		pri;
532 	int		len;
533 	const wchar_t	*t;
534 	wchar_t		*tr = NULL;
535 	int		direc;
536 	int		pass;
537 	const int32_t 	*state;
538 	size_t		want = 0;
539 	size_t		need = 0;
540 	int		b;
541 	uint8_t		buf[XFRM_BYTES];
542 
543 	for (pass = 0; pass <= table->info->directive_count; pass++) {
544 
545 		state = NULL;
546 
547 		if (pass != 0) {
548 			/* insert level separator from the previous pass */
549 			if (room) {
550 				*xf++ = XFRM_SEP;
551 				room--;
552 			}
553 			want++;
554 		}
555 
556 		/* special pass for undefined */
557 		if (pass == table->info->directive_count) {
558 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
559 		} else {
560 			direc = table->info->directive[pass];
561 		}
562 
563 		t = src;
564 
565 		if (direc & DIRECTIVE_BACKWARD) {
566 			wchar_t *bp, *fp, c;
567 			if (tr)
568 				free(tr);
569 			if ((tr = wcsdup(t)) == NULL) {
570 				errno = ENOMEM;
571 				goto fail;
572 			}
573 			bp = tr;
574 			fp = tr + wcslen(tr) - 1;
575 			while (bp < fp) {
576 				c = *bp;
577 				*bp++ = *fp;
578 				*fp-- = c;
579 			}
580 			t = (const wchar_t *)tr;
581 		}
582 
583 		if (direc & DIRECTIVE_POSITION) {
584 			while (*t || state) {
585 
586 				_collate_lookup(table, t, &len, &pri, pass, &state);
587 				t += len;
588 				if (pri <= 0) {
589 					if (pri < 0) {
590 						errno = EINVAL;
591 						goto fail;
592 					}
593 					pri = COLLATE_MAX_PRIORITY;
594 				}
595 
596 				b = xfrm(table, buf, pri, pass);
597 				want += b;
598 				if (room) {
599 					while (b) {
600 						b--;
601 						if (room) {
602 							*xf++ = buf[b];
603 							room--;
604 						}
605 					}
606 				}
607 				need = want;
608 			}
609 		} else {
610 			while (*t || state) {
611 				_collate_lookup(table, t, &len, &pri, pass, &state);
612 				t += len;
613 				if (pri <= 0) {
614 					if (pri < 0) {
615 						errno = EINVAL;
616 						goto fail;
617 					}
618 					continue;
619 				}
620 
621 				b = xfrm(table, buf, pri, pass);
622 				want += b;
623 				if (room) {
624 
625 					while (b) {
626 						b--;
627 						if (room) {
628 							*xf++ = buf[b];
629 							room--;
630 						}
631 					}
632 				}
633 				need = want;
634 			}
635 		}
636 	}
637 	if (tr)
638 		free(tr);
639 	return (need);
640 
641 fail:
642 	if (tr)
643 		free(tr);
644 	return ((size_t)(-1));
645 }
646