1 /*
2  * regc_locale.c --
3  *
4  *	This file contains locale-specific regexp routines.
5  *	This file is #included by regcomp.c.
6  *
7  * Copyright (c) 1998 by Scriptics Corporation.
8  *
9  * This software is copyrighted by the Regents of the University of
10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11  * Corporation and other parties.  The following terms apply to all files
12  * associated with the software unless explicitly disclaimed in
13  * individual files.
14  *
15  * The authors hereby grant permission to use, copy, modify, distribute,
16  * and license this software and its documentation for any purpose, provided
17  * that existing copyright notices are retained in all copies and that this
18  * notice is included verbatim in any distributions. No written agreement,
19  * license, or royalty fee is required for any of the authorized uses.
20  * Modifications to this software may be copyrighted by their authors
21  * and need not follow the licensing terms described here, provided that
22  * the new terms are clearly indicated on the first page of each file where
23  * they apply.
24  *
25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  *
31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36  * MODIFICATIONS.
37  *
38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
39  * U.S. government, the Government shall have only "Restricted Rights"
40  * in the software and related documentation as defined in the Federal
41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
42  * are acquiring the software on behalf of the Department of Defense, the
43  * software shall be classified as "Commercial Computer Software" and the
44  * Government shall have only "Restricted Rights" as defined in Clause
45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
46  * authors grant the U.S. Government and others acting in its behalf
47  * permission to use and distribute the software in accordance with the
48  * terms specified in this license.
49  *
50  * src/backend/regex/regc_locale.c
51  */
52 
53 /* ASCII character-name table */
54 
55 static const struct cname
56 {
57 	const char *name;
58 	const char	code;
59 }			cnames[] =
60 
61 {
62 	{
63 		"NUL", '\0'
64 	},
65 	{
66 		"SOH", '\001'
67 	},
68 	{
69 		"STX", '\002'
70 	},
71 	{
72 		"ETX", '\003'
73 	},
74 	{
75 		"EOT", '\004'
76 	},
77 	{
78 		"ENQ", '\005'
79 	},
80 	{
81 		"ACK", '\006'
82 	},
83 	{
84 		"BEL", '\007'
85 	},
86 	{
87 		"alert", '\007'
88 	},
89 	{
90 		"BS", '\010'
91 	},
92 	{
93 		"backspace", '\b'
94 	},
95 	{
96 		"HT", '\011'
97 	},
98 	{
99 		"tab", '\t'
100 	},
101 	{
102 		"LF", '\012'
103 	},
104 	{
105 		"newline", '\n'
106 	},
107 	{
108 		"VT", '\013'
109 	},
110 	{
111 		"vertical-tab", '\v'
112 	},
113 	{
114 		"FF", '\014'
115 	},
116 	{
117 		"form-feed", '\f'
118 	},
119 	{
120 		"CR", '\015'
121 	},
122 	{
123 		"carriage-return", '\r'
124 	},
125 	{
126 		"SO", '\016'
127 	},
128 	{
129 		"SI", '\017'
130 	},
131 	{
132 		"DLE", '\020'
133 	},
134 	{
135 		"DC1", '\021'
136 	},
137 	{
138 		"DC2", '\022'
139 	},
140 	{
141 		"DC3", '\023'
142 	},
143 	{
144 		"DC4", '\024'
145 	},
146 	{
147 		"NAK", '\025'
148 	},
149 	{
150 		"SYN", '\026'
151 	},
152 	{
153 		"ETB", '\027'
154 	},
155 	{
156 		"CAN", '\030'
157 	},
158 	{
159 		"EM", '\031'
160 	},
161 	{
162 		"SUB", '\032'
163 	},
164 	{
165 		"ESC", '\033'
166 	},
167 	{
168 		"IS4", '\034'
169 	},
170 	{
171 		"FS", '\034'
172 	},
173 	{
174 		"IS3", '\035'
175 	},
176 	{
177 		"GS", '\035'
178 	},
179 	{
180 		"IS2", '\036'
181 	},
182 	{
183 		"RS", '\036'
184 	},
185 	{
186 		"IS1", '\037'
187 	},
188 	{
189 		"US", '\037'
190 	},
191 	{
192 		"space", ' '
193 	},
194 	{
195 		"exclamation-mark", '!'
196 	},
197 	{
198 		"quotation-mark", '"'
199 	},
200 	{
201 		"number-sign", '#'
202 	},
203 	{
204 		"dollar-sign", '$'
205 	},
206 	{
207 		"percent-sign", '%'
208 	},
209 	{
210 		"ampersand", '&'
211 	},
212 	{
213 		"apostrophe", '\''
214 	},
215 	{
216 		"left-parenthesis", '('
217 	},
218 	{
219 		"right-parenthesis", ')'
220 	},
221 	{
222 		"asterisk", '*'
223 	},
224 	{
225 		"plus-sign", '+'
226 	},
227 	{
228 		"comma", ','
229 	},
230 	{
231 		"hyphen", '-'
232 	},
233 	{
234 		"hyphen-minus", '-'
235 	},
236 	{
237 		"period", '.'
238 	},
239 	{
240 		"full-stop", '.'
241 	},
242 	{
243 		"slash", '/'
244 	},
245 	{
246 		"solidus", '/'
247 	},
248 	{
249 		"zero", '0'
250 	},
251 	{
252 		"one", '1'
253 	},
254 	{
255 		"two", '2'
256 	},
257 	{
258 		"three", '3'
259 	},
260 	{
261 		"four", '4'
262 	},
263 	{
264 		"five", '5'
265 	},
266 	{
267 		"six", '6'
268 	},
269 	{
270 		"seven", '7'
271 	},
272 	{
273 		"eight", '8'
274 	},
275 	{
276 		"nine", '9'
277 	},
278 	{
279 		"colon", ':'
280 	},
281 	{
282 		"semicolon", ';'
283 	},
284 	{
285 		"less-than-sign", '<'
286 	},
287 	{
288 		"equals-sign", '='
289 	},
290 	{
291 		"greater-than-sign", '>'
292 	},
293 	{
294 		"question-mark", '?'
295 	},
296 	{
297 		"commercial-at", '@'
298 	},
299 	{
300 		"left-square-bracket", '['
301 	},
302 	{
303 		"backslash", '\\'
304 	},
305 	{
306 		"reverse-solidus", '\\'
307 	},
308 	{
309 		"right-square-bracket", ']'
310 	},
311 	{
312 		"circumflex", '^'
313 	},
314 	{
315 		"circumflex-accent", '^'
316 	},
317 	{
318 		"underscore", '_'
319 	},
320 	{
321 		"low-line", '_'
322 	},
323 	{
324 		"grave-accent", '`'
325 	},
326 	{
327 		"left-brace", '{'
328 	},
329 	{
330 		"left-curly-bracket", '{'
331 	},
332 	{
333 		"vertical-line", '|'
334 	},
335 	{
336 		"right-brace", '}'
337 	},
338 	{
339 		"right-curly-bracket", '}'
340 	},
341 	{
342 		"tilde", '~'
343 	},
344 	{
345 		"DEL", '\177'
346 	},
347 	{
348 		NULL, 0
349 	}
350 };
351 
352 /*
353  * The following arrays define the valid character class names.
354  */
355 static const char *const classNames[NUM_CCLASSES + 1] = {
356 	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
357 	"lower", "print", "punct", "space", "upper", "xdigit", NULL
358 };
359 
360 enum classes
361 {
362 	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
363 	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
364 };
365 
366 /*
367  * We do not use the hard-wired Unicode classification tables that Tcl does.
368  * This is because (a) we need to deal with other encodings besides Unicode,
369  * and (b) we want to track the behavior of the libc locale routines as
370  * closely as possible.  For example, it wouldn't be unreasonable for a
371  * locale to not consider every Unicode letter as a letter.  So we build
372  * character classification cvecs by asking libc, even for Unicode.
373  */
374 
375 
376 /*
377  * element - map collating-element name to chr
378  */
379 static chr
element(struct vars * v,const chr * startp,const chr * endp)380 element(struct vars *v,			/* context */
381 		const chr *startp,		/* points to start of name */
382 		const chr *endp)		/* points just past end of name */
383 {
384 	const struct cname *cn;
385 	size_t		len;
386 
387 	/* generic:  one-chr names stand for themselves */
388 	assert(startp < endp);
389 	len = endp - startp;
390 	if (len == 1)
391 		return *startp;
392 
393 	NOTE(REG_ULOCALE);
394 
395 	/* search table */
396 	for (cn = cnames; cn->name != NULL; cn++)
397 	{
398 		if (strlen(cn->name) == len &&
399 			pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
400 		{
401 			break;				/* NOTE BREAK OUT */
402 		}
403 	}
404 	if (cn->name != NULL)
405 		return CHR(cn->code);
406 
407 	/* couldn't find it */
408 	ERR(REG_ECOLLATE);
409 	return 0;
410 }
411 
412 /*
413  * range - supply cvec for a range, including legality check
414  */
415 static struct cvec *
range(struct vars * v,chr a,chr b,int cases)416 range(struct vars *v,			/* context */
417 	  chr a,					/* range start */
418 	  chr b,					/* range end, might equal a */
419 	  int cases)				/* case-independent? */
420 {
421 	int			nchrs;
422 	struct cvec *cv;
423 	chr			c,
424 				cc;
425 
426 	if (a != b && !before(a, b))
427 	{
428 		ERR(REG_ERANGE);
429 		return NULL;
430 	}
431 
432 	if (!cases)
433 	{							/* easy version */
434 		cv = getcvec(v, 0, 1);
435 		NOERRN();
436 		addrange(cv, a, b);
437 		return cv;
438 	}
439 
440 	/*
441 	 * When case-independent, it's hard to decide when cvec ranges are usable,
442 	 * so for now at least, we won't try.  We use a range for the originally
443 	 * specified chrs and then add on any case-equivalents that are outside
444 	 * that range as individual chrs.
445 	 *
446 	 * To ensure sane behavior if someone specifies a very large range, limit
447 	 * the allocation size to 100000 chrs (arbitrary) and check for overrun
448 	 * inside the loop below.
449 	 */
450 	nchrs = b - a + 1;
451 	if (nchrs <= 0 || nchrs > 100000)
452 		nchrs = 100000;
453 
454 	cv = getcvec(v, nchrs, 1);
455 	NOERRN();
456 	addrange(cv, a, b);
457 
458 	for (c = a; c <= b; c++)
459 	{
460 		cc = pg_wc_tolower(c);
461 		if (cc != c &&
462 			(before(cc, a) || before(b, cc)))
463 		{
464 			if (cv->nchrs >= cv->chrspace)
465 			{
466 				ERR(REG_ETOOBIG);
467 				return NULL;
468 			}
469 			addchr(cv, cc);
470 		}
471 		cc = pg_wc_toupper(c);
472 		if (cc != c &&
473 			(before(cc, a) || before(b, cc)))
474 		{
475 			if (cv->nchrs >= cv->chrspace)
476 			{
477 				ERR(REG_ETOOBIG);
478 				return NULL;
479 			}
480 			addchr(cv, cc);
481 		}
482 		if (CANCEL_REQUESTED(v->re))
483 		{
484 			ERR(REG_CANCEL);
485 			return NULL;
486 		}
487 	}
488 
489 	return cv;
490 }
491 
492 /*
493  * before - is chr x before chr y, for purposes of range legality?
494  */
495 static int						/* predicate */
before(chr x,chr y)496 before(chr x, chr y)
497 {
498 	if (x < y)
499 		return 1;
500 	return 0;
501 }
502 
503 /*
504  * eclass - supply cvec for an equivalence class
505  * Must include case counterparts on request.
506  */
507 static struct cvec *
eclass(struct vars * v,chr c,int cases)508 eclass(struct vars *v,			/* context */
509 	   chr c,					/* Collating element representing the
510 								 * equivalence class. */
511 	   int cases)				/* all cases? */
512 {
513 	struct cvec *cv;
514 
515 	/* crude fake equivalence class for testing */
516 	if ((v->cflags & REG_FAKE) && c == 'x')
517 	{
518 		cv = getcvec(v, 4, 0);
519 		addchr(cv, CHR('x'));
520 		addchr(cv, CHR('y'));
521 		if (cases)
522 		{
523 			addchr(cv, CHR('X'));
524 			addchr(cv, CHR('Y'));
525 		}
526 		return cv;
527 	}
528 
529 	/* otherwise, none */
530 	if (cases)
531 		return allcases(v, c);
532 	cv = getcvec(v, 1, 0);
533 	assert(cv != NULL);
534 	addchr(cv, c);
535 	return cv;
536 }
537 
538 /*
539  * cclass - supply cvec for a character class
540  *
541  * Must include case counterparts if "cases" is true.
542  *
543  * The returned cvec might be either a transient cvec gotten from getcvec(),
544  * or a permanently cached one from pg_ctype_get_cache().  This is okay
545  * because callers are not supposed to explicitly free the result either way.
546  */
547 static struct cvec *
cclass(struct vars * v,const chr * startp,const chr * endp,int cases)548 cclass(struct vars *v,			/* context */
549 	   const chr *startp,		/* where the name starts */
550 	   const chr *endp,			/* just past the end of the name */
551 	   int cases)				/* case-independent? */
552 {
553 	size_t		len;
554 	struct cvec *cv = NULL;
555 	const char *const *namePtr;
556 	int			i,
557 				index;
558 
559 	/*
560 	 * Map the name to the corresponding enumerated value.
561 	 */
562 	len = endp - startp;
563 	index = -1;
564 	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
565 	{
566 		if (strlen(*namePtr) == len &&
567 			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
568 		{
569 			index = i;
570 			break;
571 		}
572 	}
573 	if (index == -1)
574 	{
575 		ERR(REG_ECTYPE);
576 		return NULL;
577 	}
578 
579 	/*
580 	 * Remap lower and upper to alpha if the match is case insensitive.
581 	 */
582 
583 	if (cases &&
584 		((enum classes) index == CC_LOWER ||
585 		 (enum classes) index == CC_UPPER))
586 		index = (int) CC_ALPHA;
587 
588 	/*
589 	 * Now compute the character class contents.  For classes that are based
590 	 * on the behavior of a <wctype.h> or <ctype.h> function, we use
591 	 * pg_ctype_get_cache so that we can cache the results.  Other classes
592 	 * have definitions that are hard-wired here, and for those we just
593 	 * construct a transient cvec on the fly.
594 	 *
595 	 * NB: keep this code in sync with cclass_column_index(), below.
596 	 */
597 
598 	switch ((enum classes) index)
599 	{
600 		case CC_PRINT:
601 			cv = pg_ctype_get_cache(pg_wc_isprint, index);
602 			break;
603 		case CC_ALNUM:
604 			cv = pg_ctype_get_cache(pg_wc_isalnum, index);
605 			break;
606 		case CC_ALPHA:
607 			cv = pg_ctype_get_cache(pg_wc_isalpha, index);
608 			break;
609 		case CC_ASCII:
610 			/* hard-wired meaning */
611 			cv = getcvec(v, 0, 1);
612 			if (cv)
613 				addrange(cv, 0, 0x7f);
614 			break;
615 		case CC_BLANK:
616 			/* hard-wired meaning */
617 			cv = getcvec(v, 2, 0);
618 			addchr(cv, '\t');
619 			addchr(cv, ' ');
620 			break;
621 		case CC_CNTRL:
622 			/* hard-wired meaning */
623 			cv = getcvec(v, 0, 2);
624 			addrange(cv, 0x0, 0x1f);
625 			addrange(cv, 0x7f, 0x9f);
626 			break;
627 		case CC_DIGIT:
628 			cv = pg_ctype_get_cache(pg_wc_isdigit, index);
629 			break;
630 		case CC_PUNCT:
631 			cv = pg_ctype_get_cache(pg_wc_ispunct, index);
632 			break;
633 		case CC_XDIGIT:
634 
635 			/*
636 			 * It's not clear how to define this in non-western locales, and
637 			 * even less clear that there's any particular use in trying. So
638 			 * just hard-wire the meaning.
639 			 */
640 			cv = getcvec(v, 0, 3);
641 			if (cv)
642 			{
643 				addrange(cv, '0', '9');
644 				addrange(cv, 'a', 'f');
645 				addrange(cv, 'A', 'F');
646 			}
647 			break;
648 		case CC_SPACE:
649 			cv = pg_ctype_get_cache(pg_wc_isspace, index);
650 			break;
651 		case CC_LOWER:
652 			cv = pg_ctype_get_cache(pg_wc_islower, index);
653 			break;
654 		case CC_UPPER:
655 			cv = pg_ctype_get_cache(pg_wc_isupper, index);
656 			break;
657 		case CC_GRAPH:
658 			cv = pg_ctype_get_cache(pg_wc_isgraph, index);
659 			break;
660 	}
661 
662 	/* If cv is NULL now, the reason must be "out of memory" */
663 	if (cv == NULL)
664 		ERR(REG_ESPACE);
665 	return cv;
666 }
667 
668 /*
669  * cclass_column_index - get appropriate high colormap column index for chr
670  */
671 static int
cclass_column_index(struct colormap * cm,chr c)672 cclass_column_index(struct colormap *cm, chr c)
673 {
674 	int			colnum = 0;
675 
676 	/* Shouldn't go through all these pushups for simple chrs */
677 	assert(c > MAX_SIMPLE_CHR);
678 
679 	/*
680 	 * Note: we should not see requests to consider cclasses that are not
681 	 * treated as locale-specific by cclass(), above.
682 	 */
683 	if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
684 		colnum |= cm->classbits[CC_PRINT];
685 	if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
686 		colnum |= cm->classbits[CC_ALNUM];
687 	if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
688 		colnum |= cm->classbits[CC_ALPHA];
689 	assert(cm->classbits[CC_ASCII] == 0);
690 	assert(cm->classbits[CC_BLANK] == 0);
691 	assert(cm->classbits[CC_CNTRL] == 0);
692 	if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
693 		colnum |= cm->classbits[CC_DIGIT];
694 	if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
695 		colnum |= cm->classbits[CC_PUNCT];
696 	assert(cm->classbits[CC_XDIGIT] == 0);
697 	if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
698 		colnum |= cm->classbits[CC_SPACE];
699 	if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
700 		colnum |= cm->classbits[CC_LOWER];
701 	if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
702 		colnum |= cm->classbits[CC_UPPER];
703 	if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
704 		colnum |= cm->classbits[CC_GRAPH];
705 
706 	return colnum;
707 }
708 
709 /*
710  * allcases - supply cvec for all case counterparts of a chr (including itself)
711  *
712  * This is a shortcut, preferably an efficient one, for simple characters;
713  * messy cases are done via range().
714  */
715 static struct cvec *
allcases(struct vars * v,chr c)716 allcases(struct vars *v,		/* context */
717 		 chr c)					/* character to get case equivs of */
718 {
719 	struct cvec *cv;
720 	chr			lc,
721 				uc;
722 
723 	lc = pg_wc_tolower(c);
724 	uc = pg_wc_toupper(c);
725 
726 	cv = getcvec(v, 2, 0);
727 	addchr(cv, lc);
728 	if (lc != uc)
729 		addchr(cv, uc);
730 	return cv;
731 }
732 
733 /*
734  * cmp - chr-substring compare
735  *
736  * Backrefs need this.  It should preferably be efficient.
737  * Note that it does not need to report anything except equal/unequal.
738  * Note also that the length is exact, and the comparison should not
739  * stop at embedded NULs!
740  */
741 static int						/* 0 for equal, nonzero for unequal */
cmp(const chr * x,const chr * y,size_t len)742 cmp(const chr *x, const chr *y, /* strings to compare */
743 	size_t len)					/* exact length of comparison */
744 {
745 	return memcmp(VS(x), VS(y), len * sizeof(chr));
746 }
747 
748 /*
749  * casecmp - case-independent chr-substring compare
750  *
751  * REG_ICASE backrefs need this.  It should preferably be efficient.
752  * Note that it does not need to report anything except equal/unequal.
753  * Note also that the length is exact, and the comparison should not
754  * stop at embedded NULs!
755  */
756 static int						/* 0 for equal, nonzero for unequal */
casecmp(const chr * x,const chr * y,size_t len)757 casecmp(const chr *x, const chr *y, /* strings to compare */
758 		size_t len)				/* exact length of comparison */
759 {
760 	for (; len > 0; len--, x++, y++)
761 	{
762 		if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
763 			return 1;
764 	}
765 	return 0;
766 }
767