1 /*
2  * regc_locale.c --
3  *
4  *	This file contains locale-specific regexp routines.
5  *	This file is #included by regcomp.c.
6  *
7  * Copyright (c) 1998 by Scriptics Corporation.
8  *
9  * This software is copyrighted by the Regents of the University of
10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11  * Corporation and other parties.  The following terms apply to all files
12  * associated with the software unless explicitly disclaimed in
13  * individual files.
14  *
15  * The authors hereby grant permission to use, copy, modify, distribute,
16  * and license this software and its documentation for any purpose, provided
17  * that existing copyright notices are retained in all copies and that this
18  * notice is included verbatim in any distributions. No written agreement,
19  * license, or royalty fee is required for any of the authorized uses.
20  * Modifications to this software may be copyrighted by their authors
21  * and need not follow the licensing terms described here, provided that
22  * the new terms are clearly indicated on the first page of each file where
23  * they apply.
24  *
25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  *
31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36  * MODIFICATIONS.
37  *
38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
39  * U.S. government, the Government shall have only "Restricted Rights"
40  * in the software and related documentation as defined in the Federal
41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
42  * are acquiring the software on behalf of the Department of Defense, the
43  * software shall be classified as "Commercial Computer Software" and the
44  * Government shall have only "Restricted Rights" as defined in Clause
45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
46  * authors grant the U.S. Government and others acting in its behalf
47  * permission to use and distribute the software in accordance with the
48  * terms specified in this license.
49  *
50  * src/backend/regex/regc_locale.c
51  */
52 
53 /* ASCII character-name table */
54 
55 static const struct cname
56 {
57 	const char *name;
58 	const char	code;
59 }			cnames[] =
60 
61 {
62 	{
63 		"NUL", '\0'
64 	},
65 	{
66 		"SOH", '\001'
67 	},
68 	{
69 		"STX", '\002'
70 	},
71 	{
72 		"ETX", '\003'
73 	},
74 	{
75 		"EOT", '\004'
76 	},
77 	{
78 		"ENQ", '\005'
79 	},
80 	{
81 		"ACK", '\006'
82 	},
83 	{
84 		"BEL", '\007'
85 	},
86 	{
87 		"alert", '\007'
88 	},
89 	{
90 		"BS", '\010'
91 	},
92 	{
93 		"backspace", '\b'
94 	},
95 	{
96 		"HT", '\011'
97 	},
98 	{
99 		"tab", '\t'
100 	},
101 	{
102 		"LF", '\012'
103 	},
104 	{
105 		"newline", '\n'
106 	},
107 	{
108 		"VT", '\013'
109 	},
110 	{
111 		"vertical-tab", '\v'
112 	},
113 	{
114 		"FF", '\014'
115 	},
116 	{
117 		"form-feed", '\f'
118 	},
119 	{
120 		"CR", '\015'
121 	},
122 	{
123 		"carriage-return", '\r'
124 	},
125 	{
126 		"SO", '\016'
127 	},
128 	{
129 		"SI", '\017'
130 	},
131 	{
132 		"DLE", '\020'
133 	},
134 	{
135 		"DC1", '\021'
136 	},
137 	{
138 		"DC2", '\022'
139 	},
140 	{
141 		"DC3", '\023'
142 	},
143 	{
144 		"DC4", '\024'
145 	},
146 	{
147 		"NAK", '\025'
148 	},
149 	{
150 		"SYN", '\026'
151 	},
152 	{
153 		"ETB", '\027'
154 	},
155 	{
156 		"CAN", '\030'
157 	},
158 	{
159 		"EM", '\031'
160 	},
161 	{
162 		"SUB", '\032'
163 	},
164 	{
165 		"ESC", '\033'
166 	},
167 	{
168 		"IS4", '\034'
169 	},
170 	{
171 		"FS", '\034'
172 	},
173 	{
174 		"IS3", '\035'
175 	},
176 	{
177 		"GS", '\035'
178 	},
179 	{
180 		"IS2", '\036'
181 	},
182 	{
183 		"RS", '\036'
184 	},
185 	{
186 		"IS1", '\037'
187 	},
188 	{
189 		"US", '\037'
190 	},
191 	{
192 		"space", ' '
193 	},
194 	{
195 		"exclamation-mark", '!'
196 	},
197 	{
198 		"quotation-mark", '"'
199 	},
200 	{
201 		"number-sign", '#'
202 	},
203 	{
204 		"dollar-sign", '$'
205 	},
206 	{
207 		"percent-sign", '%'
208 	},
209 	{
210 		"ampersand", '&'
211 	},
212 	{
213 		"apostrophe", '\''
214 	},
215 	{
216 		"left-parenthesis", '('
217 	},
218 	{
219 		"right-parenthesis", ')'
220 	},
221 	{
222 		"asterisk", '*'
223 	},
224 	{
225 		"plus-sign", '+'
226 	},
227 	{
228 		"comma", ','
229 	},
230 	{
231 		"hyphen", '-'
232 	},
233 	{
234 		"hyphen-minus", '-'
235 	},
236 	{
237 		"period", '.'
238 	},
239 	{
240 		"full-stop", '.'
241 	},
242 	{
243 		"slash", '/'
244 	},
245 	{
246 		"solidus", '/'
247 	},
248 	{
249 		"zero", '0'
250 	},
251 	{
252 		"one", '1'
253 	},
254 	{
255 		"two", '2'
256 	},
257 	{
258 		"three", '3'
259 	},
260 	{
261 		"four", '4'
262 	},
263 	{
264 		"five", '5'
265 	},
266 	{
267 		"six", '6'
268 	},
269 	{
270 		"seven", '7'
271 	},
272 	{
273 		"eight", '8'
274 	},
275 	{
276 		"nine", '9'
277 	},
278 	{
279 		"colon", ':'
280 	},
281 	{
282 		"semicolon", ';'
283 	},
284 	{
285 		"less-than-sign", '<'
286 	},
287 	{
288 		"equals-sign", '='
289 	},
290 	{
291 		"greater-than-sign", '>'
292 	},
293 	{
294 		"question-mark", '?'
295 	},
296 	{
297 		"commercial-at", '@'
298 	},
299 	{
300 		"left-square-bracket", '['
301 	},
302 	{
303 		"backslash", '\\'
304 	},
305 	{
306 		"reverse-solidus", '\\'
307 	},
308 	{
309 		"right-square-bracket", ']'
310 	},
311 	{
312 		"circumflex", '^'
313 	},
314 	{
315 		"circumflex-accent", '^'
316 	},
317 	{
318 		"underscore", '_'
319 	},
320 	{
321 		"low-line", '_'
322 	},
323 	{
324 		"grave-accent", '`'
325 	},
326 	{
327 		"left-brace", '{'
328 	},
329 	{
330 		"left-curly-bracket", '{'
331 	},
332 	{
333 		"vertical-line", '|'
334 	},
335 	{
336 		"right-brace", '}'
337 	},
338 	{
339 		"right-curly-bracket", '}'
340 	},
341 	{
342 		"tilde", '~'
343 	},
344 	{
345 		"DEL", '\177'
346 	},
347 	{
348 		NULL, 0
349 	}
350 };
351 
352 /*
353  * The following array defines the valid character class names.
354  * The entries must match enum char_classes in regguts.h.
355  */
356 static const char *const classNames[NUM_CCLASSES + 1] = {
357 	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
358 	"lower", "print", "punct", "space", "upper", "xdigit", "word",
359 	NULL
360 };
361 
362 /*
363  * We do not use the hard-wired Unicode classification tables that Tcl does.
364  * This is because (a) we need to deal with other encodings besides Unicode,
365  * and (b) we want to track the behavior of the libc locale routines as
366  * closely as possible.  For example, it wouldn't be unreasonable for a
367  * locale to not consider every Unicode letter as a letter.  So we build
368  * character classification cvecs by asking libc, even for Unicode.
369  */
370 
371 
372 /*
373  * element - map collating-element name to chr
374  */
375 static chr
element(struct vars * v,const chr * startp,const chr * endp)376 element(struct vars *v,			/* context */
377 		const chr *startp,		/* points to start of name */
378 		const chr *endp)		/* points just past end of name */
379 {
380 	const struct cname *cn;
381 	size_t		len;
382 
383 	/* generic:  one-chr names stand for themselves */
384 	assert(startp < endp);
385 	len = endp - startp;
386 	if (len == 1)
387 		return *startp;
388 
389 	NOTE(REG_ULOCALE);
390 
391 	/* search table */
392 	for (cn = cnames; cn->name != NULL; cn++)
393 	{
394 		if (strlen(cn->name) == len &&
395 			pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
396 		{
397 			break;				/* NOTE BREAK OUT */
398 		}
399 	}
400 	if (cn->name != NULL)
401 		return CHR(cn->code);
402 
403 	/* couldn't find it */
404 	ERR(REG_ECOLLATE);
405 	return 0;
406 }
407 
408 /*
409  * range - supply cvec for a range, including legality check
410  */
411 static struct cvec *
range(struct vars * v,chr a,chr b,int cases)412 range(struct vars *v,			/* context */
413 	  chr a,					/* range start */
414 	  chr b,					/* range end, might equal a */
415 	  int cases)				/* case-independent? */
416 {
417 	int			nchrs;
418 	struct cvec *cv;
419 	chr			c,
420 				cc;
421 
422 	if (a != b && !before(a, b))
423 	{
424 		ERR(REG_ERANGE);
425 		return NULL;
426 	}
427 
428 	if (!cases)
429 	{							/* easy version */
430 		cv = getcvec(v, 0, 1);
431 		NOERRN();
432 		addrange(cv, a, b);
433 		return cv;
434 	}
435 
436 	/*
437 	 * When case-independent, it's hard to decide when cvec ranges are usable,
438 	 * so for now at least, we won't try.  We use a range for the originally
439 	 * specified chrs and then add on any case-equivalents that are outside
440 	 * that range as individual chrs.
441 	 *
442 	 * To ensure sane behavior if someone specifies a very large range, limit
443 	 * the allocation size to 100000 chrs (arbitrary) and check for overrun
444 	 * inside the loop below.
445 	 */
446 	nchrs = b - a + 1;
447 	if (nchrs <= 0 || nchrs > 100000)
448 		nchrs = 100000;
449 
450 	cv = getcvec(v, nchrs, 1);
451 	NOERRN();
452 	addrange(cv, a, b);
453 
454 	for (c = a; c <= b; c++)
455 	{
456 		cc = pg_wc_tolower(c);
457 		if (cc != c &&
458 			(before(cc, a) || before(b, cc)))
459 		{
460 			if (cv->nchrs >= cv->chrspace)
461 			{
462 				ERR(REG_ETOOBIG);
463 				return NULL;
464 			}
465 			addchr(cv, cc);
466 		}
467 		cc = pg_wc_toupper(c);
468 		if (cc != c &&
469 			(before(cc, a) || before(b, cc)))
470 		{
471 			if (cv->nchrs >= cv->chrspace)
472 			{
473 				ERR(REG_ETOOBIG);
474 				return NULL;
475 			}
476 			addchr(cv, cc);
477 		}
478 		if (CANCEL_REQUESTED(v->re))
479 		{
480 			ERR(REG_CANCEL);
481 			return NULL;
482 		}
483 	}
484 
485 	return cv;
486 }
487 
488 /*
489  * before - is chr x before chr y, for purposes of range legality?
490  */
491 static int						/* predicate */
before(chr x,chr y)492 before(chr x, chr y)
493 {
494 	if (x < y)
495 		return 1;
496 	return 0;
497 }
498 
499 /*
500  * eclass - supply cvec for an equivalence class
501  * Must include case counterparts on request.
502  */
503 static struct cvec *
eclass(struct vars * v,chr c,int cases)504 eclass(struct vars *v,			/* context */
505 	   chr c,					/* Collating element representing the
506 								 * equivalence class. */
507 	   int cases)				/* all cases? */
508 {
509 	struct cvec *cv;
510 
511 	/* crude fake equivalence class for testing */
512 	if ((v->cflags & REG_FAKE) && c == 'x')
513 	{
514 		cv = getcvec(v, 4, 0);
515 		addchr(cv, CHR('x'));
516 		addchr(cv, CHR('y'));
517 		if (cases)
518 		{
519 			addchr(cv, CHR('X'));
520 			addchr(cv, CHR('Y'));
521 		}
522 		return cv;
523 	}
524 
525 	/* otherwise, none */
526 	if (cases)
527 		return allcases(v, c);
528 	cv = getcvec(v, 1, 0);
529 	assert(cv != NULL);
530 	addchr(cv, c);
531 	return cv;
532 }
533 
534 /*
535  * lookupcclass - lookup a character class identified by name
536  *
537  * On failure, sets an error code in *v; the result is then garbage.
538  */
539 static enum char_classes
lookupcclass(struct vars * v,const chr * startp,const chr * endp)540 lookupcclass(struct vars *v,	/* context (for returning errors) */
541 			 const chr *startp, /* where the name starts */
542 			 const chr *endp)	/* just past the end of the name */
543 {
544 	size_t		len;
545 	const char *const *namePtr;
546 	int			i;
547 
548 	/*
549 	 * Map the name to the corresponding enumerated value.
550 	 */
551 	len = endp - startp;
552 	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
553 	{
554 		if (strlen(*namePtr) == len &&
555 			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
556 			return (enum char_classes) i;
557 	}
558 
559 	ERR(REG_ECTYPE);
560 	return (enum char_classes) 0;
561 }
562 
563 /*
564  * cclasscvec - supply cvec for a character class
565  *
566  * Must include case counterparts if "cases" is true.
567  *
568  * The returned cvec might be either a transient cvec gotten from getcvec(),
569  * or a permanently cached one from pg_ctype_get_cache().  This is okay
570  * because callers are not supposed to explicitly free the result either way.
571  */
572 static struct cvec *
cclasscvec(struct vars * v,enum char_classes cclasscode,int cases)573 cclasscvec(struct vars *v,		/* context */
574 		   enum char_classes cclasscode,	/* class to build a cvec for */
575 		   int cases)			/* case-independent? */
576 {
577 	struct cvec *cv = NULL;
578 
579 	/*
580 	 * Remap lower and upper to alpha if the match is case insensitive.
581 	 */
582 
583 	if (cases &&
584 		(cclasscode == CC_LOWER ||
585 		 cclasscode == CC_UPPER))
586 		cclasscode = CC_ALPHA;
587 
588 	/*
589 	 * Now compute the character class contents.  For classes that are based
590 	 * on the behavior of a <wctype.h> or <ctype.h> function, we use
591 	 * pg_ctype_get_cache so that we can cache the results.  Other classes
592 	 * have definitions that are hard-wired here, and for those we just
593 	 * construct a transient cvec on the fly.
594 	 *
595 	 * NB: keep this code in sync with cclass_column_index(), below.
596 	 */
597 
598 	switch (cclasscode)
599 	{
600 		case CC_PRINT:
601 			cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
602 			break;
603 		case CC_ALNUM:
604 			cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
605 			break;
606 		case CC_ALPHA:
607 			cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
608 			break;
609 		case CC_WORD:
610 			cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
611 			break;
612 		case CC_ASCII:
613 			/* hard-wired meaning */
614 			cv = getcvec(v, 0, 1);
615 			if (cv)
616 				addrange(cv, 0, 0x7f);
617 			break;
618 		case CC_BLANK:
619 			/* hard-wired meaning */
620 			cv = getcvec(v, 2, 0);
621 			addchr(cv, '\t');
622 			addchr(cv, ' ');
623 			break;
624 		case CC_CNTRL:
625 			/* hard-wired meaning */
626 			cv = getcvec(v, 0, 2);
627 			addrange(cv, 0x0, 0x1f);
628 			addrange(cv, 0x7f, 0x9f);
629 			break;
630 		case CC_DIGIT:
631 			cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
632 			break;
633 		case CC_PUNCT:
634 			cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
635 			break;
636 		case CC_XDIGIT:
637 
638 			/*
639 			 * It's not clear how to define this in non-western locales, and
640 			 * even less clear that there's any particular use in trying. So
641 			 * just hard-wire the meaning.
642 			 */
643 			cv = getcvec(v, 0, 3);
644 			if (cv)
645 			{
646 				addrange(cv, '0', '9');
647 				addrange(cv, 'a', 'f');
648 				addrange(cv, 'A', 'F');
649 			}
650 			break;
651 		case CC_SPACE:
652 			cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
653 			break;
654 		case CC_LOWER:
655 			cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
656 			break;
657 		case CC_UPPER:
658 			cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
659 			break;
660 		case CC_GRAPH:
661 			cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
662 			break;
663 	}
664 
665 	/* If cv is NULL now, the reason must be "out of memory" */
666 	if (cv == NULL)
667 		ERR(REG_ESPACE);
668 	return cv;
669 }
670 
671 /*
672  * cclass_column_index - get appropriate high colormap column index for chr
673  */
674 static int
cclass_column_index(struct colormap * cm,chr c)675 cclass_column_index(struct colormap *cm, chr c)
676 {
677 	int			colnum = 0;
678 
679 	/* Shouldn't go through all these pushups for simple chrs */
680 	assert(c > MAX_SIMPLE_CHR);
681 
682 	/*
683 	 * Note: we should not see requests to consider cclasses that are not
684 	 * treated as locale-specific by cclasscvec(), above.
685 	 */
686 	if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
687 		colnum |= cm->classbits[CC_PRINT];
688 	if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
689 		colnum |= cm->classbits[CC_ALNUM];
690 	if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
691 		colnum |= cm->classbits[CC_ALPHA];
692 	if (cm->classbits[CC_WORD] && pg_wc_isword(c))
693 		colnum |= cm->classbits[CC_WORD];
694 	assert(cm->classbits[CC_ASCII] == 0);
695 	assert(cm->classbits[CC_BLANK] == 0);
696 	assert(cm->classbits[CC_CNTRL] == 0);
697 	if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
698 		colnum |= cm->classbits[CC_DIGIT];
699 	if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
700 		colnum |= cm->classbits[CC_PUNCT];
701 	assert(cm->classbits[CC_XDIGIT] == 0);
702 	if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
703 		colnum |= cm->classbits[CC_SPACE];
704 	if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
705 		colnum |= cm->classbits[CC_LOWER];
706 	if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
707 		colnum |= cm->classbits[CC_UPPER];
708 	if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
709 		colnum |= cm->classbits[CC_GRAPH];
710 
711 	return colnum;
712 }
713 
714 /*
715  * allcases - supply cvec for all case counterparts of a chr (including itself)
716  *
717  * This is a shortcut, preferably an efficient one, for simple characters;
718  * messy cases are done via range().
719  */
720 static struct cvec *
allcases(struct vars * v,chr c)721 allcases(struct vars *v,		/* context */
722 		 chr c)					/* character to get case equivs of */
723 {
724 	struct cvec *cv;
725 	chr			lc,
726 				uc;
727 
728 	lc = pg_wc_tolower(c);
729 	uc = pg_wc_toupper(c);
730 
731 	cv = getcvec(v, 2, 0);
732 	addchr(cv, lc);
733 	if (lc != uc)
734 		addchr(cv, uc);
735 	return cv;
736 }
737 
738 /*
739  * cmp - chr-substring compare
740  *
741  * Backrefs need this.  It should preferably be efficient.
742  * Note that it does not need to report anything except equal/unequal.
743  * Note also that the length is exact, and the comparison should not
744  * stop at embedded NULs!
745  */
746 static int						/* 0 for equal, nonzero for unequal */
cmp(const chr * x,const chr * y,size_t len)747 cmp(const chr *x, const chr *y, /* strings to compare */
748 	size_t len)					/* exact length of comparison */
749 {
750 	return memcmp(VS(x), VS(y), len * sizeof(chr));
751 }
752 
753 /*
754  * casecmp - case-independent chr-substring compare
755  *
756  * REG_ICASE backrefs need this.  It should preferably be efficient.
757  * Note that it does not need to report anything except equal/unequal.
758  * Note also that the length is exact, and the comparison should not
759  * stop at embedded NULs!
760  */
761 static int						/* 0 for equal, nonzero for unequal */
casecmp(const chr * x,const chr * y,size_t len)762 casecmp(const chr *x, const chr *y, /* strings to compare */
763 		size_t len)				/* exact length of comparison */
764 {
765 	for (; len > 0; len--, x++, y++)
766 	{
767 		if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
768 			return 1;
769 	}
770 	return 0;
771 }
772