1 /* Test regular expressions
2    Copyright 1996-2001, 2003-2019 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16 
17 #include <config.h>
18 
19 #include "regex.h"
20 
21 #include <locale.h>
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #if HAVE_DECL_ALARM
26 # include <unistd.h>
27 # include <signal.h>
28 #endif
29 
30 #include "localcharset.h"
31 
32 int
main(void)33 main (void)
34 {
35   int result = 0;
36   static struct re_pattern_buffer regex;
37   unsigned char folded_chars[UCHAR_MAX + 1];
38   int i;
39   const char *s;
40   struct re_registers regs;
41 
42 #if HAVE_DECL_ALARM
43   /* Some builds of glibc go into an infinite loop on this test.  */
44   int alarm_value = 2;
45   signal (SIGALRM, SIG_DFL);
46   alarm (alarm_value);
47 #endif
48   if (setlocale (LC_ALL, "en_US.UTF-8"))
49     {
50       {
51         /* https://sourceware.org/ml/libc-hacker/2006-09/msg00008.html
52            This test needs valgrind to catch the bug on Debian
53            GNU/Linux 3.1 x86, but it might catch the bug better
54            on other platforms and it shouldn't hurt to try the
55            test here.  */
56         static char const pat[] = "insert into";
57         static char const data[] =
58           "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK";
59         re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE
60                        | RE_ICASE);
61         memset (&regex, 0, sizeof regex);
62         s = re_compile_pattern (pat, sizeof pat - 1, &regex);
63         if (s)
64           result |= 1;
65         else
66           {
67             memset (&regs, 0, sizeof regs);
68             if (re_search (&regex, data, sizeof data - 1,
69                            0, sizeof data - 1, &regs)
70                 != -1)
71               result |= 1;
72             regfree (&regex);
73             free (regs.start);
74             free (regs.end);
75           }
76       }
77 
78       /* Check whether it's really a UTF-8 locale.
79          On mingw, the setlocale call succeeds but returns
80          "English_United States.1252", with locale_charset() returning
81          "CP1252".  */
82       if (strcmp (locale_charset (), "UTF-8") == 0)
83         {
84           /* This test is from glibc bug 15078.
85              The test case is from Andreas Schwab in
86              <https://sourceware.org/ml/libc-alpha/2013-01/msg00967.html>.
87           */
88           static char const pat[] = "[^x]x";
89           static char const data[] =
90             /* <U1000><U103B><U103D><U1014><U103A><U102F><U1015><U103A> */
91             "\xe1\x80\x80"
92             "\xe1\x80\xbb"
93             "\xe1\x80\xbd"
94             "\xe1\x80\x94"
95             "\xe1\x80\xba"
96             "\xe1\x80\xaf"
97             "\xe1\x80\x95"
98             "\xe1\x80\xba"
99             "x";
100           re_set_syntax (0);
101           memset (&regex, 0, sizeof regex);
102           s = re_compile_pattern (pat, sizeof pat - 1, &regex);
103           if (s)
104             result |= 1;
105           else
106             {
107               memset (&regs, 0, sizeof regs);
108               i = re_search (&regex, data, sizeof data - 1,
109                              0, sizeof data - 1, 0);
110               if (i != 0 && i != 21)
111                 result |= 1;
112               regfree (&regex);
113               free (regs.start);
114               free (regs.end);
115             }
116         }
117 
118       if (! setlocale (LC_ALL, "C"))
119         return 1;
120     }
121 
122   /* This test is from glibc bug 3957, reported by Andrew Mackey.  */
123   re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE);
124   memset (&regex, 0, sizeof regex);
125   s = re_compile_pattern ("a[^x]b", 6, &regex);
126   if (s)
127     result |= 2;
128   /* This should fail, but succeeds for glibc-2.5.  */
129   else
130     {
131       memset (&regs, 0, sizeof regs);
132       if (re_search (&regex, "a\nb", 3, 0, 3, &regs) != -1)
133         result |= 2;
134       regfree (&regex);
135       free (regs.start);
136       free (regs.end);
137     }
138 
139   /* This regular expression is from Spencer ere test number 75
140      in grep-2.3.  */
141   re_set_syntax (RE_SYNTAX_POSIX_EGREP);
142   memset (&regex, 0, sizeof regex);
143   for (i = 0; i <= UCHAR_MAX; i++)
144     folded_chars[i] = i;
145   regex.translate = folded_chars;
146   s = re_compile_pattern ("a[[:@:>@:]]b\n", 11, &regex);
147   /* This should fail with _Invalid character class name_ error.  */
148   if (!s)
149     {
150       result |= 4;
151       regfree (&regex);
152     }
153 
154   /* Ensure that [b-a] is diagnosed as invalid, when
155      using RE_NO_EMPTY_RANGES. */
156   re_set_syntax (RE_SYNTAX_POSIX_EGREP | RE_NO_EMPTY_RANGES);
157   memset (&regex, 0, sizeof regex);
158   s = re_compile_pattern ("a[b-a]", 6, &regex);
159   if (s == 0)
160     {
161       result |= 8;
162       regfree (&regex);
163     }
164 
165   /* This should succeed, but does not for glibc-2.1.3.  */
166   memset (&regex, 0, sizeof regex);
167   s = re_compile_pattern ("{1", 2, &regex);
168   if (s)
169     result |= 8;
170   else
171     regfree (&regex);
172 
173   /* The following example is derived from a problem report
174      against gawk from Jorge Stolfi <stolfi@ic.unicamp.br>.  */
175   memset (&regex, 0, sizeof regex);
176   s = re_compile_pattern ("[an\371]*n", 7, &regex);
177   if (s)
178     result |= 8;
179   /* This should match, but does not for glibc-2.2.1.  */
180   else
181     {
182       memset (&regs, 0, sizeof regs);
183       if (re_match (&regex, "an", 2, 0, &regs) != 2)
184         result |= 8;
185       regfree (&regex);
186       free (regs.start);
187       free (regs.end);
188     }
189 
190   memset (&regex, 0, sizeof regex);
191   s = re_compile_pattern ("x", 1, &regex);
192   if (s)
193     result |= 8;
194   /* glibc-2.2.93 does not work with a negative RANGE argument.  */
195   else
196     {
197       memset (&regs, 0, sizeof regs);
198       if (re_search (&regex, "wxy", 3, 2, -2, &regs) != 1)
199         result |= 8;
200       regfree (&regex);
201       free (regs.start);
202       free (regs.end);
203     }
204 
205   /* The version of regex.c in older versions of gnulib
206      ignored RE_ICASE.  Detect that problem too.  */
207   re_set_syntax (RE_SYNTAX_EMACS | RE_ICASE);
208   memset (&regex, 0, sizeof regex);
209   s = re_compile_pattern ("x", 1, &regex);
210   if (s)
211     result |= 16;
212   else
213     {
214       memset (&regs, 0, sizeof regs);
215       if (re_search (&regex, "WXY", 3, 0, 3, &regs) < 0)
216         result |= 16;
217       regfree (&regex);
218       free (regs.start);
219       free (regs.end);
220     }
221 
222   /* Catch a bug reported by Vin Shelton in
223      https://lists.gnu.org/r/bug-coreutils/2007-06/msg00089.html
224      */
225   re_set_syntax (RE_SYNTAX_POSIX_BASIC
226                  & ~RE_CONTEXT_INVALID_DUP
227                  & ~RE_NO_EMPTY_RANGES);
228   memset (&regex, 0, sizeof regex);
229   s = re_compile_pattern ("[[:alnum:]_-]\\\\+$", 16, &regex);
230   if (s)
231     result |= 32;
232   else
233     regfree (&regex);
234 
235   /* REG_STARTEND was added to glibc on 2004-01-15.
236      Reject older versions.  */
237   if (! REG_STARTEND)
238     result |= 64;
239 
240   /* Matching with the compiled form of this regexp would provoke
241      an assertion failure prior to glibc-2.28:
242        regexec.c:1375: pop_fail_stack: Assertion 'num >= 0' failed
243      With glibc-2.28, compilation fails and reports the invalid
244      back reference.  */
245   re_set_syntax (RE_SYNTAX_POSIX_EGREP);
246   memset (&regex, 0, sizeof regex);
247   s = re_compile_pattern ("0|()0|\\1|0", 10, &regex);
248   if (!s || strcmp (s, "Invalid back reference"))
249     result |= 64;
250 
251 #if 0
252   /* It would be nice to reject hosts whose regoff_t values are too
253      narrow (including glibc on hosts with 64-bit ptrdiff_t and
254      32-bit int), but we should wait until glibc implements this
255      feature.  Otherwise, support for equivalence classes and
256      multibyte collation symbols would always be broken except
257      when compiling --without-included-regex.   */
258   if (sizeof (regoff_t) < sizeof (ptrdiff_t)
259       || sizeof (regoff_t) < sizeof (ssize_t))
260     result |= 64;
261 #endif
262 
263   return result;
264 }
265