1 /* regexprops.c -- document the properties of the regular expressions
2    understood by gnulib.
3 
4    Copyright (C) 2005-2021 Free Software Foundation, Inc.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation, either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.
18 */
19 
20 
21 /*
22   The output of this program is included in the GNU findutils source
23   distribution.  The copying conditions for that file are generated
24   by the copying() function below.
25 */
26 
27 /* Written by James Youngman, <jay@gnu.org>. */
28 
29 /* config.h must be included first. */
30 #include <config.h>
31 
32 /* system headers */
33 #include <errno.h>
34 #include <regex.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <unistd.h>
38 
39 /* gnulib headers */
40 #include "progname.h"
41 
42 /* find headers */
43 #include "regextype.h"
44 
45 static void
output(const char * s,int escape)46 output (const char *s, int escape)
47 {
48   (void) escape;
49 
50   fputs (s, stdout);
51 }
52 
53 
54 static void
newline(void)55 newline (void)
56 {
57   output ("\n", 0);
58 }
59 
60 static void
content(const char * s)61 content (const char *s)
62 {
63   output (s, 1);
64 }
65 
66 static void
literal(const char * s)67 literal (const char *s)
68 {
69   output (s, 0);
70 }
71 
72 static void
directive(const char * s)73 directive (const char *s)
74 {
75   output (s, 0);
76 }
77 
78 static void
comment(const char * s)79 comment (const char *s)
80 {
81   directive ("@c");
82   if (s[0])
83     {
84       literal (" ");
85       literal (s);
86     }
87   newline ();
88 }
89 
90 static void
enum_item(const char * s)91 enum_item (const char *s)
92 {
93   newline ();
94   directive ("@item ");
95   literal (s);
96   newline ();
97 }
98 
99 static void
begin_subsection(const char * name,const char * next,const char * prev,const char * up)100 begin_subsection (const char *name,
101 		  const char *next,
102 		  const char *prev,
103 		  const char *up)
104 {
105   (void) next;
106   (void) prev;
107   (void) up;
108 
109   newline ();
110 
111   directive ("@node ");
112   content (name);
113   content (" regular expression syntax");
114   newline ();
115 
116   directive ("@subsection ");
117   output ("@samp{", 0);
118   content (name);
119   output ("}", 0);
120   content (" regular expression syntax");
121   newline ();
122 }
123 
124 static void
begintable_markup(char const * markup)125 begintable_markup (char const *markup)
126 {
127   newline ();
128   directive ("@table ");
129   literal (markup);
130   newline ();
131 }
132 
133 static void
endtable(void)134 endtable (void)
135 {
136   newline ();
137   directive ("@end table");
138   newline ();
139 }
140 
141 static void
beginenum(void)142 beginenum (void)
143 {
144   newline ();
145   directive ("@enumerate");
146   newline ();
147 }
148 
149 static void
endenum(void)150 endenum (void)
151 {
152   newline ();
153   directive ("@end enumerate");
154   newline ();
155 }
156 
157 static void
newpara(void)158 newpara (void)
159 {
160   content ("\n\n");
161 }
162 
163 
164 static void
describe_regex_syntax(int options)165 describe_regex_syntax (int options)
166 {
167   newpara ();
168   content ("The character @samp{.} matches any single character");
169   if ( (options & RE_DOT_NEWLINE)  == 0 )
170     {
171       content (" except newline");
172     }
173   if (options & RE_DOT_NOT_NULL)
174     {
175       if ( (options & RE_DOT_NEWLINE)  == 0 )
176 	content (" and");
177       else
178 	content (" except");
179 
180       content (" the null character");
181     }
182   content (".");
183   newpara ();
184 
185   if (!(options & RE_LIMITED_OPS))
186     {
187       begintable_markup ("@samp");
188       if (options & RE_BK_PLUS_QM)
189 	{
190 	  enum_item ("\\+");
191 	  content ("indicates that the regular expression should match one"
192 		   " or more occurrences of the previous atom or regexp.");
193 	  enum_item ("\\?");
194 	  content ("indicates that the regular expression should match zero"
195 		   " or one occurrence of the previous atom or regexp.");
196 	  enum_item ("+ and ?");
197 	  content ("match themselves.\n");
198 	}
199       else
200 	{
201 	  enum_item ("+");
202 	  content ("indicates that the regular expression should match one"
203 		   " or more occurrences of the previous atom or regexp.");
204 	  enum_item ("?");
205 	  content ("indicates that the regular expression should match zero"
206 		   " or one occurrence of the previous atom or regexp.");
207 	  enum_item ("\\+");
208 	  literal ("matches a @samp{+}");
209 	  enum_item ("\\?");
210 	  literal ("matches a @samp{?}.");
211 	}
212       endtable ();
213     }
214 
215   newpara ();
216 
217   content ("Bracket expressions are used to match ranges of characters.  ");
218   literal ("Bracket expressions where the range is backward, for example @samp{[z-a]}, are ");
219   if (options & RE_NO_EMPTY_RANGES)
220     content ("invalid");
221   else
222     content ("ignored");
223   content (".  ");
224 
225   if (options &  RE_BACKSLASH_ESCAPE_IN_LISTS)
226     literal ("Within square brackets, @samp{\\} can be used to quote "
227 	     "the following character.  ");
228   else
229     literal ("Within square brackets, @samp{\\} is taken literally.  ");
230 
231   if (options & RE_CHAR_CLASSES)
232     content ("Character classes are supported; for example "
233 	     "@samp{[[:digit:]]} will match a single decimal digit.\n");
234   else
235     literal ("Character classes are not supported, so for example "
236 	     "you would need to use @samp{[0-9]} "
237 	     "instead of @samp{[[:digit:]]}.\n");
238 
239   if (options & RE_HAT_LISTS_NOT_NEWLINE)
240     {
241       literal ("Non-matching lists @samp{[^@dots{}]} do not ever match newline.\n");
242     }
243   newpara ();
244   if (options & RE_NO_GNU_OPS)
245     {
246       content ("GNU extensions are not supported and so "
247 	       "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} "
248 	       "match "
249 	       "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively.\n");
250     }
251   else
252     {
253       content ("GNU extensions are supported:");
254       beginenum ();
255       enum_item ("@samp{\\w} matches a character within a word");
256       enum_item ("@samp{\\W} matches a character which is not within a word");
257       enum_item ("@samp{\\<} matches the beginning of a word");
258       enum_item ("@samp{\\>} matches the end of a word");
259       enum_item ("@samp{\\b} matches a word boundary");
260       enum_item ("@samp{\\B} matches characters which are not a word boundary");
261       enum_item ("@samp{\\`} matches the beginning of the whole input");
262       enum_item ("@samp{\\'} matches the end of the whole input");
263       endenum ();
264     }
265 
266   newpara ();
267 
268 
269   if (options & RE_NO_BK_PARENS)
270     {
271       literal ("Grouping is performed with parentheses @samp{()}.  ");
272 
273       if (options & RE_UNMATCHED_RIGHT_PAREN_ORD)
274 	literal ("An unmatched @samp{)} matches just itself.  ");
275     }
276   else
277     {
278       literal ("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}.  ");
279     }
280 
281   if (options & RE_NO_BK_REFS)
282     {
283       content ("A backslash followed by a digit matches that digit.");
284     }
285   else
286     {
287       literal ("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number.  For example @samp{\\2} matches the second group expression.  The order of group expressions is determined by the position of their opening parenthesis ");
288       if (options & RE_NO_BK_PARENS)
289 	literal ("@samp{(}");
290       else
291 	literal ("@samp{\\(}");
292       content (".");
293     }
294 
295 
296   newpara ();
297   if (!(options & RE_LIMITED_OPS))
298     {
299       if (options & RE_NO_BK_VBAR)
300 	literal ("The alternation operator is @samp{|}.");
301       else
302 	literal ("The alternation operator is @samp{\\|}.");
303     }
304   newpara ();
305 
306   if (options & RE_CONTEXT_INDEP_ANCHORS)
307     {
308       literal ("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets.  Within brackets, @samp{^} can be used to invert the membership of the character class being specified.\n");
309     }
310   else
311     {
312       literal ("The character @samp{^} only represents the beginning of a string when it appears:");
313       beginenum ();
314       enum_item ("At the beginning of a regular expression");
315       if (options & RE_NO_BK_PARENS)
316 	{
317 	  enum_item ("After an open-group, signified by @samp{(}");
318 	}
319       else
320 	{
321 	  enum_item ("After an open-group, signified by @samp{\\(}");
322 	}
323       newline ();
324       if (!(options & RE_LIMITED_OPS))
325 	{
326 	  if (options & RE_NEWLINE_ALT)
327 	    enum_item ("After a newline");
328 
329 	  if (options & RE_NO_BK_VBAR )
330 	    enum_item ("After the alternation operator @samp{|}");
331 	  else
332 	    enum_item ("After the alternation operator @samp{\\|}");
333 	}
334       endenum ();
335 
336       newpara ();
337       literal ("The character @samp{$} only represents the end of a string when it appears:");
338       beginenum ();
339       enum_item ("At the end of a regular expression");
340       if (options & RE_NO_BK_PARENS)
341 	{
342 	  enum_item ("Before a close-group, signified by @samp{)}");
343 	}
344       else
345 	{
346 	  enum_item ("Before a close-group, signified by @samp{\\)}");
347 	}
348       if (!(options & RE_LIMITED_OPS))
349 	{
350 	  if (options & RE_NEWLINE_ALT)
351 	    enum_item ("Before a newline");
352 
353 	  if (options & RE_NO_BK_VBAR)
354 	    enum_item ("Before the alternation operator @samp{|}");
355 	  else
356 	    enum_item ("Before the alternation operator @samp{\\|}");
357 	}
358       endenum ();
359     }
360   newpara ();
361   if (!(options & RE_LIMITED_OPS) )
362     {
363       if ((options & RE_CONTEXT_INDEP_OPS)
364 	  && !(options & RE_CONTEXT_INVALID_OPS))
365 	{
366 	  literal ("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression.\n");
367 	}
368       else
369 	{
370 	  if (options & RE_BK_PLUS_QM)
371 	    literal ("@samp{\\*}, @samp{\\+} and @samp{\\?} ");
372 	  else
373 	    literal ("@samp{*}, @samp{+} and @samp{?} ");
374 
375 	  if (options & RE_CONTEXT_INVALID_OPS)
376 	    {
377 	      content ("are special at any point in a regular expression except the following places, where they are not allowed:");
378 	    }
379 	  else
380 	    {
381 	      content ("are special at any point in a regular expression except:");
382 	    }
383 
384 	  beginenum ();
385 	  enum_item ("At the beginning of a regular expression");
386 	  if (options & RE_NO_BK_PARENS)
387 	    {
388 	      enum_item ("After an open-group, signified by @samp{(}");
389 	    }
390 	  else
391 	    {
392 	      enum_item ("After an open-group, signified by @samp{\\(}");
393 	    }
394 	  if (!(options & RE_LIMITED_OPS))
395 	    {
396 	      if (options & RE_NEWLINE_ALT)
397 		enum_item ("After a newline");
398 
399 	      if (options & RE_NO_BK_VBAR)
400 		enum_item ("After the alternation operator @samp{|}");
401 	      else
402 		enum_item ("After the alternation operator @samp{\\|}");
403 	    }
404 	  endenum ();
405 	}
406     }
407 
408 
409   newpara ();
410   if (options & RE_INTERVALS)
411     {
412       if (options & RE_NO_BK_BRACES)
413 	{
414 	  literal ("Intervals are specified by @samp{@{} and @samp{@}}.\n");
415 	  if (options & RE_INVALID_INTERVAL_ORD)
416 	    {
417 	      literal ("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}");
418 	    }
419 	  else
420 	    {
421 	      literal ("Invalid intervals such as @samp{a@{1z} are not accepted.\n");
422 	    }
423 	}
424       else
425 	{
426 	  literal ("Intervals are specified by @samp{\\@{} and @samp{\\@}}.\n");
427 	  if (options & RE_INVALID_INTERVAL_ORD)
428 	    {
429 	      literal ("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}");
430 	    }
431 	  else
432 	    {
433 	      literal ("Invalid intervals such as @samp{a\\@{1z} are not accepted.\n");
434 	    }
435 	}
436     }
437 
438   newpara ();
439   if (options & RE_NO_POSIX_BACKTRACKING)
440     {
441       content ("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match.");
442     }
443   else
444     {
445       content ("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups.");
446     }
447   newpara ();
448 }
449 
450 
451 static void
copying(void)452 copying (void)
453 {
454   static const char *copy_para[]=
455     {
456       /* The copyright year number range is with "--" in Texinfo files.  */
457       "Copyright (C) 1994--2021 Free Software Foundation, Inc."
458       ,""
459       ,"Permission is granted to copy, distribute and/or modify this document"
460       ,"under the terms of the GNU Free Documentation License, Version 1.3 or"
461       ,"any later version published by the Free Software Foundation; with no"
462       ,"Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts."
463       ,"A copy of the license is included in the ``GNU Free"
464       ,"Documentation License'' file as part of this distribution."
465       ""
466       ,NULL
467     };
468   const char **s = copy_para;
469   while (*s)
470     comment (*s++);
471 }
472 
473 static int
ignore(int ix,const unsigned int context)474 ignore (int ix, const unsigned int context)
475 {
476   return 0 == (get_regex_type_context (ix) & context);
477 }
478 
479 static void
menu(unsigned int context)480 menu (unsigned int context)
481 {
482   int i;
483   const char *name;
484 
485   output ("@menu\n", 0);
486   for (i=0;
487        get_regex_type_flags (i),
488 	 name=get_regex_type_name (i);
489        ++i)
490     {
491       if (!ignore (i, context))
492 	{
493 	  output ("* ", 0);
494 	  output (name, 0);
495 	  content (" regular expression syntax");
496 	  output ("::", 0);
497 	  newline ();
498 	}
499     }
500   output ("@end menu\n", 0);
501 }
502 
503 
504 
505 static const char *
get_next(unsigned int ix,unsigned int context)506 get_next (unsigned int ix, unsigned int context)
507 {
508   const char *next;
509   while (get_regex_type_name (ix))
510     {
511       if (!ignore (ix, context))
512 	{
513 	  next = get_regex_type_name (ix);
514 	  if (NULL == next)
515 	    return "";
516 	  else
517 	    return next;
518 	}
519       ++ix;
520     }
521   return "";
522 }
523 
524 
525 static void
describe_all(const char * contextname,unsigned int context,const char * up)526 describe_all (const char *contextname,
527 	      unsigned int context,
528 	      const char *up)
529 {
530   const char *name, *next, *previous;
531   int options;
532   int i, parent;
533 
534   copying ();
535   newline ();
536   literal ("@c this regular expression description is for: ");
537   literal (contextname);
538   newline ();
539   newline ();
540   menu (context);
541 
542   previous = "";
543 
544   for (i=0;
545        options = get_regex_type_flags (i),
546 	 name=get_regex_type_name (i);
547        ++i)
548     {
549       if (ignore (i, context))
550 	{
551 	  fprintf (stderr,
552 		   "Skipping regexp type %s for context %s\n",
553 		   name, contextname);
554 	  name = previous;
555 	  continue;
556 	}
557 
558       next = get_next (i+1, context);
559       if (NULL == next)
560 	next = "";
561       begin_subsection (name, next, previous, up);
562       parent = get_regex_type_synonym (i, context);
563       if (parent >= 0)
564 	{
565 	  content ("This is a synonym for ");
566 	  content (get_regex_type_name (parent));
567 	  content (".");
568 	}
569       else
570 	{
571 	  describe_regex_syntax (options);
572 	}
573       previous = name;
574     }
575 }
576 
577 
578 
579 int
main(int argc,char * argv[])580 main (int argc, char *argv[])
581 {
582   const char *up = "";
583   unsigned int context = CONTEXT_ALL;
584   const char *contextname = "all";
585 
586   if (argc)
587     set_program_name (argv[0]);
588   else
589     set_program_name ("regexprops");
590 
591   if (argc > 1)
592     {
593       up = argv[1];
594     }
595   if (argc > 2)
596     {
597       contextname = argv[2];
598       if (0 == strcmp (contextname, "findutils"))
599 	context = CONTEXT_FINDUTILS;
600       else if (0 == strcmp (contextname, "generic"))
601 	context = CONTEXT_GENERIC;
602       else if (0 == strcmp (contextname, "all"))
603 	context = CONTEXT_ALL;
604       else
605 	{
606 	  fprintf (stderr, "Unexpected context %s",
607 		   contextname);
608 	  return 1;
609 	}
610     }
611 
612   describe_all (contextname, context, up);
613   return 0;
614 }
615