1 /* regexprops.c -- document the properties of the regular expressions
2 understood by gnulib.
3
4 Copyright (C) 2005-2021 Free Software Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20
21 /*
22 The output of this program is included in the GNU findutils source
23 distribution. The copying conditions for that file are generated
24 by the copying() function below.
25 */
26
27 /* Written by James Youngman, <jay@gnu.org>. */
28
29 /* config.h must be included first. */
30 #include <config.h>
31
32 /* system headers */
33 #include <errno.h>
34 #include <regex.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <unistd.h>
38
39 /* gnulib headers */
40 #include "progname.h"
41
42 /* find headers */
43 #include "regextype.h"
44
45 static void
output(const char * s,int escape)46 output (const char *s, int escape)
47 {
48 (void) escape;
49
50 fputs (s, stdout);
51 }
52
53
54 static void
newline(void)55 newline (void)
56 {
57 output ("\n", 0);
58 }
59
60 static void
content(const char * s)61 content (const char *s)
62 {
63 output (s, 1);
64 }
65
66 static void
literal(const char * s)67 literal (const char *s)
68 {
69 output (s, 0);
70 }
71
72 static void
directive(const char * s)73 directive (const char *s)
74 {
75 output (s, 0);
76 }
77
78 static void
comment(const char * s)79 comment (const char *s)
80 {
81 directive ("@c");
82 if (s[0])
83 {
84 literal (" ");
85 literal (s);
86 }
87 newline ();
88 }
89
90 static void
enum_item(const char * s)91 enum_item (const char *s)
92 {
93 newline ();
94 directive ("@item ");
95 literal (s);
96 newline ();
97 }
98
99 static void
begin_subsection(const char * name,const char * next,const char * prev,const char * up)100 begin_subsection (const char *name,
101 const char *next,
102 const char *prev,
103 const char *up)
104 {
105 (void) next;
106 (void) prev;
107 (void) up;
108
109 newline ();
110
111 directive ("@node ");
112 content (name);
113 content (" regular expression syntax");
114 newline ();
115
116 directive ("@subsection ");
117 output ("@samp{", 0);
118 content (name);
119 output ("}", 0);
120 content (" regular expression syntax");
121 newline ();
122 }
123
124 static void
begintable_markup(char const * markup)125 begintable_markup (char const *markup)
126 {
127 newline ();
128 directive ("@table ");
129 literal (markup);
130 newline ();
131 }
132
133 static void
endtable(void)134 endtable (void)
135 {
136 newline ();
137 directive ("@end table");
138 newline ();
139 }
140
141 static void
beginenum(void)142 beginenum (void)
143 {
144 newline ();
145 directive ("@enumerate");
146 newline ();
147 }
148
149 static void
endenum(void)150 endenum (void)
151 {
152 newline ();
153 directive ("@end enumerate");
154 newline ();
155 }
156
157 static void
newpara(void)158 newpara (void)
159 {
160 content ("\n\n");
161 }
162
163
164 static void
describe_regex_syntax(int options)165 describe_regex_syntax (int options)
166 {
167 newpara ();
168 content ("The character @samp{.} matches any single character");
169 if ( (options & RE_DOT_NEWLINE) == 0 )
170 {
171 content (" except newline");
172 }
173 if (options & RE_DOT_NOT_NULL)
174 {
175 if ( (options & RE_DOT_NEWLINE) == 0 )
176 content (" and");
177 else
178 content (" except");
179
180 content (" the null character");
181 }
182 content (".");
183 newpara ();
184
185 if (!(options & RE_LIMITED_OPS))
186 {
187 begintable_markup ("@samp");
188 if (options & RE_BK_PLUS_QM)
189 {
190 enum_item ("\\+");
191 content ("indicates that the regular expression should match one"
192 " or more occurrences of the previous atom or regexp.");
193 enum_item ("\\?");
194 content ("indicates that the regular expression should match zero"
195 " or one occurrence of the previous atom or regexp.");
196 enum_item ("+ and ?");
197 content ("match themselves.\n");
198 }
199 else
200 {
201 enum_item ("+");
202 content ("indicates that the regular expression should match one"
203 " or more occurrences of the previous atom or regexp.");
204 enum_item ("?");
205 content ("indicates that the regular expression should match zero"
206 " or one occurrence of the previous atom or regexp.");
207 enum_item ("\\+");
208 literal ("matches a @samp{+}");
209 enum_item ("\\?");
210 literal ("matches a @samp{?}.");
211 }
212 endtable ();
213 }
214
215 newpara ();
216
217 content ("Bracket expressions are used to match ranges of characters. ");
218 literal ("Bracket expressions where the range is backward, for example @samp{[z-a]}, are ");
219 if (options & RE_NO_EMPTY_RANGES)
220 content ("invalid");
221 else
222 content ("ignored");
223 content (". ");
224
225 if (options & RE_BACKSLASH_ESCAPE_IN_LISTS)
226 literal ("Within square brackets, @samp{\\} can be used to quote "
227 "the following character. ");
228 else
229 literal ("Within square brackets, @samp{\\} is taken literally. ");
230
231 if (options & RE_CHAR_CLASSES)
232 content ("Character classes are supported; for example "
233 "@samp{[[:digit:]]} will match a single decimal digit.\n");
234 else
235 literal ("Character classes are not supported, so for example "
236 "you would need to use @samp{[0-9]} "
237 "instead of @samp{[[:digit:]]}.\n");
238
239 if (options & RE_HAT_LISTS_NOT_NEWLINE)
240 {
241 literal ("Non-matching lists @samp{[^@dots{}]} do not ever match newline.\n");
242 }
243 newpara ();
244 if (options & RE_NO_GNU_OPS)
245 {
246 content ("GNU extensions are not supported and so "
247 "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} "
248 "match "
249 "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively.\n");
250 }
251 else
252 {
253 content ("GNU extensions are supported:");
254 beginenum ();
255 enum_item ("@samp{\\w} matches a character within a word");
256 enum_item ("@samp{\\W} matches a character which is not within a word");
257 enum_item ("@samp{\\<} matches the beginning of a word");
258 enum_item ("@samp{\\>} matches the end of a word");
259 enum_item ("@samp{\\b} matches a word boundary");
260 enum_item ("@samp{\\B} matches characters which are not a word boundary");
261 enum_item ("@samp{\\`} matches the beginning of the whole input");
262 enum_item ("@samp{\\'} matches the end of the whole input");
263 endenum ();
264 }
265
266 newpara ();
267
268
269 if (options & RE_NO_BK_PARENS)
270 {
271 literal ("Grouping is performed with parentheses @samp{()}. ");
272
273 if (options & RE_UNMATCHED_RIGHT_PAREN_ORD)
274 literal ("An unmatched @samp{)} matches just itself. ");
275 }
276 else
277 {
278 literal ("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}. ");
279 }
280
281 if (options & RE_NO_BK_REFS)
282 {
283 content ("A backslash followed by a digit matches that digit.");
284 }
285 else
286 {
287 literal ("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number. For example @samp{\\2} matches the second group expression. The order of group expressions is determined by the position of their opening parenthesis ");
288 if (options & RE_NO_BK_PARENS)
289 literal ("@samp{(}");
290 else
291 literal ("@samp{\\(}");
292 content (".");
293 }
294
295
296 newpara ();
297 if (!(options & RE_LIMITED_OPS))
298 {
299 if (options & RE_NO_BK_VBAR)
300 literal ("The alternation operator is @samp{|}.");
301 else
302 literal ("The alternation operator is @samp{\\|}.");
303 }
304 newpara ();
305
306 if (options & RE_CONTEXT_INDEP_ANCHORS)
307 {
308 literal ("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets. Within brackets, @samp{^} can be used to invert the membership of the character class being specified.\n");
309 }
310 else
311 {
312 literal ("The character @samp{^} only represents the beginning of a string when it appears:");
313 beginenum ();
314 enum_item ("At the beginning of a regular expression");
315 if (options & RE_NO_BK_PARENS)
316 {
317 enum_item ("After an open-group, signified by @samp{(}");
318 }
319 else
320 {
321 enum_item ("After an open-group, signified by @samp{\\(}");
322 }
323 newline ();
324 if (!(options & RE_LIMITED_OPS))
325 {
326 if (options & RE_NEWLINE_ALT)
327 enum_item ("After a newline");
328
329 if (options & RE_NO_BK_VBAR )
330 enum_item ("After the alternation operator @samp{|}");
331 else
332 enum_item ("After the alternation operator @samp{\\|}");
333 }
334 endenum ();
335
336 newpara ();
337 literal ("The character @samp{$} only represents the end of a string when it appears:");
338 beginenum ();
339 enum_item ("At the end of a regular expression");
340 if (options & RE_NO_BK_PARENS)
341 {
342 enum_item ("Before a close-group, signified by @samp{)}");
343 }
344 else
345 {
346 enum_item ("Before a close-group, signified by @samp{\\)}");
347 }
348 if (!(options & RE_LIMITED_OPS))
349 {
350 if (options & RE_NEWLINE_ALT)
351 enum_item ("Before a newline");
352
353 if (options & RE_NO_BK_VBAR)
354 enum_item ("Before the alternation operator @samp{|}");
355 else
356 enum_item ("Before the alternation operator @samp{\\|}");
357 }
358 endenum ();
359 }
360 newpara ();
361 if (!(options & RE_LIMITED_OPS) )
362 {
363 if ((options & RE_CONTEXT_INDEP_OPS)
364 && !(options & RE_CONTEXT_INVALID_OPS))
365 {
366 literal ("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression.\n");
367 }
368 else
369 {
370 if (options & RE_BK_PLUS_QM)
371 literal ("@samp{\\*}, @samp{\\+} and @samp{\\?} ");
372 else
373 literal ("@samp{*}, @samp{+} and @samp{?} ");
374
375 if (options & RE_CONTEXT_INVALID_OPS)
376 {
377 content ("are special at any point in a regular expression except the following places, where they are not allowed:");
378 }
379 else
380 {
381 content ("are special at any point in a regular expression except:");
382 }
383
384 beginenum ();
385 enum_item ("At the beginning of a regular expression");
386 if (options & RE_NO_BK_PARENS)
387 {
388 enum_item ("After an open-group, signified by @samp{(}");
389 }
390 else
391 {
392 enum_item ("After an open-group, signified by @samp{\\(}");
393 }
394 if (!(options & RE_LIMITED_OPS))
395 {
396 if (options & RE_NEWLINE_ALT)
397 enum_item ("After a newline");
398
399 if (options & RE_NO_BK_VBAR)
400 enum_item ("After the alternation operator @samp{|}");
401 else
402 enum_item ("After the alternation operator @samp{\\|}");
403 }
404 endenum ();
405 }
406 }
407
408
409 newpara ();
410 if (options & RE_INTERVALS)
411 {
412 if (options & RE_NO_BK_BRACES)
413 {
414 literal ("Intervals are specified by @samp{@{} and @samp{@}}.\n");
415 if (options & RE_INVALID_INTERVAL_ORD)
416 {
417 literal ("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}");
418 }
419 else
420 {
421 literal ("Invalid intervals such as @samp{a@{1z} are not accepted.\n");
422 }
423 }
424 else
425 {
426 literal ("Intervals are specified by @samp{\\@{} and @samp{\\@}}.\n");
427 if (options & RE_INVALID_INTERVAL_ORD)
428 {
429 literal ("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}");
430 }
431 else
432 {
433 literal ("Invalid intervals such as @samp{a\\@{1z} are not accepted.\n");
434 }
435 }
436 }
437
438 newpara ();
439 if (options & RE_NO_POSIX_BACKTRACKING)
440 {
441 content ("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match.");
442 }
443 else
444 {
445 content ("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups.");
446 }
447 newpara ();
448 }
449
450
451 static void
copying(void)452 copying (void)
453 {
454 static const char *copy_para[]=
455 {
456 /* The copyright year number range is with "--" in Texinfo files. */
457 "Copyright (C) 1994--2021 Free Software Foundation, Inc."
458 ,""
459 ,"Permission is granted to copy, distribute and/or modify this document"
460 ,"under the terms of the GNU Free Documentation License, Version 1.3 or"
461 ,"any later version published by the Free Software Foundation; with no"
462 ,"Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts."
463 ,"A copy of the license is included in the ``GNU Free"
464 ,"Documentation License'' file as part of this distribution."
465 ""
466 ,NULL
467 };
468 const char **s = copy_para;
469 while (*s)
470 comment (*s++);
471 }
472
473 static int
ignore(int ix,const unsigned int context)474 ignore (int ix, const unsigned int context)
475 {
476 return 0 == (get_regex_type_context (ix) & context);
477 }
478
479 static void
menu(unsigned int context)480 menu (unsigned int context)
481 {
482 int i;
483 const char *name;
484
485 output ("@menu\n", 0);
486 for (i=0;
487 get_regex_type_flags (i),
488 name=get_regex_type_name (i);
489 ++i)
490 {
491 if (!ignore (i, context))
492 {
493 output ("* ", 0);
494 output (name, 0);
495 content (" regular expression syntax");
496 output ("::", 0);
497 newline ();
498 }
499 }
500 output ("@end menu\n", 0);
501 }
502
503
504
505 static const char *
get_next(unsigned int ix,unsigned int context)506 get_next (unsigned int ix, unsigned int context)
507 {
508 const char *next;
509 while (get_regex_type_name (ix))
510 {
511 if (!ignore (ix, context))
512 {
513 next = get_regex_type_name (ix);
514 if (NULL == next)
515 return "";
516 else
517 return next;
518 }
519 ++ix;
520 }
521 return "";
522 }
523
524
525 static void
describe_all(const char * contextname,unsigned int context,const char * up)526 describe_all (const char *contextname,
527 unsigned int context,
528 const char *up)
529 {
530 const char *name, *next, *previous;
531 int options;
532 int i, parent;
533
534 copying ();
535 newline ();
536 literal ("@c this regular expression description is for: ");
537 literal (contextname);
538 newline ();
539 newline ();
540 menu (context);
541
542 previous = "";
543
544 for (i=0;
545 options = get_regex_type_flags (i),
546 name=get_regex_type_name (i);
547 ++i)
548 {
549 if (ignore (i, context))
550 {
551 fprintf (stderr,
552 "Skipping regexp type %s for context %s\n",
553 name, contextname);
554 name = previous;
555 continue;
556 }
557
558 next = get_next (i+1, context);
559 if (NULL == next)
560 next = "";
561 begin_subsection (name, next, previous, up);
562 parent = get_regex_type_synonym (i, context);
563 if (parent >= 0)
564 {
565 content ("This is a synonym for ");
566 content (get_regex_type_name (parent));
567 content (".");
568 }
569 else
570 {
571 describe_regex_syntax (options);
572 }
573 previous = name;
574 }
575 }
576
577
578
579 int
main(int argc,char * argv[])580 main (int argc, char *argv[])
581 {
582 const char *up = "";
583 unsigned int context = CONTEXT_ALL;
584 const char *contextname = "all";
585
586 if (argc)
587 set_program_name (argv[0]);
588 else
589 set_program_name ("regexprops");
590
591 if (argc > 1)
592 {
593 up = argv[1];
594 }
595 if (argc > 2)
596 {
597 contextname = argv[2];
598 if (0 == strcmp (contextname, "findutils"))
599 context = CONTEXT_FINDUTILS;
600 else if (0 == strcmp (contextname, "generic"))
601 context = CONTEXT_GENERIC;
602 else if (0 == strcmp (contextname, "all"))
603 context = CONTEXT_ALL;
604 else
605 {
606 fprintf (stderr, "Unexpected context %s",
607 contextname);
608 return 1;
609 }
610 }
611
612 describe_all (contextname, context, up);
613 return 0;
614 }
615