xref: /dragonfly/lib/libc/tre-regex/regex.3 (revision 9348a738)
1.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
2.\" Copyright (c) 1992, 1993, 1994
3.\"	The Regents of the University of California.  All rights reserved.
4.\"
5.\" This code is derived from software contributed to Berkeley by
6.\" Henry Spencer.
7.\"
8.\" Redistribution and use in source and binary forms, with or without
9.\" modification, are permitted provided that the following conditions
10.\" are met:
11.\" 1. Redistributions of source code must retain the above copyright
12.\"    notice, this list of conditions and the following disclaimer.
13.\" 2. Redistributions in binary form must reproduce the above copyright
14.\"    notice, this list of conditions and the following disclaimer in the
15.\"    documentation and/or other materials provided with the distribution.
16.\" 3. Neither the name of the University nor the names of its contributors
17.\"    may be used to endorse or promote products derived from this software
18.\"    without specific prior written permission.
19.\"
20.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30.\" SUCH DAMAGE.
31.\"
32.\"	@(#)regex.3	8.4 (Berkeley) 3/20/94
33.\" $FreeBSD: src/lib/libc/regex/regex.3,v 1.21 2007/01/09 00:28:04 imp Exp $
34.\"
35.Dd August 20, 2015
36.Dt REGEX 3
37.Os
38.Sh NAME
39.Nm regcomp ,
40.Nm regcomp_l ,
41.Nm regerror ,
42.Nm regexec ,
43.Nm regfree ,
44.Nm regncomp ,
45.Nm regncomp_l ,
46.Nm regnexec ,
47.Nm regwcomp ,
48.Nm regwcomp_l ,
49.Nm regwexec ,
50.Nm regwncomp ,
51.Nm regwncomp_l ,
52.Nm regwnexec
53.Nd regular-expression library
54.Sh LIBRARY
55.Lb libc
56.Sh SYNOPSIS
57.Sy (Standards-compliant APIs)
58.Pp
59.In regex.h
60.Ft int
61.Fo regcomp
62.Fa "regex_t *restrict preg"
63.Fa "const char *restrict pattern"
64.Fa "int cflags"
65.Fc
66.Ft size_t
67.Fo regerror
68.Fa "int errcode"
69.Fa "const regex_t *restrict preg"
70.Fa "char *restrict errbuf"
71.Fa "size_t errbuf_size"
72.Fc
73.Ft int
74.Fo regexec
75.Fa "const regex_t *restrict preg"
76.Fa "const char *restrict string"
77.Fa "size_t nmatch"
78.Fa "regmatch_t pmatch[restrict]"
79.Fa "int eflags"
80.Fc
81.Ft void
82.Fo regfree
83.Fa "regex_t *preg"
84.Fc
85.Pp
86.Sy (Non-portable extensions)
87.Ft int
88.Fo regncomp
89.Fa "regex_t *restrict preg"
90.Fa "const char *restrict pattern"
91.Fa "size_t len"
92.Fa "int cflags"
93.Fc
94.Ft int
95.Fo regnexec
96.Fa "const regex_t *restrict preg"
97.Fa "const char *restrict string"
98.Fa "size_t len"
99.Fa "size_t nmatch"
100.Fa "regmatch_t pmatch[restrict]"
101.Fa "int eflags"
102.Fc
103.Ft int
104.Fo regwcomp
105.Fa "regex_t *restrict preg"
106.Fa "const wchar_t *restrict widepat"
107.Fa "int cflags"
108.Fc
109.Ft int
110.Fo regwexec
111.Fa "const regex_t *restrict preg"
112.Fa "const wchar_t *restrict widestr"
113.Fa "size_t nmatch"
114.Fa "regmatch_t pmatch[restrict]"
115.Fa "int eflags"
116.Fc
117.Ft int
118.Fo regwncomp
119.Fa "regex_t *restrict preg"
120.Fa "const wchar_t *restrict widepat"
121.Fa "size_t len"
122.Fa "int cflags"
123.Fc
124.Ft int
125.Fo regwnexec
126.Fa "const regex_t *restrict preg"
127.Fa "const wchar_t *restrict widestr"
128.Fa "size_t len"
129.Fa "size_t nmatch"
130.Fa "regmatch_t pmatch[restrict]"
131.Fa "int eflags"
132.Fc
133.In regex.h
134.In xlocale.h
135.Ft int
136.Fo regcomp_l
137.Fa "regex_t *restrict preg"
138.Fa "const char *restrict pattern"
139.Fa "int cflags"
140.Fa "locale_t restrict"
141.Fc
142.Ft int
143.Fo regncomp_l
144.Fa "regex_t *restrict preg"
145.Fa "const char *restrict pattern"
146.Fa "size_t len"
147.Fa "int cflags"
148.Fa "locale_t restrict"
149.Fc
150.Ft int
151.Fo regwcomp_l
152.Fa "regex_t *restrict preg"
153.Fa "const wchar_t *restrict widepat"
154.Fa "int cflags"
155.Fa "locale_t restrict"
156.Fc
157.Ft int
158.Fo regwncomp_l
159.Fa "regex_t *restrict preg"
160.Fa "const wchar_t *restrict widepat"
161.Fa "size_t len"
162.Fa "int cflags"
163.Fa "locale_t restrict"
164.Fc
165.Sh DESCRIPTION
166These routines implement
167.St -p1003.2
168regular expressions
169.Pq Do RE Dc Ns s ;
170see
171.Xr re_format 7 .
172The
173.Fn regcomp
174function
175compiles an RE, written as a string, into an internal form.
176.Fn regexec
177matches that internal form against a string and reports results.
178.Fn regerror
179transforms error codes from either into human-readable messages.
180.Fn regfree
181frees any dynamically-allocated storage used by the internal form
182of an RE.
183.Pp
184The header
185.In regex.h
186declares two structure types,
187.Ft regex_t
188and
189.Ft regmatch_t ,
190the former for compiled internal forms and the latter for match reporting.
191It also declares the four functions,
192a type
193.Ft regoff_t ,
194and a number of constants with names starting with
195.Dq Dv REG_ .
196.Pp
197The
198.Fn regcomp
199function
200compiles the regular expression contained in the
201.Fa pattern
202string,
203subject to the flags in
204.Fa cflags ,
205and places the results in the
206.Ft regex_t
207structure pointed to by
208.Fa preg .
209The
210.Fa cflags
211argument
212is the bitwise OR of zero or more of the following flags:
213.Bl -tag -width REG_EXTENDED
214.It Dv REG_EXTENDED
215Compile modern
216.Pq Dq extended
217REs,
218rather than the obsolete
219.Pq Dq basic
220REs that
221are the default.
222.It Dv REG_BASIC
223This is a synonym for 0,
224provided as a counterpart to
225.Dv REG_EXTENDED
226to improve readability.
227.It Dv REG_NOSPEC
228Compile with recognition of all special characters turned off.
229All characters are thus considered ordinary,
230so the
231.Dq RE
232is a literal string.
233This is an extension,
234compatible with but not specified by
235.St -p1003.2 ,
236and should be used with
237caution in software intended to be portable to other systems.
238.Dv REG_EXTENDED
239and
240.Dv REG_NOSPEC
241may not be used
242in the same call to
243.Fn regcomp .
244.It Dv REG_LITERAL
245An alias of
246.Dv REG_NOSPEC .
247.It Dv REG_ICASE
248Compile for matching that ignores upper/lower case distinctions.
249See
250.Xr re_format 7 .
251.It Dv REG_NOSUB
252Compile for matching that need only report success or failure,
253not what was matched.
254.It Dv REG_NEWLINE
255Compile for newline-sensitive matching.
256By default, newline is a completely ordinary character with no special
257meaning in either REs or strings.
258With this flag,
259.Ql [^
260bracket expressions and
261.Ql .\&
262never match newline,
263a
264.Ql ^\&
265anchor matches the null string after any newline in the string
266in addition to its normal function,
267and the
268.Ql $\&
269anchor matches the null string before any newline in the
270string in addition to its normal function.
271.It Dv REG_PEND
272(Note that
273.Dv REG_PEND
274is not recognized by any of the wide character or
275.Dq Nm n
276variants.
277Besides, the
278.Dq Nm n
279variants can be used instead of
280.Dv REG_PEND ;
281see EXTENDED APIS below.)
282The regular expression ends,
283not at the first NUL,
284but just before the character pointed to by the
285.Va re_endp
286member of the structure pointed to by
287.Fa preg .
288The
289.Va re_endp
290member is of type
291.Ft "const char *" .
292This flag permits inclusion of NULs in the RE;
293they are considered ordinary characters.
294This is an extension,
295compatible with but not specified by
296.St -p1003.2 ,
297and should be used with
298caution in software intended to be portable to other systems.
299.It Dv REG_ENHANCED
300Recognized enhanced regular expression features; see
301.Xr re_format 7
302for details.
303This is an extension not specified by
304.St -p1003.2 ,
305and should be used with
306caution in software intended to be portable to other systems.
307.It Dv REG_MINIMAL
308Use minimal (non-greedy) repetitions instead of the normal greedy ones; see
309.Xr re_format 7
310for details.
311(This only applies when both
312.Dv REG_ENHANCED
313and
314.Dv REG_EXTENDED
315are also set.)
316This is an extension not specified by
317.St -p1003.2 ,
318and should be used with
319caution in software intended to be portable to other systems.
320.It Dv REG_UNGREEDY
321Alias of
322.Dv REG_MINIMAL .
323.El
324.Pp
325When successful,
326.Fn regcomp
327returns 0 and fills in the structure pointed to by
328.Fa preg .
329One member of that structure
330(other than
331.Va re_endp )
332is publicized:
333.Va re_nsub ,
334of type
335.Ft size_t ,
336contains the number of parenthesized subexpressions within the RE
337(except that the value of this member is undefined if the
338.Dv REG_NOSUB
339flag was used).
340If
341.Fn regcomp
342fails, it returns a non-zero error code;
343see
344.Sx DIAGNOSTICS .
345.Pp
346The
347.Fn regexec
348function
349matches the compiled RE pointed to by
350.Fa preg
351against the
352.Fa string ,
353subject to the flags in
354.Fa eflags ,
355and reports results using
356.Fa nmatch ,
357.Fa pmatch ,
358and the returned value.
359The RE must have been compiled by a previous invocation of
360.Fn regcomp .
361The compiled form is not altered during execution of
362.Fn regexec ,
363so a single compiled RE can be used simultaneously by multiple threads.
364.Pp
365By default,
366the NUL-terminated string pointed to by
367.Fa string
368is considered to be the text of an entire line, minus any terminating
369newline.
370The
371.Fa eflags
372argument is the bitwise OR of zero or more of the following flags:
373.Bl -tag -width REG_STARTEND
374.It Dv REG_NOTBOL
375The first character of
376the string
377is not the beginning of a line, so the
378.Ql ^\&
379anchor should not match before it.
380This does not affect the behavior of newlines under
381.Dv REG_NEWLINE .
382.It Dv REG_NOTEOL
383The NUL terminating
384the string
385does not end a line, so the
386.Ql $\&
387anchor should not match before it.
388This does not affect the behavior of newlines under
389.Dv REG_NEWLINE .
390.It Dv REG_STARTEND
391The string is considered to start at
392.Fa string
393+
394.Fa pmatch Ns [0]. Ns Va rm_so
395and to have a terminating NUL located at
396.Fa string
397+
398.Fa pmatch Ns [0]. Ns Va rm_eo
399(there need not actually be a NUL at that location),
400regardless of the value of
401.Fa nmatch .
402See below for the definition of
403.Fa pmatch
404and
405.Fa nmatch .
406This is an extension,
407compatible with but not specified by
408.St -p1003.2 ,
409and should be used with
410caution in software intended to be portable to other systems.
411Note that a non-zero
412.Va rm_so
413does not imply
414.Dv REG_NOTBOL ;
415.Dv REG_STARTEND
416affects only the location of the string,
417not how it is matched.
418.El
419.Pp
420See
421.Xr re_format 7
422for a discussion of what is matched in situations where an RE or a
423portion thereof could match any of several substrings of
424.Fa string .
425.Pp
426Normally,
427.Fn regexec
428returns 0 for success and the non-zero code
429.Dv REG_NOMATCH
430for failure.
431Other non-zero error codes may be returned in exceptional situations;
432see
433.Sx DIAGNOSTICS .
434.Pp
435If
436.Dv REG_NOSUB
437was specified in the compilation of the RE,
438or if
439.Fa nmatch
440is 0,
441.Fn regexec
442ignores the
443.Fa pmatch
444argument (but see below for the case where
445.Dv REG_STARTEND
446is specified).
447Otherwise,
448.Fa pmatch
449points to an array of
450.Fa nmatch
451structures of type
452.Ft regmatch_t .
453Such a structure has at least the members
454.Va rm_so
455and
456.Va rm_eo ,
457both of type
458.Ft regoff_t
459(a signed arithmetic type at least as large as an
460.Ft off_t
461and a
462.Ft ssize_t ) ,
463containing respectively the offset of the first character of a substring
464and the offset of the first character after the end of the substring.
465Offsets are measured from the beginning of the
466.Fa string
467argument given to
468.Fn regexec .
469An empty substring is denoted by equal offsets,
470both indicating the character following the empty substring.
471.Pp
472The 0th member of the
473.Fa pmatch
474array is filled in to indicate what substring of
475.Fa string
476was matched by the entire RE.
477Remaining members report what substring was matched by parenthesized
478subexpressions within the RE;
479member
480.Va i
481reports subexpression
482.Va i ,
483with subexpressions counted (starting at 1) by the order of their opening
484parentheses in the RE, left to right.
485Unused entries in the array (corresponding either to subexpressions that
486did not participate in the match at all, or to subexpressions that do not
487exist in the RE (that is,
488.Va i
489>
490.Fa preg Ns -> Ns Va re_nsub ) )
491have both
492.Va rm_so
493and
494.Va rm_eo
495set to -1.
496If a subexpression participated in the match several times,
497the reported substring is the last one it matched.
498(Note, as an example in particular, that when the RE
499.Ql "(b*)+"
500matches
501.Ql bbb ,
502the parenthesized subexpression matches each of the three
503.So Li b Sc Ns s
504and then
505an infinite number of empty strings following the last
506.Ql b ,
507so the reported substring is one of the empties.)
508.Pp
509If
510.Dv REG_STARTEND
511is specified,
512.Fa pmatch
513must point to at least one
514.Ft regmatch_t
515(even if
516.Fa nmatch
517is 0 or
518.Dv REG_NOSUB
519was specified),
520to hold the input offsets for
521.Dv REG_STARTEND .
522Use for output is still entirely controlled by
523.Fa nmatch ;
524if
525.Fa nmatch
526is 0 or
527.Dv REG_NOSUB
528was specified,
529the value of
530.Fa pmatch Ns [0]
531will not be changed by a successful
532.Fn regexec .
533.Pp
534The
535.Fn regerror
536function
537maps a non-zero
538.Fa errcode
539from either
540.Fn regcomp
541or
542.Fn regexec
543to a human-readable, printable message.
544If
545.Fa preg
546is
547.No non\- Ns Dv NULL ,
548the error code should have arisen from use of
549the
550.Ft regex_t
551pointed to by
552.Fa preg ,
553and if the error code came from
554.Fn regcomp ,
555it should have been the result from the most recent
556.Fn regcomp
557using that
558.Ft regex_t .
559The
560.Fn ( regerror
561may be able to supply a more detailed message using information
562from the
563.Ft regex_t . )
564The
565.Fn regerror
566function
567places the NUL-terminated message into the buffer pointed to by
568.Fa errbuf ,
569limiting the length (including the NUL) to at most
570.Fa errbuf_size
571bytes.
572If the whole message will not fit,
573as much of it as will fit before the terminating NUL is supplied.
574In any case,
575the returned value is the size of buffer needed to hold the whole
576message (including terminating NUL).
577If
578.Fa errbuf_size
579is 0,
580.Fa errbuf
581is ignored but the return value is still correct.
582.Pp
583If the
584.Fa errcode
585given to
586.Fn regerror
587is first ORed with
588.Dv REG_ITOA ,
589the
590.Dq message
591that results is the printable name of the error code,
592e.g.\&
593.Dq Dv REG_NOMATCH ,
594rather than an explanation thereof.
595If
596.Fa errcode
597is
598.Dv REG_ATOI ,
599then
600.Fa preg
601shall be
602.No non\- Ns Dv NULL
603and the
604.Va re_endp
605member of the structure it points to
606must point to the printable name of an error code;
607in this case, the result in
608.Fa errbuf
609is the decimal digits of
610the numeric value of the error code
611(0 if the name is not recognized).
612.Dv REG_ITOA
613and
614.Dv REG_ATOI
615are intended primarily as debugging facilities;
616they are extensions,
617compatible with but not specified by
618.St -p1003.2 ,
619and should be used with
620caution in software intended to be portable to other systems.
621Be warned also that they are considered experimental and changes are possible.
622.Pp
623The
624.Fn regfree
625function
626frees any dynamically-allocated storage associated with the compiled RE
627pointed to by
628.Fa preg .
629The remaining
630.Ft regex_t
631is no longer a valid compiled RE
632and the effect of supplying it to
633.Fn regexec
634or
635.Fn regerror
636is undefined.
637.Pp
638None of these functions references global variables except for tables
639of constants;
640all are safe for use from multiple threads if the arguments are safe.
641.Sh EXTENDED APIS
642These extended APIs are available in Mac OS X 10.8 and beyond, when the
643deployment target is 10.8 or later.
644It should also be noted that any of the
645.Fn regcomp
646variants may be used to initialize a
647.Ft regex_t
648structure, that can then be passed to any of the
649.Fn regexec
650variants.
651So it is quite legal to compile a wide character RE and use it to match a
652multibyte character string, or vice versa.
653.Pp
654The
655.Fn regncomp
656routine compiles regular expressions like
657.Fn regcomp ,
658but the length of the regular expression string is specified, allowing a string
659that is not NUL terminated and/or contains NUL characters.
660This is a modern replacement for using
661.Fn regcomp
662with the
663.Dv REG_PEND
664option.
665.Pp
666Similarly, the
667.Fn regnexec
668routine is like
669.Fn regexec ,
670but the length of the string to match is specified, allowing a string
671that is not NUL terminated and/or contains NUL characters.
672.Pp
673The
674.Fn regwcomp
675and
676.Fn regwexec
677variants take a wide-character
678.Vt ( wchar_t )
679string for the regular expression and string to match.
680And
681.Fn regwncomp
682and
683.Fn regwnexec
684are variants that allow specifying the wide character string length, and
685so allows wide character strings that are not NUL terminated and/or
686contains NUL characters.
687.Sh INTERACTION WITH THE LOCALE
688When
689.Fn regcomp
690or one of its variants is run, the regular expression is compiled into an
691internal form, which may include specific information about the locale currently
692in effect, such as equivalence classes or multi-character collation symbols.
693So a reference to the current locale is also stored with the internal form,
694so that when
695.Fn regexec
696is run, it can use the same locale (even if the locale is changed in-between
697the calls to
698.Fn regcomp
699and
700.Fn regexec ) .
701.Pp
702To provide more direct control over which locale is used,
703routines with
704.Dq Nm _l
705appended to their names are provided that work just like the variants
706without the
707.Dq Nm _l ,
708except that a locale (via a
709.Vt locale_t
710variable type) is specified directly.
711Note that only variants of
712.Fn regcomp
713have
714.Dq Nm _l
715variants, since the
716.Fn regexec
717variants just use the reference to the locale stored in the internal form.
718.Sh IMPLEMENTATION CHOICES
719The
720.Nm regex
721implementation in Mac OS X 10.8 and later is based on a heavily modified subset
722of TRE (http://laurikari.net/tre/).
723This provides improved performance, better conformance and additional features.
724However, both API and binary compatibility have been maintained with previous
725releases, so binaries
726built on previous releases should work on 10.8 and later, and binaries built on
72710.8 and later should be able to run on previous releases (as long as none of
728the new variants or new features are used.
729.Pp
730There are a number of decisions that
731.St -p1003.2
732leaves up to the implementor,
733either by explicitly saying
734.Dq undefined
735or by virtue of them being
736forbidden by the RE grammar.
737This implementation treats them as follows.
738.Pp
739See
740.Xr re_format 7
741for a discussion of the definition of case-independent matching.
742.Pp
743There is no particular limit on the length of REs,
744except insofar as memory is limited.
745Memory usage is approximately linear in RE size, and largely insensitive
746to RE complexity, except for bounded repetitions.
747See
748.Sx BUGS
749for one short RE using them
750that will run almost any system out of memory.
751.Pp
752A backslashed character other than one specifically given a magic meaning
753by
754.St -p1003.2
755(such magic meanings occur only in obsolete
756.Bq Dq basic
757REs)
758is taken as an ordinary character.
759.Pp
760Any unmatched
761.Ql [\&
762is a
763.Dv REG_EBRACK
764error.
765.Pp
766Equivalence classes cannot begin or end bracket-expression ranges.
767The endpoint of one range cannot begin another.
768.Pp
769.Dv RE_DUP_MAX ,
770the limit on repetition counts in bounded repetitions, is 255.
771.Pp
772A repetition operator
773.Ql ( ?\& ,
774.Ql *\& ,
775.Ql +\& ,
776or bounds)
777cannot follow another
778repetition operator, except for the use of
779.Ql ?\&
780for minimal repetition (for enhanced extended REs; see
781.Xr re_format 7
782for details).
783A repetition operator cannot begin an expression or subexpression
784or follow
785.Ql ^\&
786or
787.Ql |\& .
788.Pp
789.Ql |\&
790cannot appear first or last in a (sub)expression or after another
791.Ql |\& ,
792i.e., an operand of
793.Ql |\&
794cannot be an empty subexpression.
795An empty parenthesized subexpression,
796.Ql "()" ,
797is legal and matches an
798empty (sub)string.
799An empty string is not a legal RE.
800.Pp
801A
802.Ql {\&
803followed by a digit is considered the beginning of bounds for a
804bounded repetition, which must then follow the syntax for bounds.
805A
806.Ql {\&
807.Em not
808followed by a digit is considered an ordinary character.
809.Pp
810.Ql ^\&
811and
812.Ql $\&
813beginning and ending subexpressions in obsolete
814.Pq Dq basic
815REs are anchors, not ordinary characters.
816.Sh DIAGNOSTICS
817Non-zero error codes from
818.Fn regcomp
819and
820.Fn regexec
821include the following:
822.Pp
823.Bl -tag -width REG_ECOLLATE -compact
824.It Dv REG_NOMATCH
825The
826.Fn regexec
827function
828failed to match
829.It Dv REG_BADPAT
830invalid regular expression
831.It Dv REG_ECOLLATE
832invalid collating element
833.It Dv REG_ECTYPE
834invalid character class
835.It Dv REG_EESCAPE
836.Ql \e
837applied to unescapable character
838.It Dv REG_ESUBREG
839invalid backreference number
840.It Dv REG_EBRACK
841brackets
842.Ql "[ ]"
843not balanced
844.It Dv REG_EPAREN
845parentheses
846.Ql "( )"
847not balanced
848.It Dv REG_EBRACE
849braces
850.Ql "{ }"
851not balanced
852.It Dv REG_BADBR
853invalid repetition count(s) in
854.Ql "{ }"
855.It Dv REG_ERANGE
856invalid character range in
857.Ql "[ ]"
858.It Dv REG_ESPACE
859ran out of memory
860.It Dv REG_BADRPT
861.Ql ?\& ,
862.Ql *\& ,
863or
864.Ql +\&
865operand invalid
866.It Dv REG_EMPTY
867empty (sub)expression
868.It Dv REG_ASSERT
869cannot happen - you found a bug
870.It Dv REG_INVARG
871invalid argument, e.g.\& negative-length string
872.It Dv REG_ILLSEQ
873illegal byte sequence (bad multibyte character)
874.El
875.Sh SEE ALSO
876.Xr grep 1 ,
877.Xr re_format 7
878.Pp
879.St -p1003.2 ,
880sections 2.8 (Regular Expression Notation)
881and
882B.5 (C Binding for Regular Expression Matching).
883.Sh HISTORY
884The
885.Nm regex
886implementation is based on a heavily modified subset of TRE
887(http://laurikari.net/tre/), originally written by Ville Laurikari.
888Previous releases used an implementation originally written by
889.An Henry Spencer ,
890and altered for inclusion in the
891.Bx 4.4
892distribution.
893.Sh BUGS
894The beginning-of-line and end-of-line anchors (
895.Dq ^\&
896and
897.Dq $\& )
898are currently implemented so that repetitions can not be applied to them.
899The standards are unclear about whether this is legal, but other
900.Nm regex
901packages do support this case.
902It is best to avoid this non-portable (and not really very useful) case.
903.Pp
904The back-reference code is subtle and doubts linger about its correctness
905in complex cases.
906.Pp
907The
908.Fn regexec
909variants use one of two internal matching engines.
910The normal one is linear worst-case time in the length of the text being
911searched, and quadratic worst-case time in the length of the used regular
912expression.
913When back-references are used, a slower, backtracking engine is used.
914While all backtracking matching engines suffer from extreme slowness for certain
915pathological cases, the normal engines doesn't suffer from these cases.
916It is advised to avoid back-references whenever possible.
917.Pp
918The
919.Fn regcomp
920variants
921implements bounded repetitions by macro expansion,
922which is costly in time and space if counts are large
923or bounded repetitions are nested.
924An RE like, say,
925.Ql "((((a{1,100}){1,100}){1,100}){1,100}){1,100}"
926will (eventually) run almost any existing machine out of swap space.
927.Pp
928Due to a mistake in
929.St -p1003.2 ,
930things like
931.Ql "a)b"
932are legal REs because
933.Ql )\&
934is
935a special character only in the presence of a previous unmatched
936.Ql (\& .
937This cannot be fixed until the spec is fixed.
938.Pp
939The standard's definition of back references is vague.
940For example, does
941.Ql "a\e(\e(b\e)*\e2\e)*d"
942match
943.Ql "abbbd" ?
944Until the standard is clarified,
945behavior in such cases should not be relied on.
946