1#lang scribble/doc
2@(require scribble/bnf
3          "mz.rkt"
4          "rx.rkt"
5          (for-syntax racket/base))
6
7@title[#:tag "regexp"]{Regular Expressions}
8
9@section-index{regexps}
10@section-index{pattern matching}
11@section-index["strings" "pattern matching"]
12@section-index["input ports" "pattern matching"]
13
14@(define-syntax (rx-examples stx)
15  (syntax-case stx ()
16   [(_ [num rx input] ...)
17    (with-syntax ([(ex ...)
18                   (map (lambda (num rx input)
19                          `(eval:alts #,(racket
20                                         (code:line
21                                          (regexp-match ,rx ,input)
22                                          (code:comment @#,t["ex"
23                                                             (let ([s (number->string ,num)])
24                                                               (elemtag `(rxex ,s)
25                                                                        (racketcommentfont s)))
26                                                             ,(if (pregexp? (syntax-e rx))
27                                                                  `(list ", uses " (racketmetafont "#px"))
28                                                                  "")])))
29                                      (regexp-match ,rx ,input)))
30                        (syntax->list #'(num ...))
31                        (syntax->list #'(rx ...))
32                        (syntax->list #'(input ...)))])
33      #`(examples ex ...))]))
34
35@guideintro["regexp"]{regular expressions}
36
37@deftech{Regular expressions} are specified as strings or byte
38strings, using the same pattern language as either the Unix utility
39@exec{egrep} or Perl. A string-specified pattern produces a character
40regexp matcher, and a byte-string pattern produces a byte regexp
41matcher. If a character regexp is used with a byte string or input
42port, it matches UTF-8 encodings (see @secref["encodings"]) of
43matching character streams; if a byte regexp is used with a character
44string, it matches bytes in the UTF-8 encoding of the string.
45
46A regular expression that is represented as a string or byte string
47can be compiled to a @deftech{regexp value}, which can be used more
48efficiently by functions such as @racket[regexp-match] compared to the
49string or byte string form. The @racket[regexp] and
50@racket[byte-regexp] procedures convert a string or byte string
51(respectively) into a regexp value using a syntax of regular
52expressions that is most compatible to @exec{egrep}. The
53@racket[pregexp] and @racket[byte-pregexp] procedures produce a regexp
54value using a slightly different syntax of regular expressions that is
55more compatible with Perl.
56
57Two @tech{regexp values} are @racket[equal?] if they have the same
58source, use the same pattern language, and are both character regexps
59or both byte regexps.
60
61A literal or printed @tech{regexp value} starts with @litchar{#rx} or
62@litchar{#px}. @see-read-print["regexp"]{regular expressions} Regexp
63values produced by the default reader are @tech{interned} in
64@racket[read-syntax] mode.
65
66On the @tech[#:doc '(lib "scribblings/guide/guide.scrbl")]{BC} variant of Racket,
67the internal size of a @tech{regexp value} is limited to 32 kilobytes; this
68limit roughly corresponds to a source string with 32,000 literal
69characters or 5,000 operators.
70
71@;------------------------------------------------------------------------
72@section[#:tag "regexp-syntax"]{Regexp Syntax}
73
74The following syntax specifications describe the content of a string
75that represents a regular expression. The syntax of the corresponding
76string may involve extra escape characters. For example, the regular
77expression @litchar{(.*)\1} can be represented with the string
78@racket["(.*)\\1"] or the regexp constant @racket[#rx"(.*)\\1"]; the
79@litchar{\} in the regular expression must be escaped to include it
80in a string or regexp constant.
81
82The @racket[regexp] and @racket[pregexp] syntaxes share a common core:
83
84@common-table
85
86The following completes the grammar for @racket[regexp], which treats
87@litchar["{"] and @litchar["}"] as literals, @litchar{\} as a
88literal within ranges, and @litchar{\} as a literal producer
89outside of ranges.
90
91@rx-table
92
93The following completes the grammar for @racket[pregexp], which uses
94@litchar["{"] and @litchar["}"] bounded repetition and uses
95@litchar{\} for meta-characters both inside and outside of ranges.
96
97@px-table
98
99In case-insensitive mode, a backreference of the form
100@litchar{\}@nonterm{n} matches case-insensitively only with respect to
101ASCII characters.
102
103The Unicode categories follow.
104
105@category-table
106
107@rx-examples[
108[1 #rx"a|b" "cat"]
109[2 #rx"[at]" "cat"]
110[3 #rx"ca*[at]" "caaat"]
111[4 #rx"ca+[at]" "caaat"]
112[5 #rx"ca?t?" "ct"]
113[6 #rx"ca*?[at]" "caaat"]
114[7 #px"ca{2}" "caaat"]
115[8 #px"ca{2,}t" "catcaat"]
116[9 #px"ca{,2}t" "caaatcat"]
117[10 #px"ca{1,2}t" "caaatcat"]
118[11 #rx"(c*)(a*)" "caat"]
119[12 #rx"[^ca]" "caat"]
120[13 #rx".(.)." "cat"]
121[14 #rx"^a|^c" "cat"]
122[15 #rx"a$|t$" "cat"]
123[16 #px"c(.)\\1t" "caat"]
124[17 #px".\\b." "cat in hat"]
125[18 #px".\\B." "cat in hat"]
126[19 #px"\\p{Ll}" "Cat"]
127[20 #px"\\P{Ll}" "cat!"]
128[21 #rx"\\|" "c|t"]
129[22 #rx"[a-f]*" "cat"]
130[23 #px"[a-f\\d]*" "1cat"]
131[24 #px" [\\w]" "cat hat"]
132[25 #px"t[\\s]" "cat\nhat"]
133[26 #px"[[:lower:]]+" "Cat"]
134[27 #rx"[]]" "c]t"]
135[28 #rx"[-]" "c-t"]
136[29 #rx"[]a[]+" "c[a]t"]
137[30 #rx"[a^]+" "ca^t"]
138[31 #rx".a(?=p)" "cat nap"]
139[32 #rx".a(?!t)" "cat nap"]
140[33 #rx"(?<=n)a." "cat nap"]
141[34 #rx"(?<!c)a." "cat nap"]
142[35 #rx"(?i:a)[tp]" "cAT nAp"]
143[36 #rx"(?(?<=c)a|b)+" "cabal"]
144]
145
146@;------------------------------------------------------------------------
147@section{Additional Syntactic Constraints}
148
149In addition to matching a grammar, regular expressions must meet two
150syntactic restrictions:
151
152@itemize[
153
154 @item{In a @nonterm{repeat} other than @nonterm{atom}@litchar{?},
155       the @nonterm{atom} must not match an empty sequence.}
156
157 @item{In a @litchar{(?<=}@nonterm{regexp}@litchar{)} or
158       @litchar{(?<!}@nonterm{regexp}@litchar{)},
159       the @nonterm{regexp} must match a bounded sequence only.}
160
161]
162
163These constraints are checked syntactically by the following type
164system. A type [@math{n}, @math{m}] corresponds to an expression that
165matches between @math{n} and @math{m} characters. In the rule for
166@litchar{(}@nonterm{Regexp}@litchar{)}, @math{N} means the number such
167that the opening parenthesis is the @math{N}th opening parenthesis for
168collecting match reports.  Non-emptiness is inferred for a
169backreference pattern, @litchar{\}@nonterm{N}, so that a
170backreference can be used for repetition patterns; in the case of
171mutual dependencies among backreferences, the inference chooses the
172fixpoint that maximizes non-emptiness.  Finiteness is not inferred for
173backreferences (i.e., a backreference is assumed to match an
174arbitrarily large sequence). No syntactic constraint prohibits a
175backreference within the group that it references, although such self
176references might create a pattern with no possible matches (as in the
177case of @litchar{(.\1)}, although @litchar{(^.|\1){2}} matches an
178input that starts with the same two characters).
179
180@type-table
181
182@;------------------------------------------------------------------------
183@section{Regexp Constructors}
184
185@defproc[(regexp? [v any/c]) boolean?]{
186
187Returns @racket[#t] if @racket[v] is a @tech{regexp value} created by
188@racket[regexp] or @racket[pregexp], @racket[#f] otherwise.}
189
190
191@defproc[(pregexp? [v any/c]) boolean?]{
192
193Returns @racket[#t] if @racket[v] is a @tech{regexp value} created by
194@racket[pregexp] (not @racket[regexp]), @racket[#f] otherwise.}
195
196
197@defproc[(byte-regexp? [v any/c]) boolean?]{
198
199Returns @racket[#t] if @racket[v] is a @tech{regexp value} created by
200@racket[byte-regexp] or @racket[byte-pregexp], @racket[#f] otherwise.}
201
202
203@defproc[(byte-pregexp? [v any/c]) boolean?]{
204
205Returns @racket[#t] if @racket[v] is a @tech{regexp value} created by
206@racket[byte-pregexp] (not @racket[byte-regexp]), @racket[#f]
207otherwise.}
208
209
210@defproc*[([(regexp [str string?]) regexp?]
211           [(regexp [str string?]
212                    [handler (or/c #f (string? -> any))])
213            any])]{
214
215Takes a string representation of a regular expression (using the
216syntax in @secref["regexp-syntax"]) and compiles it into a @tech{regexp
217value}. Other regular expression procedures accept either a string or a
218@tech{regexp value} as the matching pattern. If a regular expression string
219is used multiple times, it is faster to compile the string once to a
220@tech{regexp value} and use it for repeated matches instead of using the
221string each time.
222
223If @racket[handler] is provided and not @racket[#f], it is called and
224its result is returned when @racket[str] is not a valid representation
225of a regular expression; the argument to @racket[handler] is a string
226that describes the problem with @racket[str]. If @racket[handler] is
227@racket[#f] or not provided, then @exnraise[exn:fail:contract].
228
229The @racket[object-name] procedure returns
230the source string for a @tech{regexp value}.
231
232@examples[
233(regexp "ap*le")
234(object-name #rx"ap*le")
235(regexp "+" (λ (s) (list s)))
236]
237
238@history[#:changed "6.5.0.1" @elem{Added the @racket[handler] argument.}]}
239
240@defproc*[([(pregexp [str string?]) pregexp?]
241           [(pregexp [str string?]
242                     [handler (or/c #f (string? -> any))])
243            any])]{
244
245Like @racket[regexp], except that it uses a slightly different syntax
246(see @secref["regexp-syntax"]). The result can be used with
247@racket[regexp-match], etc., just like the result from
248@racket[regexp].
249
250@examples[
251(pregexp "ap*le")
252(regexp? #px"ap*le")
253(pregexp "+" (λ (s) (vector s)))
254]
255
256@history[#:changed "6.5.0.1" @elem{Added the @racket[handler] argument.}]}
257
258@defproc*[([(byte-regexp [bstr bytes?]) byte-regexp?]
259           [(byte-regexp [bstr bytes?]
260                         [handler (or/c #f (bytes? -> any))])
261            any])]{
262
263Takes a byte-string representation of a regular expression (using the
264syntax in @secref["regexp-syntax"]) and compiles it into a
265byte-@tech{regexp value}.
266
267If @racket[handler] is provided, it is called and its result is returned
268if @racket[str] is not a valid representation of a regular expression.
269
270The @racket[object-name] procedure
271returns the source byte string for a @tech{regexp value}.
272
273@examples[
274(byte-regexp #"ap*le")
275(object-name #rx#"ap*le")
276(eval:error (byte-regexp "ap*le"))
277(byte-regexp #"+" (λ (s) (list s)))
278]
279
280@history[#:changed "6.5.0.1" @elem{Added the @racket[handler] argument.}]}
281
282@defproc*[([(byte-pregexp [bstr bytes?]) byte-pregexp?]
283           [(byte-pregexp [bstr bytes?]
284                          [handler (or/c #f (bytes? -> any))])
285            any])]{
286
287Like @racket[byte-regexp], except that it uses a slightly different
288syntax (see @secref["regexp-syntax"]). The result can be used with
289@racket[regexp-match], etc., just like the result from
290@racket[byte-regexp].
291
292@examples[
293(byte-pregexp #"ap*le")
294(byte-pregexp #"+" (λ (s) (vector s)))
295]
296
297@history[#:changed "6.5.0.1" @elem{Added the @racket[handler] argument.}]}
298
299@defproc*[([(regexp-quote [str string?] [case-sensitive? any/c #t]) string?]
300           [(regexp-quote [bstr bytes?] [case-sensitive? any/c #t]) bytes?])]{
301
302Produces a string or byte string suitable for use with @racket[regexp]
303to match the literal sequence of characters in @racket[str] or
304sequence of bytes in @racket[bstr]. If @racket[case-sensitive?] is
305true (the default), the resulting regexp matches letters in
306@racket[str] or @racket[bytes] case-sensitively, otherwise it matches
307case-insensitively.
308
309@examples[
310(regexp-match "." "apple.scm")
311(regexp-match (regexp-quote ".") "apple.scm")
312]}
313
314@defproc[(regexp-max-lookbehind [pattern (or/c regexp? byte-regexp?)])
315         exact-nonnegative-integer?]{
316
317Returns the maximum number of bytes that @racket[pattern] may consult
318before the starting position of a match to determine the match. For
319example, the pattern @litchar{(?<=abc)d} consults three bytes
320preceding a matching @litchar{d}, while @litchar{e(?<=a..)d} consults
321two bytes before a matching @litchar{ed}. A @litchar{^} pattern may
322consult a preceding byte to determine whether the current position is
323the start of the input or of a line.}
324
325@;------------------------------------------------------------------------
326@section{Regexp Matching}
327
328@defproc[(regexp-match [pattern (or/c string? bytes? regexp? byte-regexp?)]
329                       [input (or/c string? bytes? path? input-port?)]
330                       [start-pos exact-nonnegative-integer? 0]
331                       [end-pos (or/c exact-nonnegative-integer? #f) #f]
332                       [output-port (or/c output-port? #f) #f]
333                       [input-prefix bytes? #""])
334         (if (and (or (string? pattern) (regexp? pattern))
335                  (or (string? input) (path? input)))
336             (or/c #f (cons/c string? (listof (or/c string? #f))))
337             (or/c #f (cons/c bytes?  (listof (or/c bytes?  #f)))))]{
338
339Attempts to match @racket[pattern] (a string, byte string,
340@tech{regexp value}, or byte-@tech{regexp value}) once to a portion of
341@racket[input].  The matcher finds a portion of @racket[input] that
342matches and is closest to the start of the input (after
343@racket[start-pos]).
344
345If @racket[input] is a path, it is converted to a byte string with
346@racket[path->bytes] if @racket[pattern] is a byte string or a
347byte-based regexp. Otherwise, @racket[input] is converted to a string
348with @racket[path->string].
349
350The optional @racket[start-pos] and @racket[end-pos] arguments select
351a portion of @racket[input] for matching; the default is the entire
352string or the stream up to an end-of-file. When @racket[input] is a
353string, @racket[start-pos] is a character position; when
354@racket[input] is a byte string, then @racket[start-pos] is a byte
355position; and when @racket[input] is an input port, @racket[start-pos]
356is the number of bytes to skip before starting to match. The
357@racket[end-pos] argument can be @racket[#f], which corresponds to the
358end of the string or an end-of-file in the stream; otherwise, it is a
359character or byte position, like @racket[start-pos]. If @racket[input]
360is an input port, and if an end-of-file is reached before
361@racket[start-pos] bytes are skipped, then the match fails.
362
363In @racket[pattern], a start-of-string @litchar{^} refers to the first
364position of @racket[input] after @racket[start-pos], assuming that
365@racket[input-prefix] is @racket[#""].  The end-of-input @litchar{$}
366refers to the @racket[end-pos]th position or (in the case of an input
367port) an end-of-file, whichever comes first.
368
369The @racket[input-prefix] specifies bytes that effectively precede
370@racket[input] for the purposes of @litchar{^} and other look-behind
371matching. For example, a @racket[#""] prefix means that @litchar{^}
372matches at the beginning of the stream, while a @racket[#"\n"]
373@racket[input-prefix] means that a start-of-line @litchar{^} can match
374the beginning of the input, while a start-of-file @litchar{^} cannot.
375
376If the match fails, @racket[#f] is returned. If the match succeeds, a
377list containing strings or byte string, and possibly @racket[#f], is
378returned. The list contains strings only if @racket[input] is a string
379and @racket[pattern] is not a byte regexp. Otherwise, the list
380contains byte strings (substrings of the UTF-8 encoding of
381@racket[input], if @racket[input] is a string).
382
383The first [byte] string in a result list is the portion of
384@racket[input] that matched @racket[pattern]. If two portions of
385@racket[input] can match @racket[pattern], then the match that starts
386earliest is found.
387
388Additional [byte] strings are returned in the list if @racket[pattern]
389contains parenthesized sub-expressions (but not when the opening
390parenthesis is followed by @litchar{?}). Matches for the
391sub-expressions are provided in the order of the opening parentheses
392in @racket[pattern]. When sub-expressions occur in branches of an
393@litchar{|} ``or'' pattern, in a @litchar{*} ``zero or more''
394pattern, or other places where the overall pattern can succeed without
395a match for the sub-expression, then a @racket[#f] is returned for the
396sub-expression if it did not contribute to the final match. When a
397single sub-expression occurs within a @litchar{*} ``zero or more''
398pattern or other multiple-match positions, then the rightmost match
399associated with the sub-expression is returned in the list.
400
401If the optional @racket[output-port] is provided as an output port,
402the part of @racket[input] from its beginning (not @racket[start-pos])
403that precedes the match is written to the port. All of @racket[input]
404up to @racket[end-pos] is written to the port if no match is
405found. This functionality is most useful when @racket[input] is an
406input port.
407
408When matching an input port, a match failure reads up to
409@racket[end-pos] bytes (or end-of-file), even if @racket[pattern]
410begins with a start-of-string @litchar{^}; see also
411@racket[regexp-try-match]. On success, all bytes up to and including
412the match are eventually read from the port, but matching proceeds by
413first peeking bytes from the port (using @racket[peek-bytes-avail!]),
414and then (re@-~-)reading matching bytes to discard them after the match
415result is determined. Non-matching bytes may be read and discarded
416before the match is determined. The matcher peeks in blocking mode
417only as far as necessary to determine a match, but it may peek extra
418bytes to fill an internal buffer if immediately available (i.e.,
419without blocking). Greedy repeat operators in @racket[pattern], such
420as @litchar{*} or @litchar{+}, tend to force reading the entire
421content of the port (up to @racket[end-pos]) to determine a match.
422
423If the input port is read simultaneously by another thread, or if the
424port is a custom port with inconsistent reading and peeking procedures
425(see @secref["customport"]), then the bytes that are peeked and
426used for matching may be different than the bytes read and discarded
427after the match completes; the matcher inspects only the peeked
428bytes. To avoid such interleaving, use @racket[regexp-match-peek]
429(with a @racket[progress-evt] argument) followed by
430@racket[port-commit-peeked].
431
432@examples[
433(regexp-match #rx"x." "12x4x6")
434(regexp-match #rx"y." "12x4x6")
435(regexp-match #rx"x." "12x4x6" 3)
436(regexp-match #rx"x." "12x4x6" 3 4)
437(regexp-match #rx#"x." "12x4x6")
438(regexp-match #rx"x." "12x4x6" 0 #f (current-output-port))
439(regexp-match #rx"(-[0-9]*)+" "a-12--345b")
440]}
441
442
443@defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)]
444                        [input (or/c string? bytes? path? input-port?)]
445                        [start-pos exact-nonnegative-integer? 0]
446                        [end-pos (or/c exact-nonnegative-integer? #f) #f]
447                        [input-prefix bytes? #""]
448                        [#:match-select match-select
449                         (or/c (list? . -> . (or/c any/c list?))
450                               #f)
451                         car]
452                        [#:gap-select? gap-select any/c #f])
453         (if (and (or (string? pattern) (regexp? pattern))
454                  (or (string? input) (path? input)))
455             (listof (or/c string? (listof (or/c #f string?))))
456             (listof (or/c bytes? (listof (or/c #f bytes?)))))]{
457
458Like @racket[regexp-match], but the result is a list of strings or
459byte strings corresponding to a sequence of matches of
460@racket[pattern] in @racket[input].
461
462The @racket[pattern] is used in order to find matches, where each
463match attempt starts at the end of the last match, and @litchar{^} is
464allowed to match the beginning of the input (if @racket[input-prefix]
465is @racket[#""]) only for the first match.  Empty matches are handled
466like other matches, returning a zero-length string or byte sequence
467(they are more useful in making this a complement of
468@racket[regexp-split]), but @racket[pattern] is restricted from
469matching an empty sequence immediately after an empty match.
470
471If @racket[input] contains no matches (in the range @racket[start-pos]
472to @racket[end-pos]), @racket[null] is returned. Otherwise, each item
473in the resulting list is a distinct substring or byte sequence from
474@racket[input] that matches @racket[pattern]. The @racket[end-pos]
475argument can be @racket[#f] to match to the end of @racket[input]
476(which corresponds to an end-of-file if @racket[input] is an input
477port).
478
479@examples[
480(regexp-match* #rx"x." "12x4x6")
481(regexp-match* #rx"x*" "12x4x6")
482]
483
484@racket[match-select] specifies the collected results.  The default of
485@racket[car] means that the result is the list of matches without
486returning parenthesized sub-patterns.  It can be given as a `selector'
487function which chooses an item from a list, or it can choose a list of
488items.  For example, you can use @racket[cdr] to get a list of lists
489of parenthesized sub-patterns matches, or @racket[values] (as an
490identity function) to get the full matches as well.  (Note that the
491selector must choose an element of its input list or a list of
492elements, but it must not inspect its input as they can be either a
493list of strings or a list of position pairs.  Furthermore, the
494selector must be consistent in its choice(s).)
495
496@examples[
497(regexp-match* #rx"x(.)" "12x4x6" #:match-select cadr)
498(regexp-match* #rx"x(.)" "12x4x6" #:match-select values)
499]
500
501In addition, specifying @racket[gap-select] as a non-@racket[#f] value
502will make the result an interleaved list of the matches as well as the
503separators between them matches, starting and ending with a separator.
504In this case, @racket[match-select] can be given as @racket[#f] to
505return @emph{only} the separators, making such uses equivalent to
506@racket[regexp-split].
507
508@examples[
509(regexp-match* #rx"x(.)" "12x4x6" #:match-select cadr #:gap-select? #t)
510(regexp-match* #rx"x(.)" "12x4x6" #:match-select #f #:gap-select? #t)
511]}
512
513
514@defproc[(regexp-try-match [pattern (or/c string? bytes? regexp? byte-regexp?)]
515                           [input input-port?]
516                           [start-pos exact-nonnegative-integer? 0]
517                           [end-pos (or/c exact-nonnegative-integer? #f) #f]
518                           [output-port (or/c output-port? #f) #f]
519                           [input-prefix bytes? #""])
520         (or/c #f (cons/c bytes? (listof (or/c bytes? #f))))]{
521
522Like @racket[regexp-match] on input ports, except that if the match
523fails, no characters are read and discarded from @racket[in].
524
525This procedure is especially useful with a @racket[pattern] that
526begins with a start-of-string @litchar{^} or with a non-@racket[#f]
527@racket[end-pos], since each limits the amount of peeking into the
528port. Otherwise, beware that a large portion of the stream may be
529peeked (and therefore pulled into memory) before the match succeeds or
530fails.}
531
532
533@defproc[(regexp-match-positions [pattern (or/c string? bytes? regexp? byte-regexp?)]
534                                 [input (or/c string? bytes? path? input-port?)]
535                                 [start-pos exact-nonnegative-integer? 0]
536                                 [end-pos (or/c exact-nonnegative-integer? #f) #f]
537                                 [output-port (or/c output-port? #f) #f]
538                                 [input-prefix bytes? #""])
539          (or/c (cons/c (cons/c exact-nonnegative-integer?
540                                exact-nonnegative-integer?)
541                        (listof (or/c (cons/c exact-integer?
542                                              exact-integer?)
543                                      #f)))
544                #f)]{
545
546Like @racket[regexp-match], but returns a list of number pairs (and
547@racket[#f]) instead of a list of strings. Each pair of numbers refers
548to a range of characters or bytes in @racket[input]. If the result for
549the same arguments with @racket[regexp-match] would be a list of byte
550strings, the resulting ranges correspond to byte ranges; in that case,
551if @racket[input] is a character string, the byte ranges correspond to
552bytes in the UTF-8 encoding of the string.
553
554Range results are returned in a @racket[substring]- and
555@racket[subbytes]-compatible manner, independent of
556@racket[start-pos]. In the case of an input port, the returned
557positions indicate the number of bytes that were read, including
558@racket[start-pos], before the first matching byte.
559
560@examples[
561(regexp-match-positions #rx"x." "12x4x6")
562(regexp-match-positions #rx"x." "12x4x6" 3)
563(regexp-match-positions #rx"(-[0-9]*)+" "a-12--345b")
564]
565
566Range results after the first one can include negative numbers if
567@racket[input-prefix] is non-empty and if @racket[pattern] includes a
568lookbehind pattern. Such ranges start in the @racket[input-prefix]
569instead of @racket[input]. More generally, when @racket[start-pos] is
570positive, then range results that are less than @racket[start-pos]
571start in @racket[input-prefix].
572
573@examples[
574(regexp-match-positions #rx"(?<=(.))." "a" 0 #f #f #"x")
575(regexp-match-positions #rx"(?<=(..))." "a" 0 #f #f #"x")
576(regexp-match-positions #rx"(?<=(..))." "_a" 1 #f #f #"x")
577]
578
579Although @racket[input-prefix] is always a byte string, when the
580returned positions are string indices and they refer to a portion of
581@racket[input-prefix], then they correspond to a UTF-8 decoding of
582a tail of @racket[input-prefix].
583
584@examples[
585(bytes-length (string->bytes/utf-8 "\u3BB"))
586(regexp-match-positions #rx"(?<=(.))." "a" 0 #f #f (string->bytes/utf-8 "\u3BB"))
587]}
588
589@defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
590                                  [input (or/c string? bytes? path? input-port?)]
591                                  [start-pos exact-nonnegative-integer? 0]
592                                  [end-pos (or/c exact-nonnegative-integer? #f) #f]
593                                  [input-prefix bytes? #""]
594                                  [#:match-select match-select
595                                   (list? . -> . (or/c any/c list?))
596                                   car])
597         (or/c (listof (cons/c exact-nonnegative-integer?
598                               exact-nonnegative-integer?))
599               (listof (listof (or/c #f (cons/c exact-nonnegative-integer?
600                                                exact-nonnegative-integer?)))))]{
601
602Like @racket[regexp-match-positions], but returns multiple matches
603like @racket[regexp-match*].
604
605@examples[
606(regexp-match-positions* #rx"x." "12x4x6")
607(regexp-match-positions* #rx"x(.)" "12x4x6" #:match-select cadr)
608]
609
610Note that unlike @racket[regexp-match*], there is no
611@racket[#:gap-select?] input keyword, as this information can be easily
612inferred from the resulting matches.
613}
614
615
616@defproc[(regexp-match? [pattern (or/c string? bytes? regexp? byte-regexp?)]
617                        [input (or/c string? bytes? path? input-port?)]
618                        [start-pos exact-nonnegative-integer? 0]
619                        [end-pos (or/c exact-nonnegative-integer? #f) #f]
620                        [output-port (or/c output-port? #f) #f]
621                        [input-prefix bytes? #""])
622           boolean?]{
623
624Like @racket[regexp-match], but returns merely @racket[#t] when the
625match succeeds, @racket[#f] otherwise.
626
627@examples[
628(regexp-match? #rx"x." "12x4x6")
629(regexp-match? #rx"y." "12x4x6")
630]}
631
632
633@defproc[(regexp-match-exact? [pattern (or/c string? bytes? regexp? byte-regexp?)]
634                              [input (or/c string? bytes? path?)])
635          boolean?]{
636
637Like @racket[regexp-match?], but @racket[#t] is only returned when the
638first found match is to the entire content of @racket[input].
639
640@examples[
641(regexp-match-exact? #rx"x." "12x4x6")
642(regexp-match-exact? #rx"1.*x." "12x4x6")
643]
644
645Beware that @racket[regexp-match-exact?] can return @racket[#f] if
646@racket[pattern] generates a partial match for @racket[input] first, even if
647@racket[pattern] could also generate a complete match. To check if there is any
648match of @racket[pattern] that covers all of @racket[input], use
649@racket[rexexp-match?] with @elem{@litchar{^(?:}@racket[pattern]@litchar{)$}}
650instead.
651
652@examples[
653(regexp-match-exact? #rx"a|ab" "ab")
654(regexp-match? #rx"^(?:a|ab)$" "ab")
655]
656
657The @litchar{(?:)} grouping is necessary because concatenation has
658lower precedence than alternation; the regular expression without it,
659@litchar{^a|ab$}, matches any input that either starts with
660@litchar{a} or ends with @litchar{ab}.
661
662@examples[
663(regexp-match? #rx"^a|ab$" "123ab")
664]}
665
666
667@defproc[(regexp-match-peek [pattern (or/c string? bytes? regexp? byte-regexp?)]
668                            [input input-port?]
669                            [start-pos exact-nonnegative-integer? 0]
670                            [end-pos (or/c exact-nonnegative-integer? #f) #f]
671                            [progress (or/c evt #f) #f]
672                            [input-prefix bytes? #""])
673          (or/c (cons/c bytes? (listof (or/c bytes? #f)))
674                #f)]{
675
676Like @racket[regexp-match] on input ports, but only peeks bytes from
677@racket[input] instead of reading them. Furthermore, instead of
678an output port, the last optional argument is a progress event for
679@racket[input] (see @racket[port-progress-evt]). If @racket[progress]
680becomes ready, then the match stops peeking from @racket[input]
681and returns @racket[#f]. The @racket[progress] argument can be
682@racket[#f], in which case the peek may continue with inconsistent
683information if another process meanwhile reads from
684@racket[input].
685
686@examples[
687(define p (open-input-string "a abcd"))
688(regexp-match-peek ".*bc" p)
689(regexp-match-peek ".*bc" p 2)
690(regexp-match ".*bc" p 2)
691(peek-char p)
692(regexp-match ".*bc" p)
693(peek-char p)
694]}
695
696
697@defproc[(regexp-match-peek-positions [pattern (or/c string? bytes? regexp? byte-regexp?)]
698                            [input input-port?]
699                            [start-pos exact-nonnegative-integer? 0]
700                            [end-pos (or/c exact-nonnegative-integer? #f) #f]
701                            [progress (or/c evt #f) #f]
702                            [input-prefix bytes? #""])
703          (or/c (cons/c (cons/c exact-nonnegative-integer?
704                                exact-nonnegative-integer?)
705                        (listof (or/c (cons/c exact-nonnegative-integer?
706                                              exact-nonnegative-integer?)
707                                      #f)))
708                #f)]{
709
710Like @racket[regexp-match-positions] on input ports, but only peeks
711bytes from @racket[input] instead of reading them, and with a
712@racket[progress] argument like @racket[regexp-match-peek].}
713
714
715@defproc[(regexp-match-peek-immediate [pattern (or/c string? bytes? regexp? byte-regexp?)]
716                            [input input-port?]
717                            [start-pos exact-nonnegative-integer? 0]
718                            [end-pos (or/c exact-nonnegative-integer? #f) #f]
719                            [progress (or/c evt #f) #f]
720                            [input-prefix bytes? #""])
721          (or/c (cons/c bytes? (listof (or/c bytes? #f)))
722                #f)]{
723
724Like @racket[regexp-match-peek], but it attempts to match only bytes
725that are available from @racket[input] without blocking.  The
726match fails if not-yet-available characters might be used to match
727@racket[pattern].}
728
729
730@defproc[(regexp-match-peek-positions-immediate [pattern (or/c string? bytes? regexp? byte-regexp?)]
731                            [input input-port?]
732                            [start-pos exact-nonnegative-integer? 0]
733                            [end-pos (or/c exact-nonnegative-integer? #f) #f]
734                            [progress (or/c evt #f) #f]
735                            [input-prefix bytes? #""])
736          (or/c (cons/c (cons/c exact-nonnegative-integer?
737                                exact-nonnegative-integer?)
738                        (listof (or/c (cons/c exact-nonnegative-integer?
739                                              exact-nonnegative-integer?)
740                                      #f)))
741                #f)]{
742
743Like @racket[regexp-match-peek-positions], but it attempts to match
744only bytes that are available from @racket[input] without
745blocking. The match fails if not-yet-available characters might be
746used to match @racket[pattern].}
747
748
749@defproc[(regexp-match-peek-positions*
750                            [pattern (or/c string? bytes? regexp? byte-regexp?)]
751                            [input input-port?]
752                            [start-pos exact-nonnegative-integer? 0]
753                            [end-pos (or/c exact-nonnegative-integer? #f) #f]
754                            [input-prefix bytes? #""]
755                            [#:match-select match-select
756                             (list? . -> . (or/c any/c list?))
757                             car])
758         (or/c (listof (cons/c exact-nonnegative-integer?
759                               exact-nonnegative-integer?))
760               (listof (listof (or/c #f (cons/c exact-nonnegative-integer?
761                                                exact-nonnegative-integer?)))))]{
762
763Like @racket[regexp-match-peek-positions], but returns multiple matches like
764@racket[regexp-match-positions*].}
765
766@defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
767                       [input (or/c string? bytes? path? input-port?)]
768                       [start-pos exact-nonnegative-integer? 0]
769                       [end-pos (or/c exact-nonnegative-integer? #f) #f]
770                       [output-port (or/c output-port? #f) #f]
771                       [input-prefix bytes? #""]
772                       [count exact-nonnegative-integer? 1])
773         (values
774          (if (and (or (string? pattern) (regexp? pattern))
775                   (or/c (string? input) (path? input)))
776              (or/c #f (cons/c string? (listof (or/c string? #f))))
777              (or/c #f (cons/c bytes?  (listof (or/c bytes?  #f)))))
778          (or/c #f bytes?))]{
779
780Like @racket[regexp-match], but with a second result: a byte
781string of up to @racket[count] bytes that correspond to the input
782(possibly including the @racket[input-prefix]) leading to the end of
783the match; the second result is @racket[#f] if no match is found.
784
785The second result can be useful as an @racket[input-prefix] for
786attempting a second match on @racket[input] starting from the end of
787the first match. In that case, use @racket[regexp-max-lookbehind]
788to determine an appropriate value for @racket[count].}
789
790@deftogether[(
791@defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
792                                  [input (or/c string? bytes? path? input-port?)]
793                                  [start-pos exact-nonnegative-integer? 0]
794                                  [end-pos (or/c exact-nonnegative-integer? #f) #f]
795                                  [input-prefix bytes? #""]
796                                  [count exact-nonnegative-integer? 1])
797         (values (listof (cons/c exact-nonnegative-integer?
798                                 exact-nonnegative-integer?))
799                 (or/c #f bytes?))]
800@defproc[(regexp-match-peek-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
801                            [input input-port?]
802                            [start-pos exact-nonnegative-integer? 0]
803                            [end-pos (or/c exact-nonnegative-integer? #f) #f]
804                            [progress (or/c evt #f) #f]
805                            [input-prefix bytes? #""]
806                            [count exact-nonnegative-integer? 1])
807         (values
808          (or/c (cons/c (cons/c exact-nonnegative-integer?
809                                exact-nonnegative-integer?)
810                        (listof (or/c (cons/c exact-nonnegative-integer?
811                                              exact-nonnegative-integer?)
812                                      #f)))
813                #f)
814          (or/c #f bytes?))]
815@defproc[(regexp-match-peek-positions-immediate/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
816                            [input input-port?]
817                            [start-pos exact-nonnegative-integer? 0]
818                            [end-pos (or/c exact-nonnegative-integer? #f) #f]
819                            [progress (or/c evt #f) #f]
820                            [input-prefix bytes? #""]
821                            [count exact-nonnegative-integer? 1])
822         (values
823          (or/c (cons/c (cons/c exact-nonnegative-integer?
824                                exact-nonnegative-integer?)
825                        (listof (or/c (cons/c exact-nonnegative-integer?
826                                              exact-nonnegative-integer?)
827                                      #f)))
828                #f)
829          (or/c #f bytes?))]
830)]{
831
832Like @racket[regexp-match-positions], etc., but with a second result
833like @racket[regexp-match/end].}
834
835@;------------------------------------------------------------------------
836@section{Regexp Splitting}
837
838@defproc[(regexp-split [pattern (or/c string? bytes? regexp? byte-regexp?)]
839                       [input (or/c string? bytes? input-port?)]
840                       [start-pos exact-nonnegative-integer? 0]
841                       [end-pos (or/c exact-nonnegative-integer? #f) #f]
842                       [input-prefix bytes? #""])
843         (if (and (or (string? pattern) (regexp? pattern))
844                  (string? input))
845             (cons/c string? (listof string?))
846             (cons/c bytes? (listof bytes?)))]{
847
848The complement of @racket[regexp-match*]: the result is a list of
849strings (if @racket[pattern] is a string or character regexp and
850@racket[input] is a string) or byte strings (otherwise) from
851@racket[input] that are separated by matches to
852@racket[pattern]. Adjacent matches are separated with @racket[""] or
853@racket[#""]. Zero-length matches are treated the same as for
854@racket[regexp-match*].
855
856If @racket[input] contains no matches (in the range @racket[start-pos]
857to @racket[end-pos]), the result is a list containing @racket[input]'s
858content (from @racket[start-pos] to @racket[end-pos]) as a single
859element. If a match occurs at the beginning of @racket[input] (at
860@racket[start-pos]), the resulting list will start with an empty
861string or byte string, and if a match occurs at the end (at
862@racket[end-pos]), the list will end with an empty string or byte
863string. The @racket[end-pos] argument can be @racket[#f], in which
864case splitting goes to the end of @racket[input] (which corresponds to
865an end-of-file if @racket[input] is an input port).
866
867@examples[
868(regexp-split #rx" +" "12  34")
869(regexp-split #rx"." "12  34")
870(regexp-split #rx"" "12  34")
871(regexp-split #rx" *" "12  34")
872(regexp-split #px"\\b" "12, 13 and 14.")
873(regexp-split #rx" +" "")
874]}
875
876@;------------------------------------------------------------------------
877@section{Regexp Substitution}
878
879@defproc[(regexp-replace [pattern (or/c string? bytes? regexp? byte-regexp?)]
880                         [input (or/c string? bytes?)]
881                         [insert (or/c string? bytes?
882                                       ((string?) () #:rest (listof string?) . ->* . string?)
883                                       ((bytes?) () #:rest (listof bytes?) . ->* . bytes?))]
884                         [input-prefix bytes? #""])
885         (if (and (or (string? pattern) (regexp? pattern))
886                  (string? input))
887             string?
888             bytes?)]{
889
890Performs a match using @racket[pattern] on @racket[input], and then
891returns a string or byte string in which the matching portion of
892@racket[input] is replaced with @racket[insert].  If @racket[pattern]
893matches no part of @racket[input], then @racket[input] is returned
894unmodified.
895
896The @racket[insert] argument can be either a (byte) string, or a
897function that returns a (byte) string. In the latter case, the
898function is applied on the list of values that @racket[regexp-match]
899would return (i.e., the first argument is the complete match, and then
900one argument for each parenthesized sub-expression) to obtain a
901replacement (byte) string.
902
903If @racket[pattern] is a string or character regexp and @racket[input]
904is a string, then @racket[insert] must be a string or a procedure that
905accept strings, and the result is a string. If @racket[pattern] is a
906byte string or byte regexp, or if @racket[input] is a byte string,
907then @racket[insert] as a string is converted to a byte string,
908@racket[insert] as a procedure is called with a byte string, and the
909result is a byte string.
910
911If @racket[insert] contains @litchar{&}, then @litchar{&}
912is replaced with the matching portion of @racket[input] before it is
913substituted into the match's place.  If @racket[insert] contains
914@litchar{\}@nonterm{n} for some integer @nonterm{n}, then it is
915replaced with the @nonterm{n}th matching sub-expression from
916@racket[input]. A @litchar{&} and @litchar{\0} are aliases. If
917the @nonterm{n}th sub-expression was not used in the match, or if
918@nonterm{n} is greater than the number of sub-expressions in
919@racket[pattern], then @litchar{\}@nonterm{n} is replaced with the
920empty string.
921
922To substitute a literal @litchar{&} or @litchar{\}, use
923@litchar{\&} and @litchar{\\}, respectively, in
924@racket[insert]. A @litchar{\$} in @racket[insert] is
925equivalent to an empty sequence; this can be used to terminate a
926number @nonterm{n} following @litchar{\}. If a @litchar{\} in
927@racket[insert] is followed by anything other than a digit,
928@litchar{&}, @litchar{\}, or @litchar{$}, then the @litchar{\}
929by itself is treated as @litchar{\0}.
930
931Note that the @litchar{\} described in the previous paragraphs is a
932character or byte of @racket[input]. To write such an @racket[input]
933as a Racket string literal, an escaping @litchar{\} is needed
934before the @litchar{\}. For example, the Racket constant
935@racket["\\1"] is @litchar{\1}.
936
937@examples[
938(regexp-replace #rx"mi" "mi casa" "su")
939(regexp-replace #rx"mi" "mi casa" string-upcase)
940(regexp-replace #rx"([Mm])i ([a-zA-Z]*)" "Mi Casa" "\\1y \\2")
941(regexp-replace #rx"([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi"
942                "\\1y \\2")
943(regexp-replace #rx"x" "12x4x6" "\\\\")
944(display (regexp-replace #rx"x" "12x4x6" "\\\\"))
945]}
946
947@defproc[(regexp-replace* [pattern (or/c string? bytes? regexp? byte-regexp?)]
948                          [input (or/c string? bytes?)]
949                          [insert (or/c string? bytes?
950                                        ((string?) () #:rest (listof string?) . ->* . string?)
951                                        ((bytes?) () #:rest (listof bytes?) . ->* . bytes?))]
952                          [start-pos exact-nonnegative-integer? 0]
953                          [end-pos (or/c exact-nonnegative-integer? #f) #f]
954                          [input-prefix bytes? #""])
955         (or/c string? bytes?)]{
956
957Like @racket[regexp-replace], except that every instance of
958@racket[pattern] in @racket[input] is replaced with @racket[insert],
959instead of just the first match. The result is @racket[input] only if
960there are no matches, @racket[start-pos] is @racket[0], and
961@racket[end-pos] is @racket[#f] or the length of @racket[input].
962Only non-overlapping instances of
963@racket[pattern] in @racket[input] are replaced, so instances of
964@racket[pattern] within inserted strings are @italic{not} replaced
965recursively. Zero-length matches are treated the same as in
966@racket[regexp-match*].
967
968The optional @racket[start-pos] and @racket[end-pos] arguments select
969a portion of @racket[input] for matching; the default is the entire
970string or the stream up to an end-of-file.
971
972@examples[
973(regexp-replace* #rx"([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi"
974                 "\\1y \\2")
975(regexp-replace* #rx"([Mm])i ([a-zA-Z]*)" "mi cerveza Mi Mi Mi"
976                 (lambda (all one two)
977                   (string-append (string-downcase one) "y"
978                                  (string-upcase two))))
979(regexp-replace* #px"\\w" "hello world" string-upcase 0 5)
980(display (regexp-replace* #rx"x" "12x4x6" "\\\\"))
981]
982
983@history[#:changed "8.1.0.7" @elem{Changed to return @racket[input] when no
984                                   replacements are performed.}]}
985
986@defproc[(regexp-replaces [input (or/c string? bytes?)]
987                          [replacements
988                           (listof
989                            (list/c (or/c string? bytes? regexp? byte-regexp?)
990                                    (or/c string? bytes?
991                                        ((string?) () #:rest (listof string?) . ->* . string?)
992                                        ((bytes?) () #:rest (listof bytes?) . ->* . bytes?))))])
993         (or/c string? bytes?)]{
994
995Performs a chain of @racket[regexp-replace*] operations, where each
996element in @racket[replacements] specifies a replacement as a
997@racket[(list pattern replacement)].  The replacements are done in
998order, so later replacements can apply to previous insertions.
999
1000@examples[
1001(regexp-replaces "zero-or-more?"
1002                 '([#rx"-" "_"] [#rx"(.*)\\?$" "is_\\1"]))
1003(regexp-replaces "zero-or-more?"
1004                 '([#rx"e" "o"] [#rx"o" "oo"]))
1005]}
1006
1007@defproc*[([(regexp-replace-quote [str string?]) string?]
1008           [(regexp-replace-quote [bstr bytes?]) bytes?])]{
1009
1010Produces a string suitable for use as the third argument to
1011@racket[regexp-replace] to insert the literal sequence of characters
1012in @racket[str] or bytes in @racket[bstr] as a replacement.
1013Concretely, every @litchar{\} and @litchar{&} in @racket[str] or
1014@racket[bstr] is protected by a quoting @litchar{\}.
1015
1016@examples[
1017(regexp-replace #rx"UT" "Go UT!" "A&M")
1018(regexp-replace #rx"UT" "Go UT!" (regexp-replace-quote "A&M"))
1019]}
1020