1#lang racket/base
2(require "regexp.rkt"
3         "lazy-bytes.rkt"
4         "port.rkt"
5         "compile.rkt"
6         "extract.rkt"
7         "search.rkt")
8
9;; Drives a regexp matcher on a byte string, character string, or port
10
11(provide drive-regexp-match
12
13         fast-drive-regexp-match?/bytes
14         fast-drive-regexp-match?/string
15         fast-drive-regexp-match-positions/bytes
16         fast-drive-regexp-match-positions/string
17         fast-drive-regexp-match/bytes
18         fast-drive-regexp-match/string
19
20         FAST-STRING-LEN)
21
22;; ----------------------------------------
23;; Start with some (repetative) functions for the most common cases to
24;; keep the overhead low for reaching these cases.
25
26(define FAST-STRING-LEN 64)
27
28(define (fast-drive-regexp-match?/bytes rx in start-pos end-pos)
29  (define state (and (rx:regexp-references? rx)
30                     (make-vector (rx:regexp-num-groups rx) #f)))
31  (define-values (ms-pos me-pos)
32    (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state))
33  (and ms-pos #t))
34
35(define (fast-drive-regexp-match?/string rx in-str start-offset end-offset)
36  (define state (and (rx:regexp-references? rx)
37                     (make-vector (rx:regexp-num-groups rx) #f)))
38  (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str))))
39  (define-values (ms-pos me-pos)
40    (search-match rx in 0 0 (bytes-length in) state))
41  (and ms-pos #t))
42
43(define (fast-drive-regexp-match-positions/bytes rx in start-pos end-pos)
44  (define state (let ([n (rx:regexp-num-groups rx)])
45                  (and (positive? n)
46                       (make-vector n #f))))
47  (define-values (ms-pos me-pos)
48    (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state))
49  (and ms-pos
50       (if state
51           (cons (cons ms-pos me-pos) (vector->list state))
52           (list (cons ms-pos me-pos)))))
53
54(define (fast-drive-regexp-match-positions/string rx in-str start-offset end-offset)
55  (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str))))
56  (define state (let ([n (rx:regexp-num-groups rx)])
57                  (and (positive? n)
58                       (make-vector n #f))))
59  (define-values (ms-pos me-pos)
60    (search-match rx in 0 0 (bytes-length in) state))
61  (define (string-offset pos)
62    (+ start-offset (bytes-utf-8-length in #\? 0 pos)))
63  (and ms-pos
64       (cons (cons (string-offset ms-pos) (string-offset me-pos))
65             (if state
66                 (for/list ([p (in-vector state)])
67                   (and p
68                        (cons (string-offset (car p))
69                              (string-offset (cdr p)))))
70                 null))))
71
72(define (fast-drive-regexp-match/bytes rx in start-pos end-pos)
73  (define state (let ([n (rx:regexp-num-groups rx)])
74                  (and (positive? n)
75                       (make-vector n #f))))
76  (define-values (ms-pos me-pos)
77    (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state))
78  (and ms-pos
79       (cons (subbytes in ms-pos me-pos)
80             (if state
81                 (for/list ([p (in-vector state)])
82                   (and p
83                        (subbytes in (car p) (cdr p))))
84                 null))))
85
86(define (fast-drive-regexp-match/string rx in-str start-offset end-offset)
87  (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str))))
88  (define state (let ([n (rx:regexp-num-groups rx)])
89                  (and (positive? n)
90                       (make-vector n #f))))
91  (define-values (ms-pos me-pos)
92    (search-match rx in 0 0 (bytes-length in) state))
93  (and ms-pos
94       (cons (bytes->string/utf-8 in #\? ms-pos me-pos)
95             (if state
96                 (for/list ([p (in-vector state)])
97                   (and p
98                        (bytes->string/utf-8 in #\? (car p) (cdr p))))
99                 null))))
100
101;; ----------------------------------------
102;; The general case
103
104;; An "offset" refers to a position in a byte string (in bytes) string
105;; (in characters), or port (in bytes). A "pos" always refers to a
106;; position in bytes --- so, a "pos" is normalized to UTF-8 bytes in
107;; the case of a string.
108
109(define (drive-regexp-match who orig-rx orig-in orig-start-offset orig-end-offset out prefix
110                            #:search-offset [search-offset orig-start-offset]
111                            #:mode mode
112                            #:in-port-ok? [in-port-ok? #t]
113                            #:in-path-ok? [in-path-ok? #t]
114                            #:peek? [peek? #f] #:immediate-only? [immediate-only? #f]
115                            #:progress-evt [progress-evt #f]
116                            #:end-bytes? [end-bytes? #f]
117                            #:end-bytes-count [end-bytes-count #f])
118
119  (define rx (cond
120              [(rx:regexp? orig-rx) orig-rx]
121              [(string? orig-rx) (make-regexp who orig-rx #f #f #f)]
122              [(bytes? orig-rx) (make-regexp who orig-rx #f #t #f)]
123              [else (raise-argument-error who "(or/c regexp? byte-regexp? string? bytes?)" orig-rx)]))
124  (define in (if (and in-path-ok? (path? orig-in))
125                 (if (rx:regexp-bytes? rx)
126                     (path->bytes orig-in)
127                     (path->string orig-in))
128                 orig-in))
129  (unless (or (and (bytes? in) (not peek?))
130              (and (string? in) (not peek?))
131              (and in-port-ok? (input-port? in)))
132    (raise-argument-error who
133                          (cond
134                           [peek? "input-port?"]
135                           [in-port-ok? "(or/c bytes? string? input-port? path?)"]
136                           [in-path-ok? "(or/c bytes? string? path?)"]
137                           [else "(or/c bytes? string?)"])
138                          orig-in))
139
140  (define start-offset (cond
141                        [orig-start-offset
142                         (unless (exact-nonnegative-integer? orig-start-offset)
143                           (raise-argument-error who "exact-nonnegative-integer?" orig-start-offset))
144                         (check-range who "starting index" in orig-start-offset 0)
145                         orig-start-offset]
146                        [else 0]))
147  (define end-offset (cond
148                      [orig-end-offset
149                       (unless (exact-nonnegative-integer? orig-end-offset)
150                         (raise-argument-error who "(or/c #f exact-nonnegative-integer?)" orig-end-offset))
151                       (check-range who "ending index" in orig-end-offset start-offset)
152                       orig-end-offset]
153                      [(bytes? in) (bytes-length in)]
154                      [(string? in) (string-length in)]
155                      [else 'eof]))
156
157  (unless (or (not out) (output-port? out))
158    (raise-argument-error who "(or/c #f output-port?)" out))
159
160  (unless (bytes? prefix)
161    (raise-argument-error who "bytes?" prefix))
162
163  (when end-bytes?
164    (unless (exact-nonnegative-integer? end-bytes-count)
165      (raise-argument-error who "exact-nonnegative-integer?" end-bytes-count)))
166
167  (define state (and (or (not (eq? mode '?))
168                         (rx:regexp-references? rx))
169                     (let ([n (rx:regexp-num-groups rx)])
170                       (and (positive? n)
171                            (make-vector n #f)))))
172
173  ;; Separate cases for bytes, strings, and port.
174  ;; There's an annoying level of duplication here, but
175  ;; there are lots of little differences in each case.
176  (cond
177
178   ;; Bytes input, no provided prefix: ----------------------------------------
179   [(and (bytes? in)
180         (not out)
181         (equal? #"" prefix))
182    (define start-pos start-offset)
183    (define search-pos search-offset)
184    (define end-pos end-offset)
185
186    ;; Search for a match:
187    (define-values (ms-pos me-pos) (search-match rx in search-pos start-pos end-pos state))
188
189    ;; Maybe write skipped bytes:
190    (when out
191      (write-bytes in out 0 (or ms-pos end-pos)))
192
193    ;; Return match results:
194    (case (and ms-pos mode)
195      [(#f) (add-end-bytes #f end-bytes-count #f #f)]
196      [(?) #t]
197      [(positions)
198       (define positions (byte-positions->byte-positions ms-pos me-pos state))
199       (add-end-bytes positions end-bytes-count in me-pos)]
200      [(strings)
201       (define bytess (byte-positions->bytess in ms-pos me-pos state))
202       (add-end-bytes bytess end-bytes-count in me-pos)])]
203
204   ;; Sufficiently small string input, no provided prefix: --------------------
205   [(and (string? in)
206         (not out)
207         (equal? #"" prefix)
208         ((- end-offset start-offset) . < . FAST-STRING-LEN))
209    ;; `bstr-in` includes only the characters fom `start-offset` to
210    ;; `end-offset`, so the starting offset (in characters)
211    ;; corresponds to a 0 position (in bytes):
212    (define bstr-in (string->bytes/utf-8 in 0 start-offset end-offset))
213    (define search-pos (if (= start-offset search-offset)
214                           0
215                           (string-utf-8-length in start-offset search-offset)))
216    (define end-pos (bytes-length bstr-in))
217
218    ;; Search for a match:
219    (define-values (ms-pos me-pos) (search-match rx bstr-in search-pos 0 end-pos state))
220
221    ;; Maybe write skipped bytes:
222    (when out
223      (write-string in out 0 start-offset)
224      (write-bytes bstr-in out 0 (or ms-pos end-pos)))
225
226    ;; Return match results:
227    (case (and ms-pos mode)
228      [(#f) (add-end-bytes #f end-bytes-count #f #f)]
229      [(?) #t]
230      [(positions)
231       ;; If pattern is bytes-based, then results will be bytes-based:
232       (define positions
233         (cond
234          [(rx:regexp-bytes? rx)
235           (define delta (string-utf-8-length in 0 start-offset))
236           (byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
237          [else
238           (byte-positions->string-positions bstr-in ms-pos me-pos state
239                                             #:result-offset start-offset)]))
240       (add-end-bytes positions end-bytes-count bstr-in me-pos)]
241      [(strings)
242       ;; If pattern is bytes-based, then results will be bytes instead of strings:
243       (define bytes/strings
244         (cond
245          [(rx:regexp-bytes? rx)
246           (byte-positions->bytess bstr-in ms-pos me-pos state)]
247          [else
248           (byte-positions->strings bstr-in ms-pos me-pos state)]))
249       (add-end-bytes bytes/strings end-bytes-count bstr-in me-pos)])]
250
251   ;; Port input, long string input, and/or provided prefix: --------------------
252   [else
253    (define prefix-len (bytes-length prefix))
254    ;; The lazy-bytes record will include the prefix,
255    ;; and it won't include bytes/characters before
256    ;; `start-offset`:
257    (define start-pos prefix-len)
258    (define search-pos (if (= start-offset search-offset)
259                           start-pos
260                           (+ start-pos
261                              (cond
262                               [(string? in) (string-utf-8-length in start-offset search-offset)]
263                               [else (- search-offset start-offset)]))))
264    (define port-in
265      (cond
266       [(bytes? in) (open-input-bytes/no-copy in start-offset end-offset)]
267       [(string? in) (open-input-string/lazy in start-offset end-offset)]
268       [else in]))
269    (define any-bytes-left?
270      (cond
271        [(and (input-port? in)
272              (positive? start-offset))
273         (cond
274           [peek?
275            ;; Make sure we can skip over `start-offset` bytes:
276            (not (eof-object? (peek-byte port-in (sub1 start-offset))))]
277           [else
278            ;; discard skipped bytes:
279            (copy-port-bytes port-in #f start-offset)])]
280        [else #t]))
281    ;; Create a lazy string from the port:
282    (define lb-in (make-lazy-bytes port-in (if peek? start-offset 0) prefix
283                                   peek? immediate-only? progress-evt
284                                   out (max (rx:regexp-max-lookbehind rx)
285                                            (or end-bytes-count 0))
286                                   (and (input-port? in)
287                                        (not (eq? 'eof end-offset))
288                                        (- end-offset start-offset))))
289    (define end-pos (if (or (eq? 'eof end-offset)
290                            (string? in))
291                        'eof
292                        (+ start-pos
293                           (- end-offset start-offset))))
294
295    ;; Search for a match:
296    (define-values (ms-pos me-pos)
297      (if any-bytes-left?
298          (search-match rx lb-in search-pos 0 end-pos state)
299          ;; Couldn't skip past `start-offset` bytes for an input port:
300          (values #f #f)))
301
302    ;; To write and consume skipped bytes, but we'll do this only
303    ;; after we've extracted match information from the lazy byte
304    ;; string:
305    (define (write/consume-skipped)
306      (when (not peek?)
307        (cond
308         [ms-pos
309          (when out
310            ;; Flush bytes before match:
311            (lazy-bytes-advance! lb-in ms-pos #t))
312          (when (input-port? in)
313            ;; Consume bytes that correspond to match:
314            (copy-port-bytes port-in #f (- me-pos prefix-len)))]
315         [(eq? end-pos 'eof)
316          ;; Copy all remaining bytes from input to output
317          (when (or out (input-port? in))
318            (copy-port-bytes port-in out #f))]
319         [else
320          (when out
321            ;; Copy all bytes to output
322            (lazy-bytes-advance! lb-in end-pos #t))
323          (when (input-port? in)
324            ;; Consume all bytes
325            (copy-port-bytes port-in #f (- end-pos start-pos)))])))
326
327    (begin0
328
329     ;; Return match results:
330     (case (and ms-pos
331                (not (lazy-bytes-failed? lb-in))
332                mode)
333       [(#f)
334        (add-end-bytes #f end-bytes-count #f #f)]
335       [(?) #t]
336       [(positions)
337        ;; Result positions correspond to the port after `start-offset`,
338        ;; but with the prefix bytes (= `start-pos`)
339        (define bstr (lazy-bytes-bstr lb-in))
340        (define positions
341          (cond
342           [(or (not (string? in))
343                (rx:regexp-bytes? rx))
344            (define delta (- start-offset start-pos))
345            (byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
346           [else
347            ;; Some bytes may have been discarded in `lb-in`, and we
348            ;; don't know how many characters those add up to. The
349            ;; starting position `ms-pos` must be on a code-point
350            ;; boundary, and everything from `ms-pos` to `ms-end` must
351            ;; still be in `lb-in`. So, find `ms-pos` in the original
352            ;; string, and take it from there.
353            (define ms-str-pos (byte-index->string-index in start-offset (- ms-pos start-pos)))
354            (define delta (lazy-bytes-discarded-count lb-in))
355            (byte-positions->string-positions bstr ms-pos me-pos state
356                                              #:start-index (- ms-pos delta)
357                                              #:delta delta
358                                              #:result-offset (+ ms-str-pos start-offset))]))
359        (add-end-bytes positions end-bytes-count bstr (- me-pos (lazy-bytes-discarded-count lb-in)))]
360       [(strings)
361        ;; The byte string may be shifted by discarded bytes, if not
362        ;; in `peek?` mode
363        (define bstr (lazy-bytes-bstr lb-in))
364        (define delta (lazy-bytes-discarded-count lb-in))
365        (define bytes/strings
366          (cond
367           [(or (not (string? in))
368                (rx:regexp-bytes? rx))
369            (byte-positions->bytess bstr ms-pos me-pos state #:delta delta)]
370           [else
371            (byte-positions->strings bstr ms-pos me-pos state #:delta delta)]))
372        (add-end-bytes bytes/strings end-bytes-count bstr (- me-pos delta))])
373
374     ;; Now, write and consume port content:
375     (write/consume-skipped))]))
376
377;; -------------------------------------------------------
378;; Range-checking arguments to `regexp-match` and company:
379
380(define (check-range who what in pos start-pos)
381  (define len (cond
382               [(bytes? in) (bytes-length in)]
383               [(string? in) (string-length in)]
384               [else +inf.0]))
385  (unless (pos . >= . start-pos)
386    (raise-arguments-error who
387                           (format "~a is smaller than starting index" what)
388                           what pos
389                           "starting index" start-pos))
390  (unless (pos . <= . len)
391    (raise-arguments-error who
392                           (format "~a is out of range" what)
393                           what pos)))
394
395