Lines Matching +refs:fast +refs:drive +refs:regexp +refs:match +refs:positions +refs:bytes

2 (require "regexp.rkt"
3 "lazy-bytes.rkt"
9 ;; Drives a regexp matcher on a byte string, character string, or port
11 (provide drive-regexp-match
13 fast-drive-regexp-match?/bytes
14 fast-drive-regexp-match?/string
15 fast-drive-regexp-match-positions/bytes
16 fast-drive-regexp-match-positions/string
17 fast-drive-regexp-match/bytes
18 fast-drive-regexp-match/string
28 (define (fast-drive-regexp-match?/bytes rx in start-pos end-pos)
29 (define state (and (rx:regexp-references? rx)
30 (make-vector (rx:regexp-num-groups rx) #f)))
32 (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state))
35 (define (fast-drive-regexp-match?/string rx in-str start-offset end-offset)
36 (define state (and (rx:regexp-references? rx)
37 (make-vector (rx:regexp-num-groups rx) #f)))
38 (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str))))
40 (search-match rx in 0 0 (bytes-length in) state))
43 (define (fast-drive-regexp-match-positions/bytes rx in start-pos end-pos)
44 (define state (let ([n (rx:regexp-num-groups rx)])
48 (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state))
54 (define (fast-drive-regexp-match-positions/string rx in-str start-offset end-offset)
55 (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str))))
56 (define state (let ([n (rx:regexp-num-groups rx)])
60 (search-match rx in 0 0 (bytes-length in) state))
62 (+ start-offset (bytes-utf-8-length in #\? 0 pos)))
72 (define (fast-drive-regexp-match/bytes rx in start-pos end-pos)
73 (define state (let ([n (rx:regexp-num-groups rx)])
77 (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state))
86 (define (fast-drive-regexp-match/string rx in-str start-offset end-offset)
87 (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str))))
88 (define state (let ([n (rx:regexp-num-groups rx)])
92 (search-match rx in 0 0 (bytes-length in) state))
94 (cons (bytes->string/utf-8 in #\? ms-pos me-pos)
98 (bytes->string/utf-8 in #\? (car p) (cdr p))))
104 ;; An "offset" refers to a position in a byte string (in bytes) string
105 ;; (in characters), or port (in bytes). A "pos" always refers to a
106 ;; position in bytes --- so, a "pos" is normalized to UTF-8 bytes in
109 (define (drive-regexp-match who orig-rx orig-in orig-start-offset orig-end-offset out prefix
116 #:end-bytes? [end-bytes? #f]
117 #:end-bytes-count [end-bytes-count #f])
120 [(rx:regexp? orig-rx) orig-rx]
121 [(string? orig-rx) (make-regexp who orig-rx #f #f #f)]
122 [(bytes? orig-rx) (make-regexp who orig-rx #f #t #f)]
123 … [else (raise-argument-error who "(or/c regexp? byte-regexp? string? bytes?)" orig-rx)]))
125 (if (rx:regexp-bytes? rx)
126 (path->bytes orig-in)
129 (unless (or (and (bytes? in) (not peek?))
135 [in-port-ok? "(or/c bytes? string? input-port? path?)"]
136 [in-path-ok? "(or/c bytes? string? path?)"]
137 [else "(or/c bytes? string?)"])
153 [(bytes? in) (bytes-length in)]
160 (unless (bytes? prefix)
161 (raise-argument-error who "bytes?" prefix))
163 (when end-bytes?
164 (unless (exact-nonnegative-integer? end-bytes-count)
165 (raise-argument-error who "exact-nonnegative-integer?" end-bytes-count)))
168 (rx:regexp-references? rx))
169 (let ([n (rx:regexp-num-groups rx)])
173 ;; Separate cases for bytes, strings, and port.
179 [(and (bytes? in)
186 ;; Search for a match:
187 (define-values (ms-pos me-pos) (search-match rx in search-pos start-pos end-pos state))
189 ;; Maybe write skipped bytes:
191 (write-bytes in out 0 (or ms-pos end-pos)))
193 ;; Return match results:
195 [(#f) (add-end-bytes #f end-bytes-count #f #f)]
197 [(positions)
198 (define positions (byte-positions->byte-positions ms-pos me-pos state))
199 (add-end-bytes positions end-bytes-count in me-pos)]
201 (define bytess (byte-positions->bytess in ms-pos me-pos state))
202 (add-end-bytes bytess end-bytes-count in me-pos)])]
211 ;; corresponds to a 0 position (in bytes):
212 (define bstr-in (string->bytes/utf-8 in 0 start-offset end-offset))
216 (define end-pos (bytes-length bstr-in))
218 ;; Search for a match:
219 (define-values (ms-pos me-pos) (search-match rx bstr-in search-pos 0 end-pos state))
221 ;; Maybe write skipped bytes:
224 (write-bytes bstr-in out 0 (or ms-pos end-pos)))
226 ;; Return match results:
228 [(#f) (add-end-bytes #f end-bytes-count #f #f)]
230 [(positions)
231 ;; If pattern is bytes-based, then results will be bytes-based:
232 (define positions
234 [(rx:regexp-bytes? rx)
236 (byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
238 (byte-positions->string-positions bstr-in ms-pos me-pos state
240 (add-end-bytes positions end-bytes-count bstr-in me-pos)]
242 ;; If pattern is bytes-based, then results will be bytes instead of strings:
243 (define bytes/strings
245 [(rx:regexp-bytes? rx)
246 (byte-positions->bytess bstr-in ms-pos me-pos state)]
248 (byte-positions->strings bstr-in ms-pos me-pos state)]))
249 (add-end-bytes bytes/strings end-bytes-count bstr-in me-pos)])]
253 (define prefix-len (bytes-length prefix))
254 ;; The lazy-bytes record will include the prefix,
255 ;; and it won't include bytes/characters before
266 [(bytes? in) (open-input-bytes/no-copy in start-offset end-offset)]
269 (define any-bytes-left?
275 ;; Make sure we can skip over `start-offset` bytes:
278 ;; discard skipped bytes:
279 (copy-port-bytes port-in #f start-offset)])]
282 (define lb-in (make-lazy-bytes port-in (if peek? start-offset 0) prefix
284 out (max (rx:regexp-max-lookbehind rx)
285 (or end-bytes-count 0))
295 ;; Search for a match:
297 (if any-bytes-left?
298 (search-match rx lb-in search-pos 0 end-pos state)
299 ;; Couldn't skip past `start-offset` bytes for an input port:
302 ;; To write and consume skipped bytes, but we'll do this only
303 ;; after we've extracted match information from the lazy byte
310 ;; Flush bytes before match:
311 (lazy-bytes-advance! lb-in ms-pos #t))
313 ;; Consume bytes that correspond to match:
314 (copy-port-bytes port-in #f (- me-pos prefix-len)))]
316 ;; Copy all remaining bytes from input to output
318 (copy-port-bytes port-in out #f))]
321 ;; Copy all bytes to output
322 (lazy-bytes-advance! lb-in end-pos #t))
324 ;; Consume all bytes
325 (copy-port-bytes port-in #f (- end-pos start-pos)))])))
329 ;; Return match results:
331 (not (lazy-bytes-failed? lb-in))
334 (add-end-bytes #f end-bytes-count #f #f)]
336 [(positions)
337 ;; Result positions correspond to the port after `start-offset`,
338 ;; but with the prefix bytes (= `start-pos`)
339 (define bstr (lazy-bytes-bstr lb-in))
340 (define positions
343 (rx:regexp-bytes? rx))
345 (byte-positions->byte-positions ms-pos me-pos state #:delta delta)]
347 ;; Some bytes may have been discarded in `lb-in`, and we
354 (define delta (lazy-bytes-discarded-count lb-in))
355 (byte-positions->string-positions bstr ms-pos me-pos state
359 … (add-end-bytes positions end-bytes-count bstr (- me-pos (lazy-bytes-discarded-count lb-in)))]
361 ;; The byte string may be shifted by discarded bytes, if not
363 (define bstr (lazy-bytes-bstr lb-in))
364 (define delta (lazy-bytes-discarded-count lb-in))
365 (define bytes/strings
368 (rx:regexp-bytes? rx))
369 (byte-positions->bytess bstr ms-pos me-pos state #:delta delta)]
371 (byte-positions->strings bstr ms-pos me-pos state #:delta delta)]))
372 (add-end-bytes bytes/strings end-bytes-count bstr (- me-pos delta))])
378 ;; Range-checking arguments to `regexp-match` and company:
382 [(bytes? in) (bytes-length in)]