1#lang racket/base 2(require "regexp.rkt" 3 "lazy-bytes.rkt" 4 "port.rkt" 5 "compile.rkt" 6 "extract.rkt" 7 "search.rkt") 8 9;; Drives a regexp matcher on a byte string, character string, or port 10 11(provide drive-regexp-match 12 13 fast-drive-regexp-match?/bytes 14 fast-drive-regexp-match?/string 15 fast-drive-regexp-match-positions/bytes 16 fast-drive-regexp-match-positions/string 17 fast-drive-regexp-match/bytes 18 fast-drive-regexp-match/string 19 20 FAST-STRING-LEN) 21 22;; ---------------------------------------- 23;; Start with some (repetative) functions for the most common cases to 24;; keep the overhead low for reaching these cases. 25 26(define FAST-STRING-LEN 64) 27 28(define (fast-drive-regexp-match?/bytes rx in start-pos end-pos) 29 (define state (and (rx:regexp-references? rx) 30 (make-vector (rx:regexp-num-groups rx) #f))) 31 (define-values (ms-pos me-pos) 32 (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state)) 33 (and ms-pos #t)) 34 35(define (fast-drive-regexp-match?/string rx in-str start-offset end-offset) 36 (define state (and (rx:regexp-references? rx) 37 (make-vector (rx:regexp-num-groups rx) #f))) 38 (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str)))) 39 (define-values (ms-pos me-pos) 40 (search-match rx in 0 0 (bytes-length in) state)) 41 (and ms-pos #t)) 42 43(define (fast-drive-regexp-match-positions/bytes rx in start-pos end-pos) 44 (define state (let ([n (rx:regexp-num-groups rx)]) 45 (and (positive? n) 46 (make-vector n #f)))) 47 (define-values (ms-pos me-pos) 48 (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state)) 49 (and ms-pos 50 (if state 51 (cons (cons ms-pos me-pos) (vector->list state)) 52 (list (cons ms-pos me-pos))))) 53 54(define (fast-drive-regexp-match-positions/string rx in-str start-offset end-offset) 55 (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str)))) 56 (define state (let ([n (rx:regexp-num-groups rx)]) 57 (and (positive? n) 58 (make-vector n #f)))) 59 (define-values (ms-pos me-pos) 60 (search-match rx in 0 0 (bytes-length in) state)) 61 (define (string-offset pos) 62 (+ start-offset (bytes-utf-8-length in #\? 0 pos))) 63 (and ms-pos 64 (cons (cons (string-offset ms-pos) (string-offset me-pos)) 65 (if state 66 (for/list ([p (in-vector state)]) 67 (and p 68 (cons (string-offset (car p)) 69 (string-offset (cdr p))))) 70 null)))) 71 72(define (fast-drive-regexp-match/bytes rx in start-pos end-pos) 73 (define state (let ([n (rx:regexp-num-groups rx)]) 74 (and (positive? n) 75 (make-vector n #f)))) 76 (define-values (ms-pos me-pos) 77 (search-match rx in start-pos start-pos (or end-pos (bytes-length in)) state)) 78 (and ms-pos 79 (cons (subbytes in ms-pos me-pos) 80 (if state 81 (for/list ([p (in-vector state)]) 82 (and p 83 (subbytes in (car p) (cdr p)))) 84 null)))) 85 86(define (fast-drive-regexp-match/string rx in-str start-offset end-offset) 87 (define in (string->bytes/utf-8 in-str 0 start-offset (or end-offset (string-length in-str)))) 88 (define state (let ([n (rx:regexp-num-groups rx)]) 89 (and (positive? n) 90 (make-vector n #f)))) 91 (define-values (ms-pos me-pos) 92 (search-match rx in 0 0 (bytes-length in) state)) 93 (and ms-pos 94 (cons (bytes->string/utf-8 in #\? ms-pos me-pos) 95 (if state 96 (for/list ([p (in-vector state)]) 97 (and p 98 (bytes->string/utf-8 in #\? (car p) (cdr p)))) 99 null)))) 100 101;; ---------------------------------------- 102;; The general case 103 104;; An "offset" refers to a position in a byte string (in bytes) string 105;; (in characters), or port (in bytes). A "pos" always refers to a 106;; position in bytes --- so, a "pos" is normalized to UTF-8 bytes in 107;; the case of a string. 108 109(define (drive-regexp-match who orig-rx orig-in orig-start-offset orig-end-offset out prefix 110 #:search-offset [search-offset orig-start-offset] 111 #:mode mode 112 #:in-port-ok? [in-port-ok? #t] 113 #:in-path-ok? [in-path-ok? #t] 114 #:peek? [peek? #f] #:immediate-only? [immediate-only? #f] 115 #:progress-evt [progress-evt #f] 116 #:end-bytes? [end-bytes? #f] 117 #:end-bytes-count [end-bytes-count #f]) 118 119 (define rx (cond 120 [(rx:regexp? orig-rx) orig-rx] 121 [(string? orig-rx) (make-regexp who orig-rx #f #f #f)] 122 [(bytes? orig-rx) (make-regexp who orig-rx #f #t #f)] 123 [else (raise-argument-error who "(or/c regexp? byte-regexp? string? bytes?)" orig-rx)])) 124 (define in (if (and in-path-ok? (path? orig-in)) 125 (if (rx:regexp-bytes? rx) 126 (path->bytes orig-in) 127 (path->string orig-in)) 128 orig-in)) 129 (unless (or (and (bytes? in) (not peek?)) 130 (and (string? in) (not peek?)) 131 (and in-port-ok? (input-port? in))) 132 (raise-argument-error who 133 (cond 134 [peek? "input-port?"] 135 [in-port-ok? "(or/c bytes? string? input-port? path?)"] 136 [in-path-ok? "(or/c bytes? string? path?)"] 137 [else "(or/c bytes? string?)"]) 138 orig-in)) 139 140 (define start-offset (cond 141 [orig-start-offset 142 (unless (exact-nonnegative-integer? orig-start-offset) 143 (raise-argument-error who "exact-nonnegative-integer?" orig-start-offset)) 144 (check-range who "starting index" in orig-start-offset 0) 145 orig-start-offset] 146 [else 0])) 147 (define end-offset (cond 148 [orig-end-offset 149 (unless (exact-nonnegative-integer? orig-end-offset) 150 (raise-argument-error who "(or/c #f exact-nonnegative-integer?)" orig-end-offset)) 151 (check-range who "ending index" in orig-end-offset start-offset) 152 orig-end-offset] 153 [(bytes? in) (bytes-length in)] 154 [(string? in) (string-length in)] 155 [else 'eof])) 156 157 (unless (or (not out) (output-port? out)) 158 (raise-argument-error who "(or/c #f output-port?)" out)) 159 160 (unless (bytes? prefix) 161 (raise-argument-error who "bytes?" prefix)) 162 163 (when end-bytes? 164 (unless (exact-nonnegative-integer? end-bytes-count) 165 (raise-argument-error who "exact-nonnegative-integer?" end-bytes-count))) 166 167 (define state (and (or (not (eq? mode '?)) 168 (rx:regexp-references? rx)) 169 (let ([n (rx:regexp-num-groups rx)]) 170 (and (positive? n) 171 (make-vector n #f))))) 172 173 ;; Separate cases for bytes, strings, and port. 174 ;; There's an annoying level of duplication here, but 175 ;; there are lots of little differences in each case. 176 (cond 177 178 ;; Bytes input, no provided prefix: ---------------------------------------- 179 [(and (bytes? in) 180 (not out) 181 (equal? #"" prefix)) 182 (define start-pos start-offset) 183 (define search-pos search-offset) 184 (define end-pos end-offset) 185 186 ;; Search for a match: 187 (define-values (ms-pos me-pos) (search-match rx in search-pos start-pos end-pos state)) 188 189 ;; Maybe write skipped bytes: 190 (when out 191 (write-bytes in out 0 (or ms-pos end-pos))) 192 193 ;; Return match results: 194 (case (and ms-pos mode) 195 [(#f) (add-end-bytes #f end-bytes-count #f #f)] 196 [(?) #t] 197 [(positions) 198 (define positions (byte-positions->byte-positions ms-pos me-pos state)) 199 (add-end-bytes positions end-bytes-count in me-pos)] 200 [(strings) 201 (define bytess (byte-positions->bytess in ms-pos me-pos state)) 202 (add-end-bytes bytess end-bytes-count in me-pos)])] 203 204 ;; Sufficiently small string input, no provided prefix: -------------------- 205 [(and (string? in) 206 (not out) 207 (equal? #"" prefix) 208 ((- end-offset start-offset) . < . FAST-STRING-LEN)) 209 ;; `bstr-in` includes only the characters fom `start-offset` to 210 ;; `end-offset`, so the starting offset (in characters) 211 ;; corresponds to a 0 position (in bytes): 212 (define bstr-in (string->bytes/utf-8 in 0 start-offset end-offset)) 213 (define search-pos (if (= start-offset search-offset) 214 0 215 (string-utf-8-length in start-offset search-offset))) 216 (define end-pos (bytes-length bstr-in)) 217 218 ;; Search for a match: 219 (define-values (ms-pos me-pos) (search-match rx bstr-in search-pos 0 end-pos state)) 220 221 ;; Maybe write skipped bytes: 222 (when out 223 (write-string in out 0 start-offset) 224 (write-bytes bstr-in out 0 (or ms-pos end-pos))) 225 226 ;; Return match results: 227 (case (and ms-pos mode) 228 [(#f) (add-end-bytes #f end-bytes-count #f #f)] 229 [(?) #t] 230 [(positions) 231 ;; If pattern is bytes-based, then results will be bytes-based: 232 (define positions 233 (cond 234 [(rx:regexp-bytes? rx) 235 (define delta (string-utf-8-length in 0 start-offset)) 236 (byte-positions->byte-positions ms-pos me-pos state #:delta delta)] 237 [else 238 (byte-positions->string-positions bstr-in ms-pos me-pos state 239 #:result-offset start-offset)])) 240 (add-end-bytes positions end-bytes-count bstr-in me-pos)] 241 [(strings) 242 ;; If pattern is bytes-based, then results will be bytes instead of strings: 243 (define bytes/strings 244 (cond 245 [(rx:regexp-bytes? rx) 246 (byte-positions->bytess bstr-in ms-pos me-pos state)] 247 [else 248 (byte-positions->strings bstr-in ms-pos me-pos state)])) 249 (add-end-bytes bytes/strings end-bytes-count bstr-in me-pos)])] 250 251 ;; Port input, long string input, and/or provided prefix: -------------------- 252 [else 253 (define prefix-len (bytes-length prefix)) 254 ;; The lazy-bytes record will include the prefix, 255 ;; and it won't include bytes/characters before 256 ;; `start-offset`: 257 (define start-pos prefix-len) 258 (define search-pos (if (= start-offset search-offset) 259 start-pos 260 (+ start-pos 261 (cond 262 [(string? in) (string-utf-8-length in start-offset search-offset)] 263 [else (- search-offset start-offset)])))) 264 (define port-in 265 (cond 266 [(bytes? in) (open-input-bytes/no-copy in start-offset end-offset)] 267 [(string? in) (open-input-string/lazy in start-offset end-offset)] 268 [else in])) 269 (define any-bytes-left? 270 (cond 271 [(and (input-port? in) 272 (positive? start-offset)) 273 (cond 274 [peek? 275 ;; Make sure we can skip over `start-offset` bytes: 276 (not (eof-object? (peek-byte port-in (sub1 start-offset))))] 277 [else 278 ;; discard skipped bytes: 279 (copy-port-bytes port-in #f start-offset)])] 280 [else #t])) 281 ;; Create a lazy string from the port: 282 (define lb-in (make-lazy-bytes port-in (if peek? start-offset 0) prefix 283 peek? immediate-only? progress-evt 284 out (max (rx:regexp-max-lookbehind rx) 285 (or end-bytes-count 0)) 286 (and (input-port? in) 287 (not (eq? 'eof end-offset)) 288 (- end-offset start-offset)))) 289 (define end-pos (if (or (eq? 'eof end-offset) 290 (string? in)) 291 'eof 292 (+ start-pos 293 (- end-offset start-offset)))) 294 295 ;; Search for a match: 296 (define-values (ms-pos me-pos) 297 (if any-bytes-left? 298 (search-match rx lb-in search-pos 0 end-pos state) 299 ;; Couldn't skip past `start-offset` bytes for an input port: 300 (values #f #f))) 301 302 ;; To write and consume skipped bytes, but we'll do this only 303 ;; after we've extracted match information from the lazy byte 304 ;; string: 305 (define (write/consume-skipped) 306 (when (not peek?) 307 (cond 308 [ms-pos 309 (when out 310 ;; Flush bytes before match: 311 (lazy-bytes-advance! lb-in ms-pos #t)) 312 (when (input-port? in) 313 ;; Consume bytes that correspond to match: 314 (copy-port-bytes port-in #f (- me-pos prefix-len)))] 315 [(eq? end-pos 'eof) 316 ;; Copy all remaining bytes from input to output 317 (when (or out (input-port? in)) 318 (copy-port-bytes port-in out #f))] 319 [else 320 (when out 321 ;; Copy all bytes to output 322 (lazy-bytes-advance! lb-in end-pos #t)) 323 (when (input-port? in) 324 ;; Consume all bytes 325 (copy-port-bytes port-in #f (- end-pos start-pos)))]))) 326 327 (begin0 328 329 ;; Return match results: 330 (case (and ms-pos 331 (not (lazy-bytes-failed? lb-in)) 332 mode) 333 [(#f) 334 (add-end-bytes #f end-bytes-count #f #f)] 335 [(?) #t] 336 [(positions) 337 ;; Result positions correspond to the port after `start-offset`, 338 ;; but with the prefix bytes (= `start-pos`) 339 (define bstr (lazy-bytes-bstr lb-in)) 340 (define positions 341 (cond 342 [(or (not (string? in)) 343 (rx:regexp-bytes? rx)) 344 (define delta (- start-offset start-pos)) 345 (byte-positions->byte-positions ms-pos me-pos state #:delta delta)] 346 [else 347 ;; Some bytes may have been discarded in `lb-in`, and we 348 ;; don't know how many characters those add up to. The 349 ;; starting position `ms-pos` must be on a code-point 350 ;; boundary, and everything from `ms-pos` to `ms-end` must 351 ;; still be in `lb-in`. So, find `ms-pos` in the original 352 ;; string, and take it from there. 353 (define ms-str-pos (byte-index->string-index in start-offset (- ms-pos start-pos))) 354 (define delta (lazy-bytes-discarded-count lb-in)) 355 (byte-positions->string-positions bstr ms-pos me-pos state 356 #:start-index (- ms-pos delta) 357 #:delta delta 358 #:result-offset (+ ms-str-pos start-offset))])) 359 (add-end-bytes positions end-bytes-count bstr (- me-pos (lazy-bytes-discarded-count lb-in)))] 360 [(strings) 361 ;; The byte string may be shifted by discarded bytes, if not 362 ;; in `peek?` mode 363 (define bstr (lazy-bytes-bstr lb-in)) 364 (define delta (lazy-bytes-discarded-count lb-in)) 365 (define bytes/strings 366 (cond 367 [(or (not (string? in)) 368 (rx:regexp-bytes? rx)) 369 (byte-positions->bytess bstr ms-pos me-pos state #:delta delta)] 370 [else 371 (byte-positions->strings bstr ms-pos me-pos state #:delta delta)])) 372 (add-end-bytes bytes/strings end-bytes-count bstr (- me-pos delta))]) 373 374 ;; Now, write and consume port content: 375 (write/consume-skipped))])) 376 377;; ------------------------------------------------------- 378;; Range-checking arguments to `regexp-match` and company: 379 380(define (check-range who what in pos start-pos) 381 (define len (cond 382 [(bytes? in) (bytes-length in)] 383 [(string? in) (string-length in)] 384 [else +inf.0])) 385 (unless (pos . >= . start-pos) 386 (raise-arguments-error who 387 (format "~a is smaller than starting index" what) 388 what pos 389 "starting index" start-pos)) 390 (unless (pos . <= . len) 391 (raise-arguments-error who 392 (format "~a is out of range" what) 393 what pos))) 394 395