1#lang racket/base 2(require racket/string 3 racket/contract/base 4 racket/list 5 "url-structs.rkt" 6 "url-exception.rkt" 7 "uri-codec.rkt") 8 9;; To do: 10;; Handle HTTP/file errors. 11;; Not throw away MIME headers. 12;; Determine file type. 13 14(define-logger net/url) 15 16;; ---------------------------------------------------------------------- 17 18;; Input ports have two statuses: 19;; "impure" = they have text waiting 20;; "pure" = the MIME headers have been read 21 22(define file-url-path-convention-type (make-parameter (system-path-convention-type))) 23 24(define (url-error fmt . args) 25 (raise (make-url-exception 26 (apply format fmt 27 (map (lambda (arg) (if (url? arg) (url->string arg) arg)) 28 args)) 29 (current-continuation-marks)))) 30 31(define (url->string url) 32 (let ([scheme (url-scheme url)] 33 [user (url-user url)] 34 [host (url-host url)] 35 [port (url-port url)] 36 [path (url-path url)] 37 [query (url-query url)] 38 [fragment (url-fragment url)] 39 [sa list] 40 [sa* (lambda (l) 41 (apply string-append 42 (let loop ([l l]) 43 (cond 44 [(null? l) l] 45 [(pair? (car l)) 46 (append (loop (car l)) 47 (loop (cdr l)))] 48 [(null? (car l)) (loop (cdr l))] 49 [else (cons (car l) (loop (cdr l)))]))))]) 50 (when (and (equal? scheme "file") 51 (not (url-path-absolute? url))) 52 (raise-mismatch-error 'url->string 53 "cannot convert relative file URL to a string: " 54 url)) 55 (when (and (or user host port) 56 (pair? path) 57 (not (url-path-absolute? url))) 58 (raise-mismatch-error 'url->string 59 "cannot convert relative URL with authority to a string: " 60 url)) 61 (sa* 62 (append 63 (if scheme (sa scheme ":") null) 64 (if (or user host port) 65 (sa "//" 66 (if user (sa (uri-userinfo-encode user) "@") null) 67 (if host 68 (if (regexp-match? rx:ipv6-hex host) 69 (sa "[" host "]") 70 host) 71 null) 72 (if port (sa ":" (number->string port)) null)) 73 (if (equal? "file" scheme) ; always need "//" for "file" URLs 74 '("//") 75 null)) 76 (combine-path-strings (url-path-absolute? url) path) 77 ;; (if query (sa "?" (uri-encode query)) "") 78 (if (null? query) null (sa "?" (alist->form-urlencoded query))) 79 (if fragment (sa "#" (uri-encode* fragment)) null))))) 80 81;; transliteration of code in rfc 3986, section 5.2.2 82(define (combine-url/relative Base string) 83 (let ([R (string->url string)] 84 [T (make-url #f #f #f #f #f '() '() #f)]) 85 (if (url-scheme R) 86 (begin 87 (set-url-scheme! T (url-scheme R)) 88 (set-url-user! T (url-user R)) ;; authority 89 (set-url-host! T (url-host R)) ;; authority 90 (set-url-port! T (url-port R)) ;; authority 91 (set-url-path-absolute?! T (url-path-absolute? R)) 92 (set-url-path! T (remove-dot-segments (url-path R))) 93 (set-url-query! T (url-query R))) 94 (begin 95 (if (url-host R) ;; => authority is defined 96 (begin 97 (set-url-user! T (url-user R)) ;; authority 98 (set-url-host! T (url-host R)) ;; authority 99 (set-url-port! T (url-port R)) ;; authority 100 (set-url-path-absolute?! T (url-path-absolute? R)) 101 (set-url-path! T (remove-dot-segments (url-path R))) 102 (set-url-query! T (url-query R))) 103 (begin 104 (if (null? (url-path R)) ;; => R has empty path 105 (begin 106 (set-url-path-absolute?! T (url-path-absolute? Base)) 107 (set-url-path! T (url-path Base)) 108 (if (not (null? (url-query R))) 109 (set-url-query! T (url-query R)) 110 (set-url-query! T (url-query Base)))) 111 (begin 112 (cond 113 [(url-path-absolute? R) 114 (set-url-path-absolute?! T #t) 115 (set-url-path! T (remove-dot-segments (url-path R)))] 116 [(and (null? (url-path Base)) 117 (url-host Base)) 118 (set-url-path-absolute?! T #t) 119 (set-url-path! T (remove-dot-segments (url-path R)))] 120 [else 121 (set-url-path-absolute?! T (url-path-absolute? Base)) 122 (set-url-path! T (remove-dot-segments 123 (append (all-but-last (url-path Base)) 124 (url-path R))))]) 125 (set-url-query! T (url-query R)))) 126 (set-url-user! T (url-user Base)) ;; authority 127 (set-url-host! T (url-host Base)) ;; authority 128 (set-url-port! T (url-port Base)))) ;; authority 129 (set-url-scheme! T (url-scheme Base)))) 130 (set-url-fragment! T (url-fragment R)) 131 T)) 132 133(define (all-but-last lst) 134 (cond [(null? lst) null] 135 [(null? (cdr lst)) null] 136 [else (cons (car lst) (all-but-last (cdr lst)))])) 137 138;; cribbed from 5.2.4 in rfc 3986 139;; the strange [*] cases implicitly change urls 140;; with paths segments "." and ".." at the end 141;; into "./" and "../" respectively 142(define (remove-dot-segments path) 143 (let loop ([path path] [result '()]) 144 (if (null? path) 145 (reverse result) 146 (let ([fst (path/param-path (car path))] 147 [rst (cdr path)]) 148 (loop rst 149 (cond 150 [(and (eq? fst 'same) (null? rst)) 151 (cons (make-path/param "" '()) result)] ; [*] 152 [(eq? fst 'same) 153 result] 154 [(and (eq? fst 'up) (null? rst) (not (null? result))) 155 (cons (make-path/param "" '()) (cdr result))] ; [*] 156 [(and (eq? fst 'up) (not (null? result))) 157 (cdr result)] 158 [(and (eq? fst 'up) (null? result)) 159 ;; when we go up too far, just drop the "up"s. 160 result] 161 [else 162 (cons (car path) result)])))))) 163 164;; netscape/string->url : str -> url 165(define (netscape/string->url string) 166 (let ([url (string->url string)]) 167 (cond [(url-scheme url) url] 168 [(string=? string "") 169 (url-error "can't resolve empty string as URL")] 170 [else (set-url-scheme! url 171 (if (char=? (string-ref string 0) #\/) "file" "http")) 172 url]))) 173 174;; Approximation to IPv6 literal addresses, to be recognized 175;; in "[...]" when decoding and put back in "[...]" when encoding; 176;; having at least one ":" distinguishes from other address forms: 177(define ipv6-hex "[0-9a-fA-F:]*:[0-9a-fA-F:]*") 178(define rx:ipv6-hex (regexp (string-append "^" ipv6-hex "$"))) 179 180;; URL parsing regexp 181;; this is roughly following the regexp in Appendix B of rfc 3986, except for using 182;; `*' instead of `+' for the scheme part (it is checked later anyway, and 183;; we don't want to parse it as a path element), and the user@host:port is 184;; parsed here. 185(define url-regexp 186 (regexp (string-append 187 "^" 188 "(?:" ; / scheme-colon-opt 189 "([^:/?#]*)" ; | #1 = scheme-opt 190 ":)?" ; \ 191 "(?://" ; / slash-slash-authority-opt 192 "(?:" ; | / user-at-opt 193 "([^/?#@]*)" ; | | #2 = user-opt 194 "@)?" ; | \ 195 "(?:" ; 196 "(?:\\[" ; | / #3 = ipv6-host-opt 197 "(" ipv6-hex ")" ; | | hex-addresses 198 "\\])|" ; | \ 199 "([^/?#:]*)" ; | #4 = host-opt 200 ")?" ; 201 "(?::" ; | / colon-port-opt 202 "([0-9]*)" ; | | #5 = port-opt 203 ")?" ; | \ 204 ")?" ; \ 205 "([^?#]*)" ; #6 = path 206 "(?:\\?" ; / question-query-opt 207 "([^#]*)" ; | #7 = query-opt 208 ")?" ; \ 209 "(?:#" ; / hash-fragment-opt 210 "(.*)" ; | #8 = fragment-opt 211 ")?" ; \ 212 "$"))) 213 214;; string->url : str -> url 215;; Original version by Neil Van Dyke 216(define (string->url str) 217 (apply 218 (lambda (scheme user ipv6host host port path query fragment) 219 (when (and scheme (not (regexp-match? #rx"^[a-zA-Z][a-zA-Z0-9+.-]*$" 220 scheme))) 221 (url-error "invalid URL string; bad scheme\n scheme: ~e\n in: ~e" scheme str)) 222 ;; Windows => "file://xxx:/...." specifies a "xxx:/..." path 223 (let ([win-file? (and (or (equal? "" port) (not port)) 224 (equal? "file" (and scheme (string-downcase scheme))) 225 (eq? 'windows (file-url-path-convention-type)) 226 (not (equal? host "")) 227 (or (regexp-match? "^[fF][iI][lL][eE]://[a-zA-Z]:" str) 228 (regexp-match? "^[fF][iI][lL][eE]:\\\\" str)))]) 229 (when win-file? 230 (set! path (cond [(equal? "" port) (string-append host ":" path)] 231 [(and path host) (string-append host "/" path)] 232 [else (or path host)])) 233 (set! port #f) 234 (set! host "")) 235 (define win-file-url (and win-file? 236 (path->url (bytes->path (string->bytes/utf-8 path) 'windows)))) 237 (let* ([scheme (and scheme (string-downcase scheme))] 238 [host (cond [win-file-url (url-host win-file-url)] 239 [ipv6host (and ipv6host (string-downcase ipv6host))] 240 [else (and host (string-downcase host))])] 241 [user (uri-decode/maybe user)] 242 [port (and port (string->number port))] 243 [abs? (or (equal? "file" scheme) 244 (regexp-match? #rx"^/" path))] 245 [use-abs? (or abs? 246 ;; If an authority part is provided, the (empty) path must be 247 ;; absolute, even if it isn't written with a "/": 248 (and (or host user port) #t))] 249 [path (if win-file? 250 (url-path win-file-url) 251 (separate-path-strings path))] 252 [query (if query (form-urlencoded->alist query) '())] 253 [fragment (uri-decode/maybe fragment)]) 254 (when (and (not abs?) (pair? path) host) 255 (url-error (string-append "invalid URL string;\n" 256 " host provided with non-absolute path (i.e., missing a slash)\n" 257 " in: ~e") 258 str)) 259 (make-url scheme user host port use-abs? path query fragment)))) 260 (cdr (regexp-match url-regexp str)))) 261 262(define (uri-decode/maybe f) (friendly-decode/maybe f uri-decode)) 263 264(define (friendly-decode/maybe f uri-decode) 265 ;; If #f, and leave unmolested any % that is followed by hex digit 266 ;; if a % is not followed by a hex digit, replace it with %25 267 ;; in an attempt to be "friendly" 268 (and f (uri-decode (regexp-replace* #rx"%([^0-9a-fA-F])" f "%25\\1")))) 269 270;; separate-path-strings : string[starting with /] -> (listof path/param) 271(define (separate-path-strings str) 272 (let ([strs (regexp-split #rx"/" str)]) 273 (map separate-params (if (string=? "" (car strs)) (cdr strs) strs)))) 274 275(define (separate-params s) 276 (let ([lst (map path-segment-decode (regexp-split #rx";" s))]) 277 (make-path/param (car lst) (cdr lst)))) 278 279(define (path-segment-decode p) 280 (cond [(string=? p "..") 'up] 281 [(string=? p ".") 'same] 282 [else (uri-path-segment-decode p)])) 283 284(define (path-segment-encode p) 285 (cond [(eq? p 'up) ".."] 286 [(eq? p 'same) "."] 287 [(equal? p "..") "%2e%2e"] 288 [(equal? p ".") "%2e"] 289 [else (uri-path-segment-encode* p)])) 290 291(define (combine-path-strings absolute? path/params) 292 (cond [(null? path/params) null] 293 [else (let ([p (add-between (map join-params path/params) "/")]) 294 (if (and absolute? (pair? p)) (cons "/" p) p))])) 295 296(define (join-params s) 297 (if (null? (path/param-param s)) 298 (path-segment-encode (path/param-path s)) 299 (string-join (map path-segment-encode 300 (cons (path/param-path s) (path/param-param s))) 301 ";"))) 302 303(define (path->url path) 304 (let* ([spath (simplify-path path #f)] 305 [dir? (let-values ([(b n dir?) (split-path spath)]) dir?)] 306 ;; If original path is a directory the resulting URL 307 ;; should have a trailing forward slash 308 [url-tail (if dir? (list (make-path/param "" null)) null)] 309 [host+url-path 310 (let loop ([path spath][accum null]) 311 (let-values ([(base name dir?) (split-path path)]) 312 (cond 313 [(not base) 314 (if (eq? (path-convention-type path) 'windows) 315 ;; For Windows, massage the root: 316 (append (map 317 (lambda (s) 318 (make-path/param s null)) 319 (let ([s (regexp-replace 320 #rx"[/\\\\]$" 321 (bytes->string/utf-8 (path->bytes name)) 322 "")]) 323 (cond 324 [(regexp-match? #rx"^\\\\\\\\[?]\\\\[a-zA-Z]:" s) 325 ;; \\?\<drive>: path: 326 (cons "" (regexp-split #rx"[/\\]+" (substring s 4)))] 327 [(regexp-match? #rx"^\\\\\\\\[?]\\\\UNC" s) 328 ;; \\?\ UNC path: 329 (cons "" (regexp-split #rx"[/\\]+" (substring s 7)))] 330 [(regexp-match? #rx"^[/\\]" s) 331 ;; UNC path: 332 (cdr (regexp-split #rx"[/\\]+" s))] 333 [else 334 (list "" s)]))) 335 accum) 336 ;; On other platforms, we drop the root: 337 (cons "" accum))] 338 [else 339 (let ([accum (cons (make-path/param 340 (if (symbol? name) 341 name 342 (bytes->string/utf-8 343 (path-element->bytes name))) 344 null) 345 accum)]) 346 (if (eq? base 'relative) 347 (cons "" accum) 348 (loop base accum)))])))] 349 [host (let ([h (car host+url-path)]) 350 (if (path/param? h) 351 (path/param-path h) 352 h))] 353 [url-path (cdr host+url-path)]) 354 (make-url "file" #f host #f (absolute-path? path) 355 (if (null? url-tail) url-path (append url-path url-tail)) 356 '() #f))) 357 358(define (file://->path url [kind (system-path-convention-type)]) 359 (let ([strs (map path/param-path (url-path url))] 360 [string->path-element/same 361 (lambda (e) 362 (if (symbol? e) 363 e 364 (if (string=? e "") 365 'same 366 (bytes->path-element (string->bytes/locale e) kind))))] 367 [string->path/win (lambda (s) 368 (bytes->path (string->bytes/utf-8 s) 'windows))]) 369 (if (and (url-path-absolute? url) 370 (eq? 'windows kind)) 371 ;; If initial path is "", then build UNC path. 372 ;; Also build a UNC path if the host is non-#f. 373 (cond 374 [(not (url-path-absolute? url)) 375 (apply build-path (map string->path-element/same strs))] 376 [(and ((length strs) . >= . 3) 377 (equal? (car strs) "")) 378 (apply build-path 379 (string->path/win 380 (string-append "\\\\" (cadr strs) "\\" (caddr strs) "\\")) 381 (map string->path-element/same (cdddr strs)))] 382 [(and (url-host url) 383 (not (equal? (url-host url) "")) 384 (pair? strs)) 385 (if (equal? (car strs) "") 386 (error 'file://->path "empty drive element: ~e" url) 387 (apply build-path 388 (string->path/win 389 (string-append "\\\\" (url-host url) "\\" (car strs) "\\")) 390 (map string->path-element/same (cdr strs))))] 391 [(pair? strs) 392 (apply build-path (string->path/win (car strs)) 393 (map string->path-element/same (cdr strs)))] 394 [else (error 'file://->path "no path elements: ~e" url)]) 395 (let ([elems (map string->path-element/same strs)]) 396 (if (url-path-absolute? url) 397 (apply build-path (bytes->path #"/" 'unix) elems) 398 (apply build-path elems)))))) 399 400(define (url->path url [kind (system-path-convention-type)]) 401 (file://->path url kind)) 402 403(define (relative-path->relative-url-string path) 404 (define s (string-join (for/list ([e (in-list (explode-path path))]) 405 (cond 406 [(eq? e 'same) "."] 407 [(eq? e 'up) ".."] 408 [else 409 (uri-encode* (path-element->string e))])) 410 "/")) 411 ;; Add "/" to reflect directory-ness: 412 (let-values ([(base name dir?) (split-path path)]) 413 (if dir? 414 (string-append s "/") 415 s))) 416 417(define current-url-encode-mode (make-parameter 'recommended)) 418 419(define (uri-encode* str) 420 (case (current-url-encode-mode) 421 [(unreserved) (uri-unreserved-encode str)] 422 [(recommended) (uri-encode str)])) 423 424(define (uri-path-segment-encode* str) 425 (case (current-url-encode-mode) 426 [(unreserved) (uri-path-segment-unreserved-encode str)] 427 [(recommended) (uri-path-segment-encode str)])) 428 429(provide (struct-out url) (struct-out path/param)) 430 431(provide/contract 432 (url-regexp regexp?) 433 (string->url (-> (and/c string? url-regexp) url?)) 434 (path->url ((or/c path-string? path-for-some-system?) . -> . url?)) 435 (relative-path->relative-url-string ((and/c (or/c path-string? path-for-some-system?) 436 relative-path?) 437 . -> . string?)) 438 (url->string (url? . -> . string?)) 439 (url->path (->* (url?) ((one-of/c 'unix 'windows)) path-for-some-system?)) 440 (file://->path (->* (url?) ((one-of/c 'unix 'windows)) path-for-some-system?)) 441 (netscape/string->url (string? . -> . url?)) 442 (combine-url/relative (url? string? . -> . url?)) 443 (rename -url-exception? url-exception? (any/c . -> . boolean?)) 444 (file-url-path-convention-type 445 (parameter/c (one-of/c 'unix 'windows))) 446 (current-url-encode-mode (parameter/c (one-of/c 'recommended 'unreserved)))) 447