1#lang racket/base
2(require racket/string
3         racket/contract/base
4         racket/list
5         "url-structs.rkt"
6         "url-exception.rkt"
7         "uri-codec.rkt")
8
9;; To do:
10;;   Handle HTTP/file errors.
11;;   Not throw away MIME headers.
12;;     Determine file type.
13
14(define-logger net/url)
15
16;; ----------------------------------------------------------------------
17
18;; Input ports have two statuses:
19;;   "impure" = they have text waiting
20;;   "pure" = the MIME headers have been read
21
22(define file-url-path-convention-type (make-parameter (system-path-convention-type)))
23
24(define (url-error fmt . args)
25  (raise (make-url-exception
26          (apply format fmt
27                 (map (lambda (arg) (if (url? arg) (url->string arg) arg))
28                      args))
29          (current-continuation-marks))))
30
31(define (url->string url)
32  (let ([scheme (url-scheme url)]
33        [user   (url-user url)]
34        [host   (url-host url)]
35        [port   (url-port url)]
36        [path   (url-path url)]
37        [query  (url-query url)]
38        [fragment (url-fragment url)]
39        [sa list]
40        [sa* (lambda (l)
41               (apply string-append
42                      (let loop ([l l])
43                        (cond
44                          [(null? l) l]
45                          [(pair? (car l))
46                           (append (loop (car l))
47                                   (loop (cdr l)))]
48                          [(null? (car l)) (loop (cdr l))]
49                          [else (cons (car l) (loop (cdr l)))]))))])
50    (when (and (equal? scheme "file")
51               (not (url-path-absolute? url)))
52      (raise-mismatch-error 'url->string
53                            "cannot convert relative file URL to a string: "
54                            url))
55    (when (and (or user host port)
56               (pair? path)
57               (not (url-path-absolute? url)))
58      (raise-mismatch-error 'url->string
59                            "cannot convert relative URL with authority to a string: "
60                            url))
61    (sa*
62     (append
63      (if scheme (sa scheme ":") null)
64      (if (or user host port)
65        (sa "//"
66            (if user (sa (uri-userinfo-encode user) "@") null)
67            (if host
68                (if (regexp-match? rx:ipv6-hex host)
69                    (sa "[" host "]")
70                    host)
71                null)
72            (if port (sa ":" (number->string port)) null))
73        (if (equal? "file" scheme) ; always need "//" for "file" URLs
74          '("//")
75          null))
76      (combine-path-strings (url-path-absolute? url) path)
77      ;; (if query (sa "?" (uri-encode query)) "")
78      (if (null? query) null (sa "?" (alist->form-urlencoded query)))
79      (if fragment (sa "#" (uri-encode* fragment)) null)))))
80
81;; transliteration of code in rfc 3986, section 5.2.2
82(define (combine-url/relative Base string)
83  (let ([R (string->url string)]
84        [T (make-url #f #f #f #f #f '() '() #f)])
85    (if (url-scheme R)
86      (begin
87        (set-url-scheme! T (url-scheme R))
88        (set-url-user! T (url-user R))  ;; authority
89        (set-url-host! T (url-host R))  ;; authority
90        (set-url-port! T (url-port R))  ;; authority
91        (set-url-path-absolute?! T (url-path-absolute? R))
92        (set-url-path! T (remove-dot-segments (url-path R)))
93        (set-url-query! T (url-query R)))
94      (begin
95        (if (url-host R)  ;; => authority is defined
96          (begin
97            (set-url-user! T (url-user R))  ;; authority
98            (set-url-host! T (url-host R))  ;; authority
99            (set-url-port! T (url-port R))  ;; authority
100            (set-url-path-absolute?! T (url-path-absolute? R))
101            (set-url-path! T (remove-dot-segments (url-path R)))
102            (set-url-query! T (url-query R)))
103          (begin
104            (if (null? (url-path R)) ;; => R has empty path
105              (begin
106                (set-url-path-absolute?! T (url-path-absolute? Base))
107                (set-url-path! T (url-path Base))
108                (if (not (null? (url-query R)))
109                  (set-url-query! T (url-query R))
110                  (set-url-query! T (url-query Base))))
111              (begin
112                (cond
113                  [(url-path-absolute? R)
114                   (set-url-path-absolute?! T #t)
115                   (set-url-path! T (remove-dot-segments (url-path R)))]
116                  [(and (null? (url-path Base))
117                        (url-host Base))
118                   (set-url-path-absolute?! T #t)
119                   (set-url-path! T (remove-dot-segments (url-path R)))]
120                  [else
121                   (set-url-path-absolute?! T (url-path-absolute? Base))
122                   (set-url-path! T (remove-dot-segments
123                                     (append (all-but-last (url-path Base))
124                                             (url-path R))))])
125                (set-url-query! T (url-query R))))
126            (set-url-user! T (url-user Base))   ;; authority
127            (set-url-host! T (url-host Base))   ;; authority
128            (set-url-port! T (url-port Base)))) ;; authority
129        (set-url-scheme! T (url-scheme Base))))
130    (set-url-fragment! T (url-fragment R))
131    T))
132
133(define (all-but-last lst)
134  (cond [(null? lst) null]
135        [(null? (cdr lst)) null]
136        [else (cons (car lst) (all-but-last (cdr lst)))]))
137
138;; cribbed from 5.2.4 in rfc 3986
139;; the strange [*] cases implicitly change urls
140;; with paths segments "." and ".." at the end
141;; into "./" and "../" respectively
142(define (remove-dot-segments path)
143  (let loop ([path path] [result '()])
144    (if (null? path)
145      (reverse result)
146      (let ([fst (path/param-path (car path))]
147            [rst (cdr path)])
148        (loop rst
149              (cond
150                [(and (eq? fst 'same) (null? rst))
151                 (cons (make-path/param "" '()) result)] ; [*]
152                [(eq? fst 'same)
153                 result]
154                [(and (eq? fst 'up) (null? rst) (not (null? result)))
155                 (cons (make-path/param "" '()) (cdr result))] ; [*]
156                [(and (eq? fst 'up) (not (null? result)))
157                 (cdr result)]
158                [(and (eq? fst 'up) (null? result))
159                 ;; when we go up too far, just drop the "up"s.
160                 result]
161                [else
162                 (cons (car path) result)]))))))
163
164;; netscape/string->url : str -> url
165(define (netscape/string->url string)
166  (let ([url (string->url string)])
167    (cond [(url-scheme url) url]
168          [(string=? string "")
169           (url-error "can't resolve empty string as URL")]
170          [else (set-url-scheme! url
171                                 (if (char=? (string-ref string 0) #\/) "file" "http"))
172                url])))
173
174;; Approximation to IPv6 literal addresses, to be recognized
175;; in "[...]" when decoding and put back in "[...]" when encoding;
176;; having at least one ":" distinguishes from other address forms:
177(define ipv6-hex "[0-9a-fA-F:]*:[0-9a-fA-F:]*")
178(define rx:ipv6-hex (regexp (string-append "^" ipv6-hex "$")))
179
180;; URL parsing regexp
181;; this is roughly following the regexp in Appendix B of rfc 3986, except for using
182;; `*' instead of `+' for the scheme part (it is checked later anyway, and
183;; we don't want to parse it as a path element), and the user@host:port is
184;; parsed here.
185(define url-regexp
186  (regexp (string-append
187           "^"
188           "(?:"              ; / scheme-colon-opt
189           "([^:/?#]*)"       ; | #1 = scheme-opt
190           ":)?"              ; \
191           "(?://"            ; / slash-slash-authority-opt
192           "(?:"              ; | / user-at-opt
193           "([^/?#@]*)"       ; | | #2 = user-opt
194           "@)?"              ; | \
195           "(?:"              ;
196           "(?:\\["           ; | / #3 = ipv6-host-opt
197           "(" ipv6-hex ")"   ; | | hex-addresses
198           "\\])|"            ; | \
199           "([^/?#:]*)"       ; | #4 = host-opt
200           ")?"               ;
201           "(?::"             ; | / colon-port-opt
202           "([0-9]*)"         ; | | #5 = port-opt
203           ")?"               ; | \
204           ")?"               ; \
205           "([^?#]*)"         ; #6 = path
206           "(?:\\?"           ; / question-query-opt
207           "([^#]*)"          ; | #7 = query-opt
208           ")?"               ; \
209           "(?:#"             ; / hash-fragment-opt
210           "(.*)"             ; | #8 = fragment-opt
211           ")?"               ; \
212           "$")))
213
214;; string->url : str -> url
215;; Original version by Neil Van Dyke
216(define (string->url str)
217  (apply
218   (lambda (scheme user ipv6host host port path query fragment)
219     (when (and scheme (not (regexp-match? #rx"^[a-zA-Z][a-zA-Z0-9+.-]*$"
220                                           scheme)))
221       (url-error "invalid URL string; bad scheme\n  scheme: ~e\n  in: ~e" scheme str))
222     ;; Windows => "file://xxx:/...." specifies a "xxx:/..." path
223     (let ([win-file? (and (or (equal? "" port) (not port))
224                           (equal? "file" (and scheme (string-downcase scheme)))
225                           (eq? 'windows (file-url-path-convention-type))
226                           (not (equal? host ""))
227                           (or (regexp-match? "^[fF][iI][lL][eE]://[a-zA-Z]:" str)
228                               (regexp-match? "^[fF][iI][lL][eE]:\\\\" str)))])
229       (when win-file?
230         (set! path (cond [(equal? "" port) (string-append host ":" path)]
231                          [(and path host) (string-append host "/" path)]
232                          [else (or path host)]))
233         (set! port #f)
234         (set! host ""))
235       (define win-file-url (and win-file?
236                                 (path->url (bytes->path (string->bytes/utf-8 path) 'windows))))
237       (let* ([scheme   (and scheme (string-downcase scheme))]
238              [host     (cond  [win-file-url (url-host win-file-url)]
239                               [ipv6host (and ipv6host (string-downcase ipv6host))]
240                               [else (and host (string-downcase host))])]
241              [user     (uri-decode/maybe user)]
242              [port     (and port (string->number port))]
243              [abs?     (or (equal? "file" scheme)
244                            (regexp-match? #rx"^/" path))]
245              [use-abs? (or abs?
246                            ;; If an authority part is provided, the (empty) path must be
247                            ;; absolute, even if it isn't written with a "/":
248                            (and (or host user port) #t))]
249              [path     (if win-file?
250                            (url-path win-file-url)
251                            (separate-path-strings path))]
252              [query    (if query (form-urlencoded->alist query) '())]
253              [fragment (uri-decode/maybe fragment)])
254         (when (and (not abs?) (pair? path) host)
255           (url-error (string-append "invalid URL string;\n"
256                                     " host provided with non-absolute path (i.e., missing a slash)\n"
257                                     "  in: ~e")
258                      str))
259         (make-url scheme user host port use-abs? path query fragment))))
260   (cdr (regexp-match url-regexp str))))
261
262(define (uri-decode/maybe f) (friendly-decode/maybe f uri-decode))
263
264(define (friendly-decode/maybe f uri-decode)
265  ;; If #f, and leave unmolested any % that is followed by hex digit
266  ;; if a % is not followed by a hex digit, replace it with %25
267  ;; in an attempt to be "friendly"
268  (and f (uri-decode (regexp-replace* #rx"%([^0-9a-fA-F])" f "%25\\1"))))
269
270;; separate-path-strings : string[starting with /] -> (listof path/param)
271(define (separate-path-strings str)
272  (let ([strs (regexp-split #rx"/" str)])
273    (map separate-params (if (string=? "" (car strs)) (cdr strs) strs))))
274
275(define (separate-params s)
276  (let ([lst (map path-segment-decode (regexp-split #rx";" s))])
277    (make-path/param (car lst) (cdr lst))))
278
279(define (path-segment-decode p)
280  (cond [(string=? p "..") 'up]
281        [(string=? p ".") 'same]
282        [else (uri-path-segment-decode p)]))
283
284(define (path-segment-encode p)
285  (cond [(eq?    p 'up)   ".."]
286        [(eq?    p 'same) "."]
287        [(equal? p "..")  "%2e%2e"]
288        [(equal? p ".")   "%2e"]
289        [else (uri-path-segment-encode* p)]))
290
291(define (combine-path-strings absolute? path/params)
292  (cond [(null? path/params) null]
293        [else (let ([p (add-between (map join-params path/params) "/")])
294                (if (and absolute? (pair? p)) (cons "/" p) p))]))
295
296(define (join-params s)
297  (if (null? (path/param-param s))
298    (path-segment-encode (path/param-path s))
299    (string-join (map path-segment-encode
300                      (cons (path/param-path s) (path/param-param s)))
301                 ";")))
302
303(define (path->url path)
304  (let* ([spath (simplify-path path #f)]
305         [dir? (let-values ([(b n dir?) (split-path spath)]) dir?)]
306         ;; If original path is a directory the resulting URL
307         ;; should have a trailing forward slash
308         [url-tail (if dir? (list (make-path/param "" null)) null)]
309         [host+url-path
310          (let loop ([path spath][accum null])
311            (let-values ([(base name dir?) (split-path path)])
312              (cond
313                [(not base)
314                 (if (eq? (path-convention-type path) 'windows)
315                   ;; For Windows, massage the root:
316                   (append (map
317                            (lambda (s)
318                              (make-path/param s null))
319                            (let ([s (regexp-replace
320                                      #rx"[/\\\\]$"
321                                      (bytes->string/utf-8 (path->bytes name))
322                                      "")])
323                              (cond
324                                [(regexp-match? #rx"^\\\\\\\\[?]\\\\[a-zA-Z]:" s)
325                                 ;; \\?\<drive>: path:
326                                 (cons "" (regexp-split #rx"[/\\]+" (substring s 4)))]
327                                [(regexp-match? #rx"^\\\\\\\\[?]\\\\UNC" s)
328                                 ;; \\?\ UNC path:
329                                 (cons "" (regexp-split #rx"[/\\]+" (substring s 7)))]
330                                [(regexp-match? #rx"^[/\\]" s)
331                                 ;; UNC path:
332                                 (cdr (regexp-split #rx"[/\\]+" s))]
333                                [else
334                                 (list "" s)])))
335                           accum)
336                   ;; On other platforms, we drop the root:
337                   (cons "" accum))]
338                [else
339                 (let ([accum (cons (make-path/param
340                                     (if (symbol? name)
341                                       name
342                                       (bytes->string/utf-8
343                                        (path-element->bytes name)))
344                                     null)
345                                    accum)])
346                   (if (eq? base 'relative)
347                     (cons "" accum)
348                     (loop base accum)))])))]
349         [host (let ([h (car host+url-path)])
350                 (if (path/param? h)
351                     (path/param-path h)
352                     h))]
353         [url-path (cdr host+url-path)])
354  (make-url "file" #f host #f (absolute-path? path)
355              (if (null? url-tail) url-path (append url-path url-tail))
356              '() #f)))
357
358(define (file://->path url [kind (system-path-convention-type)])
359  (let ([strs (map path/param-path (url-path url))]
360        [string->path-element/same
361         (lambda (e)
362           (if (symbol? e)
363             e
364             (if (string=? e "")
365               'same
366               (bytes->path-element (string->bytes/locale e) kind))))]
367        [string->path/win (lambda (s)
368                            (bytes->path (string->bytes/utf-8 s) 'windows))])
369    (if (and (url-path-absolute? url)
370             (eq? 'windows kind))
371      ;; If initial path is "", then build UNC path.
372      ;; Also build a UNC path if the host is non-#f.
373      (cond
374        [(not (url-path-absolute? url))
375         (apply build-path (map string->path-element/same strs))]
376        [(and ((length strs) . >= . 3)
377              (equal? (car strs) ""))
378         (apply build-path
379                (string->path/win
380                 (string-append "\\\\" (cadr strs) "\\" (caddr strs) "\\"))
381                (map string->path-element/same (cdddr strs)))]
382        [(and (url-host url)
383              (not (equal? (url-host url) ""))
384              (pair? strs))
385         (if (equal? (car strs) "")
386             (error 'file://->path "empty drive element: ~e" url)
387             (apply build-path
388                    (string->path/win
389                     (string-append "\\\\" (url-host url) "\\" (car strs) "\\"))
390                    (map string->path-element/same (cdr strs))))]
391        [(pair? strs)
392         (apply build-path (string->path/win (car strs))
393                (map string->path-element/same (cdr strs)))]
394        [else (error 'file://->path "no path elements: ~e" url)])
395      (let ([elems (map string->path-element/same strs)])
396        (if (url-path-absolute? url)
397          (apply build-path (bytes->path #"/" 'unix) elems)
398          (apply build-path elems))))))
399
400(define (url->path url [kind (system-path-convention-type)])
401  (file://->path url kind))
402
403(define (relative-path->relative-url-string path)
404  (define s (string-join (for/list ([e (in-list (explode-path path))])
405                           (cond
406                            [(eq? e 'same) "."]
407                            [(eq? e 'up) ".."]
408                            [else
409                             (uri-encode* (path-element->string e))]))
410                         "/"))
411  ;; Add "/" to reflect directory-ness:
412  (let-values ([(base name dir?) (split-path path)])
413    (if dir?
414        (string-append s "/")
415        s)))
416
417(define current-url-encode-mode (make-parameter 'recommended))
418
419(define (uri-encode* str)
420  (case (current-url-encode-mode)
421    [(unreserved) (uri-unreserved-encode str)]
422    [(recommended) (uri-encode str)]))
423
424(define (uri-path-segment-encode* str)
425  (case (current-url-encode-mode)
426    [(unreserved) (uri-path-segment-unreserved-encode str)]
427    [(recommended) (uri-path-segment-encode str)]))
428
429(provide (struct-out url) (struct-out path/param))
430
431(provide/contract
432 (url-regexp regexp?)
433 (string->url (-> (and/c string? url-regexp) url?))
434 (path->url ((or/c path-string? path-for-some-system?) . -> . url?))
435 (relative-path->relative-url-string ((and/c (or/c path-string? path-for-some-system?)
436                                             relative-path?)
437                                      . -> . string?))
438 (url->string (url? . -> . string?))
439 (url->path (->* (url?) ((one-of/c 'unix 'windows)) path-for-some-system?))
440 (file://->path (->* (url?) ((one-of/c 'unix 'windows)) path-for-some-system?))
441 (netscape/string->url (string? . -> . url?))
442 (combine-url/relative (url? string? . -> . url?))
443 (rename -url-exception? url-exception? (any/c . -> . boolean?))
444 (file-url-path-convention-type
445  (parameter/c (one-of/c 'unix 'windows)))
446 (current-url-encode-mode (parameter/c (one-of/c 'recommended 'unreserved))))
447