1;;;; (web uri) --- URI manipulation tools
2;;;;
3;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013,2014,2019-2021 Free Software Foundation, Inc.
4;;;;
5;;;; This library is free software; you can redistribute it and/or
6;;;; modify it under the terms of the GNU Lesser General Public
7;;;; License as published by the Free Software Foundation; either
8;;;; version 3 of the License, or (at your option) any later version.
9;;;;
10;;;; This library is distributed in the hope that it will be useful,
11;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13;;;; Lesser General Public License for more details.
14;;;;
15;;;; You should have received a copy of the GNU Lesser General Public
16;;;; License along with this library; if not, write to the Free Software
17;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18;;;;
19
20;;; Commentary:
21
22;; A data type for Universal Resource Identifiers, as defined in RFC
23;; 3986.
24
25;;; Code:
26
27(define-module (web uri)
28  #:use-module (srfi srfi-9)
29  #:use-module (ice-9 iconv)
30  #:use-module (ice-9 regex)
31  #:use-module (ice-9 rdelim)
32  #:use-module (ice-9 control)
33  #:use-module (rnrs bytevectors)
34  #:use-module (ice-9 binary-ports)
35  #:export (uri?
36            uri-scheme uri-userinfo uri-host uri-port
37            uri-path uri-query uri-fragment
38
39            build-uri
40            build-uri-reference
41            declare-default-port!
42            string->uri string->uri-reference
43            uri->string
44            uri-decode uri-encode
45            split-and-decode-uri-path
46            encode-and-join-uri-path
47
48            uri-reference? relative-ref?
49            build-uri-reference build-relative-ref
50            string->uri-reference string->relative-ref))
51
52(define-record-type <uri>
53  (make-uri scheme userinfo host port path query fragment)
54  uri-reference?
55  (scheme uri-scheme)
56  (userinfo uri-userinfo)
57  (host uri-host)
58  (port uri-port)
59  (path uri-path)
60  (query uri-query)
61  (fragment uri-fragment))
62
63;;;
64;;; Predicates.
65;;;
66;;; These are quick, and assume rigid validation at construction time.
67
68;;; RFC 3986, #3.
69;;;
70;;;   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
71;;;
72;;;   hier-part   = "//" authority path-abempty
73;;;               / path-absolute
74;;;               / path-rootless
75;;;               / path-empty
76
77(define (uri? obj)
78  (and (uri-reference? obj)
79       (uri-scheme obj)
80       #t))
81
82;;; RFC 3986, #4.2.
83;;;
84;;;   relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
85;;;
86;;;   relative-part = "//" authority path-abempty
87;;;                 / path-absolute
88;;;                 / path-noscheme
89;;;                 / path-empty
90
91(define (relative-ref? obj)
92  (and (uri-reference? obj)
93       (not (uri-scheme obj))))
94
95
96;;;
97;;; Constructors.
98;;;
99
100(define (uri-error message . args)
101  (throw 'uri-error message args))
102
103(define (positive-exact-integer? port)
104  (and (number? port) (exact? port) (integer? port) (positive? port)))
105
106(define (validate-uri-reference scheme userinfo host port path query fragment)
107  (cond
108   ((and scheme (not (symbol? scheme)))
109    (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
110   ((and (or userinfo port) (not host))
111    (uri-error "Expected a host, given userinfo or port"))
112   ((and port (not (positive-exact-integer? port)))
113    (uri-error "Expected port to be an integer: ~s" port))
114   ((and host (or (not (string? host)) (not (valid-host? host))))
115    (uri-error "Expected valid host: ~s" host))
116   ((and userinfo (not (string? userinfo)))
117    (uri-error "Expected string for userinfo: ~s" userinfo))
118   ((not (string? path))
119    (uri-error "Expected string for path: ~s" path))
120   ((and query (not (string? query)))
121    (uri-error "Expected string for query: ~s" query))
122   ((and fragment (not (string? fragment)))
123    (uri-error "Expected string for fragment: ~s" fragment))
124   ;; Strict validation of allowed paths, based on other components.
125   ;; Refer to RFC 3986 for the details.
126   ((not (string-null? path))
127    (if host
128        (cond
129         ((not (eqv? (string-ref path 0) #\/))
130          (uri-error
131           "Expected absolute path starting with \"/\": ~a" path)))
132        (cond
133         ((string-prefix? "//" path)
134          (uri-error
135           "Expected path not starting with \"//\" (no host): ~a" path))
136         ((and (not scheme)
137               (not (eqv? (string-ref path 0) #\/))
138               (let ((colon (string-index path #\:)))
139                 (and colon (not (string-index path #\/ 0 colon)))))
140          (uri-error
141           "Expected relative path's first segment without \":\": ~a"
142           path)))))))
143
144(define* (build-uri scheme #:key userinfo host port (path "") query fragment
145                    (validate? #t))
146  "Construct a URI object.  SCHEME should be a symbol, PORT either a
147positive, exact integer or ‘#f’, and the rest of the fields are either
148strings or ‘#f’.  If VALIDATE? is true, also run some consistency checks
149to make sure that the constructed object is a valid URI."
150  (when validate?
151    (unless scheme (uri-error "Missing URI scheme"))
152    (validate-uri-reference scheme userinfo host port path query fragment))
153  (make-uri scheme userinfo host port path query fragment))
154
155(define* (build-uri-reference #:key scheme userinfo host port (path "") query
156                              fragment (validate? #t))
157  "Construct a URI-reference object.  SCHEME should be a symbol or ‘#f’,
158PORT either a positive, exact integer or ‘#f’, and the rest of the
159fields are either strings or ‘#f’.  If VALIDATE? is true, also run some
160consistency checks to make sure that the constructed URI is a valid URI
161reference."
162  (when validate?
163    (validate-uri-reference scheme userinfo host port path query fragment))
164  (make-uri scheme userinfo host port path query fragment))
165
166(define* (build-relative-ref #:key userinfo host port (path "") query fragment
167                             (validate? #t))
168  "Construct a relative-ref URI object.  The arguments are the same as
169for ‘build-uri’ except there is no scheme."
170  (when validate?
171    (validate-uri-reference #f userinfo host port path query fragment))
172  (make-uri #f userinfo host port path query fragment))
173
174
175;;;
176;;; Converters.
177;;;
178
179;; Since character ranges in regular expressions may depend on the
180;; current locale, we use explicit lists of characters instead.  See
181;; <https://bugs.gnu.org/35785> for details.
182(define digits "0123456789")
183(define hex-digits "0123456789ABCDEFabcdef")
184(define letters "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
185
186;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
187;; 3490), and non-ASCII host names.
188;;
189(define ipv4-regexp
190  (make-regexp (string-append "^([" digits ".]+)$")))
191(define ipv6-regexp
192  (make-regexp (string-append "^([" hex-digits "]*:[" hex-digits ":.]+)$")))
193(define domain-label-regexp
194  (make-regexp
195   (string-append "^[" letters digits "]"
196                  "([" letters digits "-]*[" letters digits "])?$")))
197(define top-label-regexp
198  (make-regexp
199   (string-append "^[" letters "]"
200                  "([" letters digits "-]*[" letters digits "])?$")))
201
202(define (valid-host? host)
203  (cond
204   ((regexp-exec ipv4-regexp host)
205    (false-if-exception (inet-pton AF_INET host)))
206   ((regexp-exec ipv6-regexp host)
207    (false-if-exception (inet-pton AF_INET6 host)))
208   (else
209    (let lp ((start 0))
210      (let ((end (string-index host #\. start)))
211        (if end
212            (and (regexp-exec domain-label-regexp
213                              (substring host start end))
214                 (lp (1+ end)))
215            (regexp-exec top-label-regexp host start)))))))
216
217(define userinfo-pat
218  (string-append "[" letters digits "_.!~*'();:&=+$,-]+"))
219(define host-pat
220  (string-append "[" letters digits ".-]+"))
221(define ipv6-host-pat
222  (string-append "[" hex-digits ":.]+"))
223(define port-pat
224  (string-append "[" digits "]*"))
225(define authority-regexp
226  (make-regexp
227   (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$"
228           userinfo-pat host-pat ipv6-host-pat port-pat)))
229
230(define (parse-authority authority fail)
231  (if (equal? authority "//")
232      ;; Allow empty authorities: file:///etc/hosts is a synonym of
233      ;; file:/etc/hosts.
234      (values #f #f #f)
235      (let ((m (regexp-exec authority-regexp authority)))
236        (if (and m (valid-host? (or (match:substring m 4)
237                                    (match:substring m 6))))
238            (values (match:substring m 2)
239                    (or (match:substring m 4)
240                        (match:substring m 6))
241                    (let ((port (match:substring m 8)))
242                      (and port (not (string-null? port))
243                           (string->number port))))
244            (fail)))))
245
246
247;;; RFC 3986, #3.
248;;;
249;;;   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
250;;;
251;;;   hier-part   = "//" authority path-abempty
252;;;               / path-absolute
253;;;               / path-rootless
254;;;               / path-empty
255;;;
256;;;   A URI-reference is the same as URI, but where the scheme is
257;;;   optional.  If the scheme is not present, its colon isn't present
258;;;   either.
259
260(define scheme-pat
261  (string-append "[" letters "][" letters digits "+.-]*"))
262(define authority-pat
263  "[^/?#]*")
264(define path-pat
265  "[^?#]*")
266(define query-pat
267  "[^#]*")
268(define fragment-pat
269  ".*")
270(define uri-pat
271  (format #f "^((~a):)?(//~a)?(~a)(\\?(~a))?(#(~a))?$"
272          scheme-pat authority-pat path-pat query-pat fragment-pat))
273(define uri-regexp
274  (make-regexp uri-pat))
275
276(define (string->uri-reference string)
277  "Parse STRING into a URI-reference object.  Return ‘#f’ if the string
278could not be parsed."
279  (% (let ((m (regexp-exec uri-regexp string)))
280       (unless m (abort))
281       (let ((scheme (let ((str (match:substring m 2)))
282                       (and str (string->symbol (string-downcase str)))))
283             (authority (match:substring m 3))
284             (path (match:substring m 4))
285             (query (match:substring m 6))
286             (fragment (match:substring m 8)))
287         ;; The regular expression already ensures all of the validation
288         ;; requirements for URI-references, except the one that the
289         ;; first component of a relative-ref's path can't contain a
290         ;; colon.
291         (unless scheme
292           (let ((colon (string-index path #\:)))
293             (when (and colon (not (string-index path #\/ 0 colon)))
294               (abort))))
295         (call-with-values
296             (lambda ()
297               (if authority
298                   (parse-authority authority abort)
299                   (values #f #f #f)))
300           (lambda (userinfo host port)
301             (make-uri scheme userinfo host port path query fragment)))))
302     (lambda (k)
303       #f)))
304
305(define (string->uri string)
306  "Parse STRING into a URI object.  Return ‘#f’ if the string could not
307be parsed.  Note that this procedure will require that the URI have a
308scheme."
309  (let ((uri-reference (string->uri-reference string)))
310    (and (not (relative-ref? uri-reference))
311         uri-reference)))
312
313(define (string->relative-ref string)
314  "Parse STRING into a relative-ref URI object.  Return ‘#f’ if the
315string could not be parsed."
316  (let ((uri-reference (string->uri-reference string)))
317    (and (relative-ref? uri-reference)
318         uri-reference)))
319
320(define *default-ports* (make-hash-table))
321
322(define (declare-default-port! scheme port)
323  "Declare a default port for the given URI scheme."
324  (hashq-set! *default-ports* scheme port))
325
326(define (default-port? scheme port)
327  (or (not port)
328      (eqv? port (hashq-ref *default-ports* scheme))))
329
330(declare-default-port! 'http 80)
331(declare-default-port! 'https 443)
332
333(define* (uri->string uri #:key (include-fragment? #t))
334  "Serialize URI to a string.  If the URI has a port that is the
335default port for its scheme, the port is not included in the
336serialization."
337  (let* ((scheme (uri-scheme uri))
338         (userinfo (uri-userinfo uri))
339         (host (uri-host uri))
340         (port (uri-port uri))
341         (path (uri-path uri))
342         (query (uri-query uri))
343         (fragment (uri-fragment uri)))
344    (string-append
345     (if scheme
346         (string-append (symbol->string scheme) ":")
347         "")
348     (if host
349         (string-append "//"
350                        (if userinfo (string-append userinfo "@")
351                            "")
352                        (if (string-index host #\:)
353                            (string-append "[" host "]")
354                            host)
355                        (if (default-port? (uri-scheme uri) port)
356                            ""
357                            (string-append ":" (number->string port))))
358         "")
359     path
360     (if query
361         (string-append "?" query)
362         "")
363     (if (and fragment include-fragment?)
364         (string-append "#" fragment)
365         ""))))
366
367
368;; A note on characters and bytes: URIs are defined to be sequences of
369;; characters in a subset of ASCII. Those characters may encode a
370;; sequence of bytes (octets), which in turn may encode sequences of
371;; characters in other character sets.
372;;
373
374;; Return a new string made from uri-decoding STR.  Specifically,
375;; turn ‘+’ into space, and hex-encoded ‘%XX’ strings into
376;; their eight-bit characters.
377;;
378(define hex-chars
379  (string->char-set "0123456789abcdefABCDEF"))
380
381(define* (uri-decode str #:key (encoding "utf-8") (decode-plus-to-space? #t))
382  "Percent-decode the given STR, according to ENCODING,
383which should be the name of a character encoding.
384
385Note that this function should not generally be applied to a full URI
386string. For paths, use ‘split-and-decode-uri-path’ instead. For query
387strings, split the query on ‘&’ and ‘=’ boundaries, and decode
388the components separately.
389
390Note also that percent-encoded strings encode _bytes_, not characters.
391There is no guarantee that a given byte sequence is a valid string
392encoding. Therefore this routine may signal an error if the decoded
393bytes are not valid for the given encoding. Pass ‘#f’ for ENCODING if
394you want decoded bytes as a bytevector directly.  ‘set-port-encoding!’,
395for more information on character encodings.
396
397If DECODE-PLUS-TO-SPACE? is true, which is the default, also replace
398instances of the plus character (+) with a space character.  This is
399needed when parsing application/x-www-form-urlencoded data.
400
401Returns a string of the decoded characters, or a bytevector if
402ENCODING was ‘#f’."
403  (let* ((len (string-length str))
404         (bv
405          (call-with-output-bytevector
406           (lambda (port)
407             (let lp ((i 0))
408               (if (< i len)
409                   (let ((ch (string-ref str i)))
410                     (cond
411                      ((and (eqv? ch #\+) decode-plus-to-space?)
412                       (put-u8 port (char->integer #\space))
413                       (lp (1+ i)))
414                      ((and (< (+ i 2) len) (eqv? ch #\%)
415                            (let ((a (string-ref str (+ i 1)))
416                                  (b (string-ref str (+ i 2))))
417                              (and (char-set-contains? hex-chars a)
418                                   (char-set-contains? hex-chars b)
419                                   (string->number (string a b) 16))))
420                       => (lambda (u8)
421                            (put-u8 port u8)
422                            (lp (+ i 3))))
423                      ((< (char->integer ch) 128)
424                       (put-u8 port (char->integer ch))
425                       (lp (1+ i)))
426                      (else
427                       (uri-error "Invalid character in encoded URI ~a: ~s"
428                                  str ch))))))))))
429    (if encoding
430        (bytevector->string bv encoding)
431        ;; Otherwise return raw bytevector
432        bv)))
433
434(define ascii-alnum-chars
435  (string->char-set
436   "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
437
438;; RFC 3986, #2.2.
439(define gen-delims
440  (string->char-set ":/?#[]@"))
441(define sub-delims
442  (string->char-set "!$&'()*+,l="))
443(define reserved-chars
444  (char-set-union gen-delims sub-delims))
445
446;; RFC 3986, #2.3
447(define unreserved-chars
448  (char-set-union ascii-alnum-chars
449                  (string->char-set "-._~")))
450
451;; Return a new string made from uri-encoding STR, unconditionally
452;; transforming any characters not in UNESCAPED-CHARS.
453;;
454(define* (uri-encode str #:key (encoding "utf-8")
455                     (unescaped-chars unreserved-chars))
456  "Percent-encode any character not in the character set,
457UNESCAPED-CHARS.
458
459The default character set includes alphanumerics from ASCII, as well as
460the special characters ‘-’, ‘.’, ‘_’, and ‘~’.  Any other character will
461be percent-encoded, by writing out the character to a bytevector within
462the given ENCODING, then encoding each byte as ‘%HH’, where HH is the
463uppercase hexadecimal representation of the byte."
464  (define (needs-escaped? ch)
465    (not (char-set-contains? unescaped-chars ch)))
466  (if (string-index str needs-escaped?)
467      (call-with-output-string
468       (lambda (port)
469         (string-for-each
470          (lambda (ch)
471            (if (char-set-contains? unescaped-chars ch)
472                (display ch port)
473                (let* ((bv (string->bytevector (string ch) encoding))
474                       (len (bytevector-length bv)))
475                  (let lp ((i 0))
476                    (if (< i len)
477                        (let ((byte (bytevector-u8-ref bv i)))
478                          (display #\% port)
479                          (when (< byte 16)
480                            (display #\0 port))
481                          (display (string-upcase (number->string byte 16))
482                                   port)
483                          (lp (1+ i))))))))
484          str)))
485      str))
486
487(define (split-and-decode-uri-path path)
488  "Split PATH into its components, and decode each component,
489removing empty components.
490
491For example, ‘\"/foo/bar%20baz/\"’ decodes to the two-element list,
492‘(\"foo\" \"bar baz\")’."
493  (filter (lambda (x) (not (string-null? x)))
494          (map (lambda (s) (uri-decode s #:decode-plus-to-space? #f))
495               (string-split path #\/))))
496
497(define (encode-and-join-uri-path parts)
498  "URI-encode each element of PARTS, which should be a list of
499strings, and join the parts together with ‘/’ as a delimiter.
500
501For example, the list ‘(\"scrambled eggs\" \"biscuits&gravy\")’
502encodes as ‘\"scrambled%20eggs/biscuits%26gravy\"’."
503  (string-join (map uri-encode parts) "/"))
504