1;;;; (web uri) --- URI manipulation tools 2;;;; 3;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013,2014,2019-2021 Free Software Foundation, Inc. 4;;;; 5;;;; This library is free software; you can redistribute it and/or 6;;;; modify it under the terms of the GNU Lesser General Public 7;;;; License as published by the Free Software Foundation; either 8;;;; version 3 of the License, or (at your option) any later version. 9;;;; 10;;;; This library is distributed in the hope that it will be useful, 11;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 12;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13;;;; Lesser General Public License for more details. 14;;;; 15;;;; You should have received a copy of the GNU Lesser General Public 16;;;; License along with this library; if not, write to the Free Software 17;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18;;;; 19 20;;; Commentary: 21 22;; A data type for Universal Resource Identifiers, as defined in RFC 23;; 3986. 24 25;;; Code: 26 27(define-module (web uri) 28 #:use-module (srfi srfi-9) 29 #:use-module (ice-9 iconv) 30 #:use-module (ice-9 regex) 31 #:use-module (ice-9 rdelim) 32 #:use-module (ice-9 control) 33 #:use-module (rnrs bytevectors) 34 #:use-module (ice-9 binary-ports) 35 #:export (uri? 36 uri-scheme uri-userinfo uri-host uri-port 37 uri-path uri-query uri-fragment 38 39 build-uri 40 build-uri-reference 41 declare-default-port! 42 string->uri string->uri-reference 43 uri->string 44 uri-decode uri-encode 45 split-and-decode-uri-path 46 encode-and-join-uri-path 47 48 uri-reference? relative-ref? 49 build-uri-reference build-relative-ref 50 string->uri-reference string->relative-ref)) 51 52(define-record-type <uri> 53 (make-uri scheme userinfo host port path query fragment) 54 uri-reference? 55 (scheme uri-scheme) 56 (userinfo uri-userinfo) 57 (host uri-host) 58 (port uri-port) 59 (path uri-path) 60 (query uri-query) 61 (fragment uri-fragment)) 62 63;;; 64;;; Predicates. 65;;; 66;;; These are quick, and assume rigid validation at construction time. 67 68;;; RFC 3986, #3. 69;;; 70;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 71;;; 72;;; hier-part = "//" authority path-abempty 73;;; / path-absolute 74;;; / path-rootless 75;;; / path-empty 76 77(define (uri? obj) 78 (and (uri-reference? obj) 79 (uri-scheme obj) 80 #t)) 81 82;;; RFC 3986, #4.2. 83;;; 84;;; relative-ref = relative-part [ "?" query ] [ "#" fragment ] 85;;; 86;;; relative-part = "//" authority path-abempty 87;;; / path-absolute 88;;; / path-noscheme 89;;; / path-empty 90 91(define (relative-ref? obj) 92 (and (uri-reference? obj) 93 (not (uri-scheme obj)))) 94 95 96;;; 97;;; Constructors. 98;;; 99 100(define (uri-error message . args) 101 (throw 'uri-error message args)) 102 103(define (positive-exact-integer? port) 104 (and (number? port) (exact? port) (integer? port) (positive? port))) 105 106(define (validate-uri-reference scheme userinfo host port path query fragment) 107 (cond 108 ((and scheme (not (symbol? scheme))) 109 (uri-error "Expected a symbol for the URI scheme: ~s" scheme)) 110 ((and (or userinfo port) (not host)) 111 (uri-error "Expected a host, given userinfo or port")) 112 ((and port (not (positive-exact-integer? port))) 113 (uri-error "Expected port to be an integer: ~s" port)) 114 ((and host (or (not (string? host)) (not (valid-host? host)))) 115 (uri-error "Expected valid host: ~s" host)) 116 ((and userinfo (not (string? userinfo))) 117 (uri-error "Expected string for userinfo: ~s" userinfo)) 118 ((not (string? path)) 119 (uri-error "Expected string for path: ~s" path)) 120 ((and query (not (string? query))) 121 (uri-error "Expected string for query: ~s" query)) 122 ((and fragment (not (string? fragment))) 123 (uri-error "Expected string for fragment: ~s" fragment)) 124 ;; Strict validation of allowed paths, based on other components. 125 ;; Refer to RFC 3986 for the details. 126 ((not (string-null? path)) 127 (if host 128 (cond 129 ((not (eqv? (string-ref path 0) #\/)) 130 (uri-error 131 "Expected absolute path starting with \"/\": ~a" path))) 132 (cond 133 ((string-prefix? "//" path) 134 (uri-error 135 "Expected path not starting with \"//\" (no host): ~a" path)) 136 ((and (not scheme) 137 (not (eqv? (string-ref path 0) #\/)) 138 (let ((colon (string-index path #\:))) 139 (and colon (not (string-index path #\/ 0 colon))))) 140 (uri-error 141 "Expected relative path's first segment without \":\": ~a" 142 path))))))) 143 144(define* (build-uri scheme #:key userinfo host port (path "") query fragment 145 (validate? #t)) 146 "Construct a URI object. SCHEME should be a symbol, PORT either a 147positive, exact integer or ‘#f’, and the rest of the fields are either 148strings or ‘#f’. If VALIDATE? is true, also run some consistency checks 149to make sure that the constructed object is a valid URI." 150 (when validate? 151 (unless scheme (uri-error "Missing URI scheme")) 152 (validate-uri-reference scheme userinfo host port path query fragment)) 153 (make-uri scheme userinfo host port path query fragment)) 154 155(define* (build-uri-reference #:key scheme userinfo host port (path "") query 156 fragment (validate? #t)) 157 "Construct a URI-reference object. SCHEME should be a symbol or ‘#f’, 158PORT either a positive, exact integer or ‘#f’, and the rest of the 159fields are either strings or ‘#f’. If VALIDATE? is true, also run some 160consistency checks to make sure that the constructed URI is a valid URI 161reference." 162 (when validate? 163 (validate-uri-reference scheme userinfo host port path query fragment)) 164 (make-uri scheme userinfo host port path query fragment)) 165 166(define* (build-relative-ref #:key userinfo host port (path "") query fragment 167 (validate? #t)) 168 "Construct a relative-ref URI object. The arguments are the same as 169for ‘build-uri’ except there is no scheme." 170 (when validate? 171 (validate-uri-reference #f userinfo host port path query fragment)) 172 (make-uri #f userinfo host port path query fragment)) 173 174 175;;; 176;;; Converters. 177;;; 178 179;; Since character ranges in regular expressions may depend on the 180;; current locale, we use explicit lists of characters instead. See 181;; <https://bugs.gnu.org/35785> for details. 182(define digits "0123456789") 183(define hex-digits "0123456789ABCDEFabcdef") 184(define letters "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") 185 186;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC 187;; 3490), and non-ASCII host names. 188;; 189(define ipv4-regexp 190 (make-regexp (string-append "^([" digits ".]+)$"))) 191(define ipv6-regexp 192 (make-regexp (string-append "^([" hex-digits "]*:[" hex-digits ":.]+)$"))) 193(define domain-label-regexp 194 (make-regexp 195 (string-append "^[" letters digits "]" 196 "([" letters digits "-]*[" letters digits "])?$"))) 197(define top-label-regexp 198 (make-regexp 199 (string-append "^[" letters "]" 200 "([" letters digits "-]*[" letters digits "])?$"))) 201 202(define (valid-host? host) 203 (cond 204 ((regexp-exec ipv4-regexp host) 205 (false-if-exception (inet-pton AF_INET host))) 206 ((regexp-exec ipv6-regexp host) 207 (false-if-exception (inet-pton AF_INET6 host))) 208 (else 209 (let lp ((start 0)) 210 (let ((end (string-index host #\. start))) 211 (if end 212 (and (regexp-exec domain-label-regexp 213 (substring host start end)) 214 (lp (1+ end))) 215 (regexp-exec top-label-regexp host start))))))) 216 217(define userinfo-pat 218 (string-append "[" letters digits "_.!~*'();:&=+$,-]+")) 219(define host-pat 220 (string-append "[" letters digits ".-]+")) 221(define ipv6-host-pat 222 (string-append "[" hex-digits ":.]+")) 223(define port-pat 224 (string-append "[" digits "]*")) 225(define authority-regexp 226 (make-regexp 227 (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$" 228 userinfo-pat host-pat ipv6-host-pat port-pat))) 229 230(define (parse-authority authority fail) 231 (if (equal? authority "//") 232 ;; Allow empty authorities: file:///etc/hosts is a synonym of 233 ;; file:/etc/hosts. 234 (values #f #f #f) 235 (let ((m (regexp-exec authority-regexp authority))) 236 (if (and m (valid-host? (or (match:substring m 4) 237 (match:substring m 6)))) 238 (values (match:substring m 2) 239 (or (match:substring m 4) 240 (match:substring m 6)) 241 (let ((port (match:substring m 8))) 242 (and port (not (string-null? port)) 243 (string->number port)))) 244 (fail))))) 245 246 247;;; RFC 3986, #3. 248;;; 249;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 250;;; 251;;; hier-part = "//" authority path-abempty 252;;; / path-absolute 253;;; / path-rootless 254;;; / path-empty 255;;; 256;;; A URI-reference is the same as URI, but where the scheme is 257;;; optional. If the scheme is not present, its colon isn't present 258;;; either. 259 260(define scheme-pat 261 (string-append "[" letters "][" letters digits "+.-]*")) 262(define authority-pat 263 "[^/?#]*") 264(define path-pat 265 "[^?#]*") 266(define query-pat 267 "[^#]*") 268(define fragment-pat 269 ".*") 270(define uri-pat 271 (format #f "^((~a):)?(//~a)?(~a)(\\?(~a))?(#(~a))?$" 272 scheme-pat authority-pat path-pat query-pat fragment-pat)) 273(define uri-regexp 274 (make-regexp uri-pat)) 275 276(define (string->uri-reference string) 277 "Parse STRING into a URI-reference object. Return ‘#f’ if the string 278could not be parsed." 279 (% (let ((m (regexp-exec uri-regexp string))) 280 (unless m (abort)) 281 (let ((scheme (let ((str (match:substring m 2))) 282 (and str (string->symbol (string-downcase str))))) 283 (authority (match:substring m 3)) 284 (path (match:substring m 4)) 285 (query (match:substring m 6)) 286 (fragment (match:substring m 8))) 287 ;; The regular expression already ensures all of the validation 288 ;; requirements for URI-references, except the one that the 289 ;; first component of a relative-ref's path can't contain a 290 ;; colon. 291 (unless scheme 292 (let ((colon (string-index path #\:))) 293 (when (and colon (not (string-index path #\/ 0 colon))) 294 (abort)))) 295 (call-with-values 296 (lambda () 297 (if authority 298 (parse-authority authority abort) 299 (values #f #f #f))) 300 (lambda (userinfo host port) 301 (make-uri scheme userinfo host port path query fragment))))) 302 (lambda (k) 303 #f))) 304 305(define (string->uri string) 306 "Parse STRING into a URI object. Return ‘#f’ if the string could not 307be parsed. Note that this procedure will require that the URI have a 308scheme." 309 (let ((uri-reference (string->uri-reference string))) 310 (and (not (relative-ref? uri-reference)) 311 uri-reference))) 312 313(define (string->relative-ref string) 314 "Parse STRING into a relative-ref URI object. Return ‘#f’ if the 315string could not be parsed." 316 (let ((uri-reference (string->uri-reference string))) 317 (and (relative-ref? uri-reference) 318 uri-reference))) 319 320(define *default-ports* (make-hash-table)) 321 322(define (declare-default-port! scheme port) 323 "Declare a default port for the given URI scheme." 324 (hashq-set! *default-ports* scheme port)) 325 326(define (default-port? scheme port) 327 (or (not port) 328 (eqv? port (hashq-ref *default-ports* scheme)))) 329 330(declare-default-port! 'http 80) 331(declare-default-port! 'https 443) 332 333(define* (uri->string uri #:key (include-fragment? #t)) 334 "Serialize URI to a string. If the URI has a port that is the 335default port for its scheme, the port is not included in the 336serialization." 337 (let* ((scheme (uri-scheme uri)) 338 (userinfo (uri-userinfo uri)) 339 (host (uri-host uri)) 340 (port (uri-port uri)) 341 (path (uri-path uri)) 342 (query (uri-query uri)) 343 (fragment (uri-fragment uri))) 344 (string-append 345 (if scheme 346 (string-append (symbol->string scheme) ":") 347 "") 348 (if host 349 (string-append "//" 350 (if userinfo (string-append userinfo "@") 351 "") 352 (if (string-index host #\:) 353 (string-append "[" host "]") 354 host) 355 (if (default-port? (uri-scheme uri) port) 356 "" 357 (string-append ":" (number->string port)))) 358 "") 359 path 360 (if query 361 (string-append "?" query) 362 "") 363 (if (and fragment include-fragment?) 364 (string-append "#" fragment) 365 "")))) 366 367 368;; A note on characters and bytes: URIs are defined to be sequences of 369;; characters in a subset of ASCII. Those characters may encode a 370;; sequence of bytes (octets), which in turn may encode sequences of 371;; characters in other character sets. 372;; 373 374;; Return a new string made from uri-decoding STR. Specifically, 375;; turn ‘+’ into space, and hex-encoded ‘%XX’ strings into 376;; their eight-bit characters. 377;; 378(define hex-chars 379 (string->char-set "0123456789abcdefABCDEF")) 380 381(define* (uri-decode str #:key (encoding "utf-8") (decode-plus-to-space? #t)) 382 "Percent-decode the given STR, according to ENCODING, 383which should be the name of a character encoding. 384 385Note that this function should not generally be applied to a full URI 386string. For paths, use ‘split-and-decode-uri-path’ instead. For query 387strings, split the query on ‘&’ and ‘=’ boundaries, and decode 388the components separately. 389 390Note also that percent-encoded strings encode _bytes_, not characters. 391There is no guarantee that a given byte sequence is a valid string 392encoding. Therefore this routine may signal an error if the decoded 393bytes are not valid for the given encoding. Pass ‘#f’ for ENCODING if 394you want decoded bytes as a bytevector directly. ‘set-port-encoding!’, 395for more information on character encodings. 396 397If DECODE-PLUS-TO-SPACE? is true, which is the default, also replace 398instances of the plus character (+) with a space character. This is 399needed when parsing application/x-www-form-urlencoded data. 400 401Returns a string of the decoded characters, or a bytevector if 402ENCODING was ‘#f’." 403 (let* ((len (string-length str)) 404 (bv 405 (call-with-output-bytevector 406 (lambda (port) 407 (let lp ((i 0)) 408 (if (< i len) 409 (let ((ch (string-ref str i))) 410 (cond 411 ((and (eqv? ch #\+) decode-plus-to-space?) 412 (put-u8 port (char->integer #\space)) 413 (lp (1+ i))) 414 ((and (< (+ i 2) len) (eqv? ch #\%) 415 (let ((a (string-ref str (+ i 1))) 416 (b (string-ref str (+ i 2)))) 417 (and (char-set-contains? hex-chars a) 418 (char-set-contains? hex-chars b) 419 (string->number (string a b) 16)))) 420 => (lambda (u8) 421 (put-u8 port u8) 422 (lp (+ i 3)))) 423 ((< (char->integer ch) 128) 424 (put-u8 port (char->integer ch)) 425 (lp (1+ i))) 426 (else 427 (uri-error "Invalid character in encoded URI ~a: ~s" 428 str ch)))))))))) 429 (if encoding 430 (bytevector->string bv encoding) 431 ;; Otherwise return raw bytevector 432 bv))) 433 434(define ascii-alnum-chars 435 (string->char-set 436 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")) 437 438;; RFC 3986, #2.2. 439(define gen-delims 440 (string->char-set ":/?#[]@")) 441(define sub-delims 442 (string->char-set "!$&'()*+,l=")) 443(define reserved-chars 444 (char-set-union gen-delims sub-delims)) 445 446;; RFC 3986, #2.3 447(define unreserved-chars 448 (char-set-union ascii-alnum-chars 449 (string->char-set "-._~"))) 450 451;; Return a new string made from uri-encoding STR, unconditionally 452;; transforming any characters not in UNESCAPED-CHARS. 453;; 454(define* (uri-encode str #:key (encoding "utf-8") 455 (unescaped-chars unreserved-chars)) 456 "Percent-encode any character not in the character set, 457UNESCAPED-CHARS. 458 459The default character set includes alphanumerics from ASCII, as well as 460the special characters ‘-’, ‘.’, ‘_’, and ‘~’. Any other character will 461be percent-encoded, by writing out the character to a bytevector within 462the given ENCODING, then encoding each byte as ‘%HH’, where HH is the 463uppercase hexadecimal representation of the byte." 464 (define (needs-escaped? ch) 465 (not (char-set-contains? unescaped-chars ch))) 466 (if (string-index str needs-escaped?) 467 (call-with-output-string 468 (lambda (port) 469 (string-for-each 470 (lambda (ch) 471 (if (char-set-contains? unescaped-chars ch) 472 (display ch port) 473 (let* ((bv (string->bytevector (string ch) encoding)) 474 (len (bytevector-length bv))) 475 (let lp ((i 0)) 476 (if (< i len) 477 (let ((byte (bytevector-u8-ref bv i))) 478 (display #\% port) 479 (when (< byte 16) 480 (display #\0 port)) 481 (display (string-upcase (number->string byte 16)) 482 port) 483 (lp (1+ i)))))))) 484 str))) 485 str)) 486 487(define (split-and-decode-uri-path path) 488 "Split PATH into its components, and decode each component, 489removing empty components. 490 491For example, ‘\"/foo/bar%20baz/\"’ decodes to the two-element list, 492‘(\"foo\" \"bar baz\")’." 493 (filter (lambda (x) (not (string-null? x))) 494 (map (lambda (s) (uri-decode s #:decode-plus-to-space? #f)) 495 (string-split path #\/)))) 496 497(define (encode-and-join-uri-path parts) 498 "URI-encode each element of PARTS, which should be a list of 499strings, and join the parts together with ‘/’ as a delimiter. 500 501For example, the list ‘(\"scrambled eggs\" \"biscuits&gravy\")’ 502encodes as ‘\"scrambled%20eggs/biscuits%26gravy\"’." 503 (string-join (map uri-encode parts) "/")) 504