1# 2# 3# Nim's Runtime Library 4# (c) Copyright 2015 Dominik Picheta 5# 6# See the file "copying.txt", included in this 7# distribution, for details about the copyright. 8# 9 10## This module implements URI parsing as specified by RFC 3986. 11## 12## A Uniform Resource Identifier (URI) provides a simple and extensible 13## means for identifying a resource. A URI can be further classified 14## as a locator, a name, or both. The term "Uniform Resource Locator" 15## (URL) refers to the subset of URIs. 16## 17## # Basic usage 18 19 20## ## Combine URIs 21runnableExamples: 22 let host = parseUri("https://nim-lang.org") 23 assert $host == "https://nim-lang.org" 24 assert $(host / "/blog.html") == "https://nim-lang.org/blog.html" 25 assert $(host / "blog2.html") == "https://nim-lang.org/blog2.html" 26 27## ## Access URI item 28runnableExamples: 29 let res = parseUri("sftp://127.0.0.1:4343") 30 assert isAbsolute(res) 31 assert res.port == "4343" 32 33## ## Data URI Base64 34runnableExamples: 35 doAssert getDataUri("Hello World", "text/plain") == "data:text/plain;charset=utf-8;base64,SGVsbG8gV29ybGQ=" 36 doAssert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt" 37 38 39import strutils, parseutils, base64 40import std/private/[since, decode_helpers] 41 42 43type 44 Url* = distinct string 45 46 Uri* = object 47 scheme*, username*, password*: string 48 hostname*, port*, path*, query*, anchor*: string 49 opaque*: bool 50 isIpv6: bool # not expose it for compatibility. 51 52 UriParseError* = object of ValueError 53 54 55proc uriParseError*(msg: string) {.noreturn.} = 56 ## Raises a `UriParseError` exception with message `msg`. 57 raise newException(UriParseError, msg) 58 59func encodeUrl*(s: string, usePlus = true): string = 60 ## Encodes a URL according to RFC3986. 61 ## 62 ## This means that characters in the set 63 ## `{'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'}` are 64 ## carried over to the result. 65 ## All other characters are encoded as `%xx` where `xx` 66 ## denotes its hexadecimal value. 67 ## 68 ## As a special rule, when the value of `usePlus` is true, 69 ## spaces are encoded as `+` instead of `%20`. 70 ## 71 ## **See also:** 72 ## * `decodeUrl func<#decodeUrl,string>`_ 73 runnableExamples: 74 assert encodeUrl("https://nim-lang.org") == "https%3A%2F%2Fnim-lang.org" 75 assert encodeUrl("https://nim-lang.org/this is a test") == "https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test" 76 assert encodeUrl("https://nim-lang.org/this is a test", false) == "https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test" 77 result = newStringOfCap(s.len + s.len shr 2) # assume 12% non-alnum-chars 78 let fromSpace = if usePlus: "+" else: "%20" 79 for c in s: 80 case c 81 # https://tools.ietf.org/html/rfc3986#section-2.3 82 of 'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~': add(result, c) 83 of ' ': add(result, fromSpace) 84 else: 85 add(result, '%') 86 add(result, toHex(ord(c), 2)) 87 88func decodeUrl*(s: string, decodePlus = true): string = 89 ## Decodes a URL according to RFC3986. 90 ## 91 ## This means that any `%xx` (where `xx` denotes a hexadecimal 92 ## value) are converted to the character with ordinal number `xx`, 93 ## and every other character is carried over. 94 ## If `xx` is not a valid hexadecimal value, it is left intact. 95 ## 96 ## As a special rule, when the value of `decodePlus` is true, `+` 97 ## characters are converted to a space. 98 ## 99 ## **See also:** 100 ## * `encodeUrl func<#encodeUrl,string>`_ 101 runnableExamples: 102 assert decodeUrl("https%3A%2F%2Fnim-lang.org") == "https://nim-lang.org" 103 assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test") == "https://nim-lang.org/this is a test" 104 assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test", 105 false) == "https://nim-lang.org/this is a test" 106 assert decodeUrl("abc%xyz") == "abc%xyz" 107 108 result = newString(s.len) 109 var i = 0 110 var j = 0 111 while i < s.len: 112 case s[i] 113 of '%': 114 result[j] = decodePercent(s, i) 115 of '+': 116 if decodePlus: 117 result[j] = ' ' 118 else: 119 result[j] = s[i] 120 else: result[j] = s[i] 121 inc(i) 122 inc(j) 123 setLen(result, j) 124 125func encodeQuery*(query: openArray[(string, string)], usePlus = true, 126 omitEq = true): string = 127 ## Encodes a set of (key, value) parameters into a URL query string. 128 ## 129 ## Every (key, value) pair is URL-encoded and written as `key=value`. If the 130 ## value is an empty string then the `=` is omitted, unless `omitEq` is 131 ## false. 132 ## The pairs are joined together by a `&` character. 133 ## 134 ## The `usePlus` parameter is passed down to the `encodeUrl` function that 135 ## is used for the URL encoding of the string values. 136 ## 137 ## **See also:** 138 ## * `encodeUrl func<#encodeUrl,string>`_ 139 runnableExamples: 140 assert encodeQuery({: }) == "" 141 assert encodeQuery({"a": "1", "b": "2"}) == "a=1&b=2" 142 assert encodeQuery({"a": "1", "b": ""}) == "a=1&b" 143 for elem in query: 144 # Encode the `key = value` pairs and separate them with a '&' 145 if result.len > 0: result.add('&') 146 let (key, val) = elem 147 result.add(encodeUrl(key, usePlus)) 148 # Omit the '=' if the value string is empty 149 if not omitEq or val.len > 0: 150 result.add('=') 151 result.add(encodeUrl(val, usePlus)) 152 153iterator decodeQuery*(data: string): tuple[key, value: string] = 154 ## Reads and decodes query string `data` and yields the `(key, value)` pairs 155 ## the data consists of. If compiled with `-d:nimLegacyParseQueryStrict`, an 156 ## error is raised when there is an unencoded `=` character in a decoded 157 ## value, which was the behavior in Nim < 1.5.1 158 runnableExamples: 159 import std/sequtils 160 assert toSeq(decodeQuery("foo=1&bar=2=3")) == @[("foo", "1"), ("bar", "2=3")] 161 assert toSeq(decodeQuery("&a&=b&=&&")) == @[("", ""), ("a", ""), ("", "b"), ("", ""), ("", "")] 162 163 proc parseData(data: string, i: int, field: var string, sep: char): int = 164 result = i 165 while result < data.len: 166 let c = data[result] 167 case c 168 of '%': add(field, decodePercent(data, result)) 169 of '+': add(field, ' ') 170 of '&': break 171 else: 172 if c == sep: break 173 else: add(field, data[result]) 174 inc(result) 175 176 var i = 0 177 var name = "" 178 var value = "" 179 # decode everything in one pass: 180 while i < data.len: 181 setLen(name, 0) # reuse memory 182 i = parseData(data, i, name, '=') 183 setLen(value, 0) # reuse memory 184 if i < data.len and data[i] == '=': 185 inc(i) # skip '=' 186 when defined(nimLegacyParseQueryStrict): 187 i = parseData(data, i, value, '=') 188 else: 189 i = parseData(data, i, value, '&') 190 yield (name, value) 191 if i < data.len: 192 when defined(nimLegacyParseQueryStrict): 193 if data[i] != '&': 194 uriParseError("'&' expected at index '$#' for '$#'" % [$i, data]) 195 inc(i) 196 197func parseAuthority(authority: string, result: var Uri) = 198 var i = 0 199 var inPort = false 200 var inIPv6 = false 201 while i < authority.len: 202 case authority[i] 203 of '@': 204 swap result.password, result.port 205 result.port.setLen(0) 206 swap result.username, result.hostname 207 result.hostname.setLen(0) 208 inPort = false 209 of ':': 210 if inIPv6: 211 result.hostname.add(authority[i]) 212 else: 213 inPort = true 214 of '[': 215 inIPv6 = true 216 result.isIpv6 = true 217 of ']': 218 inIPv6 = false 219 of '\0': 220 break 221 else: 222 if inPort: 223 result.port.add(authority[i]) 224 else: 225 result.hostname.add(authority[i]) 226 i.inc 227 228func parsePath(uri: string, i: var int, result: var Uri) = 229 i.inc parseUntil(uri, result.path, {'?', '#'}, i) 230 231 # The 'mailto' scheme's PATH actually contains the hostname/username 232 if cmpIgnoreCase(result.scheme, "mailto") == 0: 233 parseAuthority(result.path, result) 234 result.path.setLen(0) 235 236 if i < uri.len and uri[i] == '?': 237 i.inc # Skip '?' 238 i.inc parseUntil(uri, result.query, {'#'}, i) 239 240 if i < uri.len and uri[i] == '#': 241 i.inc # Skip '#' 242 i.inc parseUntil(uri, result.anchor, {}, i) 243 244func initUri*(isIpv6 = false): Uri = 245 ## Initializes a URI with `scheme`, `username`, `password`, 246 ## `hostname`, `port`, `path`, `query`, `anchor` and `isIpv6`. 247 ## 248 ## **See also:** 249 ## * `Uri type <#Uri>`_ for available fields in the URI type 250 runnableExamples: 251 var uri2 = initUri(isIpv6 = true) 252 uri2.scheme = "tcp" 253 uri2.hostname = "2001:0db8:85a3:0000:0000:8a2e:0370:7334" 254 uri2.port = "8080" 255 assert $uri2 == "tcp://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080" 256 result = Uri(scheme: "", username: "", password: "", hostname: "", port: "", 257 path: "", query: "", anchor: "", isIpv6: isIpv6) 258 259func resetUri(uri: var Uri) = 260 for f in uri.fields: 261 when f is string: 262 f.setLen(0) 263 else: 264 f = false 265 266func parseUri*(uri: string, result: var Uri) = 267 ## Parses a URI. The `result` variable will be cleared before. 268 ## 269 ## **See also:** 270 ## * `Uri type <#Uri>`_ for available fields in the URI type 271 ## * `initUri func <#initUri>`_ for initializing a URI 272 runnableExamples: 273 var res = initUri() 274 parseUri("https://nim-lang.org/docs/manual.html", res) 275 assert res.scheme == "https" 276 assert res.hostname == "nim-lang.org" 277 assert res.path == "/docs/manual.html" 278 resetUri(result) 279 280 var i = 0 281 282 # Check if this is a reference URI (relative URI) 283 let doubleSlash = uri.len > 1 and uri[0] == '/' and uri[1] == '/' 284 if i < uri.len and uri[i] == '/': 285 # Make sure `uri` doesn't begin with '//'. 286 if not doubleSlash: 287 parsePath(uri, i, result) 288 return 289 290 # Scheme 291 i.inc parseWhile(uri, result.scheme, Letters + Digits + {'+', '-', '.'}, i) 292 if (i >= uri.len or uri[i] != ':') and not doubleSlash: 293 # Assume this is a reference URI (relative URI) 294 i = 0 295 result.scheme.setLen(0) 296 parsePath(uri, i, result) 297 return 298 if not doubleSlash: 299 i.inc # Skip ':' 300 301 # Authority 302 if i+1 < uri.len and uri[i] == '/' and uri[i+1] == '/': 303 i.inc(2) # Skip // 304 var authority = "" 305 i.inc parseUntil(uri, authority, {'/', '?', '#'}, i) 306 if authority.len > 0: 307 parseAuthority(authority, result) 308 else: 309 result.opaque = true 310 311 # Path 312 parsePath(uri, i, result) 313 314func parseUri*(uri: string): Uri = 315 ## Parses a URI and returns it. 316 ## 317 ## **See also:** 318 ## * `Uri type <#Uri>`_ for available fields in the URI type 319 runnableExamples: 320 let res = parseUri("ftp://Username:Password@Hostname") 321 assert res.username == "Username" 322 assert res.password == "Password" 323 assert res.scheme == "ftp" 324 result = initUri() 325 parseUri(uri, result) 326 327func removeDotSegments(path: string): string = 328 ## Collapses `..` and `.` in `path` in a similar way as done in `os.normalizedPath` 329 ## Caution: this is buggy. 330 runnableExamples: 331 assert removeDotSegments("a1/a2/../a3/a4/a5/./a6/a7/.//./") == "a1/a3/a4/a5/a6/a7/" 332 assert removeDotSegments("http://www.ai.") == "http://www.ai." 333 # xxx adapt or reuse `pathnorm.normalizePath(path, '/')` to make this more reliable, but 334 # taking into account url specificities such as not collapsing leading `//` in scheme 335 # `https://`. see `turi` for failing tests. 336 if path.len == 0: return "" 337 var collection: seq[string] = @[] 338 let endsWithSlash = path.endsWith '/' 339 var i = 0 340 var currentSegment = "" 341 while i < path.len: 342 case path[i] 343 of '/': 344 collection.add(currentSegment) 345 currentSegment = "" 346 of '.': 347 if i+2 < path.len and path[i+1] == '.' and path[i+2] == '/': 348 if collection.len > 0: 349 discard collection.pop() 350 i.inc 3 351 continue 352 elif i + 1 < path.len and path[i+1] == '/': 353 i.inc 2 354 continue 355 currentSegment.add path[i] 356 else: 357 currentSegment.add path[i] 358 i.inc 359 if currentSegment != "": 360 collection.add currentSegment 361 362 result = collection.join("/") 363 if endsWithSlash: result.add '/' 364 365func merge(base, reference: Uri): string = 366 # http://tools.ietf.org/html/rfc3986#section-5.2.3 367 if base.hostname != "" and base.path == "": 368 '/' & reference.path 369 else: 370 let lastSegment = rfind(base.path, "/") 371 if lastSegment == -1: 372 reference.path 373 else: 374 base.path[0 .. lastSegment] & reference.path 375 376func combine*(base: Uri, reference: Uri): Uri = 377 ## Combines a base URI with a reference URI. 378 ## 379 ## This uses the algorithm specified in 380 ## `section 5.2.2 of RFC 3986 <http://tools.ietf.org/html/rfc3986#section-5.2.2>`_. 381 ## 382 ## This means that the slashes inside the base URIs path as well as reference 383 ## URIs path affect the resulting URI. 384 ## 385 ## **See also:** 386 ## * `/ func <#/,Uri,string>`_ for building URIs 387 runnableExamples: 388 let foo = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("/baz")) 389 assert foo.path == "/baz" 390 let bar = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("baz")) 391 assert bar.path == "/foo/baz" 392 let qux = combine(parseUri("https://nim-lang.org/foo/bar/"), parseUri("baz")) 393 assert qux.path == "/foo/bar/baz" 394 395 template setAuthority(dest, src): untyped = 396 dest.hostname = src.hostname 397 dest.username = src.username 398 dest.port = src.port 399 dest.password = src.password 400 401 result = initUri() 402 if reference.scheme != base.scheme and reference.scheme != "": 403 result = reference 404 result.path = removeDotSegments(result.path) 405 else: 406 if reference.hostname != "": 407 setAuthority(result, reference) 408 result.path = removeDotSegments(reference.path) 409 result.query = reference.query 410 else: 411 if reference.path == "": 412 result.path = base.path 413 if reference.query != "": 414 result.query = reference.query 415 else: 416 result.query = base.query 417 else: 418 if reference.path.startsWith("/"): 419 result.path = removeDotSegments(reference.path) 420 else: 421 result.path = removeDotSegments(merge(base, reference)) 422 result.query = reference.query 423 setAuthority(result, base) 424 result.scheme = base.scheme 425 result.anchor = reference.anchor 426 427func combine*(uris: varargs[Uri]): Uri = 428 ## Combines multiple URIs together. 429 ## 430 ## **See also:** 431 ## * `/ func <#/,Uri,string>`_ for building URIs 432 runnableExamples: 433 let foo = combine(parseUri("https://nim-lang.org/"), parseUri("docs/"), 434 parseUri("manual.html")) 435 assert foo.hostname == "nim-lang.org" 436 assert foo.path == "/docs/manual.html" 437 result = uris[0] 438 for i in 1 ..< uris.len: 439 result = combine(result, uris[i]) 440 441func isAbsolute*(uri: Uri): bool = 442 ## Returns true if URI is absolute, false otherwise. 443 runnableExamples: 444 assert parseUri("https://nim-lang.org").isAbsolute 445 assert not parseUri("nim-lang").isAbsolute 446 return uri.scheme != "" and (uri.hostname != "" or uri.path != "") 447 448func `/`*(x: Uri, path: string): Uri = 449 ## Concatenates the path specified to the specified URIs path. 450 ## 451 ## Contrary to the `combine func <#combine,Uri,Uri>`_ you do not have to worry about 452 ## the slashes at the beginning and end of the path and URIs path 453 ## respectively. 454 ## 455 ## **See also:** 456 ## * `combine func <#combine,Uri,Uri>`_ 457 runnableExamples: 458 let foo = parseUri("https://nim-lang.org/foo/bar") / "/baz" 459 assert foo.path == "/foo/bar/baz" 460 let bar = parseUri("https://nim-lang.org/foo/bar") / "baz" 461 assert bar.path == "/foo/bar/baz" 462 let qux = parseUri("https://nim-lang.org/foo/bar/") / "baz" 463 assert qux.path == "/foo/bar/baz" 464 result = x 465 466 if result.path.len == 0: 467 if path.len == 0 or path[0] != '/': 468 result.path = "/" 469 result.path.add(path) 470 return 471 472 if result.path.len > 0 and result.path[result.path.len-1] == '/': 473 if path.len > 0 and path[0] == '/': 474 result.path.add(path[1 .. path.len-1]) 475 else: 476 result.path.add(path) 477 else: 478 if path.len == 0 or path[0] != '/': 479 result.path.add '/' 480 result.path.add(path) 481 482func `?`*(u: Uri, query: openArray[(string, string)]): Uri = 483 ## Concatenates the query parameters to the specified URI object. 484 runnableExamples: 485 let foo = parseUri("https://example.com") / "foo" ? {"bar": "qux"} 486 assert $foo == "https://example.com/foo?bar=qux" 487 result = u 488 result.query = encodeQuery(query) 489 490func `$`*(u: Uri): string = 491 ## Returns the string representation of the specified URI object. 492 runnableExamples: 493 assert $parseUri("https://nim-lang.org") == "https://nim-lang.org" 494 result = "" 495 if u.scheme.len > 0: 496 result.add(u.scheme) 497 if u.opaque: 498 result.add(":") 499 else: 500 result.add("://") 501 if u.username.len > 0: 502 result.add(u.username) 503 if u.password.len > 0: 504 result.add(":") 505 result.add(u.password) 506 result.add("@") 507 if u.hostname.endsWith('/'): 508 if u.isIpv6: 509 result.add("[" & u.hostname[0 .. ^2] & "]") 510 else: 511 result.add(u.hostname[0 .. ^2]) 512 else: 513 if u.isIpv6: 514 result.add("[" & u.hostname & "]") 515 else: 516 result.add(u.hostname) 517 if u.port.len > 0: 518 result.add(":") 519 result.add(u.port) 520 if u.path.len > 0: 521 if u.hostname.len > 0 and u.path[0] != '/': 522 result.add('/') 523 result.add(u.path) 524 if u.query.len > 0: 525 result.add("?") 526 result.add(u.query) 527 if u.anchor.len > 0: 528 result.add("#") 529 result.add(u.anchor) 530 531proc getDataUri*(data, mime: string, encoding = "utf-8"): string {.since: (1, 3).} = 532 ## Convenience proc for `base64.encode` returns a standard Base64 Data URI (RFC-2397) 533 ## 534 ## **See also:** 535 ## * `mimetypes <mimetypes.html>`_ for `mime` argument 536 ## * https://tools.ietf.org/html/rfc2397 537 ## * https://en.wikipedia.org/wiki/Data_URI_scheme 538 runnableExamples: static: assert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt" 539 assert encoding.len > 0 and mime.len > 0 # Must *not* be URL-Safe, see RFC-2397 540 result = "data:" & mime & ";charset=" & encoding & ";base64," & base64.encode(data) 541