1#
2#
3#            Nim's Runtime Library
4#        (c) Copyright 2015 Dominik Picheta
5#
6#    See the file "copying.txt", included in this
7#    distribution, for details about the copyright.
8#
9
10## This module implements URI parsing as specified by RFC 3986.
11##
12## A Uniform Resource Identifier (URI) provides a simple and extensible
13## means for identifying a resource. A URI can be further classified
14## as a locator, a name, or both. The term "Uniform Resource Locator"
15## (URL) refers to the subset of URIs.
16##
17## # Basic usage
18
19
20## ## Combine URIs
21runnableExamples:
22  let host = parseUri("https://nim-lang.org")
23  assert $host == "https://nim-lang.org"
24  assert $(host / "/blog.html") == "https://nim-lang.org/blog.html"
25  assert $(host / "blog2.html") == "https://nim-lang.org/blog2.html"
26
27## ## Access URI item
28runnableExamples:
29  let res = parseUri("sftp://127.0.0.1:4343")
30  assert isAbsolute(res)
31  assert res.port == "4343"
32
33## ## Data URI Base64
34runnableExamples:
35  doAssert getDataUri("Hello World", "text/plain") == "data:text/plain;charset=utf-8;base64,SGVsbG8gV29ybGQ="
36  doAssert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt"
37
38
39import strutils, parseutils, base64
40import std/private/[since, decode_helpers]
41
42
43type
44  Url* = distinct string
45
46  Uri* = object
47    scheme*, username*, password*: string
48    hostname*, port*, path*, query*, anchor*: string
49    opaque*: bool
50    isIpv6: bool # not expose it for compatibility.
51
52  UriParseError* = object of ValueError
53
54
55proc uriParseError*(msg: string) {.noreturn.} =
56  ## Raises a `UriParseError` exception with message `msg`.
57  raise newException(UriParseError, msg)
58
59func encodeUrl*(s: string, usePlus = true): string =
60  ## Encodes a URL according to RFC3986.
61  ##
62  ## This means that characters in the set
63  ## `{'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'}` are
64  ## carried over to the result.
65  ## All other characters are encoded as `%xx` where `xx`
66  ## denotes its hexadecimal value.
67  ##
68  ## As a special rule, when the value of `usePlus` is true,
69  ## spaces are encoded as `+` instead of `%20`.
70  ##
71  ## **See also:**
72  ## * `decodeUrl func<#decodeUrl,string>`_
73  runnableExamples:
74    assert encodeUrl("https://nim-lang.org") == "https%3A%2F%2Fnim-lang.org"
75    assert encodeUrl("https://nim-lang.org/this is a test") == "https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test"
76    assert encodeUrl("https://nim-lang.org/this is a test", false) == "https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test"
77  result = newStringOfCap(s.len + s.len shr 2) # assume 12% non-alnum-chars
78  let fromSpace = if usePlus: "+" else: "%20"
79  for c in s:
80    case c
81    # https://tools.ietf.org/html/rfc3986#section-2.3
82    of 'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~': add(result, c)
83    of ' ': add(result, fromSpace)
84    else:
85      add(result, '%')
86      add(result, toHex(ord(c), 2))
87
88func decodeUrl*(s: string, decodePlus = true): string =
89  ## Decodes a URL according to RFC3986.
90  ##
91  ## This means that any `%xx` (where `xx` denotes a hexadecimal
92  ## value) are converted to the character with ordinal number `xx`,
93  ## and every other character is carried over.
94  ## If `xx` is not a valid hexadecimal value, it is left intact.
95  ##
96  ## As a special rule, when the value of `decodePlus` is true, `+`
97  ## characters are converted to a space.
98  ##
99  ## **See also:**
100  ## * `encodeUrl func<#encodeUrl,string>`_
101  runnableExamples:
102    assert decodeUrl("https%3A%2F%2Fnim-lang.org") == "https://nim-lang.org"
103    assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis+is+a+test") == "https://nim-lang.org/this is a test"
104    assert decodeUrl("https%3A%2F%2Fnim-lang.org%2Fthis%20is%20a%20test",
105        false) == "https://nim-lang.org/this is a test"
106    assert decodeUrl("abc%xyz") == "abc%xyz"
107
108  result = newString(s.len)
109  var i = 0
110  var j = 0
111  while i < s.len:
112    case s[i]
113    of '%':
114      result[j] = decodePercent(s, i)
115    of '+':
116      if decodePlus:
117        result[j] = ' '
118      else:
119        result[j] = s[i]
120    else: result[j] = s[i]
121    inc(i)
122    inc(j)
123  setLen(result, j)
124
125func encodeQuery*(query: openArray[(string, string)], usePlus = true,
126    omitEq = true): string =
127  ## Encodes a set of (key, value) parameters into a URL query string.
128  ##
129  ## Every (key, value) pair is URL-encoded and written as `key=value`. If the
130  ## value is an empty string then the `=` is omitted, unless `omitEq` is
131  ## false.
132  ## The pairs are joined together by a `&` character.
133  ##
134  ## The `usePlus` parameter is passed down to the `encodeUrl` function that
135  ## is used for the URL encoding of the string values.
136  ##
137  ## **See also:**
138  ## * `encodeUrl func<#encodeUrl,string>`_
139  runnableExamples:
140    assert encodeQuery({: }) == ""
141    assert encodeQuery({"a": "1", "b": "2"}) == "a=1&b=2"
142    assert encodeQuery({"a": "1", "b": ""}) == "a=1&b"
143  for elem in query:
144    # Encode the `key = value` pairs and separate them with a '&'
145    if result.len > 0: result.add('&')
146    let (key, val) = elem
147    result.add(encodeUrl(key, usePlus))
148    # Omit the '=' if the value string is empty
149    if not omitEq or val.len > 0:
150      result.add('=')
151      result.add(encodeUrl(val, usePlus))
152
153iterator decodeQuery*(data: string): tuple[key, value: string] =
154  ## Reads and decodes query string `data` and yields the `(key, value)` pairs
155  ## the data consists of. If compiled with `-d:nimLegacyParseQueryStrict`, an
156  ## error is raised when there is an unencoded `=` character in a decoded
157  ## value, which was the behavior in Nim < 1.5.1
158  runnableExamples:
159    import std/sequtils
160    assert toSeq(decodeQuery("foo=1&bar=2=3")) == @[("foo", "1"), ("bar", "2=3")]
161    assert toSeq(decodeQuery("&a&=b&=&&")) == @[("", ""), ("a", ""), ("", "b"), ("", ""), ("", "")]
162
163  proc parseData(data: string, i: int, field: var string, sep: char): int =
164    result = i
165    while result < data.len:
166      let c = data[result]
167      case c
168      of '%': add(field, decodePercent(data, result))
169      of '+': add(field, ' ')
170      of '&': break
171      else:
172        if c == sep: break
173        else: add(field, data[result])
174      inc(result)
175
176  var i = 0
177  var name = ""
178  var value = ""
179  # decode everything in one pass:
180  while i < data.len:
181    setLen(name, 0) # reuse memory
182    i = parseData(data, i, name, '=')
183    setLen(value, 0) # reuse memory
184    if i < data.len and data[i] == '=':
185      inc(i) # skip '='
186      when defined(nimLegacyParseQueryStrict):
187        i = parseData(data, i, value, '=')
188      else:
189        i = parseData(data, i, value, '&')
190    yield (name, value)
191    if i < data.len:
192      when defined(nimLegacyParseQueryStrict):
193        if data[i] != '&':
194          uriParseError("'&' expected at index '$#' for '$#'" % [$i, data])
195      inc(i)
196
197func parseAuthority(authority: string, result: var Uri) =
198  var i = 0
199  var inPort = false
200  var inIPv6 = false
201  while i < authority.len:
202    case authority[i]
203    of '@':
204      swap result.password, result.port
205      result.port.setLen(0)
206      swap result.username, result.hostname
207      result.hostname.setLen(0)
208      inPort = false
209    of ':':
210      if inIPv6:
211        result.hostname.add(authority[i])
212      else:
213        inPort = true
214    of '[':
215      inIPv6 = true
216      result.isIpv6 = true
217    of ']':
218      inIPv6 = false
219    of '\0':
220      break
221    else:
222      if inPort:
223        result.port.add(authority[i])
224      else:
225        result.hostname.add(authority[i])
226    i.inc
227
228func parsePath(uri: string, i: var int, result: var Uri) =
229  i.inc parseUntil(uri, result.path, {'?', '#'}, i)
230
231  # The 'mailto' scheme's PATH actually contains the hostname/username
232  if cmpIgnoreCase(result.scheme, "mailto") == 0:
233    parseAuthority(result.path, result)
234    result.path.setLen(0)
235
236  if i < uri.len and uri[i] == '?':
237    i.inc # Skip '?'
238    i.inc parseUntil(uri, result.query, {'#'}, i)
239
240  if i < uri.len and uri[i] == '#':
241    i.inc # Skip '#'
242    i.inc parseUntil(uri, result.anchor, {}, i)
243
244func initUri*(isIpv6 = false): Uri =
245  ## Initializes a URI with `scheme`, `username`, `password`,
246  ## `hostname`, `port`, `path`, `query`, `anchor` and `isIpv6`.
247  ##
248  ## **See also:**
249  ## * `Uri type <#Uri>`_ for available fields in the URI type
250  runnableExamples:
251    var uri2 = initUri(isIpv6 = true)
252    uri2.scheme = "tcp"
253    uri2.hostname = "2001:0db8:85a3:0000:0000:8a2e:0370:7334"
254    uri2.port = "8080"
255    assert $uri2 == "tcp://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080"
256  result = Uri(scheme: "", username: "", password: "", hostname: "", port: "",
257                path: "", query: "", anchor: "", isIpv6: isIpv6)
258
259func resetUri(uri: var Uri) =
260  for f in uri.fields:
261    when f is string:
262      f.setLen(0)
263    else:
264      f = false
265
266func parseUri*(uri: string, result: var Uri) =
267  ## Parses a URI. The `result` variable will be cleared before.
268  ##
269  ## **See also:**
270  ## * `Uri type <#Uri>`_ for available fields in the URI type
271  ## * `initUri func <#initUri>`_ for initializing a URI
272  runnableExamples:
273    var res = initUri()
274    parseUri("https://nim-lang.org/docs/manual.html", res)
275    assert res.scheme == "https"
276    assert res.hostname == "nim-lang.org"
277    assert res.path == "/docs/manual.html"
278  resetUri(result)
279
280  var i = 0
281
282  # Check if this is a reference URI (relative URI)
283  let doubleSlash = uri.len > 1 and uri[0] == '/' and uri[1] == '/'
284  if i < uri.len and uri[i] == '/':
285    # Make sure `uri` doesn't begin with '//'.
286    if not doubleSlash:
287      parsePath(uri, i, result)
288      return
289
290  # Scheme
291  i.inc parseWhile(uri, result.scheme, Letters + Digits + {'+', '-', '.'}, i)
292  if (i >= uri.len or uri[i] != ':') and not doubleSlash:
293    # Assume this is a reference URI (relative URI)
294    i = 0
295    result.scheme.setLen(0)
296    parsePath(uri, i, result)
297    return
298  if not doubleSlash:
299    i.inc # Skip ':'
300
301  # Authority
302  if i+1 < uri.len and uri[i] == '/' and uri[i+1] == '/':
303    i.inc(2) # Skip //
304    var authority = ""
305    i.inc parseUntil(uri, authority, {'/', '?', '#'}, i)
306    if authority.len > 0:
307      parseAuthority(authority, result)
308  else:
309    result.opaque = true
310
311  # Path
312  parsePath(uri, i, result)
313
314func parseUri*(uri: string): Uri =
315  ## Parses a URI and returns it.
316  ##
317  ## **See also:**
318  ## * `Uri type <#Uri>`_ for available fields in the URI type
319  runnableExamples:
320    let res = parseUri("ftp://Username:Password@Hostname")
321    assert res.username == "Username"
322    assert res.password == "Password"
323    assert res.scheme == "ftp"
324  result = initUri()
325  parseUri(uri, result)
326
327func removeDotSegments(path: string): string =
328  ## Collapses `..` and `.` in `path` in a similar way as done in `os.normalizedPath`
329  ## Caution: this is buggy.
330  runnableExamples:
331    assert removeDotSegments("a1/a2/../a3/a4/a5/./a6/a7/.//./") == "a1/a3/a4/a5/a6/a7/"
332    assert removeDotSegments("http://www.ai.") == "http://www.ai."
333  # xxx adapt or reuse `pathnorm.normalizePath(path, '/')` to make this more reliable, but
334  # taking into account url specificities such as not collapsing leading `//` in scheme
335  # `https://`. see `turi` for failing tests.
336  if path.len == 0: return ""
337  var collection: seq[string] = @[]
338  let endsWithSlash = path.endsWith '/'
339  var i = 0
340  var currentSegment = ""
341  while i < path.len:
342    case path[i]
343    of '/':
344      collection.add(currentSegment)
345      currentSegment = ""
346    of '.':
347      if i+2 < path.len and path[i+1] == '.' and path[i+2] == '/':
348        if collection.len > 0:
349          discard collection.pop()
350          i.inc 3
351          continue
352      elif i + 1 < path.len and path[i+1] == '/':
353        i.inc 2
354        continue
355      currentSegment.add path[i]
356    else:
357      currentSegment.add path[i]
358    i.inc
359  if currentSegment != "":
360    collection.add currentSegment
361
362  result = collection.join("/")
363  if endsWithSlash: result.add '/'
364
365func merge(base, reference: Uri): string =
366  # http://tools.ietf.org/html/rfc3986#section-5.2.3
367  if base.hostname != "" and base.path == "":
368    '/' & reference.path
369  else:
370    let lastSegment = rfind(base.path, "/")
371    if lastSegment == -1:
372      reference.path
373    else:
374      base.path[0 .. lastSegment] & reference.path
375
376func combine*(base: Uri, reference: Uri): Uri =
377  ## Combines a base URI with a reference URI.
378  ##
379  ## This uses the algorithm specified in
380  ## `section 5.2.2 of RFC 3986 <http://tools.ietf.org/html/rfc3986#section-5.2.2>`_.
381  ##
382  ## This means that the slashes inside the base URIs path as well as reference
383  ## URIs path affect the resulting URI.
384  ##
385  ## **See also:**
386  ## * `/ func <#/,Uri,string>`_ for building URIs
387  runnableExamples:
388    let foo = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("/baz"))
389    assert foo.path == "/baz"
390    let bar = combine(parseUri("https://nim-lang.org/foo/bar"), parseUri("baz"))
391    assert bar.path == "/foo/baz"
392    let qux = combine(parseUri("https://nim-lang.org/foo/bar/"), parseUri("baz"))
393    assert qux.path == "/foo/bar/baz"
394
395  template setAuthority(dest, src): untyped =
396    dest.hostname = src.hostname
397    dest.username = src.username
398    dest.port = src.port
399    dest.password = src.password
400
401  result = initUri()
402  if reference.scheme != base.scheme and reference.scheme != "":
403    result = reference
404    result.path = removeDotSegments(result.path)
405  else:
406    if reference.hostname != "":
407      setAuthority(result, reference)
408      result.path = removeDotSegments(reference.path)
409      result.query = reference.query
410    else:
411      if reference.path == "":
412        result.path = base.path
413        if reference.query != "":
414          result.query = reference.query
415        else:
416          result.query = base.query
417      else:
418        if reference.path.startsWith("/"):
419          result.path = removeDotSegments(reference.path)
420        else:
421          result.path = removeDotSegments(merge(base, reference))
422        result.query = reference.query
423      setAuthority(result, base)
424    result.scheme = base.scheme
425  result.anchor = reference.anchor
426
427func combine*(uris: varargs[Uri]): Uri =
428  ## Combines multiple URIs together.
429  ##
430  ## **See also:**
431  ## * `/ func <#/,Uri,string>`_ for building URIs
432  runnableExamples:
433    let foo = combine(parseUri("https://nim-lang.org/"), parseUri("docs/"),
434        parseUri("manual.html"))
435    assert foo.hostname == "nim-lang.org"
436    assert foo.path == "/docs/manual.html"
437  result = uris[0]
438  for i in 1 ..< uris.len:
439    result = combine(result, uris[i])
440
441func isAbsolute*(uri: Uri): bool =
442  ## Returns true if URI is absolute, false otherwise.
443  runnableExamples:
444    assert parseUri("https://nim-lang.org").isAbsolute
445    assert not parseUri("nim-lang").isAbsolute
446  return uri.scheme != "" and (uri.hostname != "" or uri.path != "")
447
448func `/`*(x: Uri, path: string): Uri =
449  ## Concatenates the path specified to the specified URIs path.
450  ##
451  ## Contrary to the `combine func <#combine,Uri,Uri>`_ you do not have to worry about
452  ## the slashes at the beginning and end of the path and URIs path
453  ## respectively.
454  ##
455  ## **See also:**
456  ## * `combine func <#combine,Uri,Uri>`_
457  runnableExamples:
458    let foo = parseUri("https://nim-lang.org/foo/bar") / "/baz"
459    assert foo.path == "/foo/bar/baz"
460    let bar = parseUri("https://nim-lang.org/foo/bar") / "baz"
461    assert bar.path == "/foo/bar/baz"
462    let qux = parseUri("https://nim-lang.org/foo/bar/") / "baz"
463    assert qux.path == "/foo/bar/baz"
464  result = x
465
466  if result.path.len == 0:
467    if path.len == 0 or path[0] != '/':
468      result.path = "/"
469    result.path.add(path)
470    return
471
472  if result.path.len > 0 and result.path[result.path.len-1] == '/':
473    if path.len > 0 and path[0] == '/':
474      result.path.add(path[1 .. path.len-1])
475    else:
476      result.path.add(path)
477  else:
478    if path.len == 0 or path[0] != '/':
479      result.path.add '/'
480    result.path.add(path)
481
482func `?`*(u: Uri, query: openArray[(string, string)]): Uri =
483  ## Concatenates the query parameters to the specified URI object.
484  runnableExamples:
485    let foo = parseUri("https://example.com") / "foo" ? {"bar": "qux"}
486    assert $foo == "https://example.com/foo?bar=qux"
487  result = u
488  result.query = encodeQuery(query)
489
490func `$`*(u: Uri): string =
491  ## Returns the string representation of the specified URI object.
492  runnableExamples:
493    assert $parseUri("https://nim-lang.org") == "https://nim-lang.org"
494  result = ""
495  if u.scheme.len > 0:
496    result.add(u.scheme)
497    if u.opaque:
498      result.add(":")
499    else:
500      result.add("://")
501  if u.username.len > 0:
502    result.add(u.username)
503    if u.password.len > 0:
504      result.add(":")
505      result.add(u.password)
506    result.add("@")
507  if u.hostname.endsWith('/'):
508    if u.isIpv6:
509      result.add("[" & u.hostname[0 .. ^2] & "]")
510    else:
511      result.add(u.hostname[0 .. ^2])
512  else:
513    if u.isIpv6:
514      result.add("[" & u.hostname & "]")
515    else:
516      result.add(u.hostname)
517  if u.port.len > 0:
518    result.add(":")
519    result.add(u.port)
520  if u.path.len > 0:
521    if u.hostname.len > 0 and u.path[0] != '/':
522      result.add('/')
523    result.add(u.path)
524  if u.query.len > 0:
525    result.add("?")
526    result.add(u.query)
527  if u.anchor.len > 0:
528    result.add("#")
529    result.add(u.anchor)
530
531proc getDataUri*(data, mime: string, encoding = "utf-8"): string {.since: (1, 3).} =
532  ## Convenience proc for `base64.encode` returns a standard Base64 Data URI (RFC-2397)
533  ##
534  ## **See also:**
535  ## * `mimetypes <mimetypes.html>`_ for `mime` argument
536  ## * https://tools.ietf.org/html/rfc2397
537  ## * https://en.wikipedia.org/wiki/Data_URI_scheme
538  runnableExamples: static: assert getDataUri("Nim", "text/plain") == "data:text/plain;charset=utf-8;base64,Tmlt"
539  assert encoding.len > 0 and mime.len > 0 # Must *not* be URL-Safe, see RFC-2397
540  result = "data:" & mime & ";charset=" & encoding & ";base64," & base64.encode(data)
541