1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2017-2020. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20%%
21%% [RFC 3986, Chapter 2.2. Reserved Characters]
22%%
23%%   reserved    = gen-delims / sub-delims
24%%
25%%   gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
26%%
27%%   sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
28%%               / "*" / "+" / "," / ";" / "="
29%%
30%%
31%% [RFC 3986, Chapter 2.3. Unreserved Characters]
32%%
33%%   unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
34%%
35%%
36%% [RFC 3986, Chapter 3. Syntax Components]
37%%
38%% The generic URI syntax consists of a hierarchical sequence of
39%% components referred to as the scheme, authority, path, query, and
40%% fragment.
41%%
42%%    URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
43%%
44%%    hier-part   = "//" authority path-abempty
45%%                   / path-absolute
46%%                   / path-rootless
47%%                   / path-empty
48%%
49%%    The scheme and path components are required, though the path may be
50%%    empty (no characters).  When authority is present, the path must
51%%    either be empty or begin with a slash ("/") character.  When
52%%    authority is not present, the path cannot begin with two slash
53%%    characters ("//").  These restrictions result in five different ABNF
54%%    rules for a path (Section 3.3), only one of which will match any
55%%    given URI reference.
56%%
57%%    The following are two example URIs and their component parts:
58%%
59%%          foo://example.com:8042/over/there?name=ferret#nose
60%%          \_/   \______________/\_________/ \_________/ \__/
61%%           |           |            |            |        |
62%%        scheme     authority       path        query   fragment
63%%           |   _____________________|__
64%%          / \ /                        \
65%%          urn:example:animal:ferret:nose
66%%
67%%
68%% [RFC 3986, Chapter 3.1. Scheme]
69%%
70%% Each URI begins with a scheme name that refers to a specification for
71%% assigning identifiers within that scheme.
72%%
73%%    scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
74%%
75%%
76%% [RFC 3986, Chapter 3.2. Authority]
77%%
78%% Many URI schemes include a hierarchical element for a naming
79%% authority so that governance of the name space defined by the
80%% remainder of the URI is delegated to that authority (which may, in
81%% turn, delegate it further).
82%%
83%%    authority   = [ userinfo "@" ] host [ ":" port ]
84%%
85%%
86%% [RFC 3986, Chapter 3.2.1. User Information]
87%%
88%% The userinfo subcomponent may consist of a user name and, optionally,
89%% scheme-specific information about how to gain authorization to access
90%% the resource. The user information, if present, is followed by a
91%% commercial at-sign ("@") that delimits it from the host.
92%%
93%%    userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
94%%
95%%
96%% [RFC 3986, Chapter 3.2.2. Host]
97%%
98%% The host subcomponent of authority is identified by an IP literal
99%% encapsulated within square brackets, an IPv4 address in dotted-
100%% decimal form, or a registered name.
101%%
102%%    host        = IP-literal / IPv4address / reg-name
103%%
104%%    IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
105%%
106%%    IPvFuture  = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
107%%
108%%    IPv6address =                            6( h16 ":" ) ls32
109%%                /                       "::" 5( h16 ":" ) ls32
110%%                / [               h16 ] "::" 4( h16 ":" ) ls32
111%%                / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
112%%                / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
113%%                / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
114%%                / [ *4( h16 ":" ) h16 ] "::"              ls32
115%%                / [ *5( h16 ":" ) h16 ] "::"              h16
116%%                / [ *6( h16 ":" ) h16 ] "::"
117%%
118%%    ls32        = ( h16 ":" h16 ) / IPv4address
119%%                ; least-significant 32 bits of address
120%%
121%%    h16         = 1*4HEXDIG
122%%                ; 16 bits of address represented in hexadecimal
123%%
124%%    IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
125%%
126%%    dec-octet   = DIGIT                 ; 0-9
127%%                / %x31-39 DIGIT         ; 10-99
128%%                / "1" 2DIGIT            ; 100-199
129%%                / "2" %x30-34 DIGIT     ; 200-249
130%%                / "25" %x30-35          ; 250-255
131%%
132%%    reg-name    = *( unreserved / pct-encoded / sub-delims )
133%%
134%%
135%% [RFC 3986, Chapter 3.2.2. Port]
136%%
137%% The port subcomponent of authority is designated by an optional port
138%% number in decimal following the host and delimited from it by a
139%% single colon (":") character.
140%%
141%%    port        = *DIGIT
142%%
143%%
144%% [RFC 3986, Chapter 3.3. Path]
145%%
146%% The path component contains data, usually organized in hierarchical
147%% form, that, along with data in the non-hierarchical query component
148%% (Section 3.4), serves to identify a resource within the scope of the
149%% URI's scheme and naming authority (if any).  The path is terminated
150%% by the first question mark ("?") or number sign ("#") character, or
151%% by the end of the URI.
152%%
153%%    path          = path-abempty    ; begins with "/" or is empty
154%%                  / path-absolute   ; begins with "/" but not "//"
155%%                  / path-noscheme   ; begins with a non-colon segment
156%%                  / path-rootless   ; begins with a segment
157%%                  / path-empty      ; zero characters
158%%
159%%    path-abempty  = *( "/" segment )
160%%    path-absolute = "/" [ segment-nz *( "/" segment ) ]
161%%    path-noscheme = segment-nz-nc *( "/" segment )
162%%    path-rootless = segment-nz *( "/" segment )
163%%    path-empty    = 0<pchar>
164%%    segment       = *pchar
165%%    segment-nz    = 1*pchar
166%%    segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
167%%                  ; non-zero-length segment without any colon ":"
168%%
169%%    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
170%%
171%%
172%% [RFC 3986, Chapter 3.4. Query]
173%%
174%% The query component contains non-hierarchical data that, along with
175%% data in the path component (Section 3.3), serves to identify a
176%% resource within the scope of the URI's scheme and naming authority
177%% (if any).  The query component is indicated by the first question
178%% mark ("?") character and terminated by a number sign ("#") character
179%% or by the end of the URI.
180%%
181%%    query       = *( pchar / "/" / "?" )
182%%
183%%
184%% [RFC 3986, Chapter 3.5. Fragment]
185%%
186%% The fragment identifier component of a URI allows indirect
187%% identification of a secondary resource by reference to a primary
188%% resource and additional identifying information.
189%%
190%%    fragment    = *( pchar / "/" / "?" )
191%%
192%%
193%% [RFC 3986, Chapter 4.1. URI Reference]
194%%
195%% URI-reference is used to denote the most common usage of a resource
196%% identifier.
197%%
198%%    URI-reference = URI / relative-ref
199%%
200%%
201%% [RFC 3986, Chapter 4.2. Relative Reference]
202%%
203%% A relative reference takes advantage of the hierarchical syntax
204%% (Section 1.2.3) to express a URI reference relative to the name space
205%% of another hierarchical URI.
206%%
207%%    relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
208%%
209%%    relative-part = "//" authority path-abempty
210%%                  / path-absolute
211%%                  / path-noscheme
212%%                  / path-empty
213%%
214%%
215%% [RFC 3986, Chapter 4.3. Absolute URI]
216%%
217%% Some protocol elements allow only the absolute form of a URI without
218%% a fragment identifier.  For example, defining a base URI for later
219%% use by relative references calls for an absolute-URI syntax rule that
220%% does not allow a fragment.
221%%
222%%    absolute-URI  = scheme ":" hier-part [ "?" query ]
223%%
224-module(uri_string).
225
226%%-------------------------------------------------------------------------
227%% External API
228%%-------------------------------------------------------------------------
229-export([compose_query/1, compose_query/2,
230         dissect_query/1, normalize/1, normalize/2, parse/1,
231         recompose/1, resolve/2, resolve/3, transcode/2]).
232-export_type([error/0, uri_map/0, uri_string/0]).
233
234
235%%-------------------------------------------------------------------------
236%% Internal API
237%%-------------------------------------------------------------------------
238-export([is_host/1, is_path/1]).  % suppress warnings
239
240
241%%-------------------------------------------------------------------------
242%% Macros
243%%-------------------------------------------------------------------------
244-define(CHAR(Char), <<Char/utf8>>).
245-define(STRING_EMPTY, <<>>).
246-define(STRING(MatchStr), <<MatchStr/binary>>).
247-define(STRING_REST(MatchStr, Rest), <<MatchStr/utf8, Rest/binary>>).
248
249-define(DEC2HEX(X),
250        if ((X) >= 0) andalso ((X) =< 9) -> (X) + $0;
251           ((X) >= 10) andalso ((X) =< 15) -> (X) + $A - 10
252        end).
253
254-define(HEX2DEC(X),
255        if ((X) >= $0) andalso ((X) =< $9) -> (X) - $0;
256           ((X) >= $A) andalso ((X) =< $F) -> (X) - $A + 10;
257           ((X) >= $a) andalso ((X) =< $f) -> (X) - $a + 10
258        end).
259
260
261%%%=========================================================================
262%%%  API
263%%%=========================================================================
264
265%%-------------------------------------------------------------------------
266%% URI compliant with RFC 3986
267%% ASCII %x21 - %x7A ("!" - "z") except
268%%   %x34    "    double quote
269%%   %x60    <    less than
270%%   %x62    >    greater than
271%%   %x92    \    backslash
272%%   %x94    ^    caret / circumflex
273%%   %x96    `    grave / accent
274%%-------------------------------------------------------------------------
275-type uri_string() :: iodata().
276-type error() :: {error, atom(), term()}.
277
278
279%%-------------------------------------------------------------------------
280%% RFC 3986, Chapter 3. Syntax Components
281%%-------------------------------------------------------------------------
282-type uri_map() ::
283  #{fragment => unicode:chardata(),
284    host => unicode:chardata(),
285    path => unicode:chardata(),
286    port => non_neg_integer() | undefined,
287    query => unicode:chardata(),
288    scheme => unicode:chardata(),
289    userinfo => unicode:chardata()} | #{}.
290
291
292%%-------------------------------------------------------------------------
293%% Normalize URIs
294%%-------------------------------------------------------------------------
295-spec normalize(URI) -> NormalizedURI when
296      URI :: uri_string() | uri_map(),
297      NormalizedURI :: uri_string()
298                     | error().
299normalize(URIMap) ->
300    normalize(URIMap, []).
301
302
303-spec normalize(URI, Options) -> NormalizedURI when
304      URI :: uri_string() | uri_map(),
305      Options :: [return_map],
306      NormalizedURI :: uri_string() | uri_map()
307                     | error().
308normalize(URIMap, []) when is_map(URIMap) ->
309    try recompose(normalize_map(URIMap))
310    catch
311        throw:{error, Atom, RestData} -> {error, Atom, RestData}
312    end;
313normalize(URIMap, [return_map]) when is_map(URIMap) ->
314    try normalize_map(URIMap)
315    catch
316        throw:{error, Atom, RestData} -> {error, Atom, RestData}
317    end;
318normalize(URIString, []) ->
319    case parse(URIString) of
320        Value when is_map(Value) ->
321            try recompose(normalize_map(Value))
322            catch
323                throw:{error, Atom, RestData} -> {error, Atom, RestData}
324            end;
325        Error ->
326            Error
327    end;
328normalize(URIString, [return_map]) ->
329    case parse(URIString) of
330        Value when is_map(Value) ->
331            try normalize_map(Value)
332            catch
333                throw:{error, Atom, RestData} -> {error, Atom, RestData}
334            end;
335        Error ->
336            Error
337    end.
338
339
340%%-------------------------------------------------------------------------
341%% Parse URIs
342%%-------------------------------------------------------------------------
343-spec parse(URIString) -> URIMap when
344      URIString :: uri_string(),
345      URIMap :: uri_map()
346              | error().
347parse(URIString) when is_binary(URIString) ->
348    try parse_uri_reference(URIString, #{})
349    catch
350        throw:{error, Atom, RestData} -> {error, Atom, RestData}
351    end;
352parse(URIString) when is_list(URIString) ->
353    try
354        Binary = unicode:characters_to_binary(URIString),
355        Map = parse_uri_reference(Binary, #{}),
356        convert_mapfields_to_list(Map)
357    catch
358        throw:{error, Atom, RestData} -> {error, Atom, RestData}
359    end.
360
361
362%%-------------------------------------------------------------------------
363%% Recompose URIs
364%%-------------------------------------------------------------------------
365-spec recompose(URIMap) -> URIString when
366      URIMap :: uri_map(),
367      URIString :: uri_string()
368                 | error().
369recompose(Map) ->
370    case is_valid_map(Map) of
371        false ->
372            {error, invalid_map, Map};
373        true ->
374            try
375                T0 = update_scheme(Map, empty),
376                T1 = update_userinfo(Map, T0),
377                T2 = update_host(Map, T1),
378                T3 = update_port(Map, T2),
379                T4 = update_path(Map, T3),
380                T5 = update_query(Map, T4),
381                update_fragment(Map, T5)
382            catch
383                throw:{error, Atom, RestData} -> {error, Atom, RestData}
384            end
385    end.
386
387
388%%-------------------------------------------------------------------------
389%% Resolve URIs
390%%-------------------------------------------------------------------------
391-spec resolve(RefURI, BaseURI) -> TargetURI when
392      RefURI :: uri_string() | uri_map(),
393      BaseURI :: uri_string() | uri_map(),
394      TargetURI :: uri_string()
395                 | error().
396resolve(URIMap, BaseURIMap) ->
397    resolve(URIMap, BaseURIMap, []).
398
399
400-spec resolve(RefURI, BaseURI, Options) -> TargetURI when
401      RefURI :: uri_string() | uri_map(),
402      BaseURI :: uri_string() | uri_map(),
403      Options :: [return_map],
404      TargetURI :: uri_string() | uri_map()
405                 | error().
406resolve(URIMap, BaseURIMap, Options) when is_map(URIMap) ->
407    case resolve_map(URIMap, BaseURIMap) of
408        TargetURIMap when is_map(TargetURIMap) ->
409            case Options of
410                [return_map] ->
411                    TargetURIMap;
412                [] ->
413                    recompose(TargetURIMap)
414            end;
415        Error ->
416            Error
417    end;
418resolve(URIString, BaseURIMap, Options) ->
419    case parse(URIString) of
420        URIMap when is_map(URIMap) ->
421            resolve(URIMap, BaseURIMap, Options);
422        Error ->
423            Error
424    end.
425
426
427%%-------------------------------------------------------------------------
428%% Transcode URIs
429%%-------------------------------------------------------------------------
430-spec transcode(URIString, Options) -> Result when
431      URIString :: uri_string(),
432      Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}],
433      Result :: uri_string()
434              | error().
435transcode(URIString, Options) when is_binary(URIString) ->
436    try
437        InEnc = proplists:get_value(in_encoding, Options, utf8),
438        OutEnc = proplists:get_value(out_encoding, Options, utf8),
439        List = convert_to_list(URIString, InEnc),
440        Output = transcode(List, [], InEnc, OutEnc),
441        convert_to_binary(Output, utf8, OutEnc)
442    catch
443        throw:{error, Atom, RestData} -> {error, Atom, RestData}
444    end;
445transcode(URIString, Options) when is_list(URIString) ->
446    InEnc = proplists:get_value(in_encoding, Options, utf8),
447    OutEnc = proplists:get_value(out_encoding, Options, utf8),
448    Flattened = flatten_list(URIString, InEnc),
449    try transcode(Flattened, [], InEnc, OutEnc)
450    catch
451        throw:{error, Atom, RestData} -> {error, Atom, RestData}
452    end.
453
454
455%%-------------------------------------------------------------------------
456%% Functions for working with the query part of a URI as a list
457%% of key/value pairs.
458%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8
459%% HTML 5.0 - 4.10.22.6 URL-encoded form data - non UTF-8
460%%-------------------------------------------------------------------------
461
462%%-------------------------------------------------------------------------
463%% Compose urlencoded query string from a list of unescaped key/value pairs.
464%% (application/x-www-form-urlencoded encoding algorithm)
465%%-------------------------------------------------------------------------
466-spec compose_query(QueryList) -> QueryString when
467      QueryList :: [{unicode:chardata(), unicode:chardata() | true}],
468      QueryString :: uri_string()
469                   | error().
470compose_query(List) ->
471    compose_query(List, [{encoding, utf8}]).
472
473
474-spec compose_query(QueryList, Options) -> QueryString when
475      QueryList :: [{unicode:chardata(), unicode:chardata() | true}],
476      Options :: [{encoding, atom()}],
477      QueryString :: uri_string()
478                   | error().
479compose_query([],_Options) ->
480    [];
481compose_query(List, Options) ->
482    try compose_query(List, Options, false, <<>>)
483    catch
484      throw:{error, Atom, RestData} -> {error, Atom, RestData}
485    end.
486%%
487compose_query([{Key,true}|Rest], Options, IsList, Acc) ->
488    Separator = get_separator(Rest),
489    K = form_urlencode(Key, Options),
490    IsListNew = IsList orelse is_list(Key),
491    compose_query(Rest, Options, IsListNew, <<Acc/binary,K/binary,Separator/binary>>);
492compose_query([{Key,Value}|Rest], Options, IsList, Acc) ->
493    Separator = get_separator(Rest),
494    K = form_urlencode(Key, Options),
495    V = form_urlencode(Value, Options),
496    IsListNew = IsList orelse is_list(Key) orelse is_list(Value),
497    compose_query(Rest, Options, IsListNew, <<Acc/binary,K/binary,"=",V/binary,Separator/binary>>);
498compose_query([], _Options, IsList, Acc) ->
499    case IsList of
500        true -> convert_to_list(Acc, utf8);
501        false -> Acc
502    end.
503
504
505%%-------------------------------------------------------------------------
506%% Dissect a query string into a list of unescaped key/value pairs.
507%% (application/x-www-form-urlencoded decoding algorithm)
508%%-------------------------------------------------------------------------
509-spec dissect_query(QueryString) -> QueryList when
510      QueryString :: uri_string(),
511      QueryList :: [{unicode:chardata(), unicode:chardata() | true}]
512                 | error().
513dissect_query(<<>>) ->
514    [];
515dissect_query([]) ->
516    [];
517dissect_query(QueryString) when is_list(QueryString) ->
518    try
519        B = convert_to_binary(QueryString, utf8, utf8),
520        dissect_query_key(B, true, [], <<>>, <<>>)
521    catch
522        throw:{error, Atom, RestData} -> {error, Atom, RestData}
523    end;
524dissect_query(QueryString) ->
525    try dissect_query_key(QueryString, false, [], <<>>, <<>>)
526    catch
527        throw:{error, Atom, RestData} -> {error, Atom, RestData}
528    end.
529
530
531%%%========================================================================
532%%% Internal functions
533%%%========================================================================
534
535%%-------------------------------------------------------------------------
536%% Converts Map fields to lists
537%%-------------------------------------------------------------------------
538convert_mapfields_to_list(Map) ->
539    Fun = fun (_, V) when is_binary(V) -> unicode:characters_to_list(V);
540              (_, V) -> V end,
541    maps:map(Fun, Map).
542
543
544%%-------------------------------------------------------------------------
545%% [RFC 3986, Chapter 4.1. URI Reference]
546%%
547%% URI-reference is used to denote the most common usage of a resource
548%% identifier.
549%%
550%%    URI-reference = URI / relative-ref
551%%-------------------------------------------------------------------------
552-spec parse_uri_reference(binary(), uri_map()) -> uri_map().
553parse_uri_reference(<<>>, _) -> #{path => <<>>};
554parse_uri_reference(URIString, URI) ->
555    try parse_scheme_start(URIString, URI)
556    catch
557        throw:{_,_,_} ->
558            parse_relative_part(URIString, URI)
559    end.
560
561
562%%-------------------------------------------------------------------------
563%% [RFC 3986, Chapter 4.2. Relative Reference]
564%%
565%% A relative reference takes advantage of the hierarchical syntax
566%% (Section 1.2.3) to express a URI reference relative to the name space
567%% of another hierarchical URI.
568%%
569%%    relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
570%%
571%%    relative-part = "//" authority path-abempty
572%%                  / path-absolute
573%%                  / path-noscheme
574%%                  / path-empty
575%%-------------------------------------------------------------------------
576-spec parse_relative_part(binary(), uri_map()) -> uri_map().
577parse_relative_part(?STRING_REST("//", Rest), URI) ->
578    %% Parse userinfo - "//" is NOT part of authority
579    try parse_userinfo(Rest, URI) of
580        {T, URI1} ->
581            Userinfo = calculate_parsed_userinfo(Rest, T),
582            URI2 = maybe_add_path(URI1),
583            URI2#{userinfo => Userinfo}
584    catch
585        throw:{_,_,_} ->
586            {T, URI1} = parse_host(Rest, URI),
587            Host = calculate_parsed_host_port(Rest, T),
588            URI2 = maybe_add_path(URI1),
589            URI2#{host => remove_brackets(Host)}
590    end;
591parse_relative_part(?STRING_REST($/, Rest), URI) ->
592    {T, URI1} = parse_segment(Rest, URI),  % path-absolute
593    Path = calculate_parsed_part(Rest, T),
594    URI1#{path => ?STRING_REST($/, Path)};
595parse_relative_part(?STRING_REST($?, Rest), URI) ->
596    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
597    Query = calculate_parsed_query_fragment(Rest, T),
598    URI2 = maybe_add_path(URI1),
599    URI2#{query => Query};
600parse_relative_part(?STRING_REST($#, Rest), URI) ->
601    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
602    Fragment = calculate_parsed_query_fragment(Rest, T),
603    URI2 = maybe_add_path(URI1),
604    URI2#{fragment => Fragment};
605parse_relative_part(?STRING_REST(Char, Rest), URI) ->
606    case is_segment_nz_nc(Char) of
607        true ->
608            {T, URI1} = parse_segment_nz_nc(Rest, URI),  % path-noscheme
609            Path = calculate_parsed_part(Rest, T),
610            URI1#{path => ?STRING_REST(Char, Path)};
611        false -> throw({error,invalid_uri,[Char]})
612    end.
613
614
615%%-------------------------------------------------------------------------
616%% [RFC 3986, Chapter 3.3. Path]
617%%
618%% The path component contains data, usually organized in hierarchical
619%% form, that, along with data in the non-hierarchical query component
620%% (Section 3.4), serves to identify a resource within the scope of the
621%% URI's scheme and naming authority (if any).  The path is terminated
622%% by the first question mark ("?") or number sign ("#") character, or
623%% by the end of the URI.
624%%
625%%    path          = path-abempty    ; begins with "/" or is empty
626%%                  / path-absolute   ; begins with "/" but not "//"
627%%                  / path-noscheme   ; begins with a non-colon segment
628%%                  / path-rootless   ; begins with a segment
629%%                  / path-empty      ; zero characters
630%%
631%%    path-abempty  = *( "/" segment )
632%%    path-absolute = "/" [ segment-nz *( "/" segment ) ]
633%%    path-noscheme = segment-nz-nc *( "/" segment )
634%%    path-rootless = segment-nz *( "/" segment )
635%%    path-empty    = 0<pchar>
636%%    segment       = *pchar
637%%    segment-nz    = 1*pchar
638%%    segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
639%%                  ; non-zero-length segment without any colon ":"
640%%
641%%    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
642%%-------------------------------------------------------------------------
643
644%%-------------------------------------------------------------------------
645%%    path-abempty
646%%-------------------------------------------------------------------------
647-spec parse_segment(binary(), uri_map()) -> {binary(), uri_map()}.
648parse_segment(?STRING_REST($/, Rest), URI) ->
649    parse_segment(Rest, URI);  % segment
650parse_segment(?STRING_REST($?, Rest), URI) ->
651    {T, URI1} = parse_query(Rest, URI),  % ?query
652    Query = calculate_parsed_query_fragment(Rest, T),
653    {Rest, URI1#{query => Query}};
654parse_segment(?STRING_REST($#, Rest), URI) ->
655    {T, URI1} = parse_fragment(Rest, URI),
656    Fragment = calculate_parsed_query_fragment(Rest, T),
657    {Rest, URI1#{fragment => Fragment}};
658parse_segment(?STRING_REST(Char, Rest), URI) ->
659    case is_pchar(Char) of
660        true -> parse_segment(Rest, URI);
661        false -> throw({error,invalid_uri,[Char]})
662    end;
663parse_segment(?STRING_EMPTY, URI) ->
664    {?STRING_EMPTY, URI}.
665
666
667%%-------------------------------------------------------------------------
668%%    path-noscheme
669%%-------------------------------------------------------------------------
670-spec parse_segment_nz_nc(binary(), uri_map()) -> {binary(), uri_map()}.
671parse_segment_nz_nc(?STRING_REST($/, Rest), URI) ->
672    parse_segment(Rest, URI);  % segment
673parse_segment_nz_nc(?STRING_REST($?, Rest), URI) ->
674    {T, URI1} = parse_query(Rest, URI),  % ?query
675    Query = calculate_parsed_query_fragment(Rest, T),
676    {Rest, URI1#{query => Query}};
677parse_segment_nz_nc(?STRING_REST($#, Rest), URI) ->
678    {T, URI1} = parse_fragment(Rest, URI),
679    Fragment = calculate_parsed_query_fragment(Rest, T),
680    {Rest, URI1#{fragment => Fragment}};
681parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) ->
682    case is_segment_nz_nc(Char) of
683        true -> parse_segment_nz_nc(Rest, URI);
684        false -> throw({error,invalid_uri,[Char]})
685    end;
686parse_segment_nz_nc(?STRING_EMPTY, URI) ->
687    {?STRING_EMPTY, URI}.
688
689
690%% Check if char is pchar.
691-spec is_pchar(char()) -> boolean().
692is_pchar($%) -> true;  % pct-encoded
693is_pchar($:) -> true;
694is_pchar($@) -> true;
695is_pchar(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
696
697%% Check if char is segment_nz_nc.
698-spec is_segment_nz_nc(char()) -> boolean().
699is_segment_nz_nc($%) -> true;  % pct-encoded
700is_segment_nz_nc($@) -> true;
701is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
702
703
704%%-------------------------------------------------------------------------
705%% [RFC 3986, Chapter 3.1. Scheme]
706%%
707%% Each URI begins with a scheme name that refers to a specification for
708%% assigning identifiers within that scheme.
709%%
710%%    scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
711%%-------------------------------------------------------------------------
712-spec parse_scheme_start(binary(), uri_map()) -> uri_map().
713parse_scheme_start(?STRING_REST(Char, Rest), URI) ->
714    case is_alpha(Char) of
715        true  -> {T, URI1} = parse_scheme(Rest, URI),
716                 Scheme = calculate_parsed_scheme(Rest, T),
717                 URI2 = maybe_add_path(URI1),
718		 URI2#{scheme => ?STRING_REST(Char, Scheme)};
719        false -> throw({error,invalid_uri,[Char]})
720    end.
721
722%% Add path component if it missing after parsing the URI.
723%% According to the URI specification there is always a
724%% path component in every URI-reference and it can be
725%% empty.
726maybe_add_path(Map) ->
727    case maps:is_key(path, Map) of
728        false ->
729            Map#{path => <<>>};
730        _Else ->
731            Map
732    end.
733
734
735
736-spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}.
737parse_scheme(?STRING_REST($:, Rest), URI) ->
738    {_, URI1} = parse_hier(Rest, URI),
739    {Rest, URI1};
740parse_scheme(?STRING_REST(Char, Rest), URI) ->
741    case is_scheme(Char) of
742        true  -> parse_scheme(Rest, URI);
743        false -> throw({error,invalid_uri,[Char]})
744    end;
745parse_scheme(?STRING_EMPTY, _URI) ->
746    throw({error,invalid_uri,<<>>}).
747
748
749%% Check if char is allowed in scheme
750-spec is_scheme(char()) -> boolean().
751is_scheme($+) -> true;
752is_scheme($-) -> true;
753is_scheme($.) -> true;
754is_scheme(Char) -> is_alpha(Char) orelse is_digit(Char).
755
756
757%%-------------------------------------------------------------------------
758%%    hier-part   = "//" authority path-abempty
759%%                   / path-absolute
760%%                   / path-rootless
761%%                   / path-empty
762%%-------------------------------------------------------------------------
763-spec parse_hier(binary(), uri_map()) -> {binary(), uri_map()}.
764parse_hier(?STRING_REST("//", Rest), URI) ->
765    % Parse userinfo - "//" is NOT part of authority
766    try parse_userinfo(Rest, URI) of
767        {T, URI1} ->
768            Userinfo = calculate_parsed_userinfo(Rest, T),
769	    {Rest, URI1#{userinfo => Userinfo}}
770    catch
771        throw:{_,_,_} ->
772            {T, URI1} = parse_host(Rest, URI),
773            Host = calculate_parsed_host_port(Rest, T),
774	    {Rest, URI1#{host => remove_brackets(Host)}}
775    end;
776parse_hier(?STRING_REST($/, Rest), URI) ->
777    {T, URI1} = parse_segment(Rest, URI),  % path-absolute
778    Path = calculate_parsed_part(Rest, T),
779    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
780parse_hier(?STRING_REST($?, Rest), URI) ->
781    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
782    Query = calculate_parsed_query_fragment(Rest, T),
783    {Rest, URI1#{query => Query}};
784parse_hier(?STRING_REST($#, Rest), URI) ->
785    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
786    Fragment = calculate_parsed_query_fragment(Rest, T),
787    {Rest, URI1#{fragment => Fragment}};
788parse_hier(?STRING_REST(Char, Rest), URI) ->  % path-rootless
789    case is_pchar(Char) of
790        true ->  % segment_nz
791            {T, URI1} = parse_segment(Rest, URI),
792            Path = calculate_parsed_part(Rest, T),
793            {Rest, URI1#{path => ?STRING_REST(Char, Path)}};
794        false -> throw({error,invalid_uri,[Char]})
795    end;
796parse_hier(?STRING_EMPTY, URI) ->
797    {<<>>, URI}.
798
799
800%%-------------------------------------------------------------------------
801%% [RFC 3986, Chapter 3.2. Authority]
802%%
803%% Many URI schemes include a hierarchical element for a naming
804%% authority so that governance of the name space defined by the
805%% remainder of the URI is delegated to that authority (which may, in
806%% turn, delegate it further).
807%%
808%% The authority component is preceded by a double slash ("//") and is
809%% terminated by the next slash ("/"), question mark ("?"), or number
810%% sign ("#") character, or by the end of the URI.
811%%
812%%    authority   = [ userinfo "@" ] host [ ":" port ]
813%%
814%%
815%% [RFC 3986, Chapter 3.2.1. User Information]
816%%
817%% The userinfo subcomponent may consist of a user name and, optionally,
818%% scheme-specific information about how to gain authorization to access
819%% the resource. The user information, if present, is followed by a
820%% commercial at-sign ("@") that delimits it from the host.
821%%
822%%    userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
823%%-------------------------------------------------------------------------
824-spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}.
825parse_userinfo(?CHAR($@), URI) ->
826    {?STRING_EMPTY, URI#{host => <<>>}};
827parse_userinfo(?STRING_REST($@, Rest), URI) ->
828    {T, URI1} = parse_host(Rest, URI),
829    Host = calculate_parsed_host_port(Rest, T),
830    {Rest, URI1#{host => remove_brackets(Host)}};
831parse_userinfo(?STRING_REST(Char, Rest), URI) ->
832    case is_userinfo(Char) of
833        true -> parse_userinfo(Rest, URI);
834        false -> throw({error,invalid_uri,[Char]})
835    end;
836parse_userinfo(?STRING_EMPTY, _URI) ->
837    %% URI cannot end in userinfo state
838    throw({error,invalid_uri,<<>>}).
839
840
841%% Check if char is allowed in userinfo
842-spec is_userinfo(char()) -> boolean().
843is_userinfo($%) -> true;  % pct-encoded
844is_userinfo($:) -> true;
845is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
846
847
848%%-------------------------------------------------------------------------
849%% [RFC 3986, Chapter 3.2.2. Host]
850%%
851%% The host subcomponent of authority is identified by an IP literal
852%% encapsulated within square brackets, an IPv4 address in dotted-
853%% decimal form, or a registered name.
854%%
855%%    host        = IP-literal / IPv4address / reg-name
856%%
857%%    IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
858%%
859%%    IPvFuture  = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
860%%
861%%    IPv6address =                            6( h16 ":" ) ls32
862%%                /                       "::" 5( h16 ":" ) ls32
863%%                / [               h16 ] "::" 4( h16 ":" ) ls32
864%%                / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
865%%                / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
866%%                / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
867%%                / [ *4( h16 ":" ) h16 ] "::"              ls32
868%%                / [ *5( h16 ":" ) h16 ] "::"              h16
869%%                / [ *6( h16 ":" ) h16 ] "::"
870%%
871%%    ls32        = ( h16 ":" h16 ) / IPv4address
872%%                ; least-significant 32 bits of address
873%%
874%%    h16         = 1*4HEXDIG
875%%                ; 16 bits of address represented in hexadecimal
876%%
877%%    IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
878%%
879%%    dec-octet   = DIGIT                 ; 0-9
880%%                / %x31-39 DIGIT         ; 10-99
881%%                / "1" 2DIGIT            ; 100-199
882%%                / "2" %x30-34 DIGIT     ; 200-249
883%%                / "25" %x30-35          ; 250-255
884%%
885%%    reg-name    = *( unreserved / pct-encoded / sub-delims )
886%%-------------------------------------------------------------------------
887-spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}.
888parse_host(?STRING_REST($:, Rest), URI) ->
889    {T, URI1} = parse_port(Rest, URI),
890    H = calculate_parsed_host_port(Rest, T),
891    Port = get_port(H),
892    {Rest, URI1#{port => Port}};
893parse_host(?STRING_REST($/, Rest), URI) ->
894    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
895    Path = calculate_parsed_part(Rest, T),
896    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
897parse_host(?STRING_REST($?, Rest), URI) ->
898    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
899    Query = calculate_parsed_query_fragment(Rest, T),
900    {Rest, URI1#{query => Query}};
901parse_host(?STRING_REST($[, Rest), URI) ->
902    parse_ipv6_bin(Rest, [], URI);
903parse_host(?STRING_REST($#, Rest), URI) ->
904    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
905    Fragment = calculate_parsed_query_fragment(Rest, T),
906    {Rest, URI1#{fragment => Fragment}};
907parse_host(?STRING_REST(Char, Rest), URI) ->
908    case is_digit(Char) of
909        true ->
910            try parse_ipv4_bin(Rest, [Char], URI)
911            catch
912                throw:{_,_,_} ->
913                    parse_reg_name(?STRING_REST(Char, Rest), URI)
914            end;
915        false -> parse_reg_name(?STRING_REST(Char, Rest), URI)
916    end;
917parse_host(?STRING_EMPTY, URI) ->
918    {?STRING_EMPTY, URI}.
919
920
921-spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}.
922parse_reg_name(?STRING_REST($:, Rest), URI) ->
923    {T, URI1} = parse_port(Rest, URI),
924    H = calculate_parsed_host_port(Rest, T),
925    Port = get_port(H),
926    {Rest, URI1#{port => Port}};
927parse_reg_name(?STRING_REST($/, Rest), URI) ->
928    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
929    Path = calculate_parsed_part(Rest, T),
930    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
931parse_reg_name(?STRING_REST($?, Rest), URI) ->
932    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
933    Query = calculate_parsed_query_fragment(Rest, T),
934    {Rest, URI1#{query => Query}};
935parse_reg_name(?STRING_REST($#, Rest), URI) ->
936    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
937    Fragment = calculate_parsed_query_fragment(Rest, T),
938    {Rest, URI1#{fragment => Fragment}};
939parse_reg_name(?STRING_REST(Char, Rest), URI) ->
940    case is_reg_name(Char) of
941        true -> parse_reg_name(Rest, URI);
942        false -> throw({error,invalid_uri,[Char]})
943    end;
944parse_reg_name(?STRING_EMPTY, URI) ->
945    {?STRING_EMPTY, URI}.
946
947%% Check if char is allowed in reg-name
948-spec is_reg_name(char()) -> boolean().
949is_reg_name($%) -> true;
950is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
951
952
953-spec parse_ipv4_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}.
954parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) ->
955    _ = validate_ipv4_address(lists:reverse(Acc)),
956    {T, URI1} = parse_port(Rest, URI),
957    H = calculate_parsed_host_port(Rest, T),
958    Port = get_port(H),
959    {Rest, URI1#{port => Port}};
960parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) ->
961    _ = validate_ipv4_address(lists:reverse(Acc)),
962    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
963    Path = calculate_parsed_part(Rest, T),
964    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
965parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) ->
966    _ = validate_ipv4_address(lists:reverse(Acc)),
967    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
968    Query = calculate_parsed_query_fragment(Rest, T),
969    {Rest, URI1#{query => Query}};
970parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) ->
971    _ = validate_ipv4_address(lists:reverse(Acc)),
972    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
973    Fragment = calculate_parsed_query_fragment(Rest, T),
974    {Rest, URI1#{fragment => Fragment}};
975parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) ->
976    case is_ipv4(Char) of
977        true -> parse_ipv4_bin(Rest, [Char|Acc], URI);
978        false -> throw({error,invalid_uri,[Char]})
979    end;
980parse_ipv4_bin(?STRING_EMPTY, Acc, URI) ->
981    _ = validate_ipv4_address(lists:reverse(Acc)),
982    {?STRING_EMPTY, URI}.
983
984
985%% Check if char is allowed in IPv4 addresses
986-spec is_ipv4(char()) -> boolean().
987is_ipv4($.) -> true;
988is_ipv4(Char) -> is_digit(Char).
989
990-spec validate_ipv4_address(list()) -> list().
991validate_ipv4_address(Addr) ->
992    case inet:parse_ipv4strict_address(Addr) of
993        {ok, _} -> Addr;
994        {error, _} -> throw({error,invalid_uri,Addr})
995    end.
996
997
998-spec parse_ipv6_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}.
999parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) ->
1000    _ = validate_ipv6_address(lists:reverse(Acc)),
1001    parse_ipv6_bin_end(Rest, URI);
1002parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) ->
1003    case is_ipv6(Char) of
1004        true -> parse_ipv6_bin(Rest, [Char|Acc], URI);
1005        false -> throw({error,invalid_uri,[Char]})
1006    end;
1007parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) ->
1008    throw({error,invalid_uri,<<>>}).
1009
1010%% Check if char is allowed in IPv6 addresses
1011-spec is_ipv6(char()) -> boolean().
1012is_ipv6($:) -> true;
1013is_ipv6($.) -> true;
1014is_ipv6(Char) -> is_hex_digit(Char).
1015
1016
1017-spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}.
1018parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) ->
1019    {T, URI1} = parse_port(Rest, URI),
1020    H = calculate_parsed_host_port(Rest, T),
1021    Port = get_port(H),
1022    {Rest, URI1#{port => Port}};
1023parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) ->
1024    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
1025    Path = calculate_parsed_part(Rest, T),
1026    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
1027parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) ->
1028    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
1029    Query = calculate_parsed_query_fragment(Rest, T),
1030    {Rest, URI1#{query => Query}};
1031parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) ->
1032    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
1033    Fragment = calculate_parsed_query_fragment(Rest, T),
1034    {Rest, URI1#{fragment => Fragment}};
1035parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) ->
1036    case is_ipv6(Char) of
1037        true -> parse_ipv6_bin_end(Rest, URI);
1038        false -> throw({error,invalid_uri,[Char]})
1039    end;
1040parse_ipv6_bin_end(?STRING_EMPTY, URI) ->
1041    {?STRING_EMPTY, URI}.
1042
1043-spec validate_ipv6_address(list()) -> list().
1044validate_ipv6_address(Addr) ->
1045    case inet:parse_ipv6strict_address(Addr) of
1046        {ok, _} -> Addr;
1047        {error, _} -> throw({error,invalid_uri,Addr})
1048    end.
1049
1050
1051%%-------------------------------------------------------------------------
1052%% [RFC 3986, Chapter 3.2.2. Port]
1053%%
1054%% The port subcomponent of authority is designated by an optional port
1055%% number in decimal following the host and delimited from it by a
1056%% single colon (":") character.
1057%%
1058%%    port        = *DIGIT
1059%%-------------------------------------------------------------------------
1060-spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}.
1061parse_port(?STRING_REST($/, Rest), URI) ->
1062    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
1063    Path = calculate_parsed_part(Rest, T),
1064    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
1065parse_port(?STRING_REST($?, Rest), URI) ->
1066    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
1067    Query = calculate_parsed_query_fragment(Rest, T),
1068    {Rest, URI1#{query => Query}};
1069parse_port(?STRING_REST($#, Rest), URI) ->
1070    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
1071    Fragment = calculate_parsed_query_fragment(Rest, T),
1072    {Rest, URI1#{fragment => Fragment}};
1073parse_port(?STRING_REST(Char, Rest), URI) ->
1074    case is_digit(Char) of
1075        true -> parse_port(Rest, URI);
1076        false -> throw({error,invalid_uri,[Char]})
1077    end;
1078parse_port(?STRING_EMPTY, URI) ->
1079    {?STRING_EMPTY, URI}.
1080
1081
1082%%-------------------------------------------------------------------------
1083%% [RFC 3986, Chapter 3.4. Query]
1084%%
1085%% The query component contains non-hierarchical data that, along with
1086%% data in the path component (Section 3.3), serves to identify a
1087%% resource within the scope of the URI's scheme and naming authority
1088%% (if any).  The query component is indicated by the first question
1089%% mark ("?") character and terminated by a number sign ("#") character
1090%% or by the end of the URI.
1091%%
1092%%    query       = *( pchar / "/" / "?" )
1093%%-------------------------------------------------------------------------
1094-spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}.
1095parse_query(?STRING_REST($#, Rest), URI) ->
1096    {T, URI1} = parse_fragment(Rest, URI),
1097    Fragment = calculate_parsed_query_fragment(Rest, T),
1098    {Rest, URI1#{fragment => Fragment}};
1099parse_query(?STRING_REST(Char, Rest), URI) ->
1100    case is_query(Char) of
1101        true -> parse_query(Rest, URI);
1102        false -> throw({error,invalid_uri,[Char]})
1103    end;
1104parse_query(?STRING_EMPTY, URI) ->
1105    {?STRING_EMPTY, URI}.
1106
1107
1108%% Check if char is allowed in query
1109-spec is_query(char()) -> boolean().
1110is_query($/) -> true;
1111is_query($?) -> true;
1112is_query(Char) -> is_pchar(Char).
1113
1114
1115%%-------------------------------------------------------------------------
1116%% [RFC 3986, Chapter 3.5. Fragment]
1117%%
1118%% The fragment identifier component of a URI allows indirect
1119%% identification of a secondary resource by reference to a primary
1120%% resource and additional identifying information.
1121%%
1122%%    fragment    = *( pchar / "/" / "?" )
1123%%-------------------------------------------------------------------------
1124-spec parse_fragment(binary(), uri_map()) -> {binary(), uri_map()}.
1125parse_fragment(?STRING_REST(Char, Rest), URI) ->
1126    case is_fragment(Char) of
1127        true -> parse_fragment(Rest, URI);
1128        false -> throw({error,invalid_uri,[Char]})
1129    end;
1130parse_fragment(?STRING_EMPTY, URI) ->
1131    {?STRING_EMPTY, URI}.
1132
1133
1134%% Check if char is allowed in fragment
1135-spec is_fragment(char()) -> boolean().
1136is_fragment($/) -> true;
1137is_fragment($?) -> true;
1138is_fragment(Char) -> is_pchar(Char).
1139
1140
1141%%-------------------------------------------------------------------------
1142%% [RFC 3986, Chapter 2.2. Reserved Characters]
1143%%
1144%%   reserved    = gen-delims / sub-delims
1145%%
1146%%   gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1147%%
1148%%   sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
1149%%               / "*" / "+" / "," / ";" / "="
1150%%
1151%%-------------------------------------------------------------------------
1152
1153%% Return true if input char is reserved.
1154-spec is_reserved(char()) -> boolean().
1155is_reserved($:) -> true;
1156is_reserved($/) -> true;
1157is_reserved($?) -> true;
1158is_reserved($#) -> true;
1159is_reserved($[) -> true;
1160is_reserved($]) -> true;
1161is_reserved($@) -> true;
1162
1163is_reserved($!) -> true;
1164is_reserved($$) -> true;
1165is_reserved($&) -> true;
1166is_reserved($') -> true;
1167is_reserved($() -> true;
1168is_reserved($)) -> true;
1169
1170is_reserved($*) -> true;
1171is_reserved($+) -> true;
1172is_reserved($,) -> true;
1173is_reserved($;) -> true;
1174is_reserved($=) -> true;
1175is_reserved(_) -> false.
1176
1177
1178%% Check if char is sub-delim.
1179-spec is_sub_delim(char()) -> boolean().
1180is_sub_delim($!) -> true;
1181is_sub_delim($$) -> true;
1182is_sub_delim($&) -> true;
1183is_sub_delim($') -> true;
1184is_sub_delim($() -> true;
1185is_sub_delim($)) -> true;
1186
1187is_sub_delim($*) -> true;
1188is_sub_delim($+) -> true;
1189is_sub_delim($,) -> true;
1190is_sub_delim($;) -> true;
1191is_sub_delim($=) -> true;
1192is_sub_delim(_) -> false.
1193
1194
1195%%-------------------------------------------------------------------------
1196%% [RFC 3986, Chapter 2.3. Unreserved Characters]
1197%%
1198%%   unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1199%%
1200%%-------------------------------------------------------------------------
1201-spec is_unreserved(char()) -> boolean().
1202is_unreserved($-) -> true;
1203is_unreserved($.) -> true;
1204is_unreserved($_) -> true;
1205is_unreserved($~) -> true;
1206is_unreserved(Char) -> is_alpha(Char) orelse is_digit(Char).
1207
1208-spec is_alpha(char()) -> boolean().
1209is_alpha(C)
1210  when $A =< C, C =< $Z;
1211       $a =< C, C =< $z -> true;
1212is_alpha(_) -> false.
1213
1214-spec is_digit(char()) -> boolean().
1215is_digit(C)
1216  when $0 =< C, C =< $9 -> true;
1217is_digit(_) -> false.
1218
1219-spec is_hex_digit(char()) -> boolean().
1220is_hex_digit(C)
1221  when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true;
1222is_hex_digit(_) -> false.
1223
1224
1225%% Remove enclosing brackets from binary
1226-spec remove_brackets(binary()) -> binary().
1227remove_brackets(<<$[/utf8, Rest/binary>>) ->
1228    {H,T} = split_binary(Rest, byte_size(Rest) - 1),
1229    case T =:= <<$]/utf8>> of
1230        true -> H;
1231        false -> Rest
1232    end;
1233remove_brackets(Addr) -> Addr.
1234
1235
1236%%-------------------------------------------------------------------------
1237%% Helper functions for calculating the parsed binary.
1238%%-------------------------------------------------------------------------
1239-spec calculate_parsed_scheme(binary(), binary()) -> binary().
1240calculate_parsed_scheme(Input, <<>>) ->
1241    strip_last_char(Input, [$:]);
1242calculate_parsed_scheme(Input, Unparsed) ->
1243    get_parsed_binary(Input, Unparsed).
1244
1245
1246-spec calculate_parsed_part(binary(), binary()) -> binary().
1247calculate_parsed_part(Input, <<>>) ->
1248    strip_last_char(Input, [$?,$#]);
1249calculate_parsed_part(Input, Unparsed) ->
1250    get_parsed_binary(Input, Unparsed).
1251
1252
1253-spec calculate_parsed_userinfo(binary(), binary()) -> binary().
1254calculate_parsed_userinfo(Input, <<>>) ->
1255    strip_last_char(Input, [$?,$#,$@]);
1256calculate_parsed_userinfo(Input, Unparsed) ->
1257    get_parsed_binary(Input, Unparsed).
1258
1259
1260-spec calculate_parsed_host_port(binary(), binary()) -> binary().
1261calculate_parsed_host_port(Input, <<>>) ->
1262    strip_last_char(Input, [$:,$?,$#,$/]);
1263calculate_parsed_host_port(Input, Unparsed) ->
1264    get_parsed_binary(Input, Unparsed).
1265
1266
1267calculate_parsed_query_fragment(Input, <<>>) ->
1268    strip_last_char(Input, [$#]);
1269calculate_parsed_query_fragment(Input, Unparsed) ->
1270    get_parsed_binary(Input, Unparsed).
1271
1272
1273get_port(<<>>) ->
1274    undefined;
1275get_port(B) ->
1276    try binary_to_integer(B)
1277    catch
1278        error:badarg ->
1279            throw({error, invalid_uri, B})
1280    end.
1281
1282
1283%% Strip last char if it is in list
1284%%
1285%% This function is optimized for speed: parse/1 is about 10% faster than
1286%% with an alternative implementation based on lists and sets.
1287strip_last_char(<<>>, _) -> <<>>;
1288strip_last_char(Input, [C0]) ->
1289    case binary:last(Input) of
1290        C0 ->
1291            init_binary(Input);
1292        _Else ->
1293            Input
1294    end;
1295strip_last_char(Input, [C0,C1]) ->
1296    case binary:last(Input) of
1297        C0 ->
1298            init_binary(Input);
1299        C1 ->
1300            init_binary(Input);
1301        _Else ->
1302            Input
1303    end;
1304strip_last_char(Input, [C0,C1,C2]) ->
1305    case binary:last(Input) of
1306        C0 ->
1307            init_binary(Input);
1308        C1 ->
1309            init_binary(Input);
1310        C2 ->
1311            init_binary(Input);
1312        _Else ->
1313            Input
1314    end;
1315strip_last_char(Input, [C0,C1,C2,C3]) ->
1316    case binary:last(Input) of
1317        C0 ->
1318            init_binary(Input);
1319        C1 ->
1320            init_binary(Input);
1321        C2 ->
1322            init_binary(Input);
1323        C3 ->
1324            init_binary(Input);
1325        _Else ->
1326            Input
1327    end.
1328
1329
1330%% Get parsed binary
1331get_parsed_binary(Input, Unparsed) ->
1332    {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
1333    First.
1334
1335
1336%% Return all bytes of the binary except the last one. The binary must be non-empty.
1337init_binary(B) ->
1338    {Init, _} =
1339        split_binary(B, byte_size(B) - 1),
1340    Init.
1341
1342
1343%% Returns the size of a binary exluding the first element.
1344%% Used in calls to split_binary().
1345-spec byte_size_exl_head(binary()) -> number().
1346byte_size_exl_head(<<>>) -> 0;
1347byte_size_exl_head(Binary) -> byte_size(Binary) + 1.
1348
1349
1350%%-------------------------------------------------------------------------
1351%% [RFC 3986, Chapter 2.1.  Percent-Encoding]
1352%%
1353%% A percent-encoding mechanism is used to represent a data octet in a
1354%% component when that octet's corresponding character is outside the
1355%% allowed set or is being used as a delimiter of, or within, the
1356%% component.  A percent-encoded octet is encoded as a character
1357%% triplet, consisting of the percent character "%" followed by the two
1358%% hexadecimal digits representing that octet's numeric value.  For
1359%% example, "%20" is the percent-encoding for the binary octet
1360%% "00100000" (ABNF: %x20), which in US-ASCII corresponds to the space
1361%% character (SP).  Section 2.4 describes when percent-encoding and
1362%% decoding is applied.
1363%%
1364%%   pct-encoded = "%" HEXDIG HEXDIG
1365%%-------------------------------------------------------------------------
1366
1367%%-------------------------------------------------------------------------
1368%% Percent-encode
1369%%-------------------------------------------------------------------------
1370
1371%% Only validates as scheme cannot have percent-encoded characters
1372-spec encode_scheme(list()|binary()) -> list() | binary().
1373encode_scheme([]) ->
1374    throw({error,invalid_scheme,""});
1375encode_scheme(<<>>) ->
1376    throw({error,invalid_scheme,<<>>});
1377encode_scheme(Scheme) ->
1378    case validate_scheme(Scheme) of
1379        true -> Scheme;
1380        false -> throw({error,invalid_scheme,Scheme})
1381    end.
1382
1383-spec encode_userinfo(list()|binary()) -> list() | binary().
1384encode_userinfo(Cs) ->
1385    encode(Cs, fun is_userinfo/1).
1386
1387-spec encode_host(list()|binary()) -> list() | binary().
1388encode_host(Cs) ->
1389    case classify_host(Cs) of
1390        regname -> Cs;
1391        ipv4 -> Cs;
1392        ipv6 -> bracket_ipv6(Cs);
1393        other -> encode(Cs, fun is_reg_name/1)
1394    end.
1395
1396-spec encode_path(list()|binary()) -> list() | binary().
1397encode_path(Cs) ->
1398    encode(Cs, fun is_path/1).
1399
1400-spec encode_query(list()|binary()) -> list() | binary().
1401encode_query(Cs) ->
1402    encode(Cs, fun is_query/1).
1403
1404-spec encode_fragment(list()|binary()) -> list() | binary().
1405encode_fragment(Cs) ->
1406    encode(Cs, fun is_fragment/1).
1407
1408%%-------------------------------------------------------------------------
1409%% Helper funtions for percent-decode
1410%%-------------------------------------------------------------------------
1411
1412-spec decode(list()|binary()) -> list() | binary().
1413decode(Cs) ->
1414    decode(Cs, <<>>).
1415%%
1416decode(L, Acc) when is_list(L) ->
1417    B0 = unicode:characters_to_binary(L),
1418    B1 = decode(B0, Acc),
1419    unicode:characters_to_list(B1);
1420decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
1421    case is_hex_digit(C0) andalso is_hex_digit(C1) of
1422        true ->
1423            B = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
1424            case is_reserved(B) of
1425                true ->
1426                    %% [2.2] Characters in the reserved set are protected from
1427                    %% normalization.
1428                    %% [2.1] For consistency, URI producers and normalizers should
1429                    %% use uppercase hexadecimal digits for all percent-
1430                    %% encodings.
1431                    H0 = hex_to_upper(C0),
1432                    H1 = hex_to_upper(C1),
1433                    decode(Cs, <<Acc/binary,$%,H0,H1>>);
1434                false ->
1435                    decode(Cs, <<Acc/binary, B>>)
1436            end;
1437        false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>})
1438    end;
1439decode(<<C,Cs/binary>>, Acc) ->
1440    decode(Cs, <<Acc/binary, C>>);
1441decode(<<>>, Acc) ->
1442    check_utf8(Acc).
1443
1444%% Returns Cs if it is utf8 encoded.
1445check_utf8(Cs) ->
1446    case unicode:characters_to_list(Cs) of
1447        {incomplete,_,_} ->
1448            throw({error,invalid_utf8,Cs});
1449        {error,_,_} ->
1450            throw({error,invalid_utf8,Cs});
1451        _ -> Cs
1452    end.
1453
1454%% Convert hex digit to uppercase form
1455hex_to_upper(H) when $a =< H, H =< $f ->
1456    H - 32;
1457hex_to_upper(H) when $0 =< H, H =< $9;$A =< H, H =< $F->
1458    H;
1459hex_to_upper(H) ->
1460    throw({error,invalid_input, H}).
1461
1462%% Check if char is allowed in host
1463-spec is_host(char()) -> boolean().
1464is_host($:) -> true;
1465is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
1466
1467%% Check if char is allowed in path
1468-spec is_path(char()) -> boolean().
1469is_path($/) -> true;
1470is_path(Char) -> is_pchar(Char).
1471
1472
1473%%-------------------------------------------------------------------------
1474%% Helper functions for percent-encode
1475%%-------------------------------------------------------------------------
1476-spec encode(list()|binary(), fun()) -> list() | binary().
1477encode(Component, Fun) when is_list(Component) ->
1478    B = unicode:characters_to_binary(Component),
1479    unicode:characters_to_list(encode(B, Fun, <<>>));
1480encode(Component, Fun) when is_binary(Component) ->
1481    encode(Component, Fun, <<>>).
1482%%
1483encode(<<Char/utf8, Rest/binary>>, Fun, Acc) ->
1484    C = encode_codepoint_binary(Char, Fun),
1485    encode(Rest, Fun, <<Acc/binary,C/binary>>);
1486encode(<<Char, Rest/binary>>, _Fun, _Acc) ->
1487    throw({error,invalid_input,<<Char,Rest/binary>>});
1488encode(<<>>, _Fun, Acc) ->
1489    Acc.
1490
1491
1492-spec encode_codepoint_binary(integer(), fun()) -> binary().
1493encode_codepoint_binary(C, Fun) ->
1494    case Fun(C) of
1495        false -> percent_encode_binary(C);
1496        true -> <<C>>
1497    end.
1498
1499
1500-spec percent_encode_binary(integer()) -> binary().
1501percent_encode_binary(Code) ->
1502    percent_encode_binary(<<Code/utf8>>, <<>>).
1503
1504
1505percent_encode_binary(<<A:4,B:4,Rest/binary>>, Acc) ->
1506    percent_encode_binary(Rest, <<Acc/binary,$%,(?DEC2HEX(A)),(?DEC2HEX(B))>>);
1507percent_encode_binary(<<>>, Acc) ->
1508    Acc.
1509
1510
1511%%-------------------------------------------------------------------------
1512%%-------------------------------------------------------------------------
1513validate_scheme([]) -> true;
1514validate_scheme([H|T]) ->
1515    case is_scheme(H) of
1516        true -> validate_scheme(T);
1517        false -> false
1518    end;
1519validate_scheme(<<>>) -> true;
1520validate_scheme(<<H, Rest/binary>>) ->
1521    case is_scheme(H) of
1522        true -> validate_scheme(Rest);
1523        false -> false
1524    end.
1525
1526
1527%%-------------------------------------------------------------------------
1528%% Classifies hostname into the following categories:
1529%% regname, ipv4 - address does not contain reserved characters to be
1530%%           percent-encoded
1531%% ipv6 - address does not contain reserved characters but it shall be
1532%%        encolsed in brackets
1533%% other - address shall be percent-encoded
1534%%-------------------------------------------------------------------------
1535classify_host([]) -> other;
1536classify_host(Addr) when is_binary(Addr) ->
1537    A = unicode:characters_to_list(Addr),
1538    classify_host_ipv6(A);
1539classify_host(Addr) ->
1540    classify_host_ipv6(Addr).
1541
1542classify_host_ipv6(Addr) ->
1543    case is_ipv6_address(Addr) of
1544        true -> ipv6;
1545        false -> classify_host_ipv4(Addr)
1546    end.
1547
1548classify_host_ipv4(Addr) ->
1549    case is_ipv4_address(Addr) of
1550        true -> ipv4;
1551        false -> classify_host_regname(Addr)
1552    end.
1553
1554classify_host_regname([]) -> regname;
1555classify_host_regname([H|T]) ->
1556    case is_reg_name(H) of
1557        true -> classify_host_regname(T);
1558        false -> other
1559    end.
1560
1561is_ipv4_address(Addr) ->
1562    case inet:parse_ipv4strict_address(Addr) of
1563        {ok, _} -> true;
1564        {error, _} -> false
1565    end.
1566
1567is_ipv6_address(Addr) ->
1568    case inet:parse_ipv6strict_address(Addr) of
1569        {ok, _} -> true;
1570        {error, _} -> false
1571    end.
1572
1573bracket_ipv6(Addr) when is_binary(Addr) ->
1574    concat(<<$[,Addr/binary>>,<<$]>>);
1575bracket_ipv6(Addr) when is_list(Addr) ->
1576    [$[|Addr] ++ "]".
1577
1578
1579%%-------------------------------------------------------------------------
1580%% Helper funtions for recompose
1581%%-------------------------------------------------------------------------
1582
1583%%-------------------------------------------------------------------------
1584%% Checks if input Map has valid combination of fields that can be
1585%% recomposed into a URI.
1586%%
1587%% The implementation is based on a decision tree that fulfills the
1588%% following rules:
1589%%   - 'path' shall always be present in the input map
1590%%       URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
1591%%       hier-part   = "//" authority path-abempty
1592%%                      / path-absolute
1593%%                      / path-rootless
1594%%                      / path-empty
1595%%   - 'host' shall be present in the input map when 'path' starts with
1596%%     two slashes ("//")
1597%%       path          = path-abempty    ; begins with "/" or is empty
1598%%                     / path-absolute   ; begins with "/" but not "//"
1599%%                     / path-noscheme   ; begins with a non-colon segment
1600%%                     / path-rootless   ; begins with a segment
1601%%                     / path-empty      ; zero characters
1602%%       path-abempty  = *( "/" segment )
1603%%       segment       = *pchar
1604%%   - 'host' shall be present if userinfo or port is present in input map
1605%%       authority   = [ userinfo "@" ] host [ ":" port ]
1606%%   - All fields shall be valid (scheme, userinfo, host, port, path, query
1607%%     or fragment).
1608%%-------------------------------------------------------------------------
1609is_valid_map(#{path := Path} = Map) ->
1610    ((starts_with_two_slash(Path) andalso is_valid_map_host(Map))
1611     orelse
1612       (maps:is_key(userinfo, Map) andalso is_valid_map_host(Map))
1613     orelse
1614       (maps:is_key(port, Map) andalso is_valid_map_host(Map))
1615     orelse
1616     all_fields_valid(Map));
1617is_valid_map(#{}) ->
1618    false.
1619
1620
1621is_valid_map_host(Map) ->
1622    maps:is_key(host, Map) andalso all_fields_valid(Map).
1623
1624
1625all_fields_valid(Map) ->
1626    Fun = fun(scheme, _, Acc) -> Acc;
1627             (userinfo, _, Acc) -> Acc;
1628             (host, _, Acc) -> Acc;
1629             (port, _, Acc) -> Acc;
1630             (path, _, Acc) -> Acc;
1631             (query, _, Acc) -> Acc;
1632             (fragment, _, Acc) -> Acc;
1633             (_, _, _) -> false
1634          end,
1635    maps:fold(Fun, true, Map).
1636
1637
1638starts_with_two_slash([$/,$/|_]) ->
1639    true;
1640starts_with_two_slash(?STRING_REST("//", _)) ->
1641    true;
1642starts_with_two_slash(_) -> false.
1643
1644
1645update_scheme(#{scheme := Scheme}, _) ->
1646    add_colon_postfix(encode_scheme(Scheme));
1647update_scheme(#{}, _) ->
1648    empty.
1649
1650
1651update_userinfo(#{userinfo := Userinfo}, empty) ->
1652    add_auth_prefix(encode_userinfo(Userinfo));
1653update_userinfo(#{userinfo := Userinfo}, URI) ->
1654    concat(URI,add_auth_prefix(encode_userinfo(Userinfo)));
1655update_userinfo(#{}, empty) ->
1656    empty;
1657update_userinfo(#{}, URI) ->
1658    URI.
1659
1660
1661update_host(#{host := Host}, empty) ->
1662    add_auth_prefix(encode_host(Host));
1663update_host(#{host := Host} = Map, URI) ->
1664    concat(URI,add_host_prefix(Map, encode_host(Host)));
1665update_host(#{}, empty) ->
1666    empty;
1667update_host(#{}, URI) ->
1668    URI.
1669
1670
1671%% URI cannot be empty for ports. E.g. ":8080" is not a valid URI
1672update_port(#{port := undefined}, URI) ->
1673    concat(URI, <<":">>);
1674update_port(#{port := Port}, URI) ->
1675    concat(URI,add_colon(encode_port(Port)));
1676update_port(#{}, URI) ->
1677    URI.
1678
1679
1680update_path(#{path := Path}, empty) ->
1681    encode_path(Path);
1682update_path(#{path := Path}, URI) ->
1683    concat(URI,encode_path(Path));
1684update_path(#{}, empty) ->
1685    empty;
1686update_path(#{}, URI) ->
1687    URI.
1688
1689
1690update_query(#{query := Query}, empty) ->
1691    encode_query(Query);
1692update_query(#{query := Query}, URI) ->
1693    concat(URI,add_question_mark(encode_query(Query)));
1694update_query(#{}, empty) ->
1695    empty;
1696update_query(#{}, URI) ->
1697    URI.
1698
1699
1700update_fragment(#{fragment := Fragment}, empty) ->
1701    add_hashmark(encode_fragment(Fragment));
1702update_fragment(#{fragment := Fragment}, URI) ->
1703    concat(URI,add_hashmark(encode_fragment(Fragment)));
1704update_fragment(#{}, empty) ->
1705    "";
1706update_fragment(#{}, URI) ->
1707    URI.
1708
1709%%-------------------------------------------------------------------------
1710%% Concatenates its arguments that can be lists and binaries.
1711%% The result is a list if at least one of its argument is a list and
1712%% binary otherwise.
1713%%-------------------------------------------------------------------------
1714concat(A, B) when is_binary(A), is_binary(B) ->
1715    <<A/binary, B/binary>>;
1716concat(A, B) when is_binary(A), is_list(B) ->
1717    unicode:characters_to_list(A) ++ B;
1718concat(A, B) when is_list(A) ->
1719    A ++ maybe_to_list(B).
1720
1721add_hashmark(Comp) when is_binary(Comp) ->
1722    <<$#, Comp/binary>>;
1723add_hashmark(Comp) when is_list(Comp) ->
1724    [$#|Comp].
1725
1726add_question_mark(Comp) when is_binary(Comp) ->
1727    <<$?, Comp/binary>>;
1728add_question_mark(Comp) when is_list(Comp) ->
1729    [$?|Comp].
1730
1731add_colon(Comp) when is_binary(Comp) ->
1732    <<$:, Comp/binary>>.
1733
1734add_colon_postfix(Comp) when is_binary(Comp) ->
1735    <<Comp/binary,$:>>;
1736add_colon_postfix(Comp) when is_list(Comp) ->
1737    Comp ++ ":".
1738
1739add_auth_prefix(Comp) when is_binary(Comp) ->
1740    <<"//", Comp/binary>>;
1741add_auth_prefix(Comp) when is_list(Comp) ->
1742    [$/,$/|Comp].
1743
1744add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) ->
1745    <<$@,Host/binary>>;
1746add_host_prefix(#{}, Host) when is_binary(Host) ->
1747    <<"//",Host/binary>>;
1748add_host_prefix(#{userinfo := _}, Host) when is_list(Host) ->
1749    [$@|Host];
1750add_host_prefix(#{}, Host) when is_list(Host) ->
1751    [$/,$/|Host].
1752
1753maybe_to_list(Comp) when is_binary(Comp) -> unicode:characters_to_list(Comp);
1754maybe_to_list(Comp) -> Comp.
1755
1756encode_port(Port) ->
1757    integer_to_binary(Port).
1758
1759
1760%%-------------------------------------------------------------------------
1761%% Helper functions for resolve
1762%%-------------------------------------------------------------------------
1763
1764resolve_map(URIMap=#{scheme := _}, _) ->
1765    normalize_path_segment(URIMap);
1766resolve_map(URIMap, #{scheme := _}=BaseURIMap) ->
1767    resolve_map(URIMap, BaseURIMap, resolve_path_type(URIMap));
1768resolve_map(_URIMap, BaseURIMap) when is_map(BaseURIMap) ->
1769    {error,invalid_scheme,""};
1770resolve_map(URIMap, BaseURIString) ->
1771    case parse(BaseURIString) of
1772        BaseURIMap = #{scheme := _} ->
1773            resolve_map(URIMap, BaseURIMap, resolve_path_type(URIMap));
1774        BaseURIMap when is_map(BaseURIMap) ->
1775            {error,invalid_scheme,""};
1776        Error ->
1777            Error
1778    end.
1779
1780resolve_path_type(URIMap) ->
1781    case iolist_to_binary(maps:get(path, URIMap, <<>>)) of
1782        <<>> -> empty_path;
1783        <<$/,_/bits>> -> absolute_path;
1784        _ -> relative_path
1785    end.
1786
1787resolve_map(URI=#{host := _}, #{scheme := Scheme}, _) ->
1788    normalize_path_segment(URI#{scheme => Scheme});
1789resolve_map(URI, BaseURI, empty_path) ->
1790    Keys = case maps:is_key(query, URI) of
1791        true -> [scheme, userinfo, host, port, path];
1792        false -> [scheme, userinfo, host, port, path, query]
1793    end,
1794    maps:merge(URI, maps:with(Keys, BaseURI));
1795resolve_map(URI, BaseURI, absolute_path) ->
1796    normalize_path_segment(maps:merge(
1797        URI,
1798        maps:with([scheme, userinfo, host, port], BaseURI)));
1799resolve_map(URI=#{path := Path}, BaseURI, relative_path) ->
1800    normalize_path_segment(maps:merge(
1801        URI#{path => merge_paths(Path, BaseURI)},
1802        maps:with([scheme, userinfo, host, port], BaseURI))).
1803
1804merge_paths(Path, BaseURI=#{path := BasePath0}) ->
1805    case {BaseURI, iolist_size(BasePath0)} of
1806        {#{host := _}, 0} ->
1807            merge_paths_absolute(Path);
1808        _ ->
1809            case string:split(BasePath0, <<$/>>, trailing) of
1810                [BasePath, _] when is_binary(Path) -> unicode:characters_to_binary([BasePath, $/, Path]);
1811                [BasePath, _] when is_list(Path) -> unicode:characters_to_list([BasePath, $/, Path]);
1812                [_] -> Path
1813            end
1814    end.
1815
1816merge_paths_absolute(Path) when is_binary(Path) ->
1817    <<$/, Path/binary>>;
1818merge_paths_absolute(Path) when is_list(Path) ->
1819    unicode:characters_to_list([$/, Path]).
1820
1821
1822%%-------------------------------------------------------------------------
1823%% Helper functions for transcode
1824%%-------------------------------------------------------------------------
1825
1826%%-------------------------------------------------------------------------
1827%% uri_string:transcode(<<"x%00%00%00%F6"/utf32>>).
1828%% 1. Convert (transcode/2) input to list form (list of unicode codepoints)
1829%%    "x%00%00%00%F6"
1830%% 2. Accumulate characters until percent-encoded segment (transcode/4).
1831%%    Acc = "x"
1832%% 3. Convert percent-encoded triplets to binary form (transcode_pct/4)
1833%%    <<0,0,0,246>>
1834%% 4. Transcode in-encoded binary to out-encoding (utf32 -> utf8):
1835%%    <<195,182>>
1836%% 5. Percent-encode out-encoded binary:
1837%%    <<"%C3%B6"/utf8>> = <<37,67,51,37,66,54>>
1838%% 6. Convert binary to list form, reverse it and append the accumulator
1839%%    "6B%3C%" + "x"
1840%% 7. Reverse Acc and return it
1841%%-------------------------------------------------------------------------
1842transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) ->
1843    transcode_pct(L, Acc, <<>>, InEnc, OutEnc);
1844transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) ->
1845    transcode(L, Acc, [], InEnc, OutEnc).
1846%%
1847transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) ->
1848    transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding);
1849transcode([C|Rest], Acc, List, InEncoding, OutEncoding) ->
1850    transcode(Rest, Acc, [C|List], InEncoding, OutEncoding);
1851transcode([], Acc, List, _InEncoding, _OutEncoding) ->
1852    lists:reverse(List ++ Acc).
1853
1854
1855%% Transcode percent-encoded segment
1856transcode_pct([$%,C0,C1|Rest] = L, Acc, B, InEncoding, OutEncoding) ->
1857    case is_hex_digit(C0) andalso is_hex_digit(C1) of
1858        true ->
1859            Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
1860            transcode_pct(Rest, Acc, <<B/binary, Int>>, InEncoding, OutEncoding);
1861        false -> throw({error, invalid_percent_encoding,L})
1862    end;
1863transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) ->
1864    OutBinary = convert_to_binary(B, InEncoding, OutEncoding),
1865    PctEncUtf8 = percent_encode_segment(OutBinary),
1866    Out = lists:reverse(convert_to_list(PctEncUtf8, utf8)),
1867    transcode(L, Out ++ Acc, [], InEncoding, OutEncoding);
1868transcode_pct([], Acc, B, InEncoding, OutEncoding) ->
1869    OutBinary = convert_to_binary(B, InEncoding, OutEncoding),
1870    PctEncUtf8 = percent_encode_segment(OutBinary),
1871    Out = convert_to_list(PctEncUtf8, utf8),
1872    lists:reverse(Acc) ++ Out.
1873
1874
1875%% Convert to binary
1876convert_to_binary(Binary, InEncoding, OutEncoding) ->
1877    case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of
1878        {error, _List, RestData} ->
1879            throw({error, invalid_input, RestData});
1880        {incomplete, _List, RestData} ->
1881            throw({error, invalid_input, RestData});
1882        Result ->
1883            Result
1884    end.
1885
1886
1887%% Convert to list
1888convert_to_list(Binary, InEncoding) ->
1889    case unicode:characters_to_list(Binary, InEncoding) of
1890        {error, _List, RestData} ->
1891            throw({error, invalid_input, RestData});
1892        {incomplete, _List, RestData} ->
1893            throw({error, invalid_input, RestData});
1894        Result ->
1895            Result
1896    end.
1897
1898
1899%% Flatten input list
1900flatten_list([], _) ->
1901    [];
1902flatten_list(L, InEnc) ->
1903    flatten_list(L, InEnc, []).
1904%%
1905flatten_list([H|T], InEnc, Acc) when is_binary(H) ->
1906    L = convert_to_list(H, InEnc),
1907    flatten_list(T, InEnc, lists:reverse(L) ++ Acc);
1908flatten_list([H|T], InEnc, Acc) when is_list(H) ->
1909    flatten_list(H ++ T, InEnc, Acc);
1910flatten_list([H|T], InEnc, Acc) ->
1911    flatten_list(T, InEnc, [H|Acc]);
1912flatten_list([], _InEnc, Acc) ->
1913    lists:reverse(Acc);
1914flatten_list(Arg, _, _) ->
1915    throw({error, invalid_input, Arg}).
1916
1917
1918percent_encode_segment(Segment) ->
1919    percent_encode_binary(Segment, <<>>).
1920
1921
1922%%-------------------------------------------------------------------------
1923%% Helper functions for compose_query
1924%%-------------------------------------------------------------------------
1925
1926%% Returns separator to be used between key-value pairs
1927get_separator(L) when length(L) =:= 0 ->
1928    <<>>;
1929get_separator(_L) ->
1930    <<"&">>.
1931
1932
1933%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8
1934%% HTML 5.0 - 4.10.22.6 URL-encoded form data - encoding (non UTF-8)
1935form_urlencode(Cs, [{encoding, latin1}]) when is_list(Cs) ->
1936    B = convert_to_binary(Cs, utf8, utf8),
1937    html5_byte_encode(base10_encode(B));
1938form_urlencode(Cs, [{encoding, latin1}]) when is_binary(Cs) ->
1939    html5_byte_encode(base10_encode(Cs));
1940form_urlencode(Cs, [{encoding, Encoding}])
1941  when is_list(Cs), Encoding =:= utf8; Encoding =:= unicode ->
1942    B = convert_to_binary(Cs, utf8, Encoding),
1943    html5_byte_encode(B);
1944form_urlencode(Cs, [{encoding, Encoding}])
1945  when is_binary(Cs), Encoding =:= utf8; Encoding =:= unicode ->
1946    html5_byte_encode(Cs);
1947form_urlencode(Cs, [{encoding, Encoding}]) when is_list(Cs); is_binary(Cs) ->
1948    throw({error,invalid_encoding, Encoding});
1949form_urlencode(Cs, _) ->
1950    throw({error,invalid_input, Cs}).
1951
1952
1953%% For each character in the entry's name and value that cannot be expressed using
1954%% the selected character encoding, replace the character by a string consisting of
1955%% a U+0026 AMPERSAND character (&), a "#" (U+0023) character, one or more ASCII
1956%% digits representing the Unicode code point of the character in base ten, and
1957%% finally a ";" (U+003B) character.
1958base10_encode(Cs) ->
1959    base10_encode(Cs, <<>>).
1960%%
1961base10_encode(<<>>, Acc) ->
1962    Acc;
1963base10_encode(<<H/utf8,T/binary>>, Acc) when H > 255 ->
1964    Base10 = convert_to_binary(integer_to_list(H,10), utf8, utf8),
1965    base10_encode(T, <<Acc/binary,"&#",Base10/binary,$;>>);
1966base10_encode(<<H/utf8,T/binary>>, Acc) ->
1967    base10_encode(T, <<Acc/binary,H>>).
1968
1969
1970html5_byte_encode(B) ->
1971    html5_byte_encode(B, <<>>).
1972%%
1973html5_byte_encode(<<>>, Acc) ->
1974    Acc;
1975html5_byte_encode(<<$ ,T/binary>>, Acc) ->
1976    html5_byte_encode(T, <<Acc/binary,$+>>);
1977html5_byte_encode(<<H,T/binary>>, Acc) ->
1978    case is_url_char(H) of
1979        true ->
1980            html5_byte_encode(T, <<Acc/binary,H>>);
1981        false ->
1982            <<A:4,B:4>> = <<H>>,
1983            html5_byte_encode(T, <<Acc/binary,$%,(?DEC2HEX(A)),(?DEC2HEX(B))>>)
1984    end;
1985html5_byte_encode(H, _Acc) ->
1986    throw({error,invalid_input, H}).
1987
1988
1989%% Return true if input char can appear in form-urlencoded string
1990%% Allowed chararacters:
1991%%   0x2A, 0x2D, 0x2E, 0x30 to 0x39, 0x41 to 0x5A,
1992%%   0x5F, 0x61 to 0x7A
1993is_url_char(C)
1994  when C =:= 16#2A; C =:= 16#2D;
1995       C =:= 16#2E; C =:= 16#5F;
1996       16#30 =< C, C =< 16#39;
1997       16#41 =< C, C =< 16#5A;
1998       16#61 =< C, C =< 16#7A -> true;
1999is_url_char(_) -> false.
2000
2001
2002%%-------------------------------------------------------------------------
2003%% Helper functions for dissect_query
2004%%-------------------------------------------------------------------------
2005dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) ->
2006    dissect_query_value(T, IsList, Acc, Key, Value);
2007dissect_query_key(<<"&#",T/binary>>, IsList, Acc, Key, Value) ->
2008    dissect_query_key(T, IsList, Acc, <<Key/binary,"&#">>, Value);
2009dissect_query_key(T = <<$&,_/binary>>, IsList, Acc, Key, <<>>) ->
2010    dissect_query_value(T, IsList, Acc, Key, true);
2011dissect_query_key(<<H,T/binary>>, IsList, Acc, Key, Value) ->
2012    dissect_query_key(T, IsList, Acc, <<Key/binary,H>>, Value);
2013dissect_query_key(T = <<>>, IsList, Acc, Key, <<>>) ->
2014    dissect_query_value(T, IsList, Acc, Key, true).
2015
2016dissect_query_value(<<$&,T/binary>>, IsList, Acc, Key, Value) ->
2017    K = form_urldecode(IsList, Key),
2018    V = form_urldecode(IsList, Value),
2019    dissect_query_key(T, IsList, [{K,V}|Acc], <<>>, <<>>);
2020dissect_query_value(<<H,T/binary>>, IsList, Acc, Key, Value) ->
2021    dissect_query_value(T, IsList, Acc, Key, <<Value/binary,H>>);
2022dissect_query_value(<<>>, IsList, Acc, Key, Value) ->
2023    K = form_urldecode(IsList, Key),
2024    V = form_urldecode(IsList, Value),
2025    lists:reverse([{K,V}|Acc]).
2026
2027%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8
2028%% HTML 5.0 - 4.10.22.6 URL-encoded form data - decoding (non UTF-8)
2029form_urldecode(_, true) ->
2030    true;
2031form_urldecode(true, B) ->
2032    Result = base10_decode(form_urldecode(B, <<>>)),
2033    convert_to_list(Result, utf8);
2034form_urldecode(false, B) ->
2035    base10_decode(form_urldecode(B, <<>>));
2036form_urldecode(<<>>, Acc) ->
2037    Acc;
2038form_urldecode(<<$+,T/binary>>, Acc) ->
2039    form_urldecode(T, <<Acc/binary,$ >>);
2040form_urldecode(<<$%,C0,C1,T/binary>>, Acc) ->
2041    case is_hex_digit(C0) andalso is_hex_digit(C1) of
2042        true ->
2043            V = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
2044            form_urldecode(T, <<Acc/binary, V>>);
2045        false ->
2046            L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8),
2047            throw({error, invalid_percent_encoding, L})
2048    end;
2049form_urldecode(<<H/utf8,T/binary>>, Acc) ->
2050    form_urldecode(T, <<Acc/binary,H/utf8>>);
2051form_urldecode(<<H,_/binary>>, _Acc) ->
2052    throw({error, invalid_character, [H]}).
2053
2054base10_decode(Cs) ->
2055    base10_decode(Cs, <<>>).
2056%
2057base10_decode(<<>>, Acc) ->
2058    Acc;
2059base10_decode(<<"&#",T/binary>>, Acc) ->
2060    base10_decode_unicode(T, Acc);
2061base10_decode(<<H/utf8,T/binary>>, Acc) ->
2062    base10_decode(T,<<Acc/binary,H/utf8>>);
2063base10_decode(<<H,_/binary>>, _) ->
2064    throw({error, invalid_input, [H]}).
2065
2066
2067base10_decode_unicode(B, Acc) ->
2068    base10_decode_unicode(B, 0, Acc).
2069%%
2070base10_decode_unicode(<<H/utf8,T/binary>>, Codepoint, Acc) when $0 =< H, H =< $9 ->
2071    Res = Codepoint * 10 + (H - $0),
2072    base10_decode_unicode(T, Res, Acc);
2073base10_decode_unicode(<<$;,T/binary>>, Codepoint, Acc) ->
2074    base10_decode(T, <<Acc/binary,Codepoint/utf8>>);
2075base10_decode_unicode(<<H,_/binary>>, _, _) ->
2076    throw({error, invalid_input, [H]}).
2077
2078
2079%%-------------------------------------------------------------------------
2080%% Helper functions for normalize
2081%%-------------------------------------------------------------------------
2082
2083normalize_map(URIMap) ->
2084    normalize_path_segment(
2085      normalize_scheme_based(
2086        normalize_percent_encoding(
2087          normalize_case(URIMap)))).
2088
2089
2090%% 6.2.2.1.  Case Normalization
2091normalize_case(#{scheme := Scheme, host := Host} = Map) ->
2092    Map#{scheme => to_lower(Scheme),
2093         host => to_lower(Host)};
2094normalize_case(#{host := Host} = Map) ->
2095    Map#{host => to_lower(Host)};
2096normalize_case(#{scheme := Scheme} = Map) ->
2097    Map#{scheme => to_lower(Scheme)};
2098normalize_case(#{} = Map) ->
2099    Map.
2100
2101
2102%% 6.2.2.2.  Percent-Encoding Normalization
2103normalize_percent_encoding(Map) ->
2104    Fun = fun (K,V) when K =:= userinfo; K =:= host; K =:= path;
2105                         K =:= query; K =:= fragment ->
2106                  decode(V);
2107              %% Handle port and scheme
2108              (_,V) ->
2109                  V
2110          end,
2111    maps:map(Fun, Map).
2112
2113
2114to_lower(Cs) when is_list(Cs) ->
2115    B = convert_to_binary(Cs, utf8, utf8),
2116    convert_to_list(to_lower(B), utf8);
2117to_lower(Cs) when is_binary(Cs) ->
2118    to_lower(Cs, <<>>).
2119%%
2120to_lower(<<C,Cs/binary>>, Acc) when $A =< C, C =< $Z ->
2121    to_lower(Cs, <<Acc/binary,(C + 32)>>);
2122to_lower(<<C,Cs/binary>>, Acc) ->
2123    to_lower(Cs, <<Acc/binary,C>>);
2124to_lower(<<>>, Acc) ->
2125    Acc.
2126
2127
2128%% 6.2.2.3. Path Segment Normalization
2129%% 5.2.4.   Remove Dot Segments
2130normalize_path_segment(Map) ->
2131    Path = maps:get(path, Map, undefined),
2132    Map#{path => remove_dot_segments(Path)}.
2133
2134
2135remove_dot_segments(Path) when is_binary(Path) ->
2136    remove_dot_segments(Path, <<>>);
2137remove_dot_segments(Path) when is_list(Path) ->
2138    B = convert_to_binary(Path, utf8, utf8),
2139    B1 = remove_dot_segments(B, <<>>),
2140    convert_to_list(B1, utf8).
2141%%
2142remove_dot_segments(<<>>, Output) ->
2143    Output;
2144remove_dot_segments(<<"../",T/binary>>, Output) ->
2145    remove_dot_segments(T, Output);
2146remove_dot_segments(<<"./",T/binary>>, Output) ->
2147    remove_dot_segments(T, Output);
2148remove_dot_segments(<<"/./",T/binary>>, Output) ->
2149    remove_dot_segments(<<$/,T/binary>>, Output);
2150remove_dot_segments(<<"/.">>, Output) ->
2151    remove_dot_segments(<<$/>>, Output);
2152remove_dot_segments(<<"/../",T/binary>>, Output) ->
2153    Out1 = remove_last_segment(Output),
2154    remove_dot_segments(<<$/,T/binary>>, Out1);
2155remove_dot_segments(<<"/..">>, Output) ->
2156    Out1 = remove_last_segment(Output),
2157    remove_dot_segments(<<$/>>, Out1);
2158remove_dot_segments(<<$.>>, Output) ->
2159    remove_dot_segments(<<>>, Output);
2160remove_dot_segments(<<"..">>, Output) ->
2161    remove_dot_segments(<<>>, Output);
2162remove_dot_segments(Input, Output) ->
2163    {First, Rest} = first_path_segment(Input),
2164    remove_dot_segments(Rest, <<Output/binary,First/binary>>).
2165
2166
2167first_path_segment(Input) ->
2168    F = first_path_segment(Input, <<>>),
2169    split_binary(Input, byte_size(F)).
2170%%
2171first_path_segment(<<$/,T/binary>>, Acc) ->
2172    first_path_segment_end(<<T/binary>>, <<Acc/binary,$/>>);
2173first_path_segment(<<C,T/binary>>, Acc) ->
2174    first_path_segment_end(<<T/binary>>, <<Acc/binary,C>>).
2175
2176
2177first_path_segment_end(<<>>, Acc) ->
2178    Acc;
2179first_path_segment_end(<<$/,_/binary>>, Acc) ->
2180    Acc;
2181first_path_segment_end(<<C,T/binary>>, Acc) ->
2182    first_path_segment_end(<<T/binary>>, <<Acc/binary,C>>).
2183
2184
2185remove_last_segment(<<>>) ->
2186    <<>>;
2187remove_last_segment(B) ->
2188    {Init, Last} = split_binary(B, byte_size(B) - 1),
2189    case Last of
2190        <<$/>> ->
2191            Init;
2192        _Char ->
2193            remove_last_segment(Init)
2194    end.
2195
2196
2197%% RFC 3986, 6.2.3.  Scheme-Based Normalization
2198normalize_scheme_based(Map) ->
2199    Scheme = maps:get(scheme, Map, undefined),
2200    Port = maps:get(port, Map, undefined),
2201    Path= maps:get(path, Map, undefined),
2202    normalize_scheme_based(Map, Scheme, Port, Path).
2203%%
2204normalize_scheme_based(Map, Scheme, Port, Path)
2205  when Scheme =:= "http"; Scheme =:= <<"http">> ->
2206    normalize_http(Map, Port, Path);
2207normalize_scheme_based(Map, Scheme, Port, Path)
2208  when Scheme =:= "https"; Scheme =:= <<"https">> ->
2209    normalize_https(Map, Port, Path);
2210normalize_scheme_based(Map, Scheme, Port, _Path)
2211  when Scheme =:= "ftp"; Scheme =:= <<"ftp">> ->
2212    normalize_ftp(Map, Port);
2213normalize_scheme_based(Map, Scheme, Port, _Path)
2214  when Scheme =:= "ssh"; Scheme =:= <<"ssh">> ->
2215    normalize_ssh_sftp(Map, Port);
2216normalize_scheme_based(Map, Scheme, Port, _Path)
2217  when Scheme =:= "sftp"; Scheme =:= <<"sftp">> ->
2218    normalize_ssh_sftp(Map, Port);
2219normalize_scheme_based(Map, Scheme, Port, _Path)
2220  when Scheme =:= "tftp"; Scheme =:= <<"tftp">> ->
2221    normalize_tftp(Map, Port);
2222normalize_scheme_based(Map, _, _, _) ->
2223    Map.
2224
2225
2226normalize_http(Map, Port, Path) ->
2227    M1 = normalize_port(Map, Port, 80),
2228    normalize_http_path(M1, Path).
2229
2230
2231normalize_https(Map, Port, Path) ->
2232    M1 = normalize_port(Map, Port, 443),
2233    normalize_http_path(M1, Path).
2234
2235
2236normalize_ftp(Map, Port) ->
2237    normalize_port(Map, Port, 21).
2238
2239
2240normalize_ssh_sftp(Map, Port) ->
2241    normalize_port(Map, Port, 22).
2242
2243
2244normalize_tftp(Map, Port) ->
2245    normalize_port(Map, Port, 69).
2246
2247
2248normalize_port(Map, Port, Default) ->
2249    case Port of
2250        Default ->
2251            maps:remove(port, Map);
2252        _Else ->
2253            Map
2254    end.
2255
2256
2257normalize_http_path(Map, Path) ->
2258    case Path of
2259        "" ->
2260            Map#{path => "/"};
2261        <<>> ->
2262            Map#{path => <<"/">>};
2263        _Else ->
2264            Map
2265    end.
2266