1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2017-2020. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20%%
21%% [RFC 3986, Chapter 2.2. Reserved Characters]
22%%
23%%   reserved    = gen-delims / sub-delims
24%%
25%%   gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
26%%
27%%   sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
28%%               / "*" / "+" / "," / ";" / "="
29%%
30%%
31%% [RFC 3986, Chapter 2.3. Unreserved Characters]
32%%
33%%   unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
34%%
35%%
36%% [RFC 3986, Chapter 3. Syntax Components]
37%%
38%% The generic URI syntax consists of a hierarchical sequence of
39%% components referred to as the scheme, authority, path, query, and
40%% fragment.
41%%
42%%    URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
43%%
44%%    hier-part   = "//" authority path-abempty
45%%                   / path-absolute
46%%                   / path-rootless
47%%                   / path-empty
48%%
49%%    The scheme and path components are required, though the path may be
50%%    empty (no characters).  When authority is present, the path must
51%%    either be empty or begin with a slash ("/") character.  When
52%%    authority is not present, the path cannot begin with two slash
53%%    characters ("//").  These restrictions result in five different ABNF
54%%    rules for a path (Section 3.3), only one of which will match any
55%%    given URI reference.
56%%
57%%    The following are two example URIs and their component parts:
58%%
59%%          foo://example.com:8042/over/there?name=ferret#nose
60%%          \_/   \______________/\_________/ \_________/ \__/
61%%           |           |            |            |        |
62%%        scheme     authority       path        query   fragment
63%%           |   _____________________|__
64%%          / \ /                        \
65%%          urn:example:animal:ferret:nose
66%%
67%%
68%% [RFC 3986, Chapter 3.1. Scheme]
69%%
70%% Each URI begins with a scheme name that refers to a specification for
71%% assigning identifiers within that scheme.
72%%
73%%    scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
74%%
75%%
76%% [RFC 3986, Chapter 3.2. Authority]
77%%
78%% Many URI schemes include a hierarchical element for a naming
79%% authority so that governance of the name space defined by the
80%% remainder of the URI is delegated to that authority (which may, in
81%% turn, delegate it further).
82%%
83%%    authority   = [ userinfo "@" ] host [ ":" port ]
84%%
85%%
86%% [RFC 3986, Chapter 3.2.1. User Information]
87%%
88%% The userinfo subcomponent may consist of a user name and, optionally,
89%% scheme-specific information about how to gain authorization to access
90%% the resource. The user information, if present, is followed by a
91%% commercial at-sign ("@") that delimits it from the host.
92%%
93%%    userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
94%%
95%%
96%% [RFC 3986, Chapter 3.2.2. Host]
97%%
98%% The host subcomponent of authority is identified by an IP literal
99%% encapsulated within square brackets, an IPv4 address in dotted-
100%% decimal form, or a registered name.
101%%
102%%    host        = IP-literal / IPv4address / reg-name
103%%
104%%    IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
105%%
106%%    IPvFuture  = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
107%%
108%%    IPv6address =                            6( h16 ":" ) ls32
109%%                /                       "::" 5( h16 ":" ) ls32
110%%                / [               h16 ] "::" 4( h16 ":" ) ls32
111%%                / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
112%%                / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
113%%                / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
114%%                / [ *4( h16 ":" ) h16 ] "::"              ls32
115%%                / [ *5( h16 ":" ) h16 ] "::"              h16
116%%                / [ *6( h16 ":" ) h16 ] "::"
117%%
118%%    ls32        = ( h16 ":" h16 ) / IPv4address
119%%                ; least-significant 32 bits of address
120%%
121%%    h16         = 1*4HEXDIG
122%%                ; 16 bits of address represented in hexadecimal
123%%
124%%    IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
125%%
126%%    dec-octet   = DIGIT                 ; 0-9
127%%                / %x31-39 DIGIT         ; 10-99
128%%                / "1" 2DIGIT            ; 100-199
129%%                / "2" %x30-34 DIGIT     ; 200-249
130%%                / "25" %x30-35          ; 250-255
131%%
132%%    reg-name    = *( unreserved / pct-encoded / sub-delims )
133%%
134%%
135%% [RFC 3986, Chapter 3.2.2. Port]
136%%
137%% The port subcomponent of authority is designated by an optional port
138%% number in decimal following the host and delimited from it by a
139%% single colon (":") character.
140%%
141%%    port        = *DIGIT
142%%
143%%
144%% [RFC 3986, Chapter 3.3. Path]
145%%
146%% The path component contains data, usually organized in hierarchical
147%% form, that, along with data in the non-hierarchical query component
148%% (Section 3.4), serves to identify a resource within the scope of the
149%% URI's scheme and naming authority (if any).  The path is terminated
150%% by the first question mark ("?") or number sign ("#") character, or
151%% by the end of the URI.
152%%
153%%    path          = path-abempty    ; begins with "/" or is empty
154%%                  / path-absolute   ; begins with "/" but not "//"
155%%                  / path-noscheme   ; begins with a non-colon segment
156%%                  / path-rootless   ; begins with a segment
157%%                  / path-empty      ; zero characters
158%%
159%%    path-abempty  = *( "/" segment )
160%%    path-absolute = "/" [ segment-nz *( "/" segment ) ]
161%%    path-noscheme = segment-nz-nc *( "/" segment )
162%%    path-rootless = segment-nz *( "/" segment )
163%%    path-empty    = 0<pchar>
164%%    segment       = *pchar
165%%    segment-nz    = 1*pchar
166%%    segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
167%%                  ; non-zero-length segment without any colon ":"
168%%
169%%    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
170%%
171%%
172%% [RFC 3986, Chapter 3.4. Query]
173%%
174%% The query component contains non-hierarchical data that, along with
175%% data in the path component (Section 3.3), serves to identify a
176%% resource within the scope of the URI's scheme and naming authority
177%% (if any).  The query component is indicated by the first question
178%% mark ("?") character and terminated by a number sign ("#") character
179%% or by the end of the URI.
180%%
181%%    query       = *( pchar / "/" / "?" )
182%%
183%%
184%% [RFC 3986, Chapter 3.5. Fragment]
185%%
186%% The fragment identifier component of a URI allows indirect
187%% identification of a secondary resource by reference to a primary
188%% resource and additional identifying information.
189%%
190%%    fragment    = *( pchar / "/" / "?" )
191%%
192%%
193%% [RFC 3986, Chapter 4.1. URI Reference]
194%%
195%% URI-reference is used to denote the most common usage of a resource
196%% identifier.
197%%
198%%    URI-reference = URI / relative-ref
199%%
200%%
201%% [RFC 3986, Chapter 4.2. Relative Reference]
202%%
203%% A relative reference takes advantage of the hierarchical syntax
204%% (Section 1.2.3) to express a URI reference relative to the name space
205%% of another hierarchical URI.
206%%
207%%    relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
208%%
209%%    relative-part = "//" authority path-abempty
210%%                  / path-absolute
211%%                  / path-noscheme
212%%                  / path-empty
213%%
214%%
215%% [RFC 3986, Chapter 4.3. Absolute URI]
216%%
217%% Some protocol elements allow only the absolute form of a URI without
218%% a fragment identifier.  For example, defining a base URI for later
219%% use by relative references calls for an absolute-URI syntax rule that
220%% does not allow a fragment.
221%%
222%%    absolute-URI  = scheme ":" hier-part [ "?" query ]
223%%
224-module(uri_string).
225
226%%-------------------------------------------------------------------------
227%% External API
228%%-------------------------------------------------------------------------
229-export([allowed_characters/0,
230         compose_query/1,
231         compose_query/2,
232         dissect_query/1,
233         normalize/1,
234         normalize/2,
235         percent_decode/1,
236         parse/1,
237         recompose/1,
238         resolve/2,
239         resolve/3,
240         transcode/2]).
241-export_type([error/0,
242              uri_map/0,
243              uri_string/0]).
244
245
246%%-------------------------------------------------------------------------
247%% Internal API
248%%-------------------------------------------------------------------------
249-export([is_host/1, is_path/1]).  % suppress warnings
250
251
252%%-------------------------------------------------------------------------
253%% Macros
254%%-------------------------------------------------------------------------
255-define(CHAR(Char), <<Char/utf8>>).
256-define(STRING_EMPTY, <<>>).
257-define(STRING(MatchStr), <<MatchStr/binary>>).
258-define(STRING_REST(MatchStr, Rest), <<MatchStr/utf8, Rest/binary>>).
259
260-define(DEC2HEX(X),
261        if ((X) >= 0) andalso ((X) =< 9) -> (X) + $0;
262           ((X) >= 10) andalso ((X) =< 15) -> (X) + $A - 10
263        end).
264
265-define(HEX2DEC(X),
266        if ((X) >= $0) andalso ((X) =< $9) -> (X) - $0;
267           ((X) >= $A) andalso ((X) =< $F) -> (X) - $A + 10;
268           ((X) >= $a) andalso ((X) =< $f) -> (X) - $a + 10
269        end).
270
271
272%%%=========================================================================
273%%%  API
274%%%=========================================================================
275
276%%-------------------------------------------------------------------------
277%% URI compliant with RFC 3986
278%% ASCII %x21 - %x7A ("!" - "z") except
279%%   %x34    "    double quote
280%%   %x60    <    less than
281%%   %x62    >    greater than
282%%   %x92    \    backslash
283%%   %x94    ^    caret / circumflex
284%%   %x96    `    grave / accent
285%%-------------------------------------------------------------------------
286-type uri_string() :: iodata().
287-type error() :: {error, atom(), term()}.
288
289
290%%-------------------------------------------------------------------------
291%% RFC 3986, Chapter 3. Syntax Components
292%%-------------------------------------------------------------------------
293-type uri_map() ::
294  #{fragment => unicode:chardata(),
295    host => unicode:chardata(),
296    path => unicode:chardata(),
297    port => non_neg_integer() | undefined,
298    query => unicode:chardata(),
299    scheme => unicode:chardata(),
300    userinfo => unicode:chardata()}.
301
302
303%%-------------------------------------------------------------------------
304%% Normalize URIs
305%%-------------------------------------------------------------------------
306-spec normalize(URI) -> NormalizedURI when
307      URI :: uri_string() | uri_map(),
308      NormalizedURI :: uri_string()
309                     | error().
310normalize(URIMap) ->
311    normalize(URIMap, []).
312
313
314-spec normalize(URI, Options) -> NormalizedURI when
315      URI :: uri_string() | uri_map(),
316      Options :: [return_map],
317      NormalizedURI :: uri_string() | uri_map()
318                     | error().
319normalize(URIMap, []) when is_map(URIMap) ->
320    try recompose(normalize_map(URIMap))
321    catch
322        throw:{error, Atom, RestData} -> {error, Atom, RestData}
323    end;
324normalize(URIMap, [return_map]) when is_map(URIMap) ->
325    try normalize_map(URIMap)
326    catch
327        throw:{error, Atom, RestData} -> {error, Atom, RestData}
328    end;
329normalize(URIString, []) ->
330    case parse(URIString) of
331        Value when is_map(Value) ->
332            try recompose(normalize_map(Value))
333            catch
334                throw:{error, Atom, RestData} -> {error, Atom, RestData}
335            end;
336        Error ->
337            Error
338    end;
339normalize(URIString, [return_map]) ->
340    case parse(URIString) of
341        Value when is_map(Value) ->
342            try normalize_map(Value)
343            catch
344                throw:{error, Atom, RestData} -> {error, Atom, RestData}
345            end;
346        Error ->
347            Error
348    end.
349
350
351%%-------------------------------------------------------------------------
352%% Parse URIs
353%%-------------------------------------------------------------------------
354-spec parse(URIString) -> URIMap when
355      URIString :: uri_string(),
356      URIMap :: uri_map()
357              | error().
358parse(URIString) when is_binary(URIString) ->
359    try parse_uri_reference(URIString, #{})
360    catch
361        throw:{error, Atom, RestData} -> {error, Atom, RestData}
362    end;
363parse(URIString) when is_list(URIString) ->
364    try
365        Binary = unicode:characters_to_binary(URIString),
366        Map = parse_uri_reference(Binary, #{}),
367        convert_mapfields_to_list(Map)
368    catch
369        throw:{error, Atom, RestData} -> {error, Atom, RestData}
370    end.
371
372
373%%-------------------------------------------------------------------------
374%% Recompose URIs
375%%-------------------------------------------------------------------------
376-spec recompose(URIMap) -> URIString when
377      URIMap :: uri_map(),
378      URIString :: uri_string()
379                 | error().
380recompose(Map) ->
381    case is_valid_map(Map) of
382        false ->
383            {error, invalid_map, Map};
384        true ->
385            try
386                T0 = update_scheme(Map, empty),
387                T1 = update_userinfo(Map, T0),
388                T2 = update_host(Map, T1),
389                T3 = update_port(Map, T2),
390                T4 = update_path(Map, T3),
391                T5 = update_query(Map, T4),
392                update_fragment(Map, T5)
393            catch
394                throw:{error, Atom, RestData} -> {error, Atom, RestData}
395            end
396    end.
397
398
399%%-------------------------------------------------------------------------
400%% Resolve URIs
401%%-------------------------------------------------------------------------
402-spec resolve(RefURI, BaseURI) -> TargetURI when
403      RefURI :: uri_string() | uri_map(),
404      BaseURI :: uri_string() | uri_map(),
405      TargetURI :: uri_string()
406                 | error().
407resolve(URIMap, BaseURIMap) ->
408    resolve(URIMap, BaseURIMap, []).
409
410
411-spec resolve(RefURI, BaseURI, Options) -> TargetURI when
412      RefURI :: uri_string() | uri_map(),
413      BaseURI :: uri_string() | uri_map(),
414      Options :: [return_map],
415      TargetURI :: uri_string() | uri_map()
416                 | error().
417resolve(URIMap, BaseURIMap, Options) when is_map(URIMap) ->
418    case resolve_map(URIMap, BaseURIMap) of
419        TargetURIMap when is_map(TargetURIMap) ->
420            case Options of
421                [return_map] ->
422                    TargetURIMap;
423                [] ->
424                    recompose(TargetURIMap)
425            end;
426        Error ->
427            Error
428    end;
429resolve(URIString, BaseURIMap, Options) ->
430    case parse(URIString) of
431        URIMap when is_map(URIMap) ->
432            resolve(URIMap, BaseURIMap, Options);
433        Error ->
434            Error
435    end.
436
437
438%%-------------------------------------------------------------------------
439%% Transcode URIs
440%%-------------------------------------------------------------------------
441-spec transcode(URIString, Options) -> Result when
442      URIString :: uri_string(),
443      Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}],
444      Result :: uri_string()
445              | error().
446transcode(URIString, Options) when is_binary(URIString) ->
447    try
448        InEnc = proplists:get_value(in_encoding, Options, utf8),
449        OutEnc = proplists:get_value(out_encoding, Options, utf8),
450        List = convert_to_list(URIString, InEnc),
451        Output = transcode(List, [], InEnc, OutEnc),
452        convert_to_binary(Output, utf8, OutEnc)
453    catch
454        throw:{error, Atom, RestData} -> {error, Atom, RestData}
455    end;
456transcode(URIString, Options) when is_list(URIString) ->
457    InEnc = proplists:get_value(in_encoding, Options, utf8),
458    OutEnc = proplists:get_value(out_encoding, Options, utf8),
459    Flattened = flatten_list(URIString, InEnc),
460    try transcode(Flattened, [], InEnc, OutEnc)
461    catch
462        throw:{error, Atom, RestData} -> {error, Atom, RestData}
463    end.
464
465
466%%-------------------------------------------------------------------------
467%% Misc
468%%-------------------------------------------------------------------------
469-spec allowed_characters() -> [{atom(), list()}].
470allowed_characters() ->
471    Input = lists:seq(0,127),
472    Scheme = lists:filter(fun is_scheme/1, Input),
473    UserInfo = lists:filter(fun is_userinfo/1, Input),
474    Host = lists:filter(fun is_host/1, Input),
475    IPv4 = lists:filter(fun is_ipv4/1, Input),
476    IPv6 = lists:filter(fun is_ipv6/1, Input),
477    RegName = lists:filter(fun is_reg_name/1, Input),
478    Path = lists:filter(fun is_path/1, Input),
479    Query = lists:filter(fun is_query/1, Input),
480    Fragment = lists:filter(fun is_fragment/1, Input),
481    Reserved = lists:filter(fun is_reserved/1, Input),
482    Unreserved = lists:filter(fun is_unreserved/1, Input),
483    [{scheme, Scheme},
484     {userinfo, UserInfo},
485     {host, Host},
486     {ipv4, IPv4},
487     {ipv6, IPv6},
488     {regname,RegName},
489     {path,Path},
490     {query, Query},
491     {fragment,Fragment},
492     {reserved, Reserved},
493     {unreserved, Unreserved}].
494
495-spec percent_decode(URI) -> Result when
496      URI :: uri_string() | uri_map(),
497      Result :: uri_string() |
498                uri_map() |
499                {error, {invalid, {atom(), {term(), term()}}}}.
500percent_decode(URIMap) when is_map(URIMap)->
501    Fun = fun (K,V) when K =:= userinfo; K =:= host; K =:= path;
502                         K =:= query; K =:= fragment ->
503                  case raw_decode(V) of
504                      {error, Reason, Input} ->
505                          throw({error, {invalid, {K, {Reason, Input}}}});
506                      Else ->
507                          Else
508                  end;
509              %% Handle port and scheme
510              (_,V) ->
511                  V
512          end,
513    try maps:map(Fun, URIMap)
514    catch throw:Return ->
515            Return
516    end;
517percent_decode(URI) when is_list(URI) orelse
518                         is_binary(URI) ->
519    raw_decode(URI).
520
521%%-------------------------------------------------------------------------
522%% Functions for working with the query part of a URI as a list
523%% of key/value pairs.
524%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8
525%% HTML 5.0 - 4.10.22.6 URL-encoded form data - non UTF-8
526%%-------------------------------------------------------------------------
527
528%%-------------------------------------------------------------------------
529%% Compose urlencoded query string from a list of unescaped key/value pairs.
530%% (application/x-www-form-urlencoded encoding algorithm)
531%%-------------------------------------------------------------------------
532-spec compose_query(QueryList) -> QueryString when
533      QueryList :: [{unicode:chardata(), unicode:chardata() | true}],
534      QueryString :: uri_string()
535                   | error().
536compose_query(List) ->
537    compose_query(List, [{encoding, utf8}]).
538
539
540-spec compose_query(QueryList, Options) -> QueryString when
541      QueryList :: [{unicode:chardata(), unicode:chardata() | true}],
542      Options :: [{encoding, atom()}],
543      QueryString :: uri_string()
544                   | error().
545compose_query([],_Options) ->
546    [];
547compose_query(List, Options) ->
548    try compose_query(List, Options, false, <<>>)
549    catch
550      throw:{error, Atom, RestData} -> {error, Atom, RestData}
551    end.
552%%
553compose_query([{Key,true}|Rest], Options, IsList, Acc) ->
554    Separator = get_separator(Rest),
555    K = form_urlencode(Key, Options),
556    IsListNew = IsList orelse is_list(Key),
557    compose_query(Rest, Options, IsListNew, <<Acc/binary,K/binary,Separator/binary>>);
558compose_query([{Key,Value}|Rest], Options, IsList, Acc) ->
559    Separator = get_separator(Rest),
560    K = form_urlencode(Key, Options),
561    V = form_urlencode(Value, Options),
562    IsListNew = IsList orelse is_list(Key) orelse is_list(Value),
563    compose_query(Rest, Options, IsListNew, <<Acc/binary,K/binary,"=",V/binary,Separator/binary>>);
564compose_query([], _Options, IsList, Acc) ->
565    case IsList of
566        true -> convert_to_list(Acc, utf8);
567        false -> Acc
568    end.
569
570
571%%-------------------------------------------------------------------------
572%% Dissect a query string into a list of unescaped key/value pairs.
573%% (application/x-www-form-urlencoded decoding algorithm)
574%%-------------------------------------------------------------------------
575-spec dissect_query(QueryString) -> QueryList when
576      QueryString :: uri_string(),
577      QueryList :: [{unicode:chardata(), unicode:chardata() | true}]
578                 | error().
579dissect_query(<<>>) ->
580    [];
581dissect_query([]) ->
582    [];
583dissect_query(QueryString) when is_list(QueryString) ->
584    try
585        B = convert_to_binary(QueryString, utf8, utf8),
586        dissect_query_key(B, true, [], <<>>, <<>>)
587    catch
588        throw:{error, Atom, RestData} -> {error, Atom, RestData}
589    end;
590dissect_query(QueryString) ->
591    try dissect_query_key(QueryString, false, [], <<>>, <<>>)
592    catch
593        throw:{error, Atom, RestData} -> {error, Atom, RestData}
594    end.
595
596
597%%%========================================================================
598%%% Internal functions
599%%%========================================================================
600
601%%-------------------------------------------------------------------------
602%% Converts Map fields to lists
603%%-------------------------------------------------------------------------
604convert_mapfields_to_list(Map) ->
605    Fun = fun (_, V) when is_binary(V) -> unicode:characters_to_list(V);
606              (_, V) -> V end,
607    maps:map(Fun, Map).
608
609
610%%-------------------------------------------------------------------------
611%% [RFC 3986, Chapter 4.1. URI Reference]
612%%
613%% URI-reference is used to denote the most common usage of a resource
614%% identifier.
615%%
616%%    URI-reference = URI / relative-ref
617%%-------------------------------------------------------------------------
618-spec parse_uri_reference(binary(), uri_map()) -> uri_map().
619parse_uri_reference(<<>>, _) -> #{path => <<>>};
620parse_uri_reference(URIString, URI) ->
621    try parse_scheme_start(URIString, URI)
622    catch
623        throw:{_,_,_} ->
624            parse_relative_part(URIString, URI)
625    end.
626
627
628%%-------------------------------------------------------------------------
629%% [RFC 3986, Chapter 4.2. Relative Reference]
630%%
631%% A relative reference takes advantage of the hierarchical syntax
632%% (Section 1.2.3) to express a URI reference relative to the name space
633%% of another hierarchical URI.
634%%
635%%    relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
636%%
637%%    relative-part = "//" authority path-abempty
638%%                  / path-absolute
639%%                  / path-noscheme
640%%                  / path-empty
641%%-------------------------------------------------------------------------
642-spec parse_relative_part(binary(), uri_map()) -> uri_map().
643parse_relative_part(?STRING_REST("//", Rest), URI) ->
644    %% Parse userinfo - "//" is NOT part of authority
645    try parse_userinfo(Rest, URI) of
646        {T, URI1} ->
647            Userinfo = calculate_parsed_userinfo(Rest, T),
648            URI2 = maybe_add_path(URI1),
649            URI2#{userinfo => Userinfo}
650    catch
651        throw:{_,_,_} ->
652            {T, URI1} = parse_host(Rest, URI),
653            Host = calculate_parsed_host_port(Rest, T),
654            URI2 = maybe_add_path(URI1),
655            URI2#{host => remove_brackets(Host)}
656    end;
657parse_relative_part(?STRING_REST($/, Rest), URI) ->
658    {T, URI1} = parse_segment(Rest, URI),  % path-absolute
659    Path = calculate_parsed_part(Rest, T),
660    URI1#{path => ?STRING_REST($/, Path)};
661parse_relative_part(?STRING_REST($?, Rest), URI) ->
662    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
663    Query = calculate_parsed_query_fragment(Rest, T),
664    URI2 = maybe_add_path(URI1),
665    URI2#{query => Query};
666parse_relative_part(?STRING_REST($#, Rest), URI) ->
667    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
668    Fragment = calculate_parsed_query_fragment(Rest, T),
669    URI2 = maybe_add_path(URI1),
670    URI2#{fragment => Fragment};
671parse_relative_part(?STRING_REST(Char, Rest), URI) ->
672    case is_segment_nz_nc(Char) of
673        true ->
674            {T, URI1} = parse_segment_nz_nc(Rest, URI),  % path-noscheme
675            Path = calculate_parsed_part(Rest, T),
676            URI1#{path => ?STRING_REST(Char, Path)};
677        false -> throw({error,invalid_uri,[Char]})
678    end.
679
680
681%%-------------------------------------------------------------------------
682%% [RFC 3986, Chapter 3.3. Path]
683%%
684%% The path component contains data, usually organized in hierarchical
685%% form, that, along with data in the non-hierarchical query component
686%% (Section 3.4), serves to identify a resource within the scope of the
687%% URI's scheme and naming authority (if any).  The path is terminated
688%% by the first question mark ("?") or number sign ("#") character, or
689%% by the end of the URI.
690%%
691%%    path          = path-abempty    ; begins with "/" or is empty
692%%                  / path-absolute   ; begins with "/" but not "//"
693%%                  / path-noscheme   ; begins with a non-colon segment
694%%                  / path-rootless   ; begins with a segment
695%%                  / path-empty      ; zero characters
696%%
697%%    path-abempty  = *( "/" segment )
698%%    path-absolute = "/" [ segment-nz *( "/" segment ) ]
699%%    path-noscheme = segment-nz-nc *( "/" segment )
700%%    path-rootless = segment-nz *( "/" segment )
701%%    path-empty    = 0<pchar>
702%%    segment       = *pchar
703%%    segment-nz    = 1*pchar
704%%    segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
705%%                  ; non-zero-length segment without any colon ":"
706%%
707%%    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
708%%-------------------------------------------------------------------------
709
710%%-------------------------------------------------------------------------
711%%    path-abempty
712%%-------------------------------------------------------------------------
713-spec parse_segment(binary(), uri_map()) -> {binary(), uri_map()}.
714parse_segment(?STRING_REST($/, Rest), URI) ->
715    parse_segment(Rest, URI);  % segment
716parse_segment(?STRING_REST($?, Rest), URI) ->
717    {T, URI1} = parse_query(Rest, URI),  % ?query
718    Query = calculate_parsed_query_fragment(Rest, T),
719    {Rest, URI1#{query => Query}};
720parse_segment(?STRING_REST($#, Rest), URI) ->
721    {T, URI1} = parse_fragment(Rest, URI),
722    Fragment = calculate_parsed_query_fragment(Rest, T),
723    {Rest, URI1#{fragment => Fragment}};
724parse_segment(?STRING_REST(Char, Rest), URI) ->
725    case is_pchar(Char) of
726        true -> parse_segment(Rest, URI);
727        false -> throw({error,invalid_uri,[Char]})
728    end;
729parse_segment(?STRING_EMPTY, URI) ->
730    {?STRING_EMPTY, URI}.
731
732
733%%-------------------------------------------------------------------------
734%%    path-noscheme
735%%-------------------------------------------------------------------------
736-spec parse_segment_nz_nc(binary(), uri_map()) -> {binary(), uri_map()}.
737parse_segment_nz_nc(?STRING_REST($/, Rest), URI) ->
738    parse_segment(Rest, URI);  % segment
739parse_segment_nz_nc(?STRING_REST($?, Rest), URI) ->
740    {T, URI1} = parse_query(Rest, URI),  % ?query
741    Query = calculate_parsed_query_fragment(Rest, T),
742    {Rest, URI1#{query => Query}};
743parse_segment_nz_nc(?STRING_REST($#, Rest), URI) ->
744    {T, URI1} = parse_fragment(Rest, URI),
745    Fragment = calculate_parsed_query_fragment(Rest, T),
746    {Rest, URI1#{fragment => Fragment}};
747parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) ->
748    case is_segment_nz_nc(Char) of
749        true -> parse_segment_nz_nc(Rest, URI);
750        false -> throw({error,invalid_uri,[Char]})
751    end;
752parse_segment_nz_nc(?STRING_EMPTY, URI) ->
753    {?STRING_EMPTY, URI}.
754
755
756%% Check if char is pchar.
757-spec is_pchar(char()) -> boolean().
758is_pchar($%) -> true;  % pct-encoded
759is_pchar($:) -> true;
760is_pchar($@) -> true;
761is_pchar(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
762
763%% Check if char is segment_nz_nc.
764-spec is_segment_nz_nc(char()) -> boolean().
765is_segment_nz_nc($%) -> true;  % pct-encoded
766is_segment_nz_nc($@) -> true;
767is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
768
769
770%%-------------------------------------------------------------------------
771%% [RFC 3986, Chapter 3.1. Scheme]
772%%
773%% Each URI begins with a scheme name that refers to a specification for
774%% assigning identifiers within that scheme.
775%%
776%%    scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
777%%-------------------------------------------------------------------------
778-spec parse_scheme_start(binary(), uri_map()) -> uri_map().
779parse_scheme_start(?STRING_REST(Char, Rest), URI) ->
780    case is_alpha(Char) of
781        true  -> {T, URI1} = parse_scheme(Rest, URI),
782                 Scheme = calculate_parsed_scheme(Rest, T),
783                 URI2 = maybe_add_path(URI1),
784		 URI2#{scheme => ?STRING_REST(Char, Scheme)};
785        false -> throw({error,invalid_uri,[Char]})
786    end.
787
788%% Add path component if it missing after parsing the URI.
789%% According to the URI specification there is always a
790%% path component in every URI-reference and it can be
791%% empty.
792maybe_add_path(Map) ->
793    case maps:is_key(path, Map) of
794        false ->
795            Map#{path => <<>>};
796        _Else ->
797            Map
798    end.
799
800
801
802-spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}.
803parse_scheme(?STRING_REST($:, Rest), URI) ->
804    {_, URI1} = parse_hier(Rest, URI),
805    {Rest, URI1};
806parse_scheme(?STRING_REST(Char, Rest), URI) ->
807    case is_scheme(Char) of
808        true  -> parse_scheme(Rest, URI);
809        false -> throw({error,invalid_uri,[Char]})
810    end;
811parse_scheme(?STRING_EMPTY, _URI) ->
812    throw({error,invalid_uri,<<>>}).
813
814
815%% Check if char is allowed in scheme
816-spec is_scheme(char()) -> boolean().
817is_scheme($+) -> true;
818is_scheme($-) -> true;
819is_scheme($.) -> true;
820is_scheme(Char) -> is_alpha(Char) orelse is_digit(Char).
821
822
823%%-------------------------------------------------------------------------
824%%    hier-part   = "//" authority path-abempty
825%%                   / path-absolute
826%%                   / path-rootless
827%%                   / path-empty
828%%-------------------------------------------------------------------------
829-spec parse_hier(binary(), uri_map()) -> {binary(), uri_map()}.
830parse_hier(?STRING_REST("//", Rest), URI) ->
831    % Parse userinfo - "//" is NOT part of authority
832    try parse_userinfo(Rest, URI) of
833        {T, URI1} ->
834            Userinfo = calculate_parsed_userinfo(Rest, T),
835	    {Rest, URI1#{userinfo => Userinfo}}
836    catch
837        throw:{_,_,_} ->
838            {T, URI1} = parse_host(Rest, URI),
839            Host = calculate_parsed_host_port(Rest, T),
840	    {Rest, URI1#{host => remove_brackets(Host)}}
841    end;
842parse_hier(?STRING_REST($/, Rest), URI) ->
843    {T, URI1} = parse_segment(Rest, URI),  % path-absolute
844    Path = calculate_parsed_part(Rest, T),
845    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
846parse_hier(?STRING_REST($?, Rest), URI) ->
847    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
848    Query = calculate_parsed_query_fragment(Rest, T),
849    {Rest, URI1#{query => Query}};
850parse_hier(?STRING_REST($#, Rest), URI) ->
851    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
852    Fragment = calculate_parsed_query_fragment(Rest, T),
853    {Rest, URI1#{fragment => Fragment}};
854parse_hier(?STRING_REST(Char, Rest), URI) ->  % path-rootless
855    case is_pchar(Char) of
856        true ->  % segment_nz
857            {T, URI1} = parse_segment(Rest, URI),
858            Path = calculate_parsed_part(Rest, T),
859            {Rest, URI1#{path => ?STRING_REST(Char, Path)}};
860        false -> throw({error,invalid_uri,[Char]})
861    end;
862parse_hier(?STRING_EMPTY, URI) ->
863    {<<>>, URI}.
864
865
866%%-------------------------------------------------------------------------
867%% [RFC 3986, Chapter 3.2. Authority]
868%%
869%% Many URI schemes include a hierarchical element for a naming
870%% authority so that governance of the name space defined by the
871%% remainder of the URI is delegated to that authority (which may, in
872%% turn, delegate it further).
873%%
874%% The authority component is preceded by a double slash ("//") and is
875%% terminated by the next slash ("/"), question mark ("?"), or number
876%% sign ("#") character, or by the end of the URI.
877%%
878%%    authority   = [ userinfo "@" ] host [ ":" port ]
879%%
880%%
881%% [RFC 3986, Chapter 3.2.1. User Information]
882%%
883%% The userinfo subcomponent may consist of a user name and, optionally,
884%% scheme-specific information about how to gain authorization to access
885%% the resource. The user information, if present, is followed by a
886%% commercial at-sign ("@") that delimits it from the host.
887%%
888%%    userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
889%%-------------------------------------------------------------------------
890-spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}.
891parse_userinfo(?CHAR($@), URI) ->
892    {?STRING_EMPTY, URI#{host => <<>>}};
893parse_userinfo(?STRING_REST($@, Rest), URI) ->
894    {T, URI1} = parse_host(Rest, URI),
895    Host = calculate_parsed_host_port(Rest, T),
896    {Rest, URI1#{host => remove_brackets(Host)}};
897parse_userinfo(?STRING_REST(Char, Rest), URI) ->
898    case is_userinfo(Char) of
899        true -> parse_userinfo(Rest, URI);
900        false -> throw({error,invalid_uri,[Char]})
901    end;
902parse_userinfo(?STRING_EMPTY, _URI) ->
903    %% URI cannot end in userinfo state
904    throw({error,invalid_uri,<<>>}).
905
906
907%% Check if char is allowed in userinfo
908-spec is_userinfo(char()) -> boolean().
909is_userinfo($%) -> true;  % pct-encoded
910is_userinfo($:) -> true;
911is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
912
913
914%%-------------------------------------------------------------------------
915%% [RFC 3986, Chapter 3.2.2. Host]
916%%
917%% The host subcomponent of authority is identified by an IP literal
918%% encapsulated within square brackets, an IPv4 address in dotted-
919%% decimal form, or a registered name.
920%%
921%%    host        = IP-literal / IPv4address / reg-name
922%%
923%%    IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
924%%
925%%    IPvFuture  = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
926%%
927%%    IPv6address =                            6( h16 ":" ) ls32
928%%                /                       "::" 5( h16 ":" ) ls32
929%%                / [               h16 ] "::" 4( h16 ":" ) ls32
930%%                / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
931%%                / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
932%%                / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
933%%                / [ *4( h16 ":" ) h16 ] "::"              ls32
934%%                / [ *5( h16 ":" ) h16 ] "::"              h16
935%%                / [ *6( h16 ":" ) h16 ] "::"
936%%
937%%    ls32        = ( h16 ":" h16 ) / IPv4address
938%%                ; least-significant 32 bits of address
939%%
940%%    h16         = 1*4HEXDIG
941%%                ; 16 bits of address represented in hexadecimal
942%%
943%%    IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
944%%
945%%    dec-octet   = DIGIT                 ; 0-9
946%%                / %x31-39 DIGIT         ; 10-99
947%%                / "1" 2DIGIT            ; 100-199
948%%                / "2" %x30-34 DIGIT     ; 200-249
949%%                / "25" %x30-35          ; 250-255
950%%
951%%    reg-name    = *( unreserved / pct-encoded / sub-delims )
952%%-------------------------------------------------------------------------
953-spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}.
954parse_host(?STRING_REST($:, Rest), URI) ->
955    {T, URI1} = parse_port(Rest, URI),
956    H = calculate_parsed_host_port(Rest, T),
957    Port = get_port(H),
958    {Rest, URI1#{port => Port}};
959parse_host(?STRING_REST($/, Rest), URI) ->
960    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
961    Path = calculate_parsed_part(Rest, T),
962    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
963parse_host(?STRING_REST($?, Rest), URI) ->
964    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
965    Query = calculate_parsed_query_fragment(Rest, T),
966    {Rest, URI1#{query => Query}};
967parse_host(?STRING_REST($[, Rest), URI) ->
968    parse_ipv6_bin(Rest, [], URI);
969parse_host(?STRING_REST($#, Rest), URI) ->
970    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
971    Fragment = calculate_parsed_query_fragment(Rest, T),
972    {Rest, URI1#{fragment => Fragment}};
973parse_host(?STRING_REST(Char, Rest), URI) ->
974    case is_digit(Char) of
975        true ->
976            try parse_ipv4_bin(Rest, [Char], URI)
977            catch
978                throw:{_,_,_} ->
979                    parse_reg_name(?STRING_REST(Char, Rest), URI)
980            end;
981        false -> parse_reg_name(?STRING_REST(Char, Rest), URI)
982    end;
983parse_host(?STRING_EMPTY, URI) ->
984    {?STRING_EMPTY, URI}.
985
986
987-spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}.
988parse_reg_name(?STRING_REST($:, Rest), URI) ->
989    {T, URI1} = parse_port(Rest, URI),
990    H = calculate_parsed_host_port(Rest, T),
991    Port = get_port(H),
992    {Rest, URI1#{port => Port}};
993parse_reg_name(?STRING_REST($/, Rest), URI) ->
994    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
995    Path = calculate_parsed_part(Rest, T),
996    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
997parse_reg_name(?STRING_REST($?, Rest), URI) ->
998    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
999    Query = calculate_parsed_query_fragment(Rest, T),
1000    {Rest, URI1#{query => Query}};
1001parse_reg_name(?STRING_REST($#, Rest), URI) ->
1002    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
1003    Fragment = calculate_parsed_query_fragment(Rest, T),
1004    {Rest, URI1#{fragment => Fragment}};
1005parse_reg_name(?STRING_REST(Char, Rest), URI) ->
1006    case is_reg_name(Char) of
1007        true -> parse_reg_name(Rest, URI);
1008        false -> throw({error,invalid_uri,[Char]})
1009    end;
1010parse_reg_name(?STRING_EMPTY, URI) ->
1011    {?STRING_EMPTY, URI}.
1012
1013%% Check if char is allowed in reg-name
1014-spec is_reg_name(char()) -> boolean().
1015is_reg_name($%) -> true;
1016is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
1017
1018
1019-spec parse_ipv4_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}.
1020parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) ->
1021    _ = validate_ipv4_address(lists:reverse(Acc)),
1022    {T, URI1} = parse_port(Rest, URI),
1023    H = calculate_parsed_host_port(Rest, T),
1024    Port = get_port(H),
1025    {Rest, URI1#{port => Port}};
1026parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) ->
1027    _ = validate_ipv4_address(lists:reverse(Acc)),
1028    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
1029    Path = calculate_parsed_part(Rest, T),
1030    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
1031parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) ->
1032    _ = validate_ipv4_address(lists:reverse(Acc)),
1033    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
1034    Query = calculate_parsed_query_fragment(Rest, T),
1035    {Rest, URI1#{query => Query}};
1036parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) ->
1037    _ = validate_ipv4_address(lists:reverse(Acc)),
1038    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
1039    Fragment = calculate_parsed_query_fragment(Rest, T),
1040    {Rest, URI1#{fragment => Fragment}};
1041parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) ->
1042    case is_ipv4(Char) of
1043        true -> parse_ipv4_bin(Rest, [Char|Acc], URI);
1044        false -> throw({error,invalid_uri,[Char]})
1045    end;
1046parse_ipv4_bin(?STRING_EMPTY, Acc, URI) ->
1047    _ = validate_ipv4_address(lists:reverse(Acc)),
1048    {?STRING_EMPTY, URI}.
1049
1050
1051%% Check if char is allowed in IPv4 addresses
1052-spec is_ipv4(char()) -> boolean().
1053is_ipv4($.) -> true;
1054is_ipv4(Char) -> is_digit(Char).
1055
1056-spec validate_ipv4_address(list()) -> list().
1057validate_ipv4_address(Addr) ->
1058    case inet:parse_ipv4strict_address(Addr) of
1059        {ok, _} -> Addr;
1060        {error, _} -> throw({error,invalid_uri,Addr})
1061    end.
1062
1063
1064-spec parse_ipv6_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}.
1065parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) ->
1066    _ = validate_ipv6_address(lists:reverse(Acc)),
1067    parse_ipv6_bin_end(Rest, URI);
1068parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) ->
1069    case is_ipv6(Char) of
1070        true -> parse_ipv6_bin(Rest, [Char|Acc], URI);
1071        false -> throw({error,invalid_uri,[Char]})
1072    end;
1073parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) ->
1074    throw({error,invalid_uri,<<>>}).
1075
1076%% Check if char is allowed in IPv6 addresses
1077-spec is_ipv6(char()) -> boolean().
1078is_ipv6($:) -> true;
1079is_ipv6($.) -> true;
1080is_ipv6(Char) -> is_hex_digit(Char).
1081
1082
1083-spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}.
1084parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) ->
1085    {T, URI1} = parse_port(Rest, URI),
1086    H = calculate_parsed_host_port(Rest, T),
1087    Port = get_port(H),
1088    {Rest, URI1#{port => Port}};
1089parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) ->
1090    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
1091    Path = calculate_parsed_part(Rest, T),
1092    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
1093parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) ->
1094    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
1095    Query = calculate_parsed_query_fragment(Rest, T),
1096    {Rest, URI1#{query => Query}};
1097parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) ->
1098    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
1099    Fragment = calculate_parsed_query_fragment(Rest, T),
1100    {Rest, URI1#{fragment => Fragment}};
1101parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) ->
1102    case is_ipv6(Char) of
1103        true -> parse_ipv6_bin_end(Rest, URI);
1104        false -> throw({error,invalid_uri,[Char]})
1105    end;
1106parse_ipv6_bin_end(?STRING_EMPTY, URI) ->
1107    {?STRING_EMPTY, URI}.
1108
1109-spec validate_ipv6_address(list()) -> list().
1110validate_ipv6_address(Addr) ->
1111    case inet:parse_ipv6strict_address(Addr) of
1112        {ok, _} -> Addr;
1113        {error, _} -> throw({error,invalid_uri,Addr})
1114    end.
1115
1116
1117%%-------------------------------------------------------------------------
1118%% [RFC 3986, Chapter 3.2.2. Port]
1119%%
1120%% The port subcomponent of authority is designated by an optional port
1121%% number in decimal following the host and delimited from it by a
1122%% single colon (":") character.
1123%%
1124%%    port        = *DIGIT
1125%%-------------------------------------------------------------------------
1126-spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}.
1127parse_port(?STRING_REST($/, Rest), URI) ->
1128    {T, URI1} = parse_segment(Rest, URI),  % path-abempty
1129    Path = calculate_parsed_part(Rest, T),
1130    {Rest, URI1#{path => ?STRING_REST($/, Path)}};
1131parse_port(?STRING_REST($?, Rest), URI) ->
1132    {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
1133    Query = calculate_parsed_query_fragment(Rest, T),
1134    {Rest, URI1#{query => Query}};
1135parse_port(?STRING_REST($#, Rest), URI) ->
1136    {T, URI1} = parse_fragment(Rest, URI),  % path-empty
1137    Fragment = calculate_parsed_query_fragment(Rest, T),
1138    {Rest, URI1#{fragment => Fragment}};
1139parse_port(?STRING_REST(Char, Rest), URI) ->
1140    case is_digit(Char) of
1141        true -> parse_port(Rest, URI);
1142        false -> throw({error,invalid_uri,[Char]})
1143    end;
1144parse_port(?STRING_EMPTY, URI) ->
1145    {?STRING_EMPTY, URI}.
1146
1147
1148%%-------------------------------------------------------------------------
1149%% [RFC 3986, Chapter 3.4. Query]
1150%%
1151%% The query component contains non-hierarchical data that, along with
1152%% data in the path component (Section 3.3), serves to identify a
1153%% resource within the scope of the URI's scheme and naming authority
1154%% (if any).  The query component is indicated by the first question
1155%% mark ("?") character and terminated by a number sign ("#") character
1156%% or by the end of the URI.
1157%%
1158%%    query       = *( pchar / "/" / "?" )
1159%%-------------------------------------------------------------------------
1160-spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}.
1161parse_query(?STRING_REST($#, Rest), URI) ->
1162    {T, URI1} = parse_fragment(Rest, URI),
1163    Fragment = calculate_parsed_query_fragment(Rest, T),
1164    {Rest, URI1#{fragment => Fragment}};
1165parse_query(?STRING_REST(Char, Rest), URI) ->
1166    case is_query(Char) of
1167        true -> parse_query(Rest, URI);
1168        false -> throw({error,invalid_uri,[Char]})
1169    end;
1170parse_query(?STRING_EMPTY, URI) ->
1171    {?STRING_EMPTY, URI}.
1172
1173
1174%% Check if char is allowed in query
1175-spec is_query(char()) -> boolean().
1176is_query($/) -> true;
1177is_query($?) -> true;
1178is_query(Char) -> is_pchar(Char).
1179
1180
1181%%-------------------------------------------------------------------------
1182%% [RFC 3986, Chapter 3.5. Fragment]
1183%%
1184%% The fragment identifier component of a URI allows indirect
1185%% identification of a secondary resource by reference to a primary
1186%% resource and additional identifying information.
1187%%
1188%%    fragment    = *( pchar / "/" / "?" )
1189%%-------------------------------------------------------------------------
1190-spec parse_fragment(binary(), uri_map()) -> {binary(), uri_map()}.
1191parse_fragment(?STRING_REST(Char, Rest), URI) ->
1192    case is_fragment(Char) of
1193        true -> parse_fragment(Rest, URI);
1194        false -> throw({error,invalid_uri,[Char]})
1195    end;
1196parse_fragment(?STRING_EMPTY, URI) ->
1197    {?STRING_EMPTY, URI}.
1198
1199
1200%% Check if char is allowed in fragment
1201-spec is_fragment(char()) -> boolean().
1202is_fragment($/) -> true;
1203is_fragment($?) -> true;
1204is_fragment(Char) -> is_pchar(Char).
1205
1206
1207%%-------------------------------------------------------------------------
1208%% [RFC 3986, Chapter 2.2. Reserved Characters]
1209%%
1210%%   reserved    = gen-delims / sub-delims
1211%%
1212%%   gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1213%%
1214%%   sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
1215%%               / "*" / "+" / "," / ";" / "="
1216%%
1217%%-------------------------------------------------------------------------
1218
1219%% Return true if input char is reserved.
1220-spec is_reserved(char()) -> boolean().
1221is_reserved($:) -> true;
1222is_reserved($/) -> true;
1223is_reserved($?) -> true;
1224is_reserved($#) -> true;
1225is_reserved($[) -> true;
1226is_reserved($]) -> true;
1227is_reserved($@) -> true;
1228
1229is_reserved($!) -> true;
1230is_reserved($$) -> true;
1231is_reserved($&) -> true;
1232is_reserved($') -> true;
1233is_reserved($() -> true;
1234is_reserved($)) -> true;
1235
1236is_reserved($*) -> true;
1237is_reserved($+) -> true;
1238is_reserved($,) -> true;
1239is_reserved($;) -> true;
1240is_reserved($=) -> true;
1241is_reserved(_) -> false.
1242
1243
1244%% Check if char is sub-delim.
1245-spec is_sub_delim(char()) -> boolean().
1246is_sub_delim($!) -> true;
1247is_sub_delim($$) -> true;
1248is_sub_delim($&) -> true;
1249is_sub_delim($') -> true;
1250is_sub_delim($() -> true;
1251is_sub_delim($)) -> true;
1252
1253is_sub_delim($*) -> true;
1254is_sub_delim($+) -> true;
1255is_sub_delim($,) -> true;
1256is_sub_delim($;) -> true;
1257is_sub_delim($=) -> true;
1258is_sub_delim(_) -> false.
1259
1260
1261%%-------------------------------------------------------------------------
1262%% [RFC 3986, Chapter 2.3. Unreserved Characters]
1263%%
1264%%   unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1265%%
1266%%-------------------------------------------------------------------------
1267-spec is_unreserved(char()) -> boolean().
1268is_unreserved($-) -> true;
1269is_unreserved($.) -> true;
1270is_unreserved($_) -> true;
1271is_unreserved($~) -> true;
1272is_unreserved(Char) -> is_alpha(Char) orelse is_digit(Char).
1273
1274-spec is_alpha(char()) -> boolean().
1275is_alpha(C)
1276  when $A =< C, C =< $Z;
1277       $a =< C, C =< $z -> true;
1278is_alpha(_) -> false.
1279
1280-spec is_digit(char()) -> boolean().
1281is_digit(C)
1282  when $0 =< C, C =< $9 -> true;
1283is_digit(_) -> false.
1284
1285-spec is_hex_digit(char()) -> boolean().
1286is_hex_digit(C)
1287  when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true;
1288is_hex_digit(_) -> false.
1289
1290
1291%% Remove enclosing brackets from binary
1292-spec remove_brackets(binary()) -> binary().
1293remove_brackets(<<$[/utf8, Rest/binary>>) ->
1294    {H,T} = split_binary(Rest, byte_size(Rest) - 1),
1295    case T =:= <<$]/utf8>> of
1296        true -> H;
1297        false -> Rest
1298    end;
1299remove_brackets(Addr) -> Addr.
1300
1301
1302%%-------------------------------------------------------------------------
1303%% Helper functions for calculating the parsed binary.
1304%%-------------------------------------------------------------------------
1305-spec calculate_parsed_scheme(binary(), binary()) -> binary().
1306calculate_parsed_scheme(Input, <<>>) ->
1307    strip_last_char(Input, [$:]);
1308calculate_parsed_scheme(Input, Unparsed) ->
1309    get_parsed_binary(Input, Unparsed).
1310
1311
1312-spec calculate_parsed_part(binary(), binary()) -> binary().
1313calculate_parsed_part(Input, <<>>) ->
1314    strip_last_char(Input, [$?,$#]);
1315calculate_parsed_part(Input, Unparsed) ->
1316    get_parsed_binary(Input, Unparsed).
1317
1318
1319-spec calculate_parsed_userinfo(binary(), binary()) -> binary().
1320calculate_parsed_userinfo(Input, <<>>) ->
1321    strip_last_char(Input, [$?,$#,$@]);
1322calculate_parsed_userinfo(Input, Unparsed) ->
1323    get_parsed_binary(Input, Unparsed).
1324
1325
1326-spec calculate_parsed_host_port(binary(), binary()) -> binary().
1327calculate_parsed_host_port(Input, <<>>) ->
1328    strip_last_char(Input, [$:,$?,$#,$/]);
1329calculate_parsed_host_port(Input, Unparsed) ->
1330    get_parsed_binary(Input, Unparsed).
1331
1332
1333calculate_parsed_query_fragment(Input, <<>>) ->
1334    strip_last_char(Input, [$#]);
1335calculate_parsed_query_fragment(Input, Unparsed) ->
1336    get_parsed_binary(Input, Unparsed).
1337
1338
1339get_port(<<>>) ->
1340    undefined;
1341get_port(B) ->
1342    try binary_to_integer(B)
1343    catch
1344        error:badarg ->
1345            throw({error, invalid_uri, B})
1346    end.
1347
1348
1349%% Strip last char if it is in list
1350%%
1351%% This function is optimized for speed: parse/1 is about 10% faster than
1352%% with an alternative implementation based on lists and sets.
1353strip_last_char(<<>>, _) -> <<>>;
1354strip_last_char(Input, [C0]) ->
1355    case binary:last(Input) of
1356        C0 ->
1357            init_binary(Input);
1358        _Else ->
1359            Input
1360    end;
1361strip_last_char(Input, [C0,C1]) ->
1362    case binary:last(Input) of
1363        C0 ->
1364            init_binary(Input);
1365        C1 ->
1366            init_binary(Input);
1367        _Else ->
1368            Input
1369    end;
1370strip_last_char(Input, [C0,C1,C2]) ->
1371    case binary:last(Input) of
1372        C0 ->
1373            init_binary(Input);
1374        C1 ->
1375            init_binary(Input);
1376        C2 ->
1377            init_binary(Input);
1378        _Else ->
1379            Input
1380    end;
1381strip_last_char(Input, [C0,C1,C2,C3]) ->
1382    case binary:last(Input) of
1383        C0 ->
1384            init_binary(Input);
1385        C1 ->
1386            init_binary(Input);
1387        C2 ->
1388            init_binary(Input);
1389        C3 ->
1390            init_binary(Input);
1391        _Else ->
1392            Input
1393    end.
1394
1395
1396%% Get parsed binary
1397get_parsed_binary(Input, Unparsed) ->
1398    {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
1399    First.
1400
1401
1402%% Return all bytes of the binary except the last one. The binary must be non-empty.
1403init_binary(B) ->
1404    {Init, _} =
1405        split_binary(B, byte_size(B) - 1),
1406    Init.
1407
1408
1409%% Returns the size of a binary exluding the first element.
1410%% Used in calls to split_binary().
1411-spec byte_size_exl_head(binary()) -> number().
1412byte_size_exl_head(<<>>) -> 0;
1413byte_size_exl_head(Binary) -> byte_size(Binary) + 1.
1414
1415
1416%%-------------------------------------------------------------------------
1417%% [RFC 3986, Chapter 2.1.  Percent-Encoding]
1418%%
1419%% A percent-encoding mechanism is used to represent a data octet in a
1420%% component when that octet's corresponding character is outside the
1421%% allowed set or is being used as a delimiter of, or within, the
1422%% component.  A percent-encoded octet is encoded as a character
1423%% triplet, consisting of the percent character "%" followed by the two
1424%% hexadecimal digits representing that octet's numeric value.  For
1425%% example, "%20" is the percent-encoding for the binary octet
1426%% "00100000" (ABNF: %x20), which in US-ASCII corresponds to the space
1427%% character (SP).  Section 2.4 describes when percent-encoding and
1428%% decoding is applied.
1429%%
1430%%   pct-encoded = "%" HEXDIG HEXDIG
1431%%-------------------------------------------------------------------------
1432
1433%%-------------------------------------------------------------------------
1434%% Percent-encode
1435%%-------------------------------------------------------------------------
1436
1437%% Only validates as scheme cannot have percent-encoded characters
1438-spec encode_scheme(list()|binary()) -> list() | binary().
1439encode_scheme([]) ->
1440    throw({error,invalid_scheme,""});
1441encode_scheme(<<>>) ->
1442    throw({error,invalid_scheme,<<>>});
1443encode_scheme(Scheme) ->
1444    case validate_scheme(Scheme) of
1445        true -> Scheme;
1446        false -> throw({error,invalid_scheme,Scheme})
1447    end.
1448
1449-spec encode_userinfo(list()|binary()) -> list() | binary().
1450encode_userinfo(Cs) ->
1451    encode(Cs, fun is_userinfo/1).
1452
1453-spec encode_host(list()|binary()) -> list() | binary().
1454encode_host(Cs) ->
1455    case classify_host(Cs) of
1456        regname -> Cs;
1457        ipv4 -> Cs;
1458        ipv6 -> bracket_ipv6(Cs);
1459        other -> encode(Cs, fun is_reg_name/1)
1460    end.
1461
1462-spec encode_path(list()|binary()) -> list() | binary().
1463encode_path(Cs) ->
1464    encode(Cs, fun is_path/1).
1465
1466-spec encode_query(list()|binary()) -> list() | binary().
1467encode_query(Cs) ->
1468    encode(Cs, fun is_query/1).
1469
1470-spec encode_fragment(list()|binary()) -> list() | binary().
1471encode_fragment(Cs) ->
1472    encode(Cs, fun is_fragment/1).
1473
1474%%-------------------------------------------------------------------------
1475%% Helper funtions for percent-decode
1476%%-------------------------------------------------------------------------
1477
1478-spec decode(list()|binary()) -> list() | binary().
1479decode(Cs) ->
1480    decode(Cs, <<>>).
1481%%
1482decode(L, Acc) when is_list(L) ->
1483    B0 = unicode:characters_to_binary(L),
1484    B1 = decode(B0, Acc),
1485    unicode:characters_to_list(B1);
1486decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
1487    case is_hex_digit(C0) andalso is_hex_digit(C1) of
1488        true ->
1489            B = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
1490            %% [2.4] When a URI is dereferenced, the components and subcomponents
1491            %% significant to the scheme-specific dereferencing process (if any)
1492            %% must be parsed and separated before the percent-encoded octets within
1493            %% those components can be safely decoded, as otherwise the data may be
1494            %% mistaken for component delimiters.  The only exception is for
1495            %% percent-encoded octets corresponding to characters in the unreserved
1496            %% set, which can be decoded at any time.
1497            case is_unreserved(B) of
1498                false ->
1499                    %% [2.2] Characters in the reserved set are protected from
1500                    %% normalization.
1501                    %% [2.1] For consistency, URI producers and normalizers should
1502                    %% use uppercase hexadecimal digits for all percent-
1503                    %% encodings.
1504                    H0 = hex_to_upper(C0),
1505                    H1 = hex_to_upper(C1),
1506                    decode(Cs, <<Acc/binary,$%,H0,H1>>);
1507                true ->
1508                    decode(Cs, <<Acc/binary, B>>)
1509            end;
1510        false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>})
1511    end;
1512decode(<<C,Cs/binary>>, Acc) ->
1513    decode(Cs, <<Acc/binary, C>>);
1514decode(<<>>, Acc) ->
1515    check_utf8(Acc).
1516
1517-spec raw_decode(list()|binary()) -> list() | binary() | error().
1518raw_decode(Cs) ->
1519    raw_decode(Cs, <<>>).
1520%%
1521raw_decode(L, Acc) when is_list(L) ->
1522    try
1523        B0 = unicode:characters_to_binary(L),
1524        B1 = raw_decode(B0, Acc),
1525        unicode:characters_to_list(B1)
1526    catch
1527        throw:{error, Atom, RestData} ->
1528            {error, Atom, RestData}
1529    end;
1530raw_decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
1531    case is_hex_digit(C0) andalso is_hex_digit(C1) of
1532        true ->
1533            B = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
1534            raw_decode(Cs, <<Acc/binary, B>>);
1535        false ->
1536            throw({error,invalid_percent_encoding,<<$%,C0,C1>>})
1537    end;
1538raw_decode(<<C,Cs/binary>>, Acc) ->
1539    raw_decode(Cs, <<Acc/binary, C>>);
1540raw_decode(<<>>, Acc) ->
1541    check_utf8(Acc).
1542
1543%% Returns Cs if it is utf8 encoded.
1544check_utf8(Cs) ->
1545    case unicode:characters_to_list(Cs) of
1546        {incomplete,_,_} ->
1547            throw({error,invalid_utf8,Cs});
1548        {error,_,_} ->
1549            throw({error,invalid_utf8,Cs});
1550        _ -> Cs
1551    end.
1552
1553%% Convert hex digit to uppercase form
1554hex_to_upper(H) when $a =< H, H =< $f ->
1555    H - 32;
1556hex_to_upper(H) when $0 =< H, H =< $9;$A =< H, H =< $F->
1557    H;
1558hex_to_upper(H) ->
1559    throw({error,invalid_input, H}).
1560
1561%% Check if char is allowed in host
1562-spec is_host(char()) -> boolean().
1563is_host($:) -> true;
1564is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
1565
1566%% Check if char is allowed in path
1567-spec is_path(char()) -> boolean().
1568is_path($/) -> true;
1569is_path(Char) -> is_pchar(Char).
1570
1571
1572%%-------------------------------------------------------------------------
1573%% Helper functions for percent-encode
1574%%-------------------------------------------------------------------------
1575-spec encode(list()|binary(), fun()) -> list() | binary().
1576encode(Component, Fun) when is_list(Component) ->
1577    B = unicode:characters_to_binary(Component),
1578    unicode:characters_to_list(encode(B, Fun, <<>>));
1579encode(Component, Fun) when is_binary(Component) ->
1580    encode(Component, Fun, <<>>).
1581%%
1582encode(<<Char/utf8, Rest/binary>>, Fun, Acc) ->
1583    C = encode_codepoint_binary(Char, Fun),
1584    encode(Rest, Fun, <<Acc/binary,C/binary>>);
1585encode(<<Char, Rest/binary>>, _Fun, _Acc) ->
1586    throw({error,invalid_input,<<Char,Rest/binary>>});
1587encode(<<>>, _Fun, Acc) ->
1588    Acc.
1589
1590
1591-spec encode_codepoint_binary(integer(), fun()) -> binary().
1592encode_codepoint_binary(C, Fun) ->
1593    case Fun(C) of
1594        false -> percent_encode_binary(C);
1595        true -> <<C>>
1596    end.
1597
1598
1599-spec percent_encode_binary(integer()) -> binary().
1600percent_encode_binary(Code) ->
1601    percent_encode_binary(<<Code/utf8>>, <<>>).
1602
1603
1604percent_encode_binary(<<A:4,B:4,Rest/binary>>, Acc) ->
1605    percent_encode_binary(Rest, <<Acc/binary,$%,(?DEC2HEX(A)),(?DEC2HEX(B))>>);
1606percent_encode_binary(<<>>, Acc) ->
1607    Acc.
1608
1609
1610%%-------------------------------------------------------------------------
1611%%-------------------------------------------------------------------------
1612validate_scheme([]) -> true;
1613validate_scheme([H|T]) ->
1614    case is_scheme(H) of
1615        true -> validate_scheme(T);
1616        false -> false
1617    end;
1618validate_scheme(<<>>) -> true;
1619validate_scheme(<<H, Rest/binary>>) ->
1620    case is_scheme(H) of
1621        true -> validate_scheme(Rest);
1622        false -> false
1623    end.
1624
1625
1626%%-------------------------------------------------------------------------
1627%% Classifies hostname into the following categories:
1628%% regname, ipv4 - address does not contain reserved characters to be
1629%%           percent-encoded
1630%% ipv6 - address does not contain reserved characters but it shall be
1631%%        encolsed in brackets
1632%% other - address shall be percent-encoded
1633%%-------------------------------------------------------------------------
1634classify_host([]) -> other;
1635classify_host(Addr) when is_binary(Addr) ->
1636    A = unicode:characters_to_list(Addr),
1637    classify_host_ipv6(A);
1638classify_host(Addr) ->
1639    classify_host_ipv6(Addr).
1640
1641classify_host_ipv6(Addr) ->
1642    case is_ipv6_address(Addr) of
1643        true -> ipv6;
1644        false -> classify_host_ipv4(Addr)
1645    end.
1646
1647classify_host_ipv4(Addr) ->
1648    case is_ipv4_address(Addr) of
1649        true -> ipv4;
1650        false -> classify_host_regname(Addr)
1651    end.
1652
1653classify_host_regname([]) -> regname;
1654classify_host_regname([H|T]) ->
1655    case is_reg_name(H) of
1656        true -> classify_host_regname(T);
1657        false -> other
1658    end.
1659
1660is_ipv4_address(Addr) ->
1661    case inet:parse_ipv4strict_address(Addr) of
1662        {ok, _} -> true;
1663        {error, _} -> false
1664    end.
1665
1666is_ipv6_address(Addr) ->
1667    case inet:parse_ipv6strict_address(Addr) of
1668        {ok, _} -> true;
1669        {error, _} -> false
1670    end.
1671
1672bracket_ipv6(Addr) when is_binary(Addr) ->
1673    concat(<<$[,Addr/binary>>,<<$]>>);
1674bracket_ipv6(Addr) when is_list(Addr) ->
1675    [$[|Addr] ++ "]".
1676
1677
1678%%-------------------------------------------------------------------------
1679%% Helper funtions for recompose
1680%%-------------------------------------------------------------------------
1681
1682%%-------------------------------------------------------------------------
1683%% Checks if input Map has valid combination of fields that can be
1684%% recomposed into a URI.
1685%%
1686%% The implementation is based on a decision tree that fulfills the
1687%% following rules:
1688%%   - 'path' shall always be present in the input map
1689%%       URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
1690%%       hier-part   = "//" authority path-abempty
1691%%                      / path-absolute
1692%%                      / path-rootless
1693%%                      / path-empty
1694%%   - 'host' shall be present in the input map when 'path' starts with
1695%%     two slashes ("//")
1696%%       path          = path-abempty    ; begins with "/" or is empty
1697%%                     / path-absolute   ; begins with "/" but not "//"
1698%%                     / path-noscheme   ; begins with a non-colon segment
1699%%                     / path-rootless   ; begins with a segment
1700%%                     / path-empty      ; zero characters
1701%%       path-abempty  = *( "/" segment )
1702%%       segment       = *pchar
1703%%   - 'host' shall be present if userinfo or port is present in input map
1704%%       authority   = [ userinfo "@" ] host [ ":" port ]
1705%%   - All fields shall be valid (scheme, userinfo, host, port, path, query
1706%%     or fragment).
1707%%-------------------------------------------------------------------------
1708is_valid_map(#{path := Path} = Map) ->
1709    ((starts_with_two_slash(Path) andalso is_valid_map_host(Map))
1710     orelse
1711       (maps:is_key(userinfo, Map) andalso is_valid_map_host(Map))
1712     orelse
1713       (maps:is_key(port, Map) andalso is_valid_map_host(Map))
1714     orelse
1715     all_fields_valid(Map));
1716is_valid_map(#{}) ->
1717    false.
1718
1719
1720is_valid_map_host(Map) ->
1721    maps:is_key(host, Map) andalso all_fields_valid(Map).
1722
1723
1724all_fields_valid(Map) ->
1725    Fun = fun(scheme, _, Acc) -> Acc;
1726             (userinfo, _, Acc) -> Acc;
1727             (host, _, Acc) -> Acc;
1728             (port, _, Acc) -> Acc;
1729             (path, _, Acc) -> Acc;
1730             (query, _, Acc) -> Acc;
1731             (fragment, _, Acc) -> Acc;
1732             (_, _, _) -> false
1733          end,
1734    maps:fold(Fun, true, Map).
1735
1736
1737starts_with_two_slash([$/,$/|_]) ->
1738    true;
1739starts_with_two_slash(?STRING_REST("//", _)) ->
1740    true;
1741starts_with_two_slash(_) -> false.
1742
1743
1744update_scheme(#{scheme := Scheme}, _) ->
1745    add_colon_postfix(encode_scheme(Scheme));
1746update_scheme(#{}, _) ->
1747    empty.
1748
1749
1750update_userinfo(#{userinfo := Userinfo}, empty) ->
1751    add_auth_prefix(encode_userinfo(Userinfo));
1752update_userinfo(#{userinfo := Userinfo}, URI) ->
1753    concat(URI,add_auth_prefix(encode_userinfo(Userinfo)));
1754update_userinfo(#{}, empty) ->
1755    empty;
1756update_userinfo(#{}, URI) ->
1757    URI.
1758
1759
1760update_host(#{host := Host}, empty) ->
1761    add_auth_prefix(encode_host(Host));
1762update_host(#{host := Host} = Map, URI) ->
1763    concat(URI,add_host_prefix(Map, encode_host(Host)));
1764update_host(#{}, empty) ->
1765    empty;
1766update_host(#{}, URI) ->
1767    URI.
1768
1769
1770%% URI cannot be empty for ports. E.g. ":8080" is not a valid URI
1771update_port(#{port := undefined}, URI) ->
1772    concat(URI, <<":">>);
1773update_port(#{port := Port}, URI) ->
1774    concat(URI,add_colon(encode_port(Port)));
1775update_port(#{}, URI) ->
1776    URI.
1777
1778
1779update_path(#{path := Path}, empty) ->
1780    encode_path(Path);
1781update_path(#{host := _, path := Path0}, URI) ->
1782    %% When host is present in a URI the path must begin with "/" or be empty.
1783    Path1 = maybe_flatten_list(Path0),
1784    Path = make_path_absolute(Path1),
1785    concat(URI,encode_path(Path));
1786update_path(#{path := Path}, URI) ->
1787    concat(URI,encode_path(Path));
1788update_path(#{}, empty) ->
1789    empty;
1790update_path(#{}, URI) ->
1791    URI.
1792
1793
1794update_query(#{query := Query}, empty) ->
1795    encode_query(Query);
1796update_query(#{query := Query}, URI) ->
1797    concat(URI,add_question_mark(encode_query(Query)));
1798update_query(#{}, empty) ->
1799    empty;
1800update_query(#{}, URI) ->
1801    URI.
1802
1803
1804update_fragment(#{fragment := Fragment}, empty) ->
1805    add_hashmark(encode_fragment(Fragment));
1806update_fragment(#{fragment := Fragment}, URI) ->
1807    concat(URI,add_hashmark(encode_fragment(Fragment)));
1808update_fragment(#{}, empty) ->
1809    "";
1810update_fragment(#{}, URI) ->
1811    URI.
1812
1813%%-------------------------------------------------------------------------
1814%% Concatenates its arguments that can be lists and binaries.
1815%% The result is a list if at least one of its argument is a list and
1816%% binary otherwise.
1817%%-------------------------------------------------------------------------
1818concat(A, B) when is_binary(A), is_binary(B) ->
1819    <<A/binary, B/binary>>;
1820concat(A, B) when is_binary(A), is_list(B) ->
1821    unicode:characters_to_list(A) ++ B;
1822concat(A, B) when is_list(A) ->
1823    A ++ maybe_to_list(B).
1824
1825add_hashmark(Comp) when is_binary(Comp) ->
1826    <<$#, Comp/binary>>;
1827add_hashmark(Comp) when is_list(Comp) ->
1828    [$#|Comp].
1829
1830add_question_mark(Comp) when is_binary(Comp) ->
1831    <<$?, Comp/binary>>;
1832add_question_mark(Comp) when is_list(Comp) ->
1833    [$?|Comp].
1834
1835add_colon(Comp) when is_binary(Comp) ->
1836    <<$:, Comp/binary>>.
1837
1838add_colon_postfix(Comp) when is_binary(Comp) ->
1839    <<Comp/binary,$:>>;
1840add_colon_postfix(Comp) when is_list(Comp) ->
1841    Comp ++ ":".
1842
1843add_auth_prefix(Comp) when is_binary(Comp) ->
1844    <<"//", Comp/binary>>;
1845add_auth_prefix(Comp) when is_list(Comp) ->
1846    [$/,$/|Comp].
1847
1848add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) ->
1849    <<$@,Host/binary>>;
1850add_host_prefix(#{}, Host) when is_binary(Host) ->
1851    <<"//",Host/binary>>;
1852add_host_prefix(#{userinfo := _}, Host) when is_list(Host) ->
1853    [$@|Host];
1854add_host_prefix(#{}, Host) when is_list(Host) ->
1855    [$/,$/|Host].
1856
1857maybe_to_list(Comp) when is_binary(Comp) -> unicode:characters_to_list(Comp);
1858maybe_to_list(Comp) -> Comp.
1859
1860encode_port(Port) ->
1861    integer_to_binary(Port).
1862
1863%% URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
1864%%
1865%% hier-part     = "//" authority path-abempty
1866%%               / path-absolute
1867%%               / path-rootless
1868%%               / path-empty
1869%%
1870%% path          = path-abempty    ; begins with "/" or is empty
1871%%               / path-absolute   ; begins with "/" but not "//"
1872%%               / path-noscheme   ; begins with a non-colon segment
1873%%               / path-rootless   ; begins with a segment
1874%%               / path-empty      ; zero characters
1875make_path_absolute(<<>>) ->
1876    <<>>;
1877make_path_absolute("") ->
1878    "";
1879make_path_absolute(<<"/",_/binary>> = Path) ->
1880    Path;
1881make_path_absolute([$/|_] = Path) ->
1882    Path;
1883make_path_absolute(Path) when is_binary(Path) ->
1884    concat(<<$/>>, Path);
1885make_path_absolute(Path) when is_list(Path) ->
1886    concat("/", Path).
1887
1888maybe_flatten_list(Path) when is_binary(Path) ->
1889    Path;
1890maybe_flatten_list(Path) ->
1891    unicode:characters_to_list(Path).
1892
1893%%-------------------------------------------------------------------------
1894%% Helper functions for resolve
1895%%-------------------------------------------------------------------------
1896
1897resolve_map(URIMap=#{scheme := _}, _) ->
1898    normalize_path_segment(URIMap);
1899resolve_map(URIMap, #{scheme := _}=BaseURIMap) ->
1900    resolve_map(URIMap, BaseURIMap, resolve_path_type(URIMap));
1901resolve_map(_URIMap, BaseURIMap) when is_map(BaseURIMap) ->
1902    {error,invalid_scheme,""};
1903resolve_map(URIMap, BaseURIString) ->
1904    case parse(BaseURIString) of
1905        BaseURIMap = #{scheme := _} ->
1906            resolve_map(URIMap, BaseURIMap, resolve_path_type(URIMap));
1907        BaseURIMap when is_map(BaseURIMap) ->
1908            {error,invalid_scheme,""};
1909        Error ->
1910            Error
1911    end.
1912
1913resolve_path_type(URIMap) ->
1914    case iolist_to_binary(maps:get(path, URIMap, <<>>)) of
1915        <<>> -> empty_path;
1916        <<$/,_/bits>> -> absolute_path;
1917        _ -> relative_path
1918    end.
1919
1920resolve_map(URI=#{host := _}, #{scheme := Scheme}, _) ->
1921    normalize_path_segment(URI#{scheme => Scheme});
1922resolve_map(URI, BaseURI, empty_path) ->
1923    Keys = case maps:is_key(query, URI) of
1924        true -> [scheme, userinfo, host, port, path];
1925        false -> [scheme, userinfo, host, port, path, query]
1926    end,
1927    maps:merge(URI, maps:with(Keys, BaseURI));
1928resolve_map(URI, BaseURI, absolute_path) ->
1929    normalize_path_segment(maps:merge(
1930        URI,
1931        maps:with([scheme, userinfo, host, port], BaseURI)));
1932resolve_map(URI=#{path := Path}, BaseURI, relative_path) ->
1933    normalize_path_segment(maps:merge(
1934        URI#{path => merge_paths(Path, BaseURI)},
1935        maps:with([scheme, userinfo, host, port], BaseURI))).
1936
1937merge_paths(Path, BaseURI=#{path := BasePath0}) ->
1938    case {BaseURI, iolist_size(BasePath0)} of
1939        {#{host := _}, 0} ->
1940            merge_paths_absolute(Path);
1941        _ ->
1942            case string:split(BasePath0, <<$/>>, trailing) of
1943                [BasePath, _] when is_binary(Path) -> unicode:characters_to_binary([BasePath, $/, Path]);
1944                [BasePath, _] when is_list(Path) -> unicode:characters_to_list([BasePath, $/, Path]);
1945                [_] -> Path
1946            end
1947    end.
1948
1949merge_paths_absolute(Path) when is_binary(Path) ->
1950    <<$/, Path/binary>>;
1951merge_paths_absolute(Path) when is_list(Path) ->
1952    unicode:characters_to_list([$/, Path]).
1953
1954
1955%%-------------------------------------------------------------------------
1956%% Helper functions for transcode
1957%%-------------------------------------------------------------------------
1958
1959%%-------------------------------------------------------------------------
1960%% uri_string:transcode(<<"x%00%00%00%F6"/utf32>>).
1961%% 1. Convert (transcode/2) input to list form (list of unicode codepoints)
1962%%    "x%00%00%00%F6"
1963%% 2. Accumulate characters until percent-encoded segment (transcode/4).
1964%%    Acc = "x"
1965%% 3. Convert percent-encoded triplets to binary form (transcode_pct/4)
1966%%    <<0,0,0,246>>
1967%% 4. Transcode in-encoded binary to out-encoding (utf32 -> utf8):
1968%%    <<195,182>>
1969%% 5. Percent-encode out-encoded binary:
1970%%    <<"%C3%B6"/utf8>> = <<37,67,51,37,66,54>>
1971%% 6. Convert binary to list form, reverse it and append the accumulator
1972%%    "6B%3C%" + "x"
1973%% 7. Reverse Acc and return it
1974%%-------------------------------------------------------------------------
1975transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) ->
1976    transcode_pct(L, Acc, <<>>, InEnc, OutEnc);
1977transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) ->
1978    transcode(L, Acc, [], InEnc, OutEnc).
1979%%
1980transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) ->
1981    transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding);
1982transcode([C|Rest], Acc, List, InEncoding, OutEncoding) ->
1983    transcode(Rest, Acc, [C|List], InEncoding, OutEncoding);
1984transcode([], Acc, List, _InEncoding, _OutEncoding) ->
1985    lists:reverse(List ++ Acc).
1986
1987
1988%% Transcode percent-encoded segment
1989transcode_pct([$%,C0,C1|Rest] = L, Acc, B, InEncoding, OutEncoding) ->
1990    case is_hex_digit(C0) andalso is_hex_digit(C1) of
1991        true ->
1992            Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
1993            transcode_pct(Rest, Acc, <<B/binary, Int>>, InEncoding, OutEncoding);
1994        false -> throw({error, invalid_percent_encoding,L})
1995    end;
1996transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) ->
1997    OutBinary = convert_to_binary(B, InEncoding, OutEncoding),
1998    PctEncUtf8 = percent_encode_segment(OutBinary),
1999    Out = lists:reverse(convert_to_list(PctEncUtf8, utf8)),
2000    transcode(L, Out ++ Acc, [], InEncoding, OutEncoding);
2001transcode_pct([], Acc, B, InEncoding, OutEncoding) ->
2002    OutBinary = convert_to_binary(B, InEncoding, OutEncoding),
2003    PctEncUtf8 = percent_encode_segment(OutBinary),
2004    Out = convert_to_list(PctEncUtf8, utf8),
2005    lists:reverse(Acc, Out).
2006
2007
2008%% Convert to binary
2009convert_to_binary(Binary, InEncoding, OutEncoding) ->
2010    case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of
2011        {error, _List, RestData} ->
2012            throw({error, invalid_input, RestData});
2013        {incomplete, _List, RestData} ->
2014            throw({error, invalid_input, RestData});
2015        Result ->
2016            Result
2017    end.
2018
2019
2020%% Convert to list
2021convert_to_list(Binary, InEncoding) ->
2022    case unicode:characters_to_list(Binary, InEncoding) of
2023        {error, _List, RestData} ->
2024            throw({error, invalid_input, RestData});
2025        {incomplete, _List, RestData} ->
2026            throw({error, invalid_input, RestData});
2027        Result ->
2028            Result
2029    end.
2030
2031
2032%% Flatten input list
2033flatten_list([], _) ->
2034    [];
2035flatten_list(L, InEnc) ->
2036    flatten_list(L, InEnc, []).
2037%%
2038flatten_list([H|T], InEnc, Acc) when is_binary(H) ->
2039    L = convert_to_list(H, InEnc),
2040    flatten_list(T, InEnc, lists:reverse(L, Acc));
2041flatten_list([H|T], InEnc, Acc) when is_list(H) ->
2042    flatten_list(H ++ T, InEnc, Acc);
2043flatten_list([H|T], InEnc, Acc) ->
2044    flatten_list(T, InEnc, [H|Acc]);
2045flatten_list([], _InEnc, Acc) ->
2046    lists:reverse(Acc);
2047flatten_list(Arg, _, _) ->
2048    throw({error, invalid_input, Arg}).
2049
2050
2051percent_encode_segment(Segment) ->
2052    percent_encode_binary(Segment, <<>>).
2053
2054
2055%%-------------------------------------------------------------------------
2056%% Helper functions for compose_query
2057%%-------------------------------------------------------------------------
2058
2059%% Returns separator to be used between key-value pairs
2060get_separator([]) ->
2061    <<>>;
2062get_separator(_L) ->
2063    <<"&">>.
2064
2065
2066%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8
2067%% HTML 5.0 - 4.10.22.6 URL-encoded form data - encoding (non UTF-8)
2068form_urlencode(Cs, [{encoding, latin1}]) when is_list(Cs) ->
2069    B = convert_to_binary(Cs, utf8, utf8),
2070    html5_byte_encode(base10_encode(B));
2071form_urlencode(Cs, [{encoding, latin1}]) when is_binary(Cs) ->
2072    html5_byte_encode(base10_encode(Cs));
2073form_urlencode(Cs, [{encoding, Encoding}])
2074  when is_list(Cs), Encoding =:= utf8; Encoding =:= unicode ->
2075    B = convert_to_binary(Cs, utf8, Encoding),
2076    html5_byte_encode(B);
2077form_urlencode(Cs, [{encoding, Encoding}])
2078  when is_binary(Cs), Encoding =:= utf8; Encoding =:= unicode ->
2079    html5_byte_encode(Cs);
2080form_urlencode(Cs, [{encoding, Encoding}]) when is_list(Cs); is_binary(Cs) ->
2081    throw({error,invalid_encoding, Encoding});
2082form_urlencode(Cs, _) ->
2083    throw({error,invalid_input, Cs}).
2084
2085
2086%% For each character in the entry's name and value that cannot be expressed using
2087%% the selected character encoding, replace the character by a string consisting of
2088%% a U+0026 AMPERSAND character (&), a "#" (U+0023) character, one or more ASCII
2089%% digits representing the Unicode code point of the character in base ten, and
2090%% finally a ";" (U+003B) character.
2091base10_encode(Cs) ->
2092    base10_encode(Cs, <<>>).
2093%%
2094base10_encode(<<>>, Acc) ->
2095    Acc;
2096base10_encode(<<H/utf8,T/binary>>, Acc) when H > 255 ->
2097    Base10 = convert_to_binary(integer_to_list(H,10), utf8, utf8),
2098    base10_encode(T, <<Acc/binary,"&#",Base10/binary,$;>>);
2099base10_encode(<<H/utf8,T/binary>>, Acc) ->
2100    base10_encode(T, <<Acc/binary,H>>).
2101
2102
2103html5_byte_encode(B) ->
2104    html5_byte_encode(B, <<>>).
2105%%
2106html5_byte_encode(<<>>, Acc) ->
2107    Acc;
2108html5_byte_encode(<<$ ,T/binary>>, Acc) ->
2109    html5_byte_encode(T, <<Acc/binary,$+>>);
2110html5_byte_encode(<<H,T/binary>>, Acc) ->
2111    case is_url_char(H) of
2112        true ->
2113            html5_byte_encode(T, <<Acc/binary,H>>);
2114        false ->
2115            <<A:4,B:4>> = <<H>>,
2116            html5_byte_encode(T, <<Acc/binary,$%,(?DEC2HEX(A)),(?DEC2HEX(B))>>)
2117    end;
2118html5_byte_encode(H, _Acc) ->
2119    throw({error,invalid_input, H}).
2120
2121
2122%% Return true if input char can appear in form-urlencoded string
2123%% Allowed chararacters:
2124%%   0x2A, 0x2D, 0x2E, 0x30 to 0x39, 0x41 to 0x5A,
2125%%   0x5F, 0x61 to 0x7A
2126is_url_char(C)
2127  when C =:= 16#2A; C =:= 16#2D;
2128       C =:= 16#2E; C =:= 16#5F;
2129       16#30 =< C, C =< 16#39;
2130       16#41 =< C, C =< 16#5A;
2131       16#61 =< C, C =< 16#7A -> true;
2132is_url_char(_) -> false.
2133
2134
2135%%-------------------------------------------------------------------------
2136%% Helper functions for dissect_query
2137%%-------------------------------------------------------------------------
2138dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) ->
2139    dissect_query_value(T, IsList, Acc, Key, Value);
2140dissect_query_key(<<"&#",T/binary>>, IsList, Acc, Key, Value) ->
2141    dissect_query_key(T, IsList, Acc, <<Key/binary,"&#">>, Value);
2142dissect_query_key(T = <<$&,_/binary>>, IsList, Acc, Key, <<>>) ->
2143    dissect_query_value(T, IsList, Acc, Key, true);
2144dissect_query_key(<<H,T/binary>>, IsList, Acc, Key, Value) ->
2145    dissect_query_key(T, IsList, Acc, <<Key/binary,H>>, Value);
2146dissect_query_key(T = <<>>, IsList, Acc, Key, <<>>) ->
2147    dissect_query_value(T, IsList, Acc, Key, true).
2148
2149dissect_query_value(<<$&,T/binary>>, IsList, Acc, Key, Value) ->
2150    K = form_urldecode(IsList, Key),
2151    V = form_urldecode(IsList, Value),
2152    dissect_query_key(T, IsList, [{K,V}|Acc], <<>>, <<>>);
2153dissect_query_value(<<H,T/binary>>, IsList, Acc, Key, Value) ->
2154    dissect_query_value(T, IsList, Acc, Key, <<Value/binary,H>>);
2155dissect_query_value(<<>>, IsList, Acc, Key, Value) ->
2156    K = form_urldecode(IsList, Key),
2157    V = form_urldecode(IsList, Value),
2158    lists:reverse([{K,V}|Acc]).
2159
2160%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8
2161%% HTML 5.0 - 4.10.22.6 URL-encoded form data - decoding (non UTF-8)
2162form_urldecode(_, true) ->
2163    true;
2164form_urldecode(true, B) ->
2165    Result = base10_decode(form_urldecode(B, <<>>)),
2166    convert_to_list(Result, utf8);
2167form_urldecode(false, B) ->
2168    base10_decode(form_urldecode(B, <<>>));
2169form_urldecode(<<>>, Acc) ->
2170    Acc;
2171form_urldecode(<<$+,T/binary>>, Acc) ->
2172    form_urldecode(T, <<Acc/binary,$ >>);
2173form_urldecode(<<$%,C0,C1,T/binary>>, Acc) ->
2174    case is_hex_digit(C0) andalso is_hex_digit(C1) of
2175        true ->
2176            V = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
2177            form_urldecode(T, <<Acc/binary, V>>);
2178        false ->
2179            L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8),
2180            throw({error, invalid_percent_encoding, L})
2181    end;
2182form_urldecode(<<H/utf8,T/binary>>, Acc) ->
2183    form_urldecode(T, <<Acc/binary,H/utf8>>);
2184form_urldecode(<<H,_/binary>>, _Acc) ->
2185    throw({error, invalid_character, [H]}).
2186
2187base10_decode(Cs) ->
2188    base10_decode(Cs, <<>>).
2189%
2190base10_decode(<<>>, Acc) ->
2191    Acc;
2192base10_decode(<<"&#",T/binary>>, Acc) ->
2193    base10_decode_unicode(T, Acc);
2194base10_decode(<<H/utf8,T/binary>>, Acc) ->
2195    base10_decode(T,<<Acc/binary,H/utf8>>);
2196base10_decode(<<H,_/binary>>, _) ->
2197    throw({error, invalid_input, [H]}).
2198
2199
2200base10_decode_unicode(B, Acc) ->
2201    base10_decode_unicode(B, 0, Acc).
2202%%
2203base10_decode_unicode(<<H/utf8,T/binary>>, Codepoint, Acc) when $0 =< H, H =< $9 ->
2204    Res = Codepoint * 10 + (H - $0),
2205    base10_decode_unicode(T, Res, Acc);
2206base10_decode_unicode(<<$;,T/binary>>, Codepoint, Acc) ->
2207    base10_decode(T, <<Acc/binary,Codepoint/utf8>>);
2208base10_decode_unicode(<<H,_/binary>>, _, _) ->
2209    throw({error, invalid_input, [H]}).
2210
2211
2212%%-------------------------------------------------------------------------
2213%% Helper functions for normalize
2214%%-------------------------------------------------------------------------
2215
2216normalize_map(URIMap) ->
2217    normalize_path_segment(
2218      normalize_scheme_based(
2219        normalize_percent_encoding(
2220          normalize_case(URIMap)))).
2221
2222
2223%% 6.2.2.1.  Case Normalization
2224normalize_case(#{scheme := Scheme, host := Host} = Map) ->
2225    Map#{scheme => to_lower(Scheme),
2226         host => to_lower(Host)};
2227normalize_case(#{host := Host} = Map) ->
2228    Map#{host => to_lower(Host)};
2229normalize_case(#{scheme := Scheme} = Map) ->
2230    Map#{scheme => to_lower(Scheme)};
2231normalize_case(#{} = Map) ->
2232    Map.
2233
2234
2235%% 6.2.2.2.  Percent-Encoding Normalization
2236normalize_percent_encoding(Map) ->
2237    Fun = fun (K,V) when K =:= userinfo; K =:= host; K =:= path;
2238                         K =:= query; K =:= fragment ->
2239                  decode(V);
2240              %% Handle port and scheme
2241              (_,V) ->
2242                  V
2243          end,
2244    maps:map(Fun, Map).
2245
2246
2247to_lower(Cs) when is_list(Cs) ->
2248    B = convert_to_binary(Cs, utf8, utf8),
2249    convert_to_list(to_lower(B), utf8);
2250to_lower(Cs) when is_binary(Cs) ->
2251    to_lower(Cs, <<>>).
2252%%
2253to_lower(<<C,Cs/binary>>, Acc) when $A =< C, C =< $Z ->
2254    to_lower(Cs, <<Acc/binary,(C + 32)>>);
2255to_lower(<<C,Cs/binary>>, Acc) ->
2256    to_lower(Cs, <<Acc/binary,C>>);
2257to_lower(<<>>, Acc) ->
2258    Acc.
2259
2260
2261%% 6.2.2.3. Path Segment Normalization
2262%% 5.2.4.   Remove Dot Segments
2263normalize_path_segment(Map) ->
2264    Path = maps:get(path, Map, undefined),
2265    Map#{path => remove_dot_segments(Path)}.
2266
2267
2268remove_dot_segments(Path) when is_binary(Path) ->
2269    remove_dot_segments(Path, <<>>);
2270remove_dot_segments(Path) when is_list(Path) ->
2271    B = convert_to_binary(Path, utf8, utf8),
2272    B1 = remove_dot_segments(B, <<>>),
2273    convert_to_list(B1, utf8).
2274%%
2275remove_dot_segments(<<>>, Output) ->
2276    Output;
2277remove_dot_segments(<<"../",T/binary>>, Output) ->
2278    remove_dot_segments(T, Output);
2279remove_dot_segments(<<"./",T/binary>>, Output) ->
2280    remove_dot_segments(T, Output);
2281remove_dot_segments(<<"/./",T/binary>>, Output) ->
2282    remove_dot_segments(<<$/,T/binary>>, Output);
2283remove_dot_segments(<<"/.">>, Output) ->
2284    remove_dot_segments(<<$/>>, Output);
2285remove_dot_segments(<<"/../",T/binary>>, Output) ->
2286    Out1 = remove_last_segment(Output),
2287    remove_dot_segments(<<$/,T/binary>>, Out1);
2288remove_dot_segments(<<"/..">>, Output) ->
2289    Out1 = remove_last_segment(Output),
2290    remove_dot_segments(<<$/>>, Out1);
2291remove_dot_segments(<<$.>>, Output) ->
2292    remove_dot_segments(<<>>, Output);
2293remove_dot_segments(<<"..">>, Output) ->
2294    remove_dot_segments(<<>>, Output);
2295remove_dot_segments(Input, Output) ->
2296    {First, Rest} = first_path_segment(Input),
2297    remove_dot_segments(Rest, <<Output/binary,First/binary>>).
2298
2299
2300first_path_segment(Input) ->
2301    F = first_path_segment(Input, <<>>),
2302    split_binary(Input, byte_size(F)).
2303%%
2304first_path_segment(<<$/,T/binary>>, Acc) ->
2305    first_path_segment_end(<<T/binary>>, <<Acc/binary,$/>>);
2306first_path_segment(<<C,T/binary>>, Acc) ->
2307    first_path_segment_end(<<T/binary>>, <<Acc/binary,C>>).
2308
2309
2310first_path_segment_end(<<>>, Acc) ->
2311    Acc;
2312first_path_segment_end(<<$/,_/binary>>, Acc) ->
2313    Acc;
2314first_path_segment_end(<<C,T/binary>>, Acc) ->
2315    first_path_segment_end(<<T/binary>>, <<Acc/binary,C>>).
2316
2317
2318remove_last_segment(<<>>) ->
2319    <<>>;
2320remove_last_segment(B) ->
2321    {Init, Last} = split_binary(B, byte_size(B) - 1),
2322    case Last of
2323        <<$/>> ->
2324            Init;
2325        _Char ->
2326            remove_last_segment(Init)
2327    end.
2328
2329
2330%% RFC 3986, 6.2.3.  Scheme-Based Normalization
2331normalize_scheme_based(Map) ->
2332    Scheme = maps:get(scheme, Map, undefined),
2333    Port = maps:get(port, Map, undefined),
2334    Path= maps:get(path, Map, undefined),
2335    normalize_scheme_based(Map, Scheme, Port, Path).
2336%%
2337normalize_scheme_based(Map, Scheme, Port, Path)
2338  when Scheme =:= "http"; Scheme =:= <<"http">> ->
2339    normalize_http(Map, Port, Path);
2340normalize_scheme_based(Map, Scheme, Port, Path)
2341  when Scheme =:= "https"; Scheme =:= <<"https">> ->
2342    normalize_https(Map, Port, Path);
2343normalize_scheme_based(Map, Scheme, Port, _Path)
2344  when Scheme =:= "ftp"; Scheme =:= <<"ftp">> ->
2345    normalize_ftp(Map, Port);
2346normalize_scheme_based(Map, Scheme, Port, _Path)
2347  when Scheme =:= "ssh"; Scheme =:= <<"ssh">> ->
2348    normalize_ssh_sftp(Map, Port);
2349normalize_scheme_based(Map, Scheme, Port, _Path)
2350  when Scheme =:= "sftp"; Scheme =:= <<"sftp">> ->
2351    normalize_ssh_sftp(Map, Port);
2352normalize_scheme_based(Map, Scheme, Port, _Path)
2353  when Scheme =:= "tftp"; Scheme =:= <<"tftp">> ->
2354    normalize_tftp(Map, Port);
2355normalize_scheme_based(Map, _, _, _) ->
2356    Map.
2357
2358
2359normalize_http(Map, Port, Path) ->
2360    M1 = normalize_port(Map, Port, 80),
2361    normalize_http_path(M1, Path).
2362
2363
2364normalize_https(Map, Port, Path) ->
2365    M1 = normalize_port(Map, Port, 443),
2366    normalize_http_path(M1, Path).
2367
2368
2369normalize_ftp(Map, Port) ->
2370    normalize_port(Map, Port, 21).
2371
2372
2373normalize_ssh_sftp(Map, Port) ->
2374    normalize_port(Map, Port, 22).
2375
2376
2377normalize_tftp(Map, Port) ->
2378    normalize_port(Map, Port, 69).
2379
2380
2381normalize_port(Map, Port, Default) ->
2382    case Port of
2383        Default ->
2384            maps:remove(port, Map);
2385        _Else ->
2386            Map
2387    end.
2388
2389
2390normalize_http_path(Map, Path) ->
2391    case Path of
2392        "" ->
2393            Map#{path => "/"};
2394        <<>> ->
2395            Map#{path => <<"/">>};
2396        _Else ->
2397            Map
2398    end.
2399