1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2017-2020. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20%% 21%% [RFC 3986, Chapter 2.2. Reserved Characters] 22%% 23%% reserved = gen-delims / sub-delims 24%% 25%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 26%% 27%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 28%% / "*" / "+" / "," / ";" / "=" 29%% 30%% 31%% [RFC 3986, Chapter 2.3. Unreserved Characters] 32%% 33%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 34%% 35%% 36%% [RFC 3986, Chapter 3. Syntax Components] 37%% 38%% The generic URI syntax consists of a hierarchical sequence of 39%% components referred to as the scheme, authority, path, query, and 40%% fragment. 41%% 42%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 43%% 44%% hier-part = "//" authority path-abempty 45%% / path-absolute 46%% / path-rootless 47%% / path-empty 48%% 49%% The scheme and path components are required, though the path may be 50%% empty (no characters). When authority is present, the path must 51%% either be empty or begin with a slash ("/") character. When 52%% authority is not present, the path cannot begin with two slash 53%% characters ("//"). These restrictions result in five different ABNF 54%% rules for a path (Section 3.3), only one of which will match any 55%% given URI reference. 56%% 57%% The following are two example URIs and their component parts: 58%% 59%% foo://example.com:8042/over/there?name=ferret#nose 60%% \_/ \______________/\_________/ \_________/ \__/ 61%% | | | | | 62%% scheme authority path query fragment 63%% | _____________________|__ 64%% / \ / \ 65%% urn:example:animal:ferret:nose 66%% 67%% 68%% [RFC 3986, Chapter 3.1. Scheme] 69%% 70%% Each URI begins with a scheme name that refers to a specification for 71%% assigning identifiers within that scheme. 72%% 73%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 74%% 75%% 76%% [RFC 3986, Chapter 3.2. Authority] 77%% 78%% Many URI schemes include a hierarchical element for a naming 79%% authority so that governance of the name space defined by the 80%% remainder of the URI is delegated to that authority (which may, in 81%% turn, delegate it further). 82%% 83%% authority = [ userinfo "@" ] host [ ":" port ] 84%% 85%% 86%% [RFC 3986, Chapter 3.2.1. User Information] 87%% 88%% The userinfo subcomponent may consist of a user name and, optionally, 89%% scheme-specific information about how to gain authorization to access 90%% the resource. The user information, if present, is followed by a 91%% commercial at-sign ("@") that delimits it from the host. 92%% 93%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 94%% 95%% 96%% [RFC 3986, Chapter 3.2.2. Host] 97%% 98%% The host subcomponent of authority is identified by an IP literal 99%% encapsulated within square brackets, an IPv4 address in dotted- 100%% decimal form, or a registered name. 101%% 102%% host = IP-literal / IPv4address / reg-name 103%% 104%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" 105%% 106%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) 107%% 108%% IPv6address = 6( h16 ":" ) ls32 109%% / "::" 5( h16 ":" ) ls32 110%% / [ h16 ] "::" 4( h16 ":" ) ls32 111%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 112%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 113%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 114%% / [ *4( h16 ":" ) h16 ] "::" ls32 115%% / [ *5( h16 ":" ) h16 ] "::" h16 116%% / [ *6( h16 ":" ) h16 ] "::" 117%% 118%% ls32 = ( h16 ":" h16 ) / IPv4address 119%% ; least-significant 32 bits of address 120%% 121%% h16 = 1*4HEXDIG 122%% ; 16 bits of address represented in hexadecimal 123%% 124%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 125%% 126%% dec-octet = DIGIT ; 0-9 127%% / %x31-39 DIGIT ; 10-99 128%% / "1" 2DIGIT ; 100-199 129%% / "2" %x30-34 DIGIT ; 200-249 130%% / "25" %x30-35 ; 250-255 131%% 132%% reg-name = *( unreserved / pct-encoded / sub-delims ) 133%% 134%% 135%% [RFC 3986, Chapter 3.2.2. Port] 136%% 137%% The port subcomponent of authority is designated by an optional port 138%% number in decimal following the host and delimited from it by a 139%% single colon (":") character. 140%% 141%% port = *DIGIT 142%% 143%% 144%% [RFC 3986, Chapter 3.3. Path] 145%% 146%% The path component contains data, usually organized in hierarchical 147%% form, that, along with data in the non-hierarchical query component 148%% (Section 3.4), serves to identify a resource within the scope of the 149%% URI's scheme and naming authority (if any). The path is terminated 150%% by the first question mark ("?") or number sign ("#") character, or 151%% by the end of the URI. 152%% 153%% path = path-abempty ; begins with "/" or is empty 154%% / path-absolute ; begins with "/" but not "//" 155%% / path-noscheme ; begins with a non-colon segment 156%% / path-rootless ; begins with a segment 157%% / path-empty ; zero characters 158%% 159%% path-abempty = *( "/" segment ) 160%% path-absolute = "/" [ segment-nz *( "/" segment ) ] 161%% path-noscheme = segment-nz-nc *( "/" segment ) 162%% path-rootless = segment-nz *( "/" segment ) 163%% path-empty = 0<pchar> 164%% segment = *pchar 165%% segment-nz = 1*pchar 166%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) 167%% ; non-zero-length segment without any colon ":" 168%% 169%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" 170%% 171%% 172%% [RFC 3986, Chapter 3.4. Query] 173%% 174%% The query component contains non-hierarchical data that, along with 175%% data in the path component (Section 3.3), serves to identify a 176%% resource within the scope of the URI's scheme and naming authority 177%% (if any). The query component is indicated by the first question 178%% mark ("?") character and terminated by a number sign ("#") character 179%% or by the end of the URI. 180%% 181%% query = *( pchar / "/" / "?" ) 182%% 183%% 184%% [RFC 3986, Chapter 3.5. Fragment] 185%% 186%% The fragment identifier component of a URI allows indirect 187%% identification of a secondary resource by reference to a primary 188%% resource and additional identifying information. 189%% 190%% fragment = *( pchar / "/" / "?" ) 191%% 192%% 193%% [RFC 3986, Chapter 4.1. URI Reference] 194%% 195%% URI-reference is used to denote the most common usage of a resource 196%% identifier. 197%% 198%% URI-reference = URI / relative-ref 199%% 200%% 201%% [RFC 3986, Chapter 4.2. Relative Reference] 202%% 203%% A relative reference takes advantage of the hierarchical syntax 204%% (Section 1.2.3) to express a URI reference relative to the name space 205%% of another hierarchical URI. 206%% 207%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] 208%% 209%% relative-part = "//" authority path-abempty 210%% / path-absolute 211%% / path-noscheme 212%% / path-empty 213%% 214%% 215%% [RFC 3986, Chapter 4.3. Absolute URI] 216%% 217%% Some protocol elements allow only the absolute form of a URI without 218%% a fragment identifier. For example, defining a base URI for later 219%% use by relative references calls for an absolute-URI syntax rule that 220%% does not allow a fragment. 221%% 222%% absolute-URI = scheme ":" hier-part [ "?" query ] 223%% 224-module(uri_string). 225 226%%------------------------------------------------------------------------- 227%% External API 228%%------------------------------------------------------------------------- 229-export([compose_query/1, compose_query/2, 230 dissect_query/1, normalize/1, normalize/2, parse/1, 231 recompose/1, resolve/2, resolve/3, transcode/2]). 232-export_type([error/0, uri_map/0, uri_string/0]). 233 234 235%%------------------------------------------------------------------------- 236%% Internal API 237%%------------------------------------------------------------------------- 238-export([is_host/1, is_path/1]). % suppress warnings 239 240 241%%------------------------------------------------------------------------- 242%% Macros 243%%------------------------------------------------------------------------- 244-define(CHAR(Char), <<Char/utf8>>). 245-define(STRING_EMPTY, <<>>). 246-define(STRING(MatchStr), <<MatchStr/binary>>). 247-define(STRING_REST(MatchStr, Rest), <<MatchStr/utf8, Rest/binary>>). 248 249-define(DEC2HEX(X), 250 if ((X) >= 0) andalso ((X) =< 9) -> (X) + $0; 251 ((X) >= 10) andalso ((X) =< 15) -> (X) + $A - 10 252 end). 253 254-define(HEX2DEC(X), 255 if ((X) >= $0) andalso ((X) =< $9) -> (X) - $0; 256 ((X) >= $A) andalso ((X) =< $F) -> (X) - $A + 10; 257 ((X) >= $a) andalso ((X) =< $f) -> (X) - $a + 10 258 end). 259 260 261%%%========================================================================= 262%%% API 263%%%========================================================================= 264 265%%------------------------------------------------------------------------- 266%% URI compliant with RFC 3986 267%% ASCII %x21 - %x7A ("!" - "z") except 268%% %x34 " double quote 269%% %x60 < less than 270%% %x62 > greater than 271%% %x92 \ backslash 272%% %x94 ^ caret / circumflex 273%% %x96 ` grave / accent 274%%------------------------------------------------------------------------- 275-type uri_string() :: iodata(). 276-type error() :: {error, atom(), term()}. 277 278 279%%------------------------------------------------------------------------- 280%% RFC 3986, Chapter 3. Syntax Components 281%%------------------------------------------------------------------------- 282-type uri_map() :: 283 #{fragment => unicode:chardata(), 284 host => unicode:chardata(), 285 path => unicode:chardata(), 286 port => non_neg_integer() | undefined, 287 query => unicode:chardata(), 288 scheme => unicode:chardata(), 289 userinfo => unicode:chardata()} | #{}. 290 291 292%%------------------------------------------------------------------------- 293%% Normalize URIs 294%%------------------------------------------------------------------------- 295-spec normalize(URI) -> NormalizedURI when 296 URI :: uri_string() | uri_map(), 297 NormalizedURI :: uri_string() 298 | error(). 299normalize(URIMap) -> 300 normalize(URIMap, []). 301 302 303-spec normalize(URI, Options) -> NormalizedURI when 304 URI :: uri_string() | uri_map(), 305 Options :: [return_map], 306 NormalizedURI :: uri_string() | uri_map() 307 | error(). 308normalize(URIMap, []) when is_map(URIMap) -> 309 try recompose(normalize_map(URIMap)) 310 catch 311 throw:{error, Atom, RestData} -> {error, Atom, RestData} 312 end; 313normalize(URIMap, [return_map]) when is_map(URIMap) -> 314 try normalize_map(URIMap) 315 catch 316 throw:{error, Atom, RestData} -> {error, Atom, RestData} 317 end; 318normalize(URIString, []) -> 319 case parse(URIString) of 320 Value when is_map(Value) -> 321 try recompose(normalize_map(Value)) 322 catch 323 throw:{error, Atom, RestData} -> {error, Atom, RestData} 324 end; 325 Error -> 326 Error 327 end; 328normalize(URIString, [return_map]) -> 329 case parse(URIString) of 330 Value when is_map(Value) -> 331 try normalize_map(Value) 332 catch 333 throw:{error, Atom, RestData} -> {error, Atom, RestData} 334 end; 335 Error -> 336 Error 337 end. 338 339 340%%------------------------------------------------------------------------- 341%% Parse URIs 342%%------------------------------------------------------------------------- 343-spec parse(URIString) -> URIMap when 344 URIString :: uri_string(), 345 URIMap :: uri_map() 346 | error(). 347parse(URIString) when is_binary(URIString) -> 348 try parse_uri_reference(URIString, #{}) 349 catch 350 throw:{error, Atom, RestData} -> {error, Atom, RestData} 351 end; 352parse(URIString) when is_list(URIString) -> 353 try 354 Binary = unicode:characters_to_binary(URIString), 355 Map = parse_uri_reference(Binary, #{}), 356 convert_mapfields_to_list(Map) 357 catch 358 throw:{error, Atom, RestData} -> {error, Atom, RestData} 359 end. 360 361 362%%------------------------------------------------------------------------- 363%% Recompose URIs 364%%------------------------------------------------------------------------- 365-spec recompose(URIMap) -> URIString when 366 URIMap :: uri_map(), 367 URIString :: uri_string() 368 | error(). 369recompose(Map) -> 370 case is_valid_map(Map) of 371 false -> 372 {error, invalid_map, Map}; 373 true -> 374 try 375 T0 = update_scheme(Map, empty), 376 T1 = update_userinfo(Map, T0), 377 T2 = update_host(Map, T1), 378 T3 = update_port(Map, T2), 379 T4 = update_path(Map, T3), 380 T5 = update_query(Map, T4), 381 update_fragment(Map, T5) 382 catch 383 throw:{error, Atom, RestData} -> {error, Atom, RestData} 384 end 385 end. 386 387 388%%------------------------------------------------------------------------- 389%% Resolve URIs 390%%------------------------------------------------------------------------- 391-spec resolve(RefURI, BaseURI) -> TargetURI when 392 RefURI :: uri_string() | uri_map(), 393 BaseURI :: uri_string() | uri_map(), 394 TargetURI :: uri_string() 395 | error(). 396resolve(URIMap, BaseURIMap) -> 397 resolve(URIMap, BaseURIMap, []). 398 399 400-spec resolve(RefURI, BaseURI, Options) -> TargetURI when 401 RefURI :: uri_string() | uri_map(), 402 BaseURI :: uri_string() | uri_map(), 403 Options :: [return_map], 404 TargetURI :: uri_string() | uri_map() 405 | error(). 406resolve(URIMap, BaseURIMap, Options) when is_map(URIMap) -> 407 case resolve_map(URIMap, BaseURIMap) of 408 TargetURIMap when is_map(TargetURIMap) -> 409 case Options of 410 [return_map] -> 411 TargetURIMap; 412 [] -> 413 recompose(TargetURIMap) 414 end; 415 Error -> 416 Error 417 end; 418resolve(URIString, BaseURIMap, Options) -> 419 case parse(URIString) of 420 URIMap when is_map(URIMap) -> 421 resolve(URIMap, BaseURIMap, Options); 422 Error -> 423 Error 424 end. 425 426 427%%------------------------------------------------------------------------- 428%% Transcode URIs 429%%------------------------------------------------------------------------- 430-spec transcode(URIString, Options) -> Result when 431 URIString :: uri_string(), 432 Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}], 433 Result :: uri_string() 434 | error(). 435transcode(URIString, Options) when is_binary(URIString) -> 436 try 437 InEnc = proplists:get_value(in_encoding, Options, utf8), 438 OutEnc = proplists:get_value(out_encoding, Options, utf8), 439 List = convert_to_list(URIString, InEnc), 440 Output = transcode(List, [], InEnc, OutEnc), 441 convert_to_binary(Output, utf8, OutEnc) 442 catch 443 throw:{error, Atom, RestData} -> {error, Atom, RestData} 444 end; 445transcode(URIString, Options) when is_list(URIString) -> 446 InEnc = proplists:get_value(in_encoding, Options, utf8), 447 OutEnc = proplists:get_value(out_encoding, Options, utf8), 448 Flattened = flatten_list(URIString, InEnc), 449 try transcode(Flattened, [], InEnc, OutEnc) 450 catch 451 throw:{error, Atom, RestData} -> {error, Atom, RestData} 452 end. 453 454 455%%------------------------------------------------------------------------- 456%% Functions for working with the query part of a URI as a list 457%% of key/value pairs. 458%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8 459%% HTML 5.0 - 4.10.22.6 URL-encoded form data - non UTF-8 460%%------------------------------------------------------------------------- 461 462%%------------------------------------------------------------------------- 463%% Compose urlencoded query string from a list of unescaped key/value pairs. 464%% (application/x-www-form-urlencoded encoding algorithm) 465%%------------------------------------------------------------------------- 466-spec compose_query(QueryList) -> QueryString when 467 QueryList :: [{unicode:chardata(), unicode:chardata() | true}], 468 QueryString :: uri_string() 469 | error(). 470compose_query(List) -> 471 compose_query(List, [{encoding, utf8}]). 472 473 474-spec compose_query(QueryList, Options) -> QueryString when 475 QueryList :: [{unicode:chardata(), unicode:chardata() | true}], 476 Options :: [{encoding, atom()}], 477 QueryString :: uri_string() 478 | error(). 479compose_query([],_Options) -> 480 []; 481compose_query(List, Options) -> 482 try compose_query(List, Options, false, <<>>) 483 catch 484 throw:{error, Atom, RestData} -> {error, Atom, RestData} 485 end. 486%% 487compose_query([{Key,true}|Rest], Options, IsList, Acc) -> 488 Separator = get_separator(Rest), 489 K = form_urlencode(Key, Options), 490 IsListNew = IsList orelse is_list(Key), 491 compose_query(Rest, Options, IsListNew, <<Acc/binary,K/binary,Separator/binary>>); 492compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> 493 Separator = get_separator(Rest), 494 K = form_urlencode(Key, Options), 495 V = form_urlencode(Value, Options), 496 IsListNew = IsList orelse is_list(Key) orelse is_list(Value), 497 compose_query(Rest, Options, IsListNew, <<Acc/binary,K/binary,"=",V/binary,Separator/binary>>); 498compose_query([], _Options, IsList, Acc) -> 499 case IsList of 500 true -> convert_to_list(Acc, utf8); 501 false -> Acc 502 end. 503 504 505%%------------------------------------------------------------------------- 506%% Dissect a query string into a list of unescaped key/value pairs. 507%% (application/x-www-form-urlencoded decoding algorithm) 508%%------------------------------------------------------------------------- 509-spec dissect_query(QueryString) -> QueryList when 510 QueryString :: uri_string(), 511 QueryList :: [{unicode:chardata(), unicode:chardata() | true}] 512 | error(). 513dissect_query(<<>>) -> 514 []; 515dissect_query([]) -> 516 []; 517dissect_query(QueryString) when is_list(QueryString) -> 518 try 519 B = convert_to_binary(QueryString, utf8, utf8), 520 dissect_query_key(B, true, [], <<>>, <<>>) 521 catch 522 throw:{error, Atom, RestData} -> {error, Atom, RestData} 523 end; 524dissect_query(QueryString) -> 525 try dissect_query_key(QueryString, false, [], <<>>, <<>>) 526 catch 527 throw:{error, Atom, RestData} -> {error, Atom, RestData} 528 end. 529 530 531%%%======================================================================== 532%%% Internal functions 533%%%======================================================================== 534 535%%------------------------------------------------------------------------- 536%% Converts Map fields to lists 537%%------------------------------------------------------------------------- 538convert_mapfields_to_list(Map) -> 539 Fun = fun (_, V) when is_binary(V) -> unicode:characters_to_list(V); 540 (_, V) -> V end, 541 maps:map(Fun, Map). 542 543 544%%------------------------------------------------------------------------- 545%% [RFC 3986, Chapter 4.1. URI Reference] 546%% 547%% URI-reference is used to denote the most common usage of a resource 548%% identifier. 549%% 550%% URI-reference = URI / relative-ref 551%%------------------------------------------------------------------------- 552-spec parse_uri_reference(binary(), uri_map()) -> uri_map(). 553parse_uri_reference(<<>>, _) -> #{path => <<>>}; 554parse_uri_reference(URIString, URI) -> 555 try parse_scheme_start(URIString, URI) 556 catch 557 throw:{_,_,_} -> 558 parse_relative_part(URIString, URI) 559 end. 560 561 562%%------------------------------------------------------------------------- 563%% [RFC 3986, Chapter 4.2. Relative Reference] 564%% 565%% A relative reference takes advantage of the hierarchical syntax 566%% (Section 1.2.3) to express a URI reference relative to the name space 567%% of another hierarchical URI. 568%% 569%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] 570%% 571%% relative-part = "//" authority path-abempty 572%% / path-absolute 573%% / path-noscheme 574%% / path-empty 575%%------------------------------------------------------------------------- 576-spec parse_relative_part(binary(), uri_map()) -> uri_map(). 577parse_relative_part(?STRING_REST("//", Rest), URI) -> 578 %% Parse userinfo - "//" is NOT part of authority 579 try parse_userinfo(Rest, URI) of 580 {T, URI1} -> 581 Userinfo = calculate_parsed_userinfo(Rest, T), 582 URI2 = maybe_add_path(URI1), 583 URI2#{userinfo => Userinfo} 584 catch 585 throw:{_,_,_} -> 586 {T, URI1} = parse_host(Rest, URI), 587 Host = calculate_parsed_host_port(Rest, T), 588 URI2 = maybe_add_path(URI1), 589 URI2#{host => remove_brackets(Host)} 590 end; 591parse_relative_part(?STRING_REST($/, Rest), URI) -> 592 {T, URI1} = parse_segment(Rest, URI), % path-absolute 593 Path = calculate_parsed_part(Rest, T), 594 URI1#{path => ?STRING_REST($/, Path)}; 595parse_relative_part(?STRING_REST($?, Rest), URI) -> 596 {T, URI1} = parse_query(Rest, URI), % path-empty ?query 597 Query = calculate_parsed_query_fragment(Rest, T), 598 URI2 = maybe_add_path(URI1), 599 URI2#{query => Query}; 600parse_relative_part(?STRING_REST($#, Rest), URI) -> 601 {T, URI1} = parse_fragment(Rest, URI), % path-empty 602 Fragment = calculate_parsed_query_fragment(Rest, T), 603 URI2 = maybe_add_path(URI1), 604 URI2#{fragment => Fragment}; 605parse_relative_part(?STRING_REST(Char, Rest), URI) -> 606 case is_segment_nz_nc(Char) of 607 true -> 608 {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme 609 Path = calculate_parsed_part(Rest, T), 610 URI1#{path => ?STRING_REST(Char, Path)}; 611 false -> throw({error,invalid_uri,[Char]}) 612 end. 613 614 615%%------------------------------------------------------------------------- 616%% [RFC 3986, Chapter 3.3. Path] 617%% 618%% The path component contains data, usually organized in hierarchical 619%% form, that, along with data in the non-hierarchical query component 620%% (Section 3.4), serves to identify a resource within the scope of the 621%% URI's scheme and naming authority (if any). The path is terminated 622%% by the first question mark ("?") or number sign ("#") character, or 623%% by the end of the URI. 624%% 625%% path = path-abempty ; begins with "/" or is empty 626%% / path-absolute ; begins with "/" but not "//" 627%% / path-noscheme ; begins with a non-colon segment 628%% / path-rootless ; begins with a segment 629%% / path-empty ; zero characters 630%% 631%% path-abempty = *( "/" segment ) 632%% path-absolute = "/" [ segment-nz *( "/" segment ) ] 633%% path-noscheme = segment-nz-nc *( "/" segment ) 634%% path-rootless = segment-nz *( "/" segment ) 635%% path-empty = 0<pchar> 636%% segment = *pchar 637%% segment-nz = 1*pchar 638%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) 639%% ; non-zero-length segment without any colon ":" 640%% 641%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" 642%%------------------------------------------------------------------------- 643 644%%------------------------------------------------------------------------- 645%% path-abempty 646%%------------------------------------------------------------------------- 647-spec parse_segment(binary(), uri_map()) -> {binary(), uri_map()}. 648parse_segment(?STRING_REST($/, Rest), URI) -> 649 parse_segment(Rest, URI); % segment 650parse_segment(?STRING_REST($?, Rest), URI) -> 651 {T, URI1} = parse_query(Rest, URI), % ?query 652 Query = calculate_parsed_query_fragment(Rest, T), 653 {Rest, URI1#{query => Query}}; 654parse_segment(?STRING_REST($#, Rest), URI) -> 655 {T, URI1} = parse_fragment(Rest, URI), 656 Fragment = calculate_parsed_query_fragment(Rest, T), 657 {Rest, URI1#{fragment => Fragment}}; 658parse_segment(?STRING_REST(Char, Rest), URI) -> 659 case is_pchar(Char) of 660 true -> parse_segment(Rest, URI); 661 false -> throw({error,invalid_uri,[Char]}) 662 end; 663parse_segment(?STRING_EMPTY, URI) -> 664 {?STRING_EMPTY, URI}. 665 666 667%%------------------------------------------------------------------------- 668%% path-noscheme 669%%------------------------------------------------------------------------- 670-spec parse_segment_nz_nc(binary(), uri_map()) -> {binary(), uri_map()}. 671parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> 672 parse_segment(Rest, URI); % segment 673parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> 674 {T, URI1} = parse_query(Rest, URI), % ?query 675 Query = calculate_parsed_query_fragment(Rest, T), 676 {Rest, URI1#{query => Query}}; 677parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> 678 {T, URI1} = parse_fragment(Rest, URI), 679 Fragment = calculate_parsed_query_fragment(Rest, T), 680 {Rest, URI1#{fragment => Fragment}}; 681parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> 682 case is_segment_nz_nc(Char) of 683 true -> parse_segment_nz_nc(Rest, URI); 684 false -> throw({error,invalid_uri,[Char]}) 685 end; 686parse_segment_nz_nc(?STRING_EMPTY, URI) -> 687 {?STRING_EMPTY, URI}. 688 689 690%% Check if char is pchar. 691-spec is_pchar(char()) -> boolean(). 692is_pchar($%) -> true; % pct-encoded 693is_pchar($:) -> true; 694is_pchar($@) -> true; 695is_pchar(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). 696 697%% Check if char is segment_nz_nc. 698-spec is_segment_nz_nc(char()) -> boolean(). 699is_segment_nz_nc($%) -> true; % pct-encoded 700is_segment_nz_nc($@) -> true; 701is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). 702 703 704%%------------------------------------------------------------------------- 705%% [RFC 3986, Chapter 3.1. Scheme] 706%% 707%% Each URI begins with a scheme name that refers to a specification for 708%% assigning identifiers within that scheme. 709%% 710%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 711%%------------------------------------------------------------------------- 712-spec parse_scheme_start(binary(), uri_map()) -> uri_map(). 713parse_scheme_start(?STRING_REST(Char, Rest), URI) -> 714 case is_alpha(Char) of 715 true -> {T, URI1} = parse_scheme(Rest, URI), 716 Scheme = calculate_parsed_scheme(Rest, T), 717 URI2 = maybe_add_path(URI1), 718 URI2#{scheme => ?STRING_REST(Char, Scheme)}; 719 false -> throw({error,invalid_uri,[Char]}) 720 end. 721 722%% Add path component if it missing after parsing the URI. 723%% According to the URI specification there is always a 724%% path component in every URI-reference and it can be 725%% empty. 726maybe_add_path(Map) -> 727 case maps:is_key(path, Map) of 728 false -> 729 Map#{path => <<>>}; 730 _Else -> 731 Map 732 end. 733 734 735 736-spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. 737parse_scheme(?STRING_REST($:, Rest), URI) -> 738 {_, URI1} = parse_hier(Rest, URI), 739 {Rest, URI1}; 740parse_scheme(?STRING_REST(Char, Rest), URI) -> 741 case is_scheme(Char) of 742 true -> parse_scheme(Rest, URI); 743 false -> throw({error,invalid_uri,[Char]}) 744 end; 745parse_scheme(?STRING_EMPTY, _URI) -> 746 throw({error,invalid_uri,<<>>}). 747 748 749%% Check if char is allowed in scheme 750-spec is_scheme(char()) -> boolean(). 751is_scheme($+) -> true; 752is_scheme($-) -> true; 753is_scheme($.) -> true; 754is_scheme(Char) -> is_alpha(Char) orelse is_digit(Char). 755 756 757%%------------------------------------------------------------------------- 758%% hier-part = "//" authority path-abempty 759%% / path-absolute 760%% / path-rootless 761%% / path-empty 762%%------------------------------------------------------------------------- 763-spec parse_hier(binary(), uri_map()) -> {binary(), uri_map()}. 764parse_hier(?STRING_REST("//", Rest), URI) -> 765 % Parse userinfo - "//" is NOT part of authority 766 try parse_userinfo(Rest, URI) of 767 {T, URI1} -> 768 Userinfo = calculate_parsed_userinfo(Rest, T), 769 {Rest, URI1#{userinfo => Userinfo}} 770 catch 771 throw:{_,_,_} -> 772 {T, URI1} = parse_host(Rest, URI), 773 Host = calculate_parsed_host_port(Rest, T), 774 {Rest, URI1#{host => remove_brackets(Host)}} 775 end; 776parse_hier(?STRING_REST($/, Rest), URI) -> 777 {T, URI1} = parse_segment(Rest, URI), % path-absolute 778 Path = calculate_parsed_part(Rest, T), 779 {Rest, URI1#{path => ?STRING_REST($/, Path)}}; 780parse_hier(?STRING_REST($?, Rest), URI) -> 781 {T, URI1} = parse_query(Rest, URI), % path-empty ?query 782 Query = calculate_parsed_query_fragment(Rest, T), 783 {Rest, URI1#{query => Query}}; 784parse_hier(?STRING_REST($#, Rest), URI) -> 785 {T, URI1} = parse_fragment(Rest, URI), % path-empty 786 Fragment = calculate_parsed_query_fragment(Rest, T), 787 {Rest, URI1#{fragment => Fragment}}; 788parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless 789 case is_pchar(Char) of 790 true -> % segment_nz 791 {T, URI1} = parse_segment(Rest, URI), 792 Path = calculate_parsed_part(Rest, T), 793 {Rest, URI1#{path => ?STRING_REST(Char, Path)}}; 794 false -> throw({error,invalid_uri,[Char]}) 795 end; 796parse_hier(?STRING_EMPTY, URI) -> 797 {<<>>, URI}. 798 799 800%%------------------------------------------------------------------------- 801%% [RFC 3986, Chapter 3.2. Authority] 802%% 803%% Many URI schemes include a hierarchical element for a naming 804%% authority so that governance of the name space defined by the 805%% remainder of the URI is delegated to that authority (which may, in 806%% turn, delegate it further). 807%% 808%% The authority component is preceded by a double slash ("//") and is 809%% terminated by the next slash ("/"), question mark ("?"), or number 810%% sign ("#") character, or by the end of the URI. 811%% 812%% authority = [ userinfo "@" ] host [ ":" port ] 813%% 814%% 815%% [RFC 3986, Chapter 3.2.1. User Information] 816%% 817%% The userinfo subcomponent may consist of a user name and, optionally, 818%% scheme-specific information about how to gain authorization to access 819%% the resource. The user information, if present, is followed by a 820%% commercial at-sign ("@") that delimits it from the host. 821%% 822%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 823%%------------------------------------------------------------------------- 824-spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}. 825parse_userinfo(?CHAR($@), URI) -> 826 {?STRING_EMPTY, URI#{host => <<>>}}; 827parse_userinfo(?STRING_REST($@, Rest), URI) -> 828 {T, URI1} = parse_host(Rest, URI), 829 Host = calculate_parsed_host_port(Rest, T), 830 {Rest, URI1#{host => remove_brackets(Host)}}; 831parse_userinfo(?STRING_REST(Char, Rest), URI) -> 832 case is_userinfo(Char) of 833 true -> parse_userinfo(Rest, URI); 834 false -> throw({error,invalid_uri,[Char]}) 835 end; 836parse_userinfo(?STRING_EMPTY, _URI) -> 837 %% URI cannot end in userinfo state 838 throw({error,invalid_uri,<<>>}). 839 840 841%% Check if char is allowed in userinfo 842-spec is_userinfo(char()) -> boolean(). 843is_userinfo($%) -> true; % pct-encoded 844is_userinfo($:) -> true; 845is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). 846 847 848%%------------------------------------------------------------------------- 849%% [RFC 3986, Chapter 3.2.2. Host] 850%% 851%% The host subcomponent of authority is identified by an IP literal 852%% encapsulated within square brackets, an IPv4 address in dotted- 853%% decimal form, or a registered name. 854%% 855%% host = IP-literal / IPv4address / reg-name 856%% 857%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" 858%% 859%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) 860%% 861%% IPv6address = 6( h16 ":" ) ls32 862%% / "::" 5( h16 ":" ) ls32 863%% / [ h16 ] "::" 4( h16 ":" ) ls32 864%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 865%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 866%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 867%% / [ *4( h16 ":" ) h16 ] "::" ls32 868%% / [ *5( h16 ":" ) h16 ] "::" h16 869%% / [ *6( h16 ":" ) h16 ] "::" 870%% 871%% ls32 = ( h16 ":" h16 ) / IPv4address 872%% ; least-significant 32 bits of address 873%% 874%% h16 = 1*4HEXDIG 875%% ; 16 bits of address represented in hexadecimal 876%% 877%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 878%% 879%% dec-octet = DIGIT ; 0-9 880%% / %x31-39 DIGIT ; 10-99 881%% / "1" 2DIGIT ; 100-199 882%% / "2" %x30-34 DIGIT ; 200-249 883%% / "25" %x30-35 ; 250-255 884%% 885%% reg-name = *( unreserved / pct-encoded / sub-delims ) 886%%------------------------------------------------------------------------- 887-spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. 888parse_host(?STRING_REST($:, Rest), URI) -> 889 {T, URI1} = parse_port(Rest, URI), 890 H = calculate_parsed_host_port(Rest, T), 891 Port = get_port(H), 892 {Rest, URI1#{port => Port}}; 893parse_host(?STRING_REST($/, Rest), URI) -> 894 {T, URI1} = parse_segment(Rest, URI), % path-abempty 895 Path = calculate_parsed_part(Rest, T), 896 {Rest, URI1#{path => ?STRING_REST($/, Path)}}; 897parse_host(?STRING_REST($?, Rest), URI) -> 898 {T, URI1} = parse_query(Rest, URI), % path-empty ?query 899 Query = calculate_parsed_query_fragment(Rest, T), 900 {Rest, URI1#{query => Query}}; 901parse_host(?STRING_REST($[, Rest), URI) -> 902 parse_ipv6_bin(Rest, [], URI); 903parse_host(?STRING_REST($#, Rest), URI) -> 904 {T, URI1} = parse_fragment(Rest, URI), % path-empty 905 Fragment = calculate_parsed_query_fragment(Rest, T), 906 {Rest, URI1#{fragment => Fragment}}; 907parse_host(?STRING_REST(Char, Rest), URI) -> 908 case is_digit(Char) of 909 true -> 910 try parse_ipv4_bin(Rest, [Char], URI) 911 catch 912 throw:{_,_,_} -> 913 parse_reg_name(?STRING_REST(Char, Rest), URI) 914 end; 915 false -> parse_reg_name(?STRING_REST(Char, Rest), URI) 916 end; 917parse_host(?STRING_EMPTY, URI) -> 918 {?STRING_EMPTY, URI}. 919 920 921-spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. 922parse_reg_name(?STRING_REST($:, Rest), URI) -> 923 {T, URI1} = parse_port(Rest, URI), 924 H = calculate_parsed_host_port(Rest, T), 925 Port = get_port(H), 926 {Rest, URI1#{port => Port}}; 927parse_reg_name(?STRING_REST($/, Rest), URI) -> 928 {T, URI1} = parse_segment(Rest, URI), % path-abempty 929 Path = calculate_parsed_part(Rest, T), 930 {Rest, URI1#{path => ?STRING_REST($/, Path)}}; 931parse_reg_name(?STRING_REST($?, Rest), URI) -> 932 {T, URI1} = parse_query(Rest, URI), % path-empty ?query 933 Query = calculate_parsed_query_fragment(Rest, T), 934 {Rest, URI1#{query => Query}}; 935parse_reg_name(?STRING_REST($#, Rest), URI) -> 936 {T, URI1} = parse_fragment(Rest, URI), % path-empty 937 Fragment = calculate_parsed_query_fragment(Rest, T), 938 {Rest, URI1#{fragment => Fragment}}; 939parse_reg_name(?STRING_REST(Char, Rest), URI) -> 940 case is_reg_name(Char) of 941 true -> parse_reg_name(Rest, URI); 942 false -> throw({error,invalid_uri,[Char]}) 943 end; 944parse_reg_name(?STRING_EMPTY, URI) -> 945 {?STRING_EMPTY, URI}. 946 947%% Check if char is allowed in reg-name 948-spec is_reg_name(char()) -> boolean(). 949is_reg_name($%) -> true; 950is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). 951 952 953-spec parse_ipv4_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. 954parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> 955 _ = validate_ipv4_address(lists:reverse(Acc)), 956 {T, URI1} = parse_port(Rest, URI), 957 H = calculate_parsed_host_port(Rest, T), 958 Port = get_port(H), 959 {Rest, URI1#{port => Port}}; 960parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> 961 _ = validate_ipv4_address(lists:reverse(Acc)), 962 {T, URI1} = parse_segment(Rest, URI), % path-abempty 963 Path = calculate_parsed_part(Rest, T), 964 {Rest, URI1#{path => ?STRING_REST($/, Path)}}; 965parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> 966 _ = validate_ipv4_address(lists:reverse(Acc)), 967 {T, URI1} = parse_query(Rest, URI), % path-empty ?query 968 Query = calculate_parsed_query_fragment(Rest, T), 969 {Rest, URI1#{query => Query}}; 970parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> 971 _ = validate_ipv4_address(lists:reverse(Acc)), 972 {T, URI1} = parse_fragment(Rest, URI), % path-empty 973 Fragment = calculate_parsed_query_fragment(Rest, T), 974 {Rest, URI1#{fragment => Fragment}}; 975parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> 976 case is_ipv4(Char) of 977 true -> parse_ipv4_bin(Rest, [Char|Acc], URI); 978 false -> throw({error,invalid_uri,[Char]}) 979 end; 980parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> 981 _ = validate_ipv4_address(lists:reverse(Acc)), 982 {?STRING_EMPTY, URI}. 983 984 985%% Check if char is allowed in IPv4 addresses 986-spec is_ipv4(char()) -> boolean(). 987is_ipv4($.) -> true; 988is_ipv4(Char) -> is_digit(Char). 989 990-spec validate_ipv4_address(list()) -> list(). 991validate_ipv4_address(Addr) -> 992 case inet:parse_ipv4strict_address(Addr) of 993 {ok, _} -> Addr; 994 {error, _} -> throw({error,invalid_uri,Addr}) 995 end. 996 997 998-spec parse_ipv6_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. 999parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) -> 1000 _ = validate_ipv6_address(lists:reverse(Acc)), 1001 parse_ipv6_bin_end(Rest, URI); 1002parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> 1003 case is_ipv6(Char) of 1004 true -> parse_ipv6_bin(Rest, [Char|Acc], URI); 1005 false -> throw({error,invalid_uri,[Char]}) 1006 end; 1007parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> 1008 throw({error,invalid_uri,<<>>}). 1009 1010%% Check if char is allowed in IPv6 addresses 1011-spec is_ipv6(char()) -> boolean(). 1012is_ipv6($:) -> true; 1013is_ipv6($.) -> true; 1014is_ipv6(Char) -> is_hex_digit(Char). 1015 1016 1017-spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. 1018parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> 1019 {T, URI1} = parse_port(Rest, URI), 1020 H = calculate_parsed_host_port(Rest, T), 1021 Port = get_port(H), 1022 {Rest, URI1#{port => Port}}; 1023parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> 1024 {T, URI1} = parse_segment(Rest, URI), % path-abempty 1025 Path = calculate_parsed_part(Rest, T), 1026 {Rest, URI1#{path => ?STRING_REST($/, Path)}}; 1027parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> 1028 {T, URI1} = parse_query(Rest, URI), % path-empty ?query 1029 Query = calculate_parsed_query_fragment(Rest, T), 1030 {Rest, URI1#{query => Query}}; 1031parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> 1032 {T, URI1} = parse_fragment(Rest, URI), % path-empty 1033 Fragment = calculate_parsed_query_fragment(Rest, T), 1034 {Rest, URI1#{fragment => Fragment}}; 1035parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> 1036 case is_ipv6(Char) of 1037 true -> parse_ipv6_bin_end(Rest, URI); 1038 false -> throw({error,invalid_uri,[Char]}) 1039 end; 1040parse_ipv6_bin_end(?STRING_EMPTY, URI) -> 1041 {?STRING_EMPTY, URI}. 1042 1043-spec validate_ipv6_address(list()) -> list(). 1044validate_ipv6_address(Addr) -> 1045 case inet:parse_ipv6strict_address(Addr) of 1046 {ok, _} -> Addr; 1047 {error, _} -> throw({error,invalid_uri,Addr}) 1048 end. 1049 1050 1051%%------------------------------------------------------------------------- 1052%% [RFC 3986, Chapter 3.2.2. Port] 1053%% 1054%% The port subcomponent of authority is designated by an optional port 1055%% number in decimal following the host and delimited from it by a 1056%% single colon (":") character. 1057%% 1058%% port = *DIGIT 1059%%------------------------------------------------------------------------- 1060-spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}. 1061parse_port(?STRING_REST($/, Rest), URI) -> 1062 {T, URI1} = parse_segment(Rest, URI), % path-abempty 1063 Path = calculate_parsed_part(Rest, T), 1064 {Rest, URI1#{path => ?STRING_REST($/, Path)}}; 1065parse_port(?STRING_REST($?, Rest), URI) -> 1066 {T, URI1} = parse_query(Rest, URI), % path-empty ?query 1067 Query = calculate_parsed_query_fragment(Rest, T), 1068 {Rest, URI1#{query => Query}}; 1069parse_port(?STRING_REST($#, Rest), URI) -> 1070 {T, URI1} = parse_fragment(Rest, URI), % path-empty 1071 Fragment = calculate_parsed_query_fragment(Rest, T), 1072 {Rest, URI1#{fragment => Fragment}}; 1073parse_port(?STRING_REST(Char, Rest), URI) -> 1074 case is_digit(Char) of 1075 true -> parse_port(Rest, URI); 1076 false -> throw({error,invalid_uri,[Char]}) 1077 end; 1078parse_port(?STRING_EMPTY, URI) -> 1079 {?STRING_EMPTY, URI}. 1080 1081 1082%%------------------------------------------------------------------------- 1083%% [RFC 3986, Chapter 3.4. Query] 1084%% 1085%% The query component contains non-hierarchical data that, along with 1086%% data in the path component (Section 3.3), serves to identify a 1087%% resource within the scope of the URI's scheme and naming authority 1088%% (if any). The query component is indicated by the first question 1089%% mark ("?") character and terminated by a number sign ("#") character 1090%% or by the end of the URI. 1091%% 1092%% query = *( pchar / "/" / "?" ) 1093%%------------------------------------------------------------------------- 1094-spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. 1095parse_query(?STRING_REST($#, Rest), URI) -> 1096 {T, URI1} = parse_fragment(Rest, URI), 1097 Fragment = calculate_parsed_query_fragment(Rest, T), 1098 {Rest, URI1#{fragment => Fragment}}; 1099parse_query(?STRING_REST(Char, Rest), URI) -> 1100 case is_query(Char) of 1101 true -> parse_query(Rest, URI); 1102 false -> throw({error,invalid_uri,[Char]}) 1103 end; 1104parse_query(?STRING_EMPTY, URI) -> 1105 {?STRING_EMPTY, URI}. 1106 1107 1108%% Check if char is allowed in query 1109-spec is_query(char()) -> boolean(). 1110is_query($/) -> true; 1111is_query($?) -> true; 1112is_query(Char) -> is_pchar(Char). 1113 1114 1115%%------------------------------------------------------------------------- 1116%% [RFC 3986, Chapter 3.5. Fragment] 1117%% 1118%% The fragment identifier component of a URI allows indirect 1119%% identification of a secondary resource by reference to a primary 1120%% resource and additional identifying information. 1121%% 1122%% fragment = *( pchar / "/" / "?" ) 1123%%------------------------------------------------------------------------- 1124-spec parse_fragment(binary(), uri_map()) -> {binary(), uri_map()}. 1125parse_fragment(?STRING_REST(Char, Rest), URI) -> 1126 case is_fragment(Char) of 1127 true -> parse_fragment(Rest, URI); 1128 false -> throw({error,invalid_uri,[Char]}) 1129 end; 1130parse_fragment(?STRING_EMPTY, URI) -> 1131 {?STRING_EMPTY, URI}. 1132 1133 1134%% Check if char is allowed in fragment 1135-spec is_fragment(char()) -> boolean(). 1136is_fragment($/) -> true; 1137is_fragment($?) -> true; 1138is_fragment(Char) -> is_pchar(Char). 1139 1140 1141%%------------------------------------------------------------------------- 1142%% [RFC 3986, Chapter 2.2. Reserved Characters] 1143%% 1144%% reserved = gen-delims / sub-delims 1145%% 1146%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 1147%% 1148%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 1149%% / "*" / "+" / "," / ";" / "=" 1150%% 1151%%------------------------------------------------------------------------- 1152 1153%% Return true if input char is reserved. 1154-spec is_reserved(char()) -> boolean(). 1155is_reserved($:) -> true; 1156is_reserved($/) -> true; 1157is_reserved($?) -> true; 1158is_reserved($#) -> true; 1159is_reserved($[) -> true; 1160is_reserved($]) -> true; 1161is_reserved($@) -> true; 1162 1163is_reserved($!) -> true; 1164is_reserved($$) -> true; 1165is_reserved($&) -> true; 1166is_reserved($') -> true; 1167is_reserved($() -> true; 1168is_reserved($)) -> true; 1169 1170is_reserved($*) -> true; 1171is_reserved($+) -> true; 1172is_reserved($,) -> true; 1173is_reserved($;) -> true; 1174is_reserved($=) -> true; 1175is_reserved(_) -> false. 1176 1177 1178%% Check if char is sub-delim. 1179-spec is_sub_delim(char()) -> boolean(). 1180is_sub_delim($!) -> true; 1181is_sub_delim($$) -> true; 1182is_sub_delim($&) -> true; 1183is_sub_delim($') -> true; 1184is_sub_delim($() -> true; 1185is_sub_delim($)) -> true; 1186 1187is_sub_delim($*) -> true; 1188is_sub_delim($+) -> true; 1189is_sub_delim($,) -> true; 1190is_sub_delim($;) -> true; 1191is_sub_delim($=) -> true; 1192is_sub_delim(_) -> false. 1193 1194 1195%%------------------------------------------------------------------------- 1196%% [RFC 3986, Chapter 2.3. Unreserved Characters] 1197%% 1198%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 1199%% 1200%%------------------------------------------------------------------------- 1201-spec is_unreserved(char()) -> boolean(). 1202is_unreserved($-) -> true; 1203is_unreserved($.) -> true; 1204is_unreserved($_) -> true; 1205is_unreserved($~) -> true; 1206is_unreserved(Char) -> is_alpha(Char) orelse is_digit(Char). 1207 1208-spec is_alpha(char()) -> boolean(). 1209is_alpha(C) 1210 when $A =< C, C =< $Z; 1211 $a =< C, C =< $z -> true; 1212is_alpha(_) -> false. 1213 1214-spec is_digit(char()) -> boolean(). 1215is_digit(C) 1216 when $0 =< C, C =< $9 -> true; 1217is_digit(_) -> false. 1218 1219-spec is_hex_digit(char()) -> boolean(). 1220is_hex_digit(C) 1221 when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true; 1222is_hex_digit(_) -> false. 1223 1224 1225%% Remove enclosing brackets from binary 1226-spec remove_brackets(binary()) -> binary(). 1227remove_brackets(<<$[/utf8, Rest/binary>>) -> 1228 {H,T} = split_binary(Rest, byte_size(Rest) - 1), 1229 case T =:= <<$]/utf8>> of 1230 true -> H; 1231 false -> Rest 1232 end; 1233remove_brackets(Addr) -> Addr. 1234 1235 1236%%------------------------------------------------------------------------- 1237%% Helper functions for calculating the parsed binary. 1238%%------------------------------------------------------------------------- 1239-spec calculate_parsed_scheme(binary(), binary()) -> binary(). 1240calculate_parsed_scheme(Input, <<>>) -> 1241 strip_last_char(Input, [$:]); 1242calculate_parsed_scheme(Input, Unparsed) -> 1243 get_parsed_binary(Input, Unparsed). 1244 1245 1246-spec calculate_parsed_part(binary(), binary()) -> binary(). 1247calculate_parsed_part(Input, <<>>) -> 1248 strip_last_char(Input, [$?,$#]); 1249calculate_parsed_part(Input, Unparsed) -> 1250 get_parsed_binary(Input, Unparsed). 1251 1252 1253-spec calculate_parsed_userinfo(binary(), binary()) -> binary(). 1254calculate_parsed_userinfo(Input, <<>>) -> 1255 strip_last_char(Input, [$?,$#,$@]); 1256calculate_parsed_userinfo(Input, Unparsed) -> 1257 get_parsed_binary(Input, Unparsed). 1258 1259 1260-spec calculate_parsed_host_port(binary(), binary()) -> binary(). 1261calculate_parsed_host_port(Input, <<>>) -> 1262 strip_last_char(Input, [$:,$?,$#,$/]); 1263calculate_parsed_host_port(Input, Unparsed) -> 1264 get_parsed_binary(Input, Unparsed). 1265 1266 1267calculate_parsed_query_fragment(Input, <<>>) -> 1268 strip_last_char(Input, [$#]); 1269calculate_parsed_query_fragment(Input, Unparsed) -> 1270 get_parsed_binary(Input, Unparsed). 1271 1272 1273get_port(<<>>) -> 1274 undefined; 1275get_port(B) -> 1276 try binary_to_integer(B) 1277 catch 1278 error:badarg -> 1279 throw({error, invalid_uri, B}) 1280 end. 1281 1282 1283%% Strip last char if it is in list 1284%% 1285%% This function is optimized for speed: parse/1 is about 10% faster than 1286%% with an alternative implementation based on lists and sets. 1287strip_last_char(<<>>, _) -> <<>>; 1288strip_last_char(Input, [C0]) -> 1289 case binary:last(Input) of 1290 C0 -> 1291 init_binary(Input); 1292 _Else -> 1293 Input 1294 end; 1295strip_last_char(Input, [C0,C1]) -> 1296 case binary:last(Input) of 1297 C0 -> 1298 init_binary(Input); 1299 C1 -> 1300 init_binary(Input); 1301 _Else -> 1302 Input 1303 end; 1304strip_last_char(Input, [C0,C1,C2]) -> 1305 case binary:last(Input) of 1306 C0 -> 1307 init_binary(Input); 1308 C1 -> 1309 init_binary(Input); 1310 C2 -> 1311 init_binary(Input); 1312 _Else -> 1313 Input 1314 end; 1315strip_last_char(Input, [C0,C1,C2,C3]) -> 1316 case binary:last(Input) of 1317 C0 -> 1318 init_binary(Input); 1319 C1 -> 1320 init_binary(Input); 1321 C2 -> 1322 init_binary(Input); 1323 C3 -> 1324 init_binary(Input); 1325 _Else -> 1326 Input 1327 end. 1328 1329 1330%% Get parsed binary 1331get_parsed_binary(Input, Unparsed) -> 1332 {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), 1333 First. 1334 1335 1336%% Return all bytes of the binary except the last one. The binary must be non-empty. 1337init_binary(B) -> 1338 {Init, _} = 1339 split_binary(B, byte_size(B) - 1), 1340 Init. 1341 1342 1343%% Returns the size of a binary exluding the first element. 1344%% Used in calls to split_binary(). 1345-spec byte_size_exl_head(binary()) -> number(). 1346byte_size_exl_head(<<>>) -> 0; 1347byte_size_exl_head(Binary) -> byte_size(Binary) + 1. 1348 1349 1350%%------------------------------------------------------------------------- 1351%% [RFC 3986, Chapter 2.1. Percent-Encoding] 1352%% 1353%% A percent-encoding mechanism is used to represent a data octet in a 1354%% component when that octet's corresponding character is outside the 1355%% allowed set or is being used as a delimiter of, or within, the 1356%% component. A percent-encoded octet is encoded as a character 1357%% triplet, consisting of the percent character "%" followed by the two 1358%% hexadecimal digits representing that octet's numeric value. For 1359%% example, "%20" is the percent-encoding for the binary octet 1360%% "00100000" (ABNF: %x20), which in US-ASCII corresponds to the space 1361%% character (SP). Section 2.4 describes when percent-encoding and 1362%% decoding is applied. 1363%% 1364%% pct-encoded = "%" HEXDIG HEXDIG 1365%%------------------------------------------------------------------------- 1366 1367%%------------------------------------------------------------------------- 1368%% Percent-encode 1369%%------------------------------------------------------------------------- 1370 1371%% Only validates as scheme cannot have percent-encoded characters 1372-spec encode_scheme(list()|binary()) -> list() | binary(). 1373encode_scheme([]) -> 1374 throw({error,invalid_scheme,""}); 1375encode_scheme(<<>>) -> 1376 throw({error,invalid_scheme,<<>>}); 1377encode_scheme(Scheme) -> 1378 case validate_scheme(Scheme) of 1379 true -> Scheme; 1380 false -> throw({error,invalid_scheme,Scheme}) 1381 end. 1382 1383-spec encode_userinfo(list()|binary()) -> list() | binary(). 1384encode_userinfo(Cs) -> 1385 encode(Cs, fun is_userinfo/1). 1386 1387-spec encode_host(list()|binary()) -> list() | binary(). 1388encode_host(Cs) -> 1389 case classify_host(Cs) of 1390 regname -> Cs; 1391 ipv4 -> Cs; 1392 ipv6 -> bracket_ipv6(Cs); 1393 other -> encode(Cs, fun is_reg_name/1) 1394 end. 1395 1396-spec encode_path(list()|binary()) -> list() | binary(). 1397encode_path(Cs) -> 1398 encode(Cs, fun is_path/1). 1399 1400-spec encode_query(list()|binary()) -> list() | binary(). 1401encode_query(Cs) -> 1402 encode(Cs, fun is_query/1). 1403 1404-spec encode_fragment(list()|binary()) -> list() | binary(). 1405encode_fragment(Cs) -> 1406 encode(Cs, fun is_fragment/1). 1407 1408%%------------------------------------------------------------------------- 1409%% Helper funtions for percent-decode 1410%%------------------------------------------------------------------------- 1411 1412-spec decode(list()|binary()) -> list() | binary(). 1413decode(Cs) -> 1414 decode(Cs, <<>>). 1415%% 1416decode(L, Acc) when is_list(L) -> 1417 B0 = unicode:characters_to_binary(L), 1418 B1 = decode(B0, Acc), 1419 unicode:characters_to_list(B1); 1420decode(<<$%,C0,C1,Cs/binary>>, Acc) -> 1421 case is_hex_digit(C0) andalso is_hex_digit(C1) of 1422 true -> 1423 B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), 1424 case is_reserved(B) of 1425 true -> 1426 %% [2.2] Characters in the reserved set are protected from 1427 %% normalization. 1428 %% [2.1] For consistency, URI producers and normalizers should 1429 %% use uppercase hexadecimal digits for all percent- 1430 %% encodings. 1431 H0 = hex_to_upper(C0), 1432 H1 = hex_to_upper(C1), 1433 decode(Cs, <<Acc/binary,$%,H0,H1>>); 1434 false -> 1435 decode(Cs, <<Acc/binary, B>>) 1436 end; 1437 false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>}) 1438 end; 1439decode(<<C,Cs/binary>>, Acc) -> 1440 decode(Cs, <<Acc/binary, C>>); 1441decode(<<>>, Acc) -> 1442 check_utf8(Acc). 1443 1444%% Returns Cs if it is utf8 encoded. 1445check_utf8(Cs) -> 1446 case unicode:characters_to_list(Cs) of 1447 {incomplete,_,_} -> 1448 throw({error,invalid_utf8,Cs}); 1449 {error,_,_} -> 1450 throw({error,invalid_utf8,Cs}); 1451 _ -> Cs 1452 end. 1453 1454%% Convert hex digit to uppercase form 1455hex_to_upper(H) when $a =< H, H =< $f -> 1456 H - 32; 1457hex_to_upper(H) when $0 =< H, H =< $9;$A =< H, H =< $F-> 1458 H; 1459hex_to_upper(H) -> 1460 throw({error,invalid_input, H}). 1461 1462%% Check if char is allowed in host 1463-spec is_host(char()) -> boolean(). 1464is_host($:) -> true; 1465is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). 1466 1467%% Check if char is allowed in path 1468-spec is_path(char()) -> boolean(). 1469is_path($/) -> true; 1470is_path(Char) -> is_pchar(Char). 1471 1472 1473%%------------------------------------------------------------------------- 1474%% Helper functions for percent-encode 1475%%------------------------------------------------------------------------- 1476-spec encode(list()|binary(), fun()) -> list() | binary(). 1477encode(Component, Fun) when is_list(Component) -> 1478 B = unicode:characters_to_binary(Component), 1479 unicode:characters_to_list(encode(B, Fun, <<>>)); 1480encode(Component, Fun) when is_binary(Component) -> 1481 encode(Component, Fun, <<>>). 1482%% 1483encode(<<Char/utf8, Rest/binary>>, Fun, Acc) -> 1484 C = encode_codepoint_binary(Char, Fun), 1485 encode(Rest, Fun, <<Acc/binary,C/binary>>); 1486encode(<<Char, Rest/binary>>, _Fun, _Acc) -> 1487 throw({error,invalid_input,<<Char,Rest/binary>>}); 1488encode(<<>>, _Fun, Acc) -> 1489 Acc. 1490 1491 1492-spec encode_codepoint_binary(integer(), fun()) -> binary(). 1493encode_codepoint_binary(C, Fun) -> 1494 case Fun(C) of 1495 false -> percent_encode_binary(C); 1496 true -> <<C>> 1497 end. 1498 1499 1500-spec percent_encode_binary(integer()) -> binary(). 1501percent_encode_binary(Code) -> 1502 percent_encode_binary(<<Code/utf8>>, <<>>). 1503 1504 1505percent_encode_binary(<<A:4,B:4,Rest/binary>>, Acc) -> 1506 percent_encode_binary(Rest, <<Acc/binary,$%,(?DEC2HEX(A)),(?DEC2HEX(B))>>); 1507percent_encode_binary(<<>>, Acc) -> 1508 Acc. 1509 1510 1511%%------------------------------------------------------------------------- 1512%%------------------------------------------------------------------------- 1513validate_scheme([]) -> true; 1514validate_scheme([H|T]) -> 1515 case is_scheme(H) of 1516 true -> validate_scheme(T); 1517 false -> false 1518 end; 1519validate_scheme(<<>>) -> true; 1520validate_scheme(<<H, Rest/binary>>) -> 1521 case is_scheme(H) of 1522 true -> validate_scheme(Rest); 1523 false -> false 1524 end. 1525 1526 1527%%------------------------------------------------------------------------- 1528%% Classifies hostname into the following categories: 1529%% regname, ipv4 - address does not contain reserved characters to be 1530%% percent-encoded 1531%% ipv6 - address does not contain reserved characters but it shall be 1532%% encolsed in brackets 1533%% other - address shall be percent-encoded 1534%%------------------------------------------------------------------------- 1535classify_host([]) -> other; 1536classify_host(Addr) when is_binary(Addr) -> 1537 A = unicode:characters_to_list(Addr), 1538 classify_host_ipv6(A); 1539classify_host(Addr) -> 1540 classify_host_ipv6(Addr). 1541 1542classify_host_ipv6(Addr) -> 1543 case is_ipv6_address(Addr) of 1544 true -> ipv6; 1545 false -> classify_host_ipv4(Addr) 1546 end. 1547 1548classify_host_ipv4(Addr) -> 1549 case is_ipv4_address(Addr) of 1550 true -> ipv4; 1551 false -> classify_host_regname(Addr) 1552 end. 1553 1554classify_host_regname([]) -> regname; 1555classify_host_regname([H|T]) -> 1556 case is_reg_name(H) of 1557 true -> classify_host_regname(T); 1558 false -> other 1559 end. 1560 1561is_ipv4_address(Addr) -> 1562 case inet:parse_ipv4strict_address(Addr) of 1563 {ok, _} -> true; 1564 {error, _} -> false 1565 end. 1566 1567is_ipv6_address(Addr) -> 1568 case inet:parse_ipv6strict_address(Addr) of 1569 {ok, _} -> true; 1570 {error, _} -> false 1571 end. 1572 1573bracket_ipv6(Addr) when is_binary(Addr) -> 1574 concat(<<$[,Addr/binary>>,<<$]>>); 1575bracket_ipv6(Addr) when is_list(Addr) -> 1576 [$[|Addr] ++ "]". 1577 1578 1579%%------------------------------------------------------------------------- 1580%% Helper funtions for recompose 1581%%------------------------------------------------------------------------- 1582 1583%%------------------------------------------------------------------------- 1584%% Checks if input Map has valid combination of fields that can be 1585%% recomposed into a URI. 1586%% 1587%% The implementation is based on a decision tree that fulfills the 1588%% following rules: 1589%% - 'path' shall always be present in the input map 1590%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 1591%% hier-part = "//" authority path-abempty 1592%% / path-absolute 1593%% / path-rootless 1594%% / path-empty 1595%% - 'host' shall be present in the input map when 'path' starts with 1596%% two slashes ("//") 1597%% path = path-abempty ; begins with "/" or is empty 1598%% / path-absolute ; begins with "/" but not "//" 1599%% / path-noscheme ; begins with a non-colon segment 1600%% / path-rootless ; begins with a segment 1601%% / path-empty ; zero characters 1602%% path-abempty = *( "/" segment ) 1603%% segment = *pchar 1604%% - 'host' shall be present if userinfo or port is present in input map 1605%% authority = [ userinfo "@" ] host [ ":" port ] 1606%% - All fields shall be valid (scheme, userinfo, host, port, path, query 1607%% or fragment). 1608%%------------------------------------------------------------------------- 1609is_valid_map(#{path := Path} = Map) -> 1610 ((starts_with_two_slash(Path) andalso is_valid_map_host(Map)) 1611 orelse 1612 (maps:is_key(userinfo, Map) andalso is_valid_map_host(Map)) 1613 orelse 1614 (maps:is_key(port, Map) andalso is_valid_map_host(Map)) 1615 orelse 1616 all_fields_valid(Map)); 1617is_valid_map(#{}) -> 1618 false. 1619 1620 1621is_valid_map_host(Map) -> 1622 maps:is_key(host, Map) andalso all_fields_valid(Map). 1623 1624 1625all_fields_valid(Map) -> 1626 Fun = fun(scheme, _, Acc) -> Acc; 1627 (userinfo, _, Acc) -> Acc; 1628 (host, _, Acc) -> Acc; 1629 (port, _, Acc) -> Acc; 1630 (path, _, Acc) -> Acc; 1631 (query, _, Acc) -> Acc; 1632 (fragment, _, Acc) -> Acc; 1633 (_, _, _) -> false 1634 end, 1635 maps:fold(Fun, true, Map). 1636 1637 1638starts_with_two_slash([$/,$/|_]) -> 1639 true; 1640starts_with_two_slash(?STRING_REST("//", _)) -> 1641 true; 1642starts_with_two_slash(_) -> false. 1643 1644 1645update_scheme(#{scheme := Scheme}, _) -> 1646 add_colon_postfix(encode_scheme(Scheme)); 1647update_scheme(#{}, _) -> 1648 empty. 1649 1650 1651update_userinfo(#{userinfo := Userinfo}, empty) -> 1652 add_auth_prefix(encode_userinfo(Userinfo)); 1653update_userinfo(#{userinfo := Userinfo}, URI) -> 1654 concat(URI,add_auth_prefix(encode_userinfo(Userinfo))); 1655update_userinfo(#{}, empty) -> 1656 empty; 1657update_userinfo(#{}, URI) -> 1658 URI. 1659 1660 1661update_host(#{host := Host}, empty) -> 1662 add_auth_prefix(encode_host(Host)); 1663update_host(#{host := Host} = Map, URI) -> 1664 concat(URI,add_host_prefix(Map, encode_host(Host))); 1665update_host(#{}, empty) -> 1666 empty; 1667update_host(#{}, URI) -> 1668 URI. 1669 1670 1671%% URI cannot be empty for ports. E.g. ":8080" is not a valid URI 1672update_port(#{port := undefined}, URI) -> 1673 concat(URI, <<":">>); 1674update_port(#{port := Port}, URI) -> 1675 concat(URI,add_colon(encode_port(Port))); 1676update_port(#{}, URI) -> 1677 URI. 1678 1679 1680update_path(#{path := Path}, empty) -> 1681 encode_path(Path); 1682update_path(#{path := Path}, URI) -> 1683 concat(URI,encode_path(Path)); 1684update_path(#{}, empty) -> 1685 empty; 1686update_path(#{}, URI) -> 1687 URI. 1688 1689 1690update_query(#{query := Query}, empty) -> 1691 encode_query(Query); 1692update_query(#{query := Query}, URI) -> 1693 concat(URI,add_question_mark(encode_query(Query))); 1694update_query(#{}, empty) -> 1695 empty; 1696update_query(#{}, URI) -> 1697 URI. 1698 1699 1700update_fragment(#{fragment := Fragment}, empty) -> 1701 add_hashmark(encode_fragment(Fragment)); 1702update_fragment(#{fragment := Fragment}, URI) -> 1703 concat(URI,add_hashmark(encode_fragment(Fragment))); 1704update_fragment(#{}, empty) -> 1705 ""; 1706update_fragment(#{}, URI) -> 1707 URI. 1708 1709%%------------------------------------------------------------------------- 1710%% Concatenates its arguments that can be lists and binaries. 1711%% The result is a list if at least one of its argument is a list and 1712%% binary otherwise. 1713%%------------------------------------------------------------------------- 1714concat(A, B) when is_binary(A), is_binary(B) -> 1715 <<A/binary, B/binary>>; 1716concat(A, B) when is_binary(A), is_list(B) -> 1717 unicode:characters_to_list(A) ++ B; 1718concat(A, B) when is_list(A) -> 1719 A ++ maybe_to_list(B). 1720 1721add_hashmark(Comp) when is_binary(Comp) -> 1722 <<$#, Comp/binary>>; 1723add_hashmark(Comp) when is_list(Comp) -> 1724 [$#|Comp]. 1725 1726add_question_mark(Comp) when is_binary(Comp) -> 1727 <<$?, Comp/binary>>; 1728add_question_mark(Comp) when is_list(Comp) -> 1729 [$?|Comp]. 1730 1731add_colon(Comp) when is_binary(Comp) -> 1732 <<$:, Comp/binary>>. 1733 1734add_colon_postfix(Comp) when is_binary(Comp) -> 1735 <<Comp/binary,$:>>; 1736add_colon_postfix(Comp) when is_list(Comp) -> 1737 Comp ++ ":". 1738 1739add_auth_prefix(Comp) when is_binary(Comp) -> 1740 <<"//", Comp/binary>>; 1741add_auth_prefix(Comp) when is_list(Comp) -> 1742 [$/,$/|Comp]. 1743 1744add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) -> 1745 <<$@,Host/binary>>; 1746add_host_prefix(#{}, Host) when is_binary(Host) -> 1747 <<"//",Host/binary>>; 1748add_host_prefix(#{userinfo := _}, Host) when is_list(Host) -> 1749 [$@|Host]; 1750add_host_prefix(#{}, Host) when is_list(Host) -> 1751 [$/,$/|Host]. 1752 1753maybe_to_list(Comp) when is_binary(Comp) -> unicode:characters_to_list(Comp); 1754maybe_to_list(Comp) -> Comp. 1755 1756encode_port(Port) -> 1757 integer_to_binary(Port). 1758 1759 1760%%------------------------------------------------------------------------- 1761%% Helper functions for resolve 1762%%------------------------------------------------------------------------- 1763 1764resolve_map(URIMap=#{scheme := _}, _) -> 1765 normalize_path_segment(URIMap); 1766resolve_map(URIMap, #{scheme := _}=BaseURIMap) -> 1767 resolve_map(URIMap, BaseURIMap, resolve_path_type(URIMap)); 1768resolve_map(_URIMap, BaseURIMap) when is_map(BaseURIMap) -> 1769 {error,invalid_scheme,""}; 1770resolve_map(URIMap, BaseURIString) -> 1771 case parse(BaseURIString) of 1772 BaseURIMap = #{scheme := _} -> 1773 resolve_map(URIMap, BaseURIMap, resolve_path_type(URIMap)); 1774 BaseURIMap when is_map(BaseURIMap) -> 1775 {error,invalid_scheme,""}; 1776 Error -> 1777 Error 1778 end. 1779 1780resolve_path_type(URIMap) -> 1781 case iolist_to_binary(maps:get(path, URIMap, <<>>)) of 1782 <<>> -> empty_path; 1783 <<$/,_/bits>> -> absolute_path; 1784 _ -> relative_path 1785 end. 1786 1787resolve_map(URI=#{host := _}, #{scheme := Scheme}, _) -> 1788 normalize_path_segment(URI#{scheme => Scheme}); 1789resolve_map(URI, BaseURI, empty_path) -> 1790 Keys = case maps:is_key(query, URI) of 1791 true -> [scheme, userinfo, host, port, path]; 1792 false -> [scheme, userinfo, host, port, path, query] 1793 end, 1794 maps:merge(URI, maps:with(Keys, BaseURI)); 1795resolve_map(URI, BaseURI, absolute_path) -> 1796 normalize_path_segment(maps:merge( 1797 URI, 1798 maps:with([scheme, userinfo, host, port], BaseURI))); 1799resolve_map(URI=#{path := Path}, BaseURI, relative_path) -> 1800 normalize_path_segment(maps:merge( 1801 URI#{path => merge_paths(Path, BaseURI)}, 1802 maps:with([scheme, userinfo, host, port], BaseURI))). 1803 1804merge_paths(Path, BaseURI=#{path := BasePath0}) -> 1805 case {BaseURI, iolist_size(BasePath0)} of 1806 {#{host := _}, 0} -> 1807 merge_paths_absolute(Path); 1808 _ -> 1809 case string:split(BasePath0, <<$/>>, trailing) of 1810 [BasePath, _] when is_binary(Path) -> unicode:characters_to_binary([BasePath, $/, Path]); 1811 [BasePath, _] when is_list(Path) -> unicode:characters_to_list([BasePath, $/, Path]); 1812 [_] -> Path 1813 end 1814 end. 1815 1816merge_paths_absolute(Path) when is_binary(Path) -> 1817 <<$/, Path/binary>>; 1818merge_paths_absolute(Path) when is_list(Path) -> 1819 unicode:characters_to_list([$/, Path]). 1820 1821 1822%%------------------------------------------------------------------------- 1823%% Helper functions for transcode 1824%%------------------------------------------------------------------------- 1825 1826%%------------------------------------------------------------------------- 1827%% uri_string:transcode(<<"x%00%00%00%F6"/utf32>>). 1828%% 1. Convert (transcode/2) input to list form (list of unicode codepoints) 1829%% "x%00%00%00%F6" 1830%% 2. Accumulate characters until percent-encoded segment (transcode/4). 1831%% Acc = "x" 1832%% 3. Convert percent-encoded triplets to binary form (transcode_pct/4) 1833%% <<0,0,0,246>> 1834%% 4. Transcode in-encoded binary to out-encoding (utf32 -> utf8): 1835%% <<195,182>> 1836%% 5. Percent-encode out-encoded binary: 1837%% <<"%C3%B6"/utf8>> = <<37,67,51,37,66,54>> 1838%% 6. Convert binary to list form, reverse it and append the accumulator 1839%% "6B%3C%" + "x" 1840%% 7. Reverse Acc and return it 1841%%------------------------------------------------------------------------- 1842transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) -> 1843 transcode_pct(L, Acc, <<>>, InEnc, OutEnc); 1844transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) -> 1845 transcode(L, Acc, [], InEnc, OutEnc). 1846%% 1847transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) -> 1848 transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding); 1849transcode([C|Rest], Acc, List, InEncoding, OutEncoding) -> 1850 transcode(Rest, Acc, [C|List], InEncoding, OutEncoding); 1851transcode([], Acc, List, _InEncoding, _OutEncoding) -> 1852 lists:reverse(List ++ Acc). 1853 1854 1855%% Transcode percent-encoded segment 1856transcode_pct([$%,C0,C1|Rest] = L, Acc, B, InEncoding, OutEncoding) -> 1857 case is_hex_digit(C0) andalso is_hex_digit(C1) of 1858 true -> 1859 Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1), 1860 transcode_pct(Rest, Acc, <<B/binary, Int>>, InEncoding, OutEncoding); 1861 false -> throw({error, invalid_percent_encoding,L}) 1862 end; 1863transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) -> 1864 OutBinary = convert_to_binary(B, InEncoding, OutEncoding), 1865 PctEncUtf8 = percent_encode_segment(OutBinary), 1866 Out = lists:reverse(convert_to_list(PctEncUtf8, utf8)), 1867 transcode(L, Out ++ Acc, [], InEncoding, OutEncoding); 1868transcode_pct([], Acc, B, InEncoding, OutEncoding) -> 1869 OutBinary = convert_to_binary(B, InEncoding, OutEncoding), 1870 PctEncUtf8 = percent_encode_segment(OutBinary), 1871 Out = convert_to_list(PctEncUtf8, utf8), 1872 lists:reverse(Acc) ++ Out. 1873 1874 1875%% Convert to binary 1876convert_to_binary(Binary, InEncoding, OutEncoding) -> 1877 case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of 1878 {error, _List, RestData} -> 1879 throw({error, invalid_input, RestData}); 1880 {incomplete, _List, RestData} -> 1881 throw({error, invalid_input, RestData}); 1882 Result -> 1883 Result 1884 end. 1885 1886 1887%% Convert to list 1888convert_to_list(Binary, InEncoding) -> 1889 case unicode:characters_to_list(Binary, InEncoding) of 1890 {error, _List, RestData} -> 1891 throw({error, invalid_input, RestData}); 1892 {incomplete, _List, RestData} -> 1893 throw({error, invalid_input, RestData}); 1894 Result -> 1895 Result 1896 end. 1897 1898 1899%% Flatten input list 1900flatten_list([], _) -> 1901 []; 1902flatten_list(L, InEnc) -> 1903 flatten_list(L, InEnc, []). 1904%% 1905flatten_list([H|T], InEnc, Acc) when is_binary(H) -> 1906 L = convert_to_list(H, InEnc), 1907 flatten_list(T, InEnc, lists:reverse(L) ++ Acc); 1908flatten_list([H|T], InEnc, Acc) when is_list(H) -> 1909 flatten_list(H ++ T, InEnc, Acc); 1910flatten_list([H|T], InEnc, Acc) -> 1911 flatten_list(T, InEnc, [H|Acc]); 1912flatten_list([], _InEnc, Acc) -> 1913 lists:reverse(Acc); 1914flatten_list(Arg, _, _) -> 1915 throw({error, invalid_input, Arg}). 1916 1917 1918percent_encode_segment(Segment) -> 1919 percent_encode_binary(Segment, <<>>). 1920 1921 1922%%------------------------------------------------------------------------- 1923%% Helper functions for compose_query 1924%%------------------------------------------------------------------------- 1925 1926%% Returns separator to be used between key-value pairs 1927get_separator(L) when length(L) =:= 0 -> 1928 <<>>; 1929get_separator(_L) -> 1930 <<"&">>. 1931 1932 1933%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8 1934%% HTML 5.0 - 4.10.22.6 URL-encoded form data - encoding (non UTF-8) 1935form_urlencode(Cs, [{encoding, latin1}]) when is_list(Cs) -> 1936 B = convert_to_binary(Cs, utf8, utf8), 1937 html5_byte_encode(base10_encode(B)); 1938form_urlencode(Cs, [{encoding, latin1}]) when is_binary(Cs) -> 1939 html5_byte_encode(base10_encode(Cs)); 1940form_urlencode(Cs, [{encoding, Encoding}]) 1941 when is_list(Cs), Encoding =:= utf8; Encoding =:= unicode -> 1942 B = convert_to_binary(Cs, utf8, Encoding), 1943 html5_byte_encode(B); 1944form_urlencode(Cs, [{encoding, Encoding}]) 1945 when is_binary(Cs), Encoding =:= utf8; Encoding =:= unicode -> 1946 html5_byte_encode(Cs); 1947form_urlencode(Cs, [{encoding, Encoding}]) when is_list(Cs); is_binary(Cs) -> 1948 throw({error,invalid_encoding, Encoding}); 1949form_urlencode(Cs, _) -> 1950 throw({error,invalid_input, Cs}). 1951 1952 1953%% For each character in the entry's name and value that cannot be expressed using 1954%% the selected character encoding, replace the character by a string consisting of 1955%% a U+0026 AMPERSAND character (&), a "#" (U+0023) character, one or more ASCII 1956%% digits representing the Unicode code point of the character in base ten, and 1957%% finally a ";" (U+003B) character. 1958base10_encode(Cs) -> 1959 base10_encode(Cs, <<>>). 1960%% 1961base10_encode(<<>>, Acc) -> 1962 Acc; 1963base10_encode(<<H/utf8,T/binary>>, Acc) when H > 255 -> 1964 Base10 = convert_to_binary(integer_to_list(H,10), utf8, utf8), 1965 base10_encode(T, <<Acc/binary,"&#",Base10/binary,$;>>); 1966base10_encode(<<H/utf8,T/binary>>, Acc) -> 1967 base10_encode(T, <<Acc/binary,H>>). 1968 1969 1970html5_byte_encode(B) -> 1971 html5_byte_encode(B, <<>>). 1972%% 1973html5_byte_encode(<<>>, Acc) -> 1974 Acc; 1975html5_byte_encode(<<$ ,T/binary>>, Acc) -> 1976 html5_byte_encode(T, <<Acc/binary,$+>>); 1977html5_byte_encode(<<H,T/binary>>, Acc) -> 1978 case is_url_char(H) of 1979 true -> 1980 html5_byte_encode(T, <<Acc/binary,H>>); 1981 false -> 1982 <<A:4,B:4>> = <<H>>, 1983 html5_byte_encode(T, <<Acc/binary,$%,(?DEC2HEX(A)),(?DEC2HEX(B))>>) 1984 end; 1985html5_byte_encode(H, _Acc) -> 1986 throw({error,invalid_input, H}). 1987 1988 1989%% Return true if input char can appear in form-urlencoded string 1990%% Allowed chararacters: 1991%% 0x2A, 0x2D, 0x2E, 0x30 to 0x39, 0x41 to 0x5A, 1992%% 0x5F, 0x61 to 0x7A 1993is_url_char(C) 1994 when C =:= 16#2A; C =:= 16#2D; 1995 C =:= 16#2E; C =:= 16#5F; 1996 16#30 =< C, C =< 16#39; 1997 16#41 =< C, C =< 16#5A; 1998 16#61 =< C, C =< 16#7A -> true; 1999is_url_char(_) -> false. 2000 2001 2002%%------------------------------------------------------------------------- 2003%% Helper functions for dissect_query 2004%%------------------------------------------------------------------------- 2005dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) -> 2006 dissect_query_value(T, IsList, Acc, Key, Value); 2007dissect_query_key(<<"&#",T/binary>>, IsList, Acc, Key, Value) -> 2008 dissect_query_key(T, IsList, Acc, <<Key/binary,"&#">>, Value); 2009dissect_query_key(T = <<$&,_/binary>>, IsList, Acc, Key, <<>>) -> 2010 dissect_query_value(T, IsList, Acc, Key, true); 2011dissect_query_key(<<H,T/binary>>, IsList, Acc, Key, Value) -> 2012 dissect_query_key(T, IsList, Acc, <<Key/binary,H>>, Value); 2013dissect_query_key(T = <<>>, IsList, Acc, Key, <<>>) -> 2014 dissect_query_value(T, IsList, Acc, Key, true). 2015 2016dissect_query_value(<<$&,T/binary>>, IsList, Acc, Key, Value) -> 2017 K = form_urldecode(IsList, Key), 2018 V = form_urldecode(IsList, Value), 2019 dissect_query_key(T, IsList, [{K,V}|Acc], <<>>, <<>>); 2020dissect_query_value(<<H,T/binary>>, IsList, Acc, Key, Value) -> 2021 dissect_query_value(T, IsList, Acc, Key, <<Value/binary,H>>); 2022dissect_query_value(<<>>, IsList, Acc, Key, Value) -> 2023 K = form_urldecode(IsList, Key), 2024 V = form_urldecode(IsList, Value), 2025 lists:reverse([{K,V}|Acc]). 2026 2027%% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8 2028%% HTML 5.0 - 4.10.22.6 URL-encoded form data - decoding (non UTF-8) 2029form_urldecode(_, true) -> 2030 true; 2031form_urldecode(true, B) -> 2032 Result = base10_decode(form_urldecode(B, <<>>)), 2033 convert_to_list(Result, utf8); 2034form_urldecode(false, B) -> 2035 base10_decode(form_urldecode(B, <<>>)); 2036form_urldecode(<<>>, Acc) -> 2037 Acc; 2038form_urldecode(<<$+,T/binary>>, Acc) -> 2039 form_urldecode(T, <<Acc/binary,$ >>); 2040form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> 2041 case is_hex_digit(C0) andalso is_hex_digit(C1) of 2042 true -> 2043 V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), 2044 form_urldecode(T, <<Acc/binary, V>>); 2045 false -> 2046 L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8), 2047 throw({error, invalid_percent_encoding, L}) 2048 end; 2049form_urldecode(<<H/utf8,T/binary>>, Acc) -> 2050 form_urldecode(T, <<Acc/binary,H/utf8>>); 2051form_urldecode(<<H,_/binary>>, _Acc) -> 2052 throw({error, invalid_character, [H]}). 2053 2054base10_decode(Cs) -> 2055 base10_decode(Cs, <<>>). 2056% 2057base10_decode(<<>>, Acc) -> 2058 Acc; 2059base10_decode(<<"&#",T/binary>>, Acc) -> 2060 base10_decode_unicode(T, Acc); 2061base10_decode(<<H/utf8,T/binary>>, Acc) -> 2062 base10_decode(T,<<Acc/binary,H/utf8>>); 2063base10_decode(<<H,_/binary>>, _) -> 2064 throw({error, invalid_input, [H]}). 2065 2066 2067base10_decode_unicode(B, Acc) -> 2068 base10_decode_unicode(B, 0, Acc). 2069%% 2070base10_decode_unicode(<<H/utf8,T/binary>>, Codepoint, Acc) when $0 =< H, H =< $9 -> 2071 Res = Codepoint * 10 + (H - $0), 2072 base10_decode_unicode(T, Res, Acc); 2073base10_decode_unicode(<<$;,T/binary>>, Codepoint, Acc) -> 2074 base10_decode(T, <<Acc/binary,Codepoint/utf8>>); 2075base10_decode_unicode(<<H,_/binary>>, _, _) -> 2076 throw({error, invalid_input, [H]}). 2077 2078 2079%%------------------------------------------------------------------------- 2080%% Helper functions for normalize 2081%%------------------------------------------------------------------------- 2082 2083normalize_map(URIMap) -> 2084 normalize_path_segment( 2085 normalize_scheme_based( 2086 normalize_percent_encoding( 2087 normalize_case(URIMap)))). 2088 2089 2090%% 6.2.2.1. Case Normalization 2091normalize_case(#{scheme := Scheme, host := Host} = Map) -> 2092 Map#{scheme => to_lower(Scheme), 2093 host => to_lower(Host)}; 2094normalize_case(#{host := Host} = Map) -> 2095 Map#{host => to_lower(Host)}; 2096normalize_case(#{scheme := Scheme} = Map) -> 2097 Map#{scheme => to_lower(Scheme)}; 2098normalize_case(#{} = Map) -> 2099 Map. 2100 2101 2102%% 6.2.2.2. Percent-Encoding Normalization 2103normalize_percent_encoding(Map) -> 2104 Fun = fun (K,V) when K =:= userinfo; K =:= host; K =:= path; 2105 K =:= query; K =:= fragment -> 2106 decode(V); 2107 %% Handle port and scheme 2108 (_,V) -> 2109 V 2110 end, 2111 maps:map(Fun, Map). 2112 2113 2114to_lower(Cs) when is_list(Cs) -> 2115 B = convert_to_binary(Cs, utf8, utf8), 2116 convert_to_list(to_lower(B), utf8); 2117to_lower(Cs) when is_binary(Cs) -> 2118 to_lower(Cs, <<>>). 2119%% 2120to_lower(<<C,Cs/binary>>, Acc) when $A =< C, C =< $Z -> 2121 to_lower(Cs, <<Acc/binary,(C + 32)>>); 2122to_lower(<<C,Cs/binary>>, Acc) -> 2123 to_lower(Cs, <<Acc/binary,C>>); 2124to_lower(<<>>, Acc) -> 2125 Acc. 2126 2127 2128%% 6.2.2.3. Path Segment Normalization 2129%% 5.2.4. Remove Dot Segments 2130normalize_path_segment(Map) -> 2131 Path = maps:get(path, Map, undefined), 2132 Map#{path => remove_dot_segments(Path)}. 2133 2134 2135remove_dot_segments(Path) when is_binary(Path) -> 2136 remove_dot_segments(Path, <<>>); 2137remove_dot_segments(Path) when is_list(Path) -> 2138 B = convert_to_binary(Path, utf8, utf8), 2139 B1 = remove_dot_segments(B, <<>>), 2140 convert_to_list(B1, utf8). 2141%% 2142remove_dot_segments(<<>>, Output) -> 2143 Output; 2144remove_dot_segments(<<"../",T/binary>>, Output) -> 2145 remove_dot_segments(T, Output); 2146remove_dot_segments(<<"./",T/binary>>, Output) -> 2147 remove_dot_segments(T, Output); 2148remove_dot_segments(<<"/./",T/binary>>, Output) -> 2149 remove_dot_segments(<<$/,T/binary>>, Output); 2150remove_dot_segments(<<"/.">>, Output) -> 2151 remove_dot_segments(<<$/>>, Output); 2152remove_dot_segments(<<"/../",T/binary>>, Output) -> 2153 Out1 = remove_last_segment(Output), 2154 remove_dot_segments(<<$/,T/binary>>, Out1); 2155remove_dot_segments(<<"/..">>, Output) -> 2156 Out1 = remove_last_segment(Output), 2157 remove_dot_segments(<<$/>>, Out1); 2158remove_dot_segments(<<$.>>, Output) -> 2159 remove_dot_segments(<<>>, Output); 2160remove_dot_segments(<<"..">>, Output) -> 2161 remove_dot_segments(<<>>, Output); 2162remove_dot_segments(Input, Output) -> 2163 {First, Rest} = first_path_segment(Input), 2164 remove_dot_segments(Rest, <<Output/binary,First/binary>>). 2165 2166 2167first_path_segment(Input) -> 2168 F = first_path_segment(Input, <<>>), 2169 split_binary(Input, byte_size(F)). 2170%% 2171first_path_segment(<<$/,T/binary>>, Acc) -> 2172 first_path_segment_end(<<T/binary>>, <<Acc/binary,$/>>); 2173first_path_segment(<<C,T/binary>>, Acc) -> 2174 first_path_segment_end(<<T/binary>>, <<Acc/binary,C>>). 2175 2176 2177first_path_segment_end(<<>>, Acc) -> 2178 Acc; 2179first_path_segment_end(<<$/,_/binary>>, Acc) -> 2180 Acc; 2181first_path_segment_end(<<C,T/binary>>, Acc) -> 2182 first_path_segment_end(<<T/binary>>, <<Acc/binary,C>>). 2183 2184 2185remove_last_segment(<<>>) -> 2186 <<>>; 2187remove_last_segment(B) -> 2188 {Init, Last} = split_binary(B, byte_size(B) - 1), 2189 case Last of 2190 <<$/>> -> 2191 Init; 2192 _Char -> 2193 remove_last_segment(Init) 2194 end. 2195 2196 2197%% RFC 3986, 6.2.3. Scheme-Based Normalization 2198normalize_scheme_based(Map) -> 2199 Scheme = maps:get(scheme, Map, undefined), 2200 Port = maps:get(port, Map, undefined), 2201 Path= maps:get(path, Map, undefined), 2202 normalize_scheme_based(Map, Scheme, Port, Path). 2203%% 2204normalize_scheme_based(Map, Scheme, Port, Path) 2205 when Scheme =:= "http"; Scheme =:= <<"http">> -> 2206 normalize_http(Map, Port, Path); 2207normalize_scheme_based(Map, Scheme, Port, Path) 2208 when Scheme =:= "https"; Scheme =:= <<"https">> -> 2209 normalize_https(Map, Port, Path); 2210normalize_scheme_based(Map, Scheme, Port, _Path) 2211 when Scheme =:= "ftp"; Scheme =:= <<"ftp">> -> 2212 normalize_ftp(Map, Port); 2213normalize_scheme_based(Map, Scheme, Port, _Path) 2214 when Scheme =:= "ssh"; Scheme =:= <<"ssh">> -> 2215 normalize_ssh_sftp(Map, Port); 2216normalize_scheme_based(Map, Scheme, Port, _Path) 2217 when Scheme =:= "sftp"; Scheme =:= <<"sftp">> -> 2218 normalize_ssh_sftp(Map, Port); 2219normalize_scheme_based(Map, Scheme, Port, _Path) 2220 when Scheme =:= "tftp"; Scheme =:= <<"tftp">> -> 2221 normalize_tftp(Map, Port); 2222normalize_scheme_based(Map, _, _, _) -> 2223 Map. 2224 2225 2226normalize_http(Map, Port, Path) -> 2227 M1 = normalize_port(Map, Port, 80), 2228 normalize_http_path(M1, Path). 2229 2230 2231normalize_https(Map, Port, Path) -> 2232 M1 = normalize_port(Map, Port, 443), 2233 normalize_http_path(M1, Path). 2234 2235 2236normalize_ftp(Map, Port) -> 2237 normalize_port(Map, Port, 21). 2238 2239 2240normalize_ssh_sftp(Map, Port) -> 2241 normalize_port(Map, Port, 22). 2242 2243 2244normalize_tftp(Map, Port) -> 2245 normalize_port(Map, Port, 69). 2246 2247 2248normalize_port(Map, Port, Default) -> 2249 case Port of 2250 Default -> 2251 maps:remove(port, Map); 2252 _Else -> 2253 Map 2254 end. 2255 2256 2257normalize_http_path(Map, Path) -> 2258 case Path of 2259 "" -> 2260 Map#{path => "/"}; 2261 <<>> -> 2262 Map#{path => <<"/">>}; 2263 _Else -> 2264 Map 2265 end. 2266