1# frozen_string_literal: false 2#-- 3# = uri/common.rb 4# 5# Author:: Akira Yamada <akira@ruby-lang.org> 6# Revision:: $Id$ 7# License:: 8# You can redistribute it and/or modify it under the same term as Ruby. 9# 10# See URI for general documentation 11# 12 13module URI 14 # 15 # Includes URI::REGEXP::PATTERN 16 # 17 module RFC2396_REGEXP 18 # 19 # Patterns used to parse URI's 20 # 21 module PATTERN 22 # :stopdoc: 23 24 # RFC 2396 (URI Generic Syntax) 25 # RFC 2732 (IPv6 Literal Addresses in URL's) 26 # RFC 2373 (IPv6 Addressing Architecture) 27 28 # alpha = lowalpha | upalpha 29 ALPHA = "a-zA-Z" 30 # alphanum = alpha | digit 31 ALNUM = "#{ALPHA}\\d" 32 33 # hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 34 # "a" | "b" | "c" | "d" | "e" | "f" 35 HEX = "a-fA-F\\d" 36 # escaped = "%" hex hex 37 ESCAPED = "%[#{HEX}]{2}" 38 # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 39 # "(" | ")" 40 # unreserved = alphanum | mark 41 UNRESERVED = "\\-_.!~*'()#{ALNUM}" 42 # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 43 # "$" | "," 44 # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 45 # "$" | "," | "[" | "]" (RFC 2732) 46 RESERVED = ";/?:@&=+$,\\[\\]" 47 48 # domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 49 DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)" 50 # toplabel = alpha | alpha *( alphanum | "-" ) alphanum 51 TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)" 52 # hostname = *( domainlabel "." ) toplabel [ "." ] 53 HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?" 54 55 # :startdoc: 56 end # PATTERN 57 58 # :startdoc: 59 end # REGEXP 60 61 # Class that parses String's into URI's. 62 # 63 # It contains a Hash set of patterns and Regexp's that match and validate. 64 # 65 class RFC2396_Parser 66 include RFC2396_REGEXP 67 68 # 69 # == Synopsis 70 # 71 # URI::Parser.new([opts]) 72 # 73 # == Args 74 # 75 # The constructor accepts a hash as options for parser. 76 # Keys of options are pattern names of URI components 77 # and values of options are pattern strings. 78 # The constructor generates set of regexps for parsing URIs. 79 # 80 # You can use the following keys: 81 # 82 # * :ESCAPED (URI::PATTERN::ESCAPED in default) 83 # * :UNRESERVED (URI::PATTERN::UNRESERVED in default) 84 # * :DOMLABEL (URI::PATTERN::DOMLABEL in default) 85 # * :TOPLABEL (URI::PATTERN::TOPLABEL in default) 86 # * :HOSTNAME (URI::PATTERN::HOSTNAME in default) 87 # 88 # == Examples 89 # 90 # p = URI::Parser.new(:ESCAPED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})") 91 # u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP http://example.jp/%uABCD> 92 # URI.parse(u.to_s) #=> raises URI::InvalidURIError 93 # 94 # s = "http://example.com/ABCD" 95 # u1 = p.parse(s) #=> #<URI::HTTP http://example.com/ABCD> 96 # u2 = URI.parse(s) #=> #<URI::HTTP http://example.com/ABCD> 97 # u1 == u2 #=> true 98 # u1.eql?(u2) #=> false 99 # 100 def initialize(opts = {}) 101 @pattern = initialize_pattern(opts) 102 @pattern.each_value(&:freeze) 103 @pattern.freeze 104 105 @regexp = initialize_regexp(@pattern) 106 @regexp.each_value(&:freeze) 107 @regexp.freeze 108 end 109 110 # The Hash of patterns. 111 # 112 # See also URI::Parser.initialize_pattern. 113 attr_reader :pattern 114 115 # The Hash of Regexp. 116 # 117 # See also URI::Parser.initialize_regexp. 118 attr_reader :regexp 119 120 # Returns a split URI against regexp[:ABS_URI]. 121 def split(uri) 122 case uri 123 when '' 124 # null uri 125 126 when @regexp[:ABS_URI] 127 scheme, opaque, userinfo, host, port, 128 registry, path, query, fragment = $~[1..-1] 129 130 # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 131 132 # absoluteURI = scheme ":" ( hier_part | opaque_part ) 133 # hier_part = ( net_path | abs_path ) [ "?" query ] 134 # opaque_part = uric_no_slash *uric 135 136 # abs_path = "/" path_segments 137 # net_path = "//" authority [ abs_path ] 138 139 # authority = server | reg_name 140 # server = [ [ userinfo "@" ] hostport ] 141 142 if !scheme 143 raise InvalidURIError, 144 "bad URI(absolute but no scheme): #{uri}" 145 end 146 if !opaque && (!path && (!host && !registry)) 147 raise InvalidURIError, 148 "bad URI(absolute but no path): #{uri}" 149 end 150 151 when @regexp[:REL_URI] 152 scheme = nil 153 opaque = nil 154 155 userinfo, host, port, registry, 156 rel_segment, abs_path, query, fragment = $~[1..-1] 157 if rel_segment && abs_path 158 path = rel_segment + abs_path 159 elsif rel_segment 160 path = rel_segment 161 elsif abs_path 162 path = abs_path 163 end 164 165 # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 166 167 # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 168 169 # net_path = "//" authority [ abs_path ] 170 # abs_path = "/" path_segments 171 # rel_path = rel_segment [ abs_path ] 172 173 # authority = server | reg_name 174 # server = [ [ userinfo "@" ] hostport ] 175 176 else 177 raise InvalidURIError, "bad URI(is not URI?): #{uri}" 178 end 179 180 path = '' if !path && !opaque # (see RFC2396 Section 5.2) 181 ret = [ 182 scheme, 183 userinfo, host, port, # X 184 registry, # X 185 path, # Y 186 opaque, # Y 187 query, 188 fragment 189 ] 190 return ret 191 end 192 193 # 194 # == Args 195 # 196 # +uri+:: 197 # String 198 # 199 # == Description 200 # 201 # Parses +uri+ and constructs either matching URI scheme object 202 # (File, FTP, HTTP, HTTPS, LDAP, LDAPS, or MailTo) or URI::Generic. 203 # 204 # == Usage 205 # 206 # p = URI::Parser.new 207 # p.parse("ldap://ldap.example.com/dc=example?user=john") 208 # #=> #<URI::LDAP ldap://ldap.example.com/dc=example?user=john> 209 # 210 def parse(uri) 211 scheme, userinfo, host, port, 212 registry, path, opaque, query, fragment = self.split(uri) 213 214 if scheme && URI.scheme_list.include?(scheme.upcase) 215 URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, 216 registry, path, opaque, query, 217 fragment, self) 218 else 219 Generic.new(scheme, userinfo, host, port, 220 registry, path, opaque, query, 221 fragment, self) 222 end 223 end 224 225 226 # 227 # == Args 228 # 229 # +uris+:: 230 # an Array of Strings 231 # 232 # == Description 233 # 234 # Attempts to parse and merge a set of URIs. 235 # 236 def join(*uris) 237 uris[0] = convert_to_uri(uris[0]) 238 uris.inject :merge 239 end 240 241 # 242 # :call-seq: 243 # extract( str ) 244 # extract( str, schemes ) 245 # extract( str, schemes ) {|item| block } 246 # 247 # == Args 248 # 249 # +str+:: 250 # String to search 251 # +schemes+:: 252 # Patterns to apply to +str+ 253 # 254 # == Description 255 # 256 # Attempts to parse and merge a set of URIs. 257 # If no +block+ given, then returns the result, 258 # else it calls +block+ for each element in result. 259 # 260 # See also URI::Parser.make_regexp. 261 # 262 def extract(str, schemes = nil) 263 if block_given? 264 str.scan(make_regexp(schemes)) { yield $& } 265 nil 266 else 267 result = [] 268 str.scan(make_regexp(schemes)) { result.push $& } 269 result 270 end 271 end 272 273 # Returns Regexp that is default self.regexp[:ABS_URI_REF], 274 # unless +schemes+ is provided. Then it is a Regexp.union with self.pattern[:X_ABS_URI]. 275 def make_regexp(schemes = nil) 276 unless schemes 277 @regexp[:ABS_URI_REF] 278 else 279 /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x 280 end 281 end 282 283 # 284 # :call-seq: 285 # escape( str ) 286 # escape( str, unsafe ) 287 # 288 # == Args 289 # 290 # +str+:: 291 # String to make safe 292 # +unsafe+:: 293 # Regexp to apply. Defaults to self.regexp[:UNSAFE] 294 # 295 # == Description 296 # 297 # Constructs a safe String from +str+, removing unsafe characters, 298 # replacing them with codes. 299 # 300 def escape(str, unsafe = @regexp[:UNSAFE]) 301 unless unsafe.kind_of?(Regexp) 302 # perhaps unsafe is String object 303 unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false) 304 end 305 str.gsub(unsafe) do 306 us = $& 307 tmp = '' 308 us.each_byte do |uc| 309 tmp << sprintf('%%%02X', uc) 310 end 311 tmp 312 end.force_encoding(Encoding::US_ASCII) 313 end 314 315 # 316 # :call-seq: 317 # unescape( str ) 318 # unescape( str, escaped ) 319 # 320 # == Args 321 # 322 # +str+:: 323 # String to remove escapes from 324 # +escaped+:: 325 # Regexp to apply. Defaults to self.regexp[:ESCAPED] 326 # 327 # == Description 328 # 329 # Removes escapes from +str+. 330 # 331 def unescape(str, escaped = @regexp[:ESCAPED]) 332 enc = str.encoding 333 enc = Encoding::UTF_8 if enc == Encoding::US_ASCII 334 str.gsub(escaped) { [$&[1, 2]].pack('H2').force_encoding(enc) } 335 end 336 337 @@to_s = Kernel.instance_method(:to_s) 338 def inspect 339 @@to_s.bind(self).call 340 end 341 342 private 343 344 # Constructs the default Hash of patterns. 345 def initialize_pattern(opts = {}) 346 ret = {} 347 ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED) 348 ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED 349 ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED 350 ret[:DOMLABEL] = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL 351 ret[:TOPLABEL] = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL 352 ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME) 353 354 # RFC 2396 (URI Generic Syntax) 355 # RFC 2732 (IPv6 Literal Addresses in URL's) 356 # RFC 2373 (IPv6 Addressing Architecture) 357 358 # uric = reserved | unreserved | escaped 359 ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})" 360 # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | 361 # "&" | "=" | "+" | "$" | "," 362 ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})" 363 # query = *uric 364 ret[:QUERY] = query = "#{uric}*" 365 # fragment = *uric 366 ret[:FRAGMENT] = fragment = "#{uric}*" 367 368 # hostname = *( domainlabel "." ) toplabel [ "." ] 369 # reg-name = *( unreserved / pct-encoded / sub-delims ) # RFC3986 370 unless hostname 371 ret[:HOSTNAME] = hostname = "(?:[a-zA-Z0-9\\-.]|%\\h\\h)+" 372 end 373 374 # RFC 2373, APPENDIX B: 375 # IPv6address = hexpart [ ":" IPv4address ] 376 # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT 377 # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 378 # hexseq = hex4 *( ":" hex4) 379 # hex4 = 1*4HEXDIG 380 # 381 # XXX: This definition has a flaw. "::" + IPv4address must be 382 # allowed too. Here is a replacement. 383 # 384 # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT 385 ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}" 386 # hex4 = 1*4HEXDIG 387 hex4 = "[#{PATTERN::HEX}]{1,4}" 388 # lastpart = hex4 | IPv4address 389 lastpart = "(?:#{hex4}|#{ipv4addr})" 390 # hexseq1 = *( hex4 ":" ) hex4 391 hexseq1 = "(?:#{hex4}:)*#{hex4}" 392 # hexseq2 = *( hex4 ":" ) lastpart 393 hexseq2 = "(?:#{hex4}:)*#{lastpart}" 394 # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ] 395 ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)" 396 397 # IPv6prefix = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT 398 # unused 399 400 # ipv6reference = "[" IPv6address "]" (RFC 2732) 401 ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]" 402 403 # host = hostname | IPv4address 404 # host = hostname | IPv4address | IPv6reference (RFC 2732) 405 ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})" 406 # port = *digit 407 ret[:PORT] = port = '\d*' 408 # hostport = host [ ":" port ] 409 ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?" 410 411 # userinfo = *( unreserved | escaped | 412 # ";" | ":" | "&" | "=" | "+" | "$" | "," ) 413 ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*" 414 415 # pchar = unreserved | escaped | 416 # ":" | "@" | "&" | "=" | "+" | "$" | "," 417 pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})" 418 # param = *pchar 419 param = "#{pchar}*" 420 # segment = *pchar *( ";" param ) 421 segment = "#{pchar}*(?:;#{param})*" 422 # path_segments = segment *( "/" segment ) 423 ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*" 424 425 # server = [ [ userinfo "@" ] hostport ] 426 server = "(?:#{userinfo}@)?#{hostport}" 427 # reg_name = 1*( unreserved | escaped | "$" | "," | 428 # ";" | ":" | "@" | "&" | "=" | "+" ) 429 ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+" 430 # authority = server | reg_name 431 authority = "(?:#{server}|#{reg_name})" 432 433 # rel_segment = 1*( unreserved | escaped | 434 # ";" | "@" | "&" | "=" | "+" | "$" | "," ) 435 ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+" 436 437 # scheme = alpha *( alpha | digit | "+" | "-" | "." ) 438 ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][\\-+.#{PATTERN::ALPHA}\\d]*" 439 440 # abs_path = "/" path_segments 441 ret[:ABS_PATH] = abs_path = "/#{path_segments}" 442 # rel_path = rel_segment [ abs_path ] 443 ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?" 444 # net_path = "//" authority [ abs_path ] 445 ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?" 446 447 # hier_part = ( net_path | abs_path ) [ "?" query ] 448 ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?" 449 # opaque_part = uric_no_slash *uric 450 ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*" 451 452 # absoluteURI = scheme ":" ( hier_part | opaque_part ) 453 ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})" 454 # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 455 ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?" 456 457 # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] 458 ret[:URI_REF] = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?" 459 460 ret[:X_ABS_URI] = " 461 (#{scheme}): (?# 1: scheme) 462 (?: 463 (#{opaque_part}) (?# 2: opaque) 464 | 465 (?:(?: 466 //(?: 467 (?:(?:(#{userinfo})@)? (?# 3: userinfo) 468 (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port) 469 | 470 (#{reg_name}) (?# 6: registry) 471 ) 472 | 473 (?!//)) (?# XXX: '//' is the mark for hostport) 474 (#{abs_path})? (?# 7: path) 475 )(?:\\?(#{query}))? (?# 8: query) 476 ) 477 (?:\\#(#{fragment}))? (?# 9: fragment) 478 " 479 480 ret[:X_REL_URI] = " 481 (?: 482 (?: 483 // 484 (?: 485 (?:(#{userinfo})@)? (?# 1: userinfo) 486 (#{host})?(?::(\\d*))? (?# 2: host, 3: port) 487 | 488 (#{reg_name}) (?# 4: registry) 489 ) 490 ) 491 | 492 (#{rel_segment}) (?# 5: rel_segment) 493 )? 494 (#{abs_path})? (?# 6: abs_path) 495 (?:\\?(#{query}))? (?# 7: query) 496 (?:\\#(#{fragment}))? (?# 8: fragment) 497 " 498 499 ret 500 end 501 502 # Constructs the default Hash of Regexp's. 503 def initialize_regexp(pattern) 504 ret = {} 505 506 # for URI::split 507 ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED) 508 ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED) 509 510 # for URI::extract 511 ret[:URI_REF] = Regexp.new(pattern[:URI_REF]) 512 ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED) 513 ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED) 514 515 # for URI::escape/unescape 516 ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED]) 517 ret[:UNSAFE] = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]") 518 519 # for Generic#initialize 520 ret[:SCHEME] = Regexp.new("\\A#{pattern[:SCHEME]}\\z") 521 ret[:USERINFO] = Regexp.new("\\A#{pattern[:USERINFO]}\\z") 522 ret[:HOST] = Regexp.new("\\A#{pattern[:HOST]}\\z") 523 ret[:PORT] = Regexp.new("\\A#{pattern[:PORT]}\\z") 524 ret[:OPAQUE] = Regexp.new("\\A#{pattern[:OPAQUE_PART]}\\z") 525 ret[:REGISTRY] = Regexp.new("\\A#{pattern[:REG_NAME]}\\z") 526 ret[:ABS_PATH] = Regexp.new("\\A#{pattern[:ABS_PATH]}\\z") 527 ret[:REL_PATH] = Regexp.new("\\A#{pattern[:REL_PATH]}\\z") 528 ret[:QUERY] = Regexp.new("\\A#{pattern[:QUERY]}\\z") 529 ret[:FRAGMENT] = Regexp.new("\\A#{pattern[:FRAGMENT]}\\z") 530 531 ret 532 end 533 534 def convert_to_uri(uri) 535 if uri.is_a?(URI::Generic) 536 uri 537 elsif uri = String.try_convert(uri) 538 parse(uri) 539 else 540 raise ArgumentError, 541 "bad argument (expected URI object or URI string)" 542 end 543 end 544 545 end # class Parser 546end # module URI 547