1# frozen_string_literal: false
2#--
3# = uri/common.rb
4#
5# Author:: Akira Yamada <akira@ruby-lang.org>
6# Revision:: $Id$
7# License::
8#   You can redistribute it and/or modify it under the same term as Ruby.
9#
10# See URI for general documentation
11#
12
13module URI
14  #
15  # Includes URI::REGEXP::PATTERN
16  #
17  module RFC2396_REGEXP
18    #
19    # Patterns used to parse URI's
20    #
21    module PATTERN
22      # :stopdoc:
23
24      # RFC 2396 (URI Generic Syntax)
25      # RFC 2732 (IPv6 Literal Addresses in URL's)
26      # RFC 2373 (IPv6 Addressing Architecture)
27
28      # alpha         = lowalpha | upalpha
29      ALPHA = "a-zA-Z"
30      # alphanum      = alpha | digit
31      ALNUM = "#{ALPHA}\\d"
32
33      # hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
34      #                         "a" | "b" | "c" | "d" | "e" | "f"
35      HEX     = "a-fA-F\\d"
36      # escaped       = "%" hex hex
37      ESCAPED = "%[#{HEX}]{2}"
38      # mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
39      #                 "(" | ")"
40      # unreserved    = alphanum | mark
41      UNRESERVED = "\\-_.!~*'()#{ALNUM}"
42      # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
43      #                 "$" | ","
44      # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
45      #                 "$" | "," | "[" | "]" (RFC 2732)
46      RESERVED = ";/?:@&=+$,\\[\\]"
47
48      # domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
49      DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
50      # toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
51      TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
52      # hostname      = *( domainlabel "." ) toplabel [ "." ]
53      HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
54
55      # :startdoc:
56    end # PATTERN
57
58    # :startdoc:
59  end # REGEXP
60
61  # Class that parses String's into URI's.
62  #
63  # It contains a Hash set of patterns and Regexp's that match and validate.
64  #
65  class RFC2396_Parser
66    include RFC2396_REGEXP
67
68    #
69    # == Synopsis
70    #
71    #   URI::Parser.new([opts])
72    #
73    # == Args
74    #
75    # The constructor accepts a hash as options for parser.
76    # Keys of options are pattern names of URI components
77    # and values of options are pattern strings.
78    # The constructor generates set of regexps for parsing URIs.
79    #
80    # You can use the following keys:
81    #
82    #   * :ESCAPED (URI::PATTERN::ESCAPED in default)
83    #   * :UNRESERVED (URI::PATTERN::UNRESERVED in default)
84    #   * :DOMLABEL (URI::PATTERN::DOMLABEL in default)
85    #   * :TOPLABEL (URI::PATTERN::TOPLABEL in default)
86    #   * :HOSTNAME (URI::PATTERN::HOSTNAME in default)
87    #
88    # == Examples
89    #
90    #   p = URI::Parser.new(:ESCAPED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})")
91    #   u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP http://example.jp/%uABCD>
92    #   URI.parse(u.to_s) #=> raises URI::InvalidURIError
93    #
94    #   s = "http://example.com/ABCD"
95    #   u1 = p.parse(s) #=> #<URI::HTTP http://example.com/ABCD>
96    #   u2 = URI.parse(s) #=> #<URI::HTTP http://example.com/ABCD>
97    #   u1 == u2 #=> true
98    #   u1.eql?(u2) #=> false
99    #
100    def initialize(opts = {})
101      @pattern = initialize_pattern(opts)
102      @pattern.each_value(&:freeze)
103      @pattern.freeze
104
105      @regexp = initialize_regexp(@pattern)
106      @regexp.each_value(&:freeze)
107      @regexp.freeze
108    end
109
110    # The Hash of patterns.
111    #
112    # See also URI::Parser.initialize_pattern.
113    attr_reader :pattern
114
115    # The Hash of Regexp.
116    #
117    # See also URI::Parser.initialize_regexp.
118    attr_reader :regexp
119
120    # Returns a split URI against regexp[:ABS_URI].
121    def split(uri)
122      case uri
123      when ''
124        # null uri
125
126      when @regexp[:ABS_URI]
127        scheme, opaque, userinfo, host, port,
128          registry, path, query, fragment = $~[1..-1]
129
130        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
131
132        # absoluteURI   = scheme ":" ( hier_part | opaque_part )
133        # hier_part     = ( net_path | abs_path ) [ "?" query ]
134        # opaque_part   = uric_no_slash *uric
135
136        # abs_path      = "/"  path_segments
137        # net_path      = "//" authority [ abs_path ]
138
139        # authority     = server | reg_name
140        # server        = [ [ userinfo "@" ] hostport ]
141
142        if !scheme
143          raise InvalidURIError,
144            "bad URI(absolute but no scheme): #{uri}"
145        end
146        if !opaque && (!path && (!host && !registry))
147          raise InvalidURIError,
148            "bad URI(absolute but no path): #{uri}"
149        end
150
151      when @regexp[:REL_URI]
152        scheme = nil
153        opaque = nil
154
155        userinfo, host, port, registry,
156          rel_segment, abs_path, query, fragment = $~[1..-1]
157        if rel_segment && abs_path
158          path = rel_segment + abs_path
159        elsif rel_segment
160          path = rel_segment
161        elsif abs_path
162          path = abs_path
163        end
164
165        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
166
167        # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
168
169        # net_path      = "//" authority [ abs_path ]
170        # abs_path      = "/"  path_segments
171        # rel_path      = rel_segment [ abs_path ]
172
173        # authority     = server | reg_name
174        # server        = [ [ userinfo "@" ] hostport ]
175
176      else
177        raise InvalidURIError, "bad URI(is not URI?): #{uri}"
178      end
179
180      path = '' if !path && !opaque # (see RFC2396 Section 5.2)
181      ret = [
182        scheme,
183        userinfo, host, port,         # X
184        registry,                     # X
185        path,                         # Y
186        opaque,                       # Y
187        query,
188        fragment
189      ]
190      return ret
191    end
192
193    #
194    # == Args
195    #
196    # +uri+::
197    #    String
198    #
199    # == Description
200    #
201    # Parses +uri+ and constructs either matching URI scheme object
202    # (File, FTP, HTTP, HTTPS, LDAP, LDAPS, or MailTo) or URI::Generic.
203    #
204    # == Usage
205    #
206    #   p = URI::Parser.new
207    #   p.parse("ldap://ldap.example.com/dc=example?user=john")
208    #   #=> #<URI::LDAP ldap://ldap.example.com/dc=example?user=john>
209    #
210    def parse(uri)
211      scheme, userinfo, host, port,
212        registry, path, opaque, query, fragment = self.split(uri)
213
214      if scheme && URI.scheme_list.include?(scheme.upcase)
215        URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port,
216                                           registry, path, opaque, query,
217                                           fragment, self)
218      else
219        Generic.new(scheme, userinfo, host, port,
220                    registry, path, opaque, query,
221                    fragment, self)
222      end
223    end
224
225
226    #
227    # == Args
228    #
229    # +uris+::
230    #    an Array of Strings
231    #
232    # == Description
233    #
234    # Attempts to parse and merge a set of URIs.
235    #
236    def join(*uris)
237      uris[0] = convert_to_uri(uris[0])
238      uris.inject :merge
239    end
240
241    #
242    # :call-seq:
243    #   extract( str )
244    #   extract( str, schemes )
245    #   extract( str, schemes ) {|item| block }
246    #
247    # == Args
248    #
249    # +str+::
250    #    String to search
251    # +schemes+::
252    #    Patterns to apply to +str+
253    #
254    # == Description
255    #
256    # Attempts to parse and merge a set of URIs.
257    # If no +block+ given, then returns the result,
258    # else it calls +block+ for each element in result.
259    #
260    # See also URI::Parser.make_regexp.
261    #
262    def extract(str, schemes = nil)
263      if block_given?
264        str.scan(make_regexp(schemes)) { yield $& }
265        nil
266      else
267        result = []
268        str.scan(make_regexp(schemes)) { result.push $& }
269        result
270      end
271    end
272
273    # Returns Regexp that is default self.regexp[:ABS_URI_REF],
274    # unless +schemes+ is provided. Then it is a Regexp.union with self.pattern[:X_ABS_URI].
275    def make_regexp(schemes = nil)
276      unless schemes
277        @regexp[:ABS_URI_REF]
278      else
279        /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
280      end
281    end
282
283    #
284    # :call-seq:
285    #   escape( str )
286    #   escape( str, unsafe )
287    #
288    # == Args
289    #
290    # +str+::
291    #    String to make safe
292    # +unsafe+::
293    #    Regexp to apply. Defaults to self.regexp[:UNSAFE]
294    #
295    # == Description
296    #
297    # Constructs a safe String from +str+, removing unsafe characters,
298    # replacing them with codes.
299    #
300    def escape(str, unsafe = @regexp[:UNSAFE])
301      unless unsafe.kind_of?(Regexp)
302        # perhaps unsafe is String object
303        unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false)
304      end
305      str.gsub(unsafe) do
306        us = $&
307        tmp = ''
308        us.each_byte do |uc|
309          tmp << sprintf('%%%02X', uc)
310        end
311        tmp
312      end.force_encoding(Encoding::US_ASCII)
313    end
314
315    #
316    # :call-seq:
317    #   unescape( str )
318    #   unescape( str, escaped )
319    #
320    # == Args
321    #
322    # +str+::
323    #    String to remove escapes from
324    # +escaped+::
325    #    Regexp to apply. Defaults to self.regexp[:ESCAPED]
326    #
327    # == Description
328    #
329    # Removes escapes from +str+.
330    #
331    def unescape(str, escaped = @regexp[:ESCAPED])
332      enc = str.encoding
333      enc = Encoding::UTF_8 if enc == Encoding::US_ASCII
334      str.gsub(escaped) { [$&[1, 2]].pack('H2').force_encoding(enc) }
335    end
336
337    @@to_s = Kernel.instance_method(:to_s)
338    def inspect
339      @@to_s.bind(self).call
340    end
341
342    private
343
344    # Constructs the default Hash of patterns.
345    def initialize_pattern(opts = {})
346      ret = {}
347      ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED)
348      ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED
349      ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED
350      ret[:DOMLABEL] = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL
351      ret[:TOPLABEL] = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL
352      ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME)
353
354      # RFC 2396 (URI Generic Syntax)
355      # RFC 2732 (IPv6 Literal Addresses in URL's)
356      # RFC 2373 (IPv6 Addressing Architecture)
357
358      # uric          = reserved | unreserved | escaped
359      ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})"
360      # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
361      #                 "&" | "=" | "+" | "$" | ","
362      ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})"
363      # query         = *uric
364      ret[:QUERY] = query = "#{uric}*"
365      # fragment      = *uric
366      ret[:FRAGMENT] = fragment = "#{uric}*"
367
368      # hostname      = *( domainlabel "." ) toplabel [ "." ]
369      # reg-name      = *( unreserved / pct-encoded / sub-delims ) # RFC3986
370      unless hostname
371        ret[:HOSTNAME] = hostname = "(?:[a-zA-Z0-9\\-.]|%\\h\\h)+"
372      end
373
374      # RFC 2373, APPENDIX B:
375      # IPv6address = hexpart [ ":" IPv4address ]
376      # IPv4address   = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
377      # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
378      # hexseq  = hex4 *( ":" hex4)
379      # hex4    = 1*4HEXDIG
380      #
381      # XXX: This definition has a flaw. "::" + IPv4address must be
382      # allowed too.  Here is a replacement.
383      #
384      # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
385      ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
386      # hex4     = 1*4HEXDIG
387      hex4 = "[#{PATTERN::HEX}]{1,4}"
388      # lastpart = hex4 | IPv4address
389      lastpart = "(?:#{hex4}|#{ipv4addr})"
390      # hexseq1  = *( hex4 ":" ) hex4
391      hexseq1 = "(?:#{hex4}:)*#{hex4}"
392      # hexseq2  = *( hex4 ":" ) lastpart
393      hexseq2 = "(?:#{hex4}:)*#{lastpart}"
394      # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
395      ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)"
396
397      # IPv6prefix  = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
398      # unused
399
400      # ipv6reference = "[" IPv6address "]" (RFC 2732)
401      ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]"
402
403      # host          = hostname | IPv4address
404      # host          = hostname | IPv4address | IPv6reference (RFC 2732)
405      ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})"
406      # port          = *digit
407      ret[:PORT] = port = '\d*'
408      # hostport      = host [ ":" port ]
409      ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?"
410
411      # userinfo      = *( unreserved | escaped |
412      #                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
413      ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*"
414
415      # pchar         = unreserved | escaped |
416      #                 ":" | "@" | "&" | "=" | "+" | "$" | ","
417      pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})"
418      # param         = *pchar
419      param = "#{pchar}*"
420      # segment       = *pchar *( ";" param )
421      segment = "#{pchar}*(?:;#{param})*"
422      # path_segments = segment *( "/" segment )
423      ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*"
424
425      # server        = [ [ userinfo "@" ] hostport ]
426      server = "(?:#{userinfo}@)?#{hostport}"
427      # reg_name      = 1*( unreserved | escaped | "$" | "," |
428      #                     ";" | ":" | "@" | "&" | "=" | "+" )
429      ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+"
430      # authority     = server | reg_name
431      authority = "(?:#{server}|#{reg_name})"
432
433      # rel_segment   = 1*( unreserved | escaped |
434      #                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
435      ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+"
436
437      # scheme        = alpha *( alpha | digit | "+" | "-" | "." )
438      ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][\\-+.#{PATTERN::ALPHA}\\d]*"
439
440      # abs_path      = "/"  path_segments
441      ret[:ABS_PATH] = abs_path = "/#{path_segments}"
442      # rel_path      = rel_segment [ abs_path ]
443      ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?"
444      # net_path      = "//" authority [ abs_path ]
445      ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?"
446
447      # hier_part     = ( net_path | abs_path ) [ "?" query ]
448      ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?"
449      # opaque_part   = uric_no_slash *uric
450      ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*"
451
452      # absoluteURI   = scheme ":" ( hier_part | opaque_part )
453      ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})"
454      # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
455      ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?"
456
457      # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
458      ret[:URI_REF] = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?"
459
460      ret[:X_ABS_URI] = "
461        (#{scheme}):                           (?# 1: scheme)
462        (?:
463           (#{opaque_part})                    (?# 2: opaque)
464        |
465           (?:(?:
466             //(?:
467                 (?:(?:(#{userinfo})@)?        (?# 3: userinfo)
468                   (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port)
469               |
470                 (#{reg_name})                 (?# 6: registry)
471               )
472             |
473             (?!//))                           (?# XXX: '//' is the mark for hostport)
474             (#{abs_path})?                    (?# 7: path)
475           )(?:\\?(#{query}))?                 (?# 8: query)
476        )
477        (?:\\#(#{fragment}))?                  (?# 9: fragment)
478      "
479
480      ret[:X_REL_URI] = "
481        (?:
482          (?:
483            //
484            (?:
485              (?:(#{userinfo})@)?       (?# 1: userinfo)
486                (#{host})?(?::(\\d*))?  (?# 2: host, 3: port)
487            |
488              (#{reg_name})             (?# 4: registry)
489            )
490          )
491        |
492          (#{rel_segment})              (?# 5: rel_segment)
493        )?
494        (#{abs_path})?                  (?# 6: abs_path)
495        (?:\\?(#{query}))?              (?# 7: query)
496        (?:\\#(#{fragment}))?           (?# 8: fragment)
497      "
498
499      ret
500    end
501
502    # Constructs the default Hash of Regexp's.
503    def initialize_regexp(pattern)
504      ret = {}
505
506      # for URI::split
507      ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED)
508      ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED)
509
510      # for URI::extract
511      ret[:URI_REF]     = Regexp.new(pattern[:URI_REF])
512      ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED)
513      ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED)
514
515      # for URI::escape/unescape
516      ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED])
517      ret[:UNSAFE]  = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]")
518
519      # for Generic#initialize
520      ret[:SCHEME]   = Regexp.new("\\A#{pattern[:SCHEME]}\\z")
521      ret[:USERINFO] = Regexp.new("\\A#{pattern[:USERINFO]}\\z")
522      ret[:HOST]     = Regexp.new("\\A#{pattern[:HOST]}\\z")
523      ret[:PORT]     = Regexp.new("\\A#{pattern[:PORT]}\\z")
524      ret[:OPAQUE]   = Regexp.new("\\A#{pattern[:OPAQUE_PART]}\\z")
525      ret[:REGISTRY] = Regexp.new("\\A#{pattern[:REG_NAME]}\\z")
526      ret[:ABS_PATH] = Regexp.new("\\A#{pattern[:ABS_PATH]}\\z")
527      ret[:REL_PATH] = Regexp.new("\\A#{pattern[:REL_PATH]}\\z")
528      ret[:QUERY]    = Regexp.new("\\A#{pattern[:QUERY]}\\z")
529      ret[:FRAGMENT] = Regexp.new("\\A#{pattern[:FRAGMENT]}\\z")
530
531      ret
532    end
533
534    def convert_to_uri(uri)
535      if uri.is_a?(URI::Generic)
536        uri
537      elsif uri = String.try_convert(uri)
538        parse(uri)
539      else
540        raise ArgumentError,
541          "bad argument (expected URI object or URI string)"
542      end
543    end
544
545  end # class Parser
546end # module URI
547