1# frozen_string_literal: false
2require "forwardable"
3require "open-uri"
4
5require_relative "rss"
6require_relative "xml"
7
8module RSS
9
10  class NotWellFormedError < Error
11    attr_reader :line, :element
12
13    # Create a new NotWellFormedError for an error at +line+
14    # in +element+.  If a block is given the return value of
15    # the block ends up in the error message.
16    def initialize(line=nil, element=nil)
17      message = "This is not well formed XML"
18      if element or line
19        message << "\nerror occurred"
20        message << " in #{element}" if element
21        message << " at about #{line} line" if line
22      end
23      message << "\n#{yield}" if block_given?
24      super(message)
25    end
26  end
27
28  class XMLParserNotFound < Error
29    def initialize
30      super("available XML parser was not found in " <<
31            "#{AVAILABLE_PARSER_LIBRARIES.inspect}.")
32    end
33  end
34
35  class NotValidXMLParser < Error
36    def initialize(parser)
37      super("#{parser} is not an available XML parser. " <<
38            "Available XML parser" <<
39            (AVAILABLE_PARSERS.size > 1 ? "s are " : " is ") <<
40            "#{AVAILABLE_PARSERS.inspect}.")
41    end
42  end
43
44  class NSError < InvalidRSSError
45    attr_reader :tag, :prefix, :uri
46    def initialize(tag, prefix, require_uri)
47      @tag, @prefix, @uri = tag, prefix, require_uri
48      super("prefix <#{prefix}> doesn't associate uri " <<
49            "<#{require_uri}> in tag <#{tag}>")
50    end
51  end
52
53  class Parser
54
55    extend Forwardable
56
57    class << self
58
59      @@default_parser = nil
60
61      def default_parser
62        @@default_parser || AVAILABLE_PARSERS.first
63      end
64
65      # Set @@default_parser to new_value if it is one of the
66      # available parsers. Else raise NotValidXMLParser error.
67      def default_parser=(new_value)
68        if AVAILABLE_PARSERS.include?(new_value)
69          @@default_parser = new_value
70        else
71          raise NotValidXMLParser.new(new_value)
72        end
73      end
74
75      def parse(rss, *args)
76        if args.last.is_a?(Hash)
77          options = args.pop
78        else
79          options = {}
80        end
81        do_validate = boolean_argument(args[0], options[:validate], true)
82        ignore_unknown_element =
83          boolean_argument(args[1], options[:ignore_unknown_element], true)
84        parser_class = args[2] || options[:parser_class] || default_parser
85        parser = new(rss, parser_class)
86        parser.do_validate = do_validate
87        parser.ignore_unknown_element = ignore_unknown_element
88        parser.parse
89      end
90
91      private
92      def boolean_argument(positioned_value, option_value, default)
93        value = positioned_value
94        if value.nil? and not option_value.nil?
95          value = option_value
96        end
97        value = default if value.nil?
98        value
99      end
100    end
101
102    def_delegators(:@parser, :parse, :rss,
103                   :ignore_unknown_element,
104                   :ignore_unknown_element=, :do_validate,
105                   :do_validate=)
106
107    def initialize(rss, parser_class=self.class.default_parser)
108      @parser = parser_class.new(normalize_rss(rss))
109    end
110
111    private
112
113    # Try to get the XML associated with +rss+.
114    # Return +rss+ if it already looks like XML, or treat it as a URI,
115    # or a file to get the XML,
116    def normalize_rss(rss)
117      return rss if maybe_xml?(rss)
118
119      uri = to_uri(rss)
120
121      if uri.respond_to?(:read)
122        uri.read
123      elsif !rss.tainted? and File.readable?(rss)
124        File.open(rss) {|f| f.read}
125      else
126        rss
127      end
128    end
129
130    # maybe_xml? tests if source is a string that looks like XML.
131    def maybe_xml?(source)
132      source.is_a?(String) and /</ =~ source
133    end
134
135    # Attempt to convert rss to a URI, but just return it if
136    # there's a ::URI::Error
137    def to_uri(rss)
138      return rss if rss.is_a?(::URI::Generic)
139
140      begin
141        ::URI.parse(rss)
142      rescue ::URI::Error
143        rss
144      end
145    end
146  end
147
148  class BaseParser
149
150    class << self
151      def raise_for_undefined_entity?
152        listener.raise_for_undefined_entity?
153      end
154    end
155
156    def initialize(rss)
157      @listener = self.class.listener.new
158      @rss = rss
159    end
160
161    def rss
162      @listener.rss
163    end
164
165    def ignore_unknown_element
166      @listener.ignore_unknown_element
167    end
168
169    def ignore_unknown_element=(new_value)
170      @listener.ignore_unknown_element = new_value
171    end
172
173    def do_validate
174      @listener.do_validate
175    end
176
177    def do_validate=(new_value)
178      @listener.do_validate = new_value
179    end
180
181    def parse
182      if @listener.rss.nil?
183        _parse
184      end
185      @listener.rss
186    end
187
188  end
189
190  class BaseListener
191
192    extend Utils
193
194    class << self
195
196      @@accessor_bases = {}
197      @@registered_uris = {}
198      @@class_names = {}
199
200      # return the setter for the uri, tag_name pair, or nil.
201      def setter(uri, tag_name)
202        _getter = getter(uri, tag_name)
203        if _getter
204          "#{_getter}="
205        else
206          nil
207        end
208      end
209
210      def getter(uri, tag_name)
211        (@@accessor_bases[uri] || {})[tag_name]
212      end
213
214      # return the tag_names for setters associated with uri
215      def available_tags(uri)
216        (@@accessor_bases[uri] || {}).keys
217      end
218
219      # register uri against this name.
220      def register_uri(uri, name)
221        @@registered_uris[name] ||= {}
222        @@registered_uris[name][uri] = nil
223      end
224
225      # test if this uri is registered against this name
226      def uri_registered?(uri, name)
227        @@registered_uris[name].has_key?(uri)
228      end
229
230      # record class_name for the supplied uri and tag_name
231      def install_class_name(uri, tag_name, class_name)
232        @@class_names[uri] ||= {}
233        @@class_names[uri][tag_name] = class_name
234      end
235
236      # retrieve class_name for the supplied uri and tag_name
237      # If it doesn't exist, capitalize the tag_name
238      def class_name(uri, tag_name)
239        name = (@@class_names[uri] || {})[tag_name]
240        return name if name
241
242        tag_name = tag_name.gsub(/[_\-]([a-z]?)/) {$1.upcase}
243        tag_name[0, 1].upcase + tag_name[1..-1]
244      end
245
246      def install_get_text_element(uri, name, accessor_base)
247        install_accessor_base(uri, name, accessor_base)
248        def_get_text_element(uri, name, *get_file_and_line_from_caller(1))
249      end
250
251      def raise_for_undefined_entity?
252        true
253      end
254
255      private
256      # set the accessor for the uri, tag_name pair
257      def install_accessor_base(uri, tag_name, accessor_base)
258        @@accessor_bases[uri] ||= {}
259        @@accessor_bases[uri][tag_name] = accessor_base.chomp("=")
260      end
261
262      def def_get_text_element(uri, element_name, file, line)
263        register_uri(uri, element_name)
264        method_name = "start_#{element_name}"
265        unless private_method_defined?(method_name)
266          define_method(method_name) do |name, prefix, attrs, ns|
267            uri = _ns(ns, prefix)
268            if self.class.uri_registered?(uri, element_name)
269              start_get_text_element(name, prefix, ns, uri)
270            else
271              start_else_element(name, prefix, attrs, ns)
272            end
273          end
274          private(method_name)
275        end
276      end
277    end
278  end
279
280  module ListenerMixin
281    attr_reader :rss
282
283    attr_accessor :ignore_unknown_element
284    attr_accessor :do_validate
285
286    def initialize
287      @rss = nil
288      @ignore_unknown_element = true
289      @do_validate = true
290      @ns_stack = [{"xml" => :xml}]
291      @tag_stack = [[]]
292      @text_stack = ['']
293      @proc_stack = []
294      @last_element = nil
295      @version = @encoding = @standalone = nil
296      @xml_stylesheets = []
297      @xml_child_mode = false
298      @xml_element = nil
299      @last_xml_element = nil
300    end
301
302    # set instance vars for version, encoding, standalone
303    def xmldecl(version, encoding, standalone)
304      @version, @encoding, @standalone = version, encoding, standalone
305    end
306
307    def instruction(name, content)
308      if name == "xml-stylesheet"
309        params = parse_pi_content(content)
310        if params.has_key?("href")
311          @xml_stylesheets << XMLStyleSheet.new(params)
312        end
313      end
314    end
315
316    def tag_start(name, attributes)
317      @text_stack.push('')
318
319      ns = @ns_stack.last.dup
320      attrs = {}
321      attributes.each do |n, v|
322        if /\Axmlns(?:\z|:)/ =~ n
323          ns[$POSTMATCH] = v
324        else
325          attrs[n] = v
326        end
327      end
328      @ns_stack.push(ns)
329
330      prefix, local = split_name(name)
331      @tag_stack.last.push([_ns(ns, prefix), local])
332      @tag_stack.push([])
333      if @xml_child_mode
334        previous = @last_xml_element
335        element_attrs = attributes.dup
336        unless previous
337          ns.each do |ns_prefix, value|
338            next if ns_prefix == "xml"
339            key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}"
340            element_attrs[key] ||= value
341          end
342        end
343        next_element = XML::Element.new(local,
344                                        prefix.empty? ? nil : prefix,
345                                        _ns(ns, prefix),
346                                        element_attrs)
347        previous << next_element if previous
348        @last_xml_element = next_element
349        pr = Proc.new do |text, tags|
350          if previous
351            @last_xml_element = previous
352          else
353            @xml_element = @last_xml_element
354            @last_xml_element = nil
355          end
356        end
357        @proc_stack.push(pr)
358      else
359        if @rss.nil? and respond_to?("initial_start_#{local}", true)
360          __send__("initial_start_#{local}", local, prefix, attrs, ns.dup)
361        elsif respond_to?("start_#{local}", true)
362          __send__("start_#{local}", local, prefix, attrs, ns.dup)
363        else
364          start_else_element(local, prefix, attrs, ns.dup)
365        end
366      end
367    end
368
369    def tag_end(name)
370      if DEBUG
371        p "end tag #{name}"
372        p @tag_stack
373      end
374      text = @text_stack.pop
375      tags = @tag_stack.pop
376      pr = @proc_stack.pop
377      pr.call(text, tags) unless pr.nil?
378      @ns_stack.pop
379    end
380
381    def text(data)
382      if @xml_child_mode
383        @last_xml_element << data if @last_xml_element
384      else
385        @text_stack.last << data
386      end
387    end
388
389    private
390    def _ns(ns, prefix)
391      ns.fetch(prefix, "")
392    end
393
394    CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/
395    # Extract the first name="value" pair from content.
396    # Works with single quotes according to the constant
397    # CONTENT_PATTERN. Return a Hash.
398    def parse_pi_content(content)
399      params = {}
400      content.scan(CONTENT_PATTERN) do |name, quote, value|
401        params[name] = value
402      end
403      params
404    end
405
406    def start_else_element(local, prefix, attrs, ns)
407      class_name = self.class.class_name(_ns(ns, prefix), local)
408      current_class = @last_element.class
409      if known_class?(current_class, class_name)
410        next_class = current_class.const_get(class_name)
411        start_have_something_element(local, prefix, attrs, ns, next_class)
412      else
413        if !@do_validate or @ignore_unknown_element
414          @proc_stack.push(setup_next_element_in_unknown_element)
415        else
416          parent = "ROOT ELEMENT???"
417          if current_class.tag_name
418            parent = current_class.tag_name
419          end
420          raise NotExpectedTagError.new(local, _ns(ns, prefix), parent)
421        end
422      end
423    end
424
425    if Module.method(:const_defined?).arity == -1
426      def known_class?(target_class, class_name)
427        class_name and
428          (target_class.const_defined?(class_name, false) or
429           target_class.constants.include?(class_name.to_sym))
430      end
431    else
432      def known_class?(target_class, class_name)
433        class_name and
434          (target_class.const_defined?(class_name) or
435           target_class.constants.include?(class_name))
436      end
437    end
438
439    NAMESPLIT = /^(?:([\w:][-\w.]*):)?([\w:][-\w.]*)/
440    def split_name(name)
441      name =~ NAMESPLIT
442      [$1 || '', $2]
443    end
444
445    def check_ns(tag_name, prefix, ns, require_uri, ignore_unknown_element=nil)
446      if _ns(ns, prefix) == require_uri
447        true
448      else
449        if ignore_unknown_element.nil?
450          ignore_unknown_element = @ignore_unknown_element
451        end
452
453        if ignore_unknown_element
454          false
455        elsif @do_validate
456          raise NSError.new(tag_name, prefix, require_uri)
457        else
458          # Force bind required URI with prefix
459          @ns_stack.last[prefix] = require_uri
460          true
461        end
462      end
463    end
464
465    def start_get_text_element(tag_name, prefix, ns, required_uri)
466      pr = Proc.new do |text, tags|
467        setter = self.class.setter(required_uri, tag_name)
468        if setter and @last_element.respond_to?(setter)
469          if @do_validate
470            getter = self.class.getter(required_uri, tag_name)
471            if @last_element.__send__(getter)
472              raise TooMuchTagError.new(tag_name, @last_element.tag_name)
473            end
474          end
475          @last_element.__send__(setter, text.to_s)
476        else
477          if @do_validate and !@ignore_unknown_element
478            raise NotExpectedTagError.new(tag_name, _ns(ns, prefix),
479                                          @last_element.tag_name)
480          end
481        end
482      end
483      @proc_stack.push(pr)
484    end
485
486    def start_have_something_element(tag_name, prefix, attrs, ns, klass)
487      if check_ns(tag_name, prefix, ns, klass.required_uri)
488        attributes = collect_attributes(tag_name, prefix, attrs, ns, klass)
489        @proc_stack.push(setup_next_element(tag_name, klass, attributes))
490      else
491        @proc_stack.push(setup_next_element_in_unknown_element)
492      end
493    end
494
495    def collect_attributes(tag_name, prefix, attrs, ns, klass)
496      attributes = {}
497      klass.get_attributes.each do |a_name, a_uri, required, element_name|
498        if a_uri.is_a?(String) or !a_uri.respond_to?(:include?)
499          a_uri = [a_uri]
500        end
501        unless a_uri == [""]
502          for prefix, uri in ns
503            if a_uri.include?(uri)
504              val = attrs["#{prefix}:#{a_name}"]
505              break if val
506            end
507          end
508        end
509        if val.nil? and a_uri.include?("")
510          val = attrs[a_name]
511        end
512
513        if @do_validate and required and val.nil?
514          unless a_uri.include?("")
515            for prefix, uri in ns
516              if a_uri.include?(uri)
517                a_name = "#{prefix}:#{a_name}"
518              end
519            end
520          end
521          raise MissingAttributeError.new(tag_name, a_name)
522        end
523
524        attributes[a_name] = val
525      end
526      attributes
527    end
528
529    def setup_next_element(tag_name, klass, attributes)
530      previous = @last_element
531      next_element = klass.new(@do_validate, attributes)
532      previous.set_next_element(tag_name, next_element)
533      @last_element = next_element
534      @last_element.parent = previous if klass.need_parent?
535      @xml_child_mode = @last_element.have_xml_content?
536
537      Proc.new do |text, tags|
538        p(@last_element.class) if DEBUG
539        if @xml_child_mode
540          @last_element.content = @xml_element.to_s
541          xml_setter = @last_element.class.xml_setter
542          @last_element.__send__(xml_setter, @xml_element)
543          @xml_element = nil
544          @xml_child_mode = false
545        else
546          if klass.have_content?
547            if @last_element.need_base64_encode?
548              text = text.lstrip.unpack("m").first
549            end
550            @last_element.content = text
551          end
552        end
553        if @do_validate
554          @last_element.validate_for_stream(tags, @ignore_unknown_element)
555        end
556        @last_element = previous
557      end
558    end
559
560    def setup_next_element_in_unknown_element
561      current_element, @last_element = @last_element, nil
562      Proc.new {@last_element = current_element}
563    end
564  end
565
566  unless const_defined? :AVAILABLE_PARSER_LIBRARIES
567    # The list of all available libraries for parsing.
568    AVAILABLE_PARSER_LIBRARIES = [
569      ["rss/xmlparser", :XMLParserParser],
570      ["rss/xmlscanner", :XMLScanParser],
571      ["rss/rexmlparser", :REXMLParser],
572    ]
573  end
574
575  # The list of all available parsers, in constant form.
576  AVAILABLE_PARSERS = []
577
578  AVAILABLE_PARSER_LIBRARIES.each do |lib, parser|
579    begin
580      require lib
581      AVAILABLE_PARSERS.push(const_get(parser))
582    rescue LoadError
583    end
584  end
585
586  if AVAILABLE_PARSERS.empty?
587    raise XMLParserNotFound
588  end
589end
590