1# frozen_string_literal: false 2require "forwardable" 3require "open-uri" 4 5require_relative "rss" 6require_relative "xml" 7 8module RSS 9 10 class NotWellFormedError < Error 11 attr_reader :line, :element 12 13 # Create a new NotWellFormedError for an error at +line+ 14 # in +element+. If a block is given the return value of 15 # the block ends up in the error message. 16 def initialize(line=nil, element=nil) 17 message = "This is not well formed XML" 18 if element or line 19 message << "\nerror occurred" 20 message << " in #{element}" if element 21 message << " at about #{line} line" if line 22 end 23 message << "\n#{yield}" if block_given? 24 super(message) 25 end 26 end 27 28 class XMLParserNotFound < Error 29 def initialize 30 super("available XML parser was not found in " << 31 "#{AVAILABLE_PARSER_LIBRARIES.inspect}.") 32 end 33 end 34 35 class NotValidXMLParser < Error 36 def initialize(parser) 37 super("#{parser} is not an available XML parser. " << 38 "Available XML parser" << 39 (AVAILABLE_PARSERS.size > 1 ? "s are " : " is ") << 40 "#{AVAILABLE_PARSERS.inspect}.") 41 end 42 end 43 44 class NSError < InvalidRSSError 45 attr_reader :tag, :prefix, :uri 46 def initialize(tag, prefix, require_uri) 47 @tag, @prefix, @uri = tag, prefix, require_uri 48 super("prefix <#{prefix}> doesn't associate uri " << 49 "<#{require_uri}> in tag <#{tag}>") 50 end 51 end 52 53 class Parser 54 55 extend Forwardable 56 57 class << self 58 59 @@default_parser = nil 60 61 def default_parser 62 @@default_parser || AVAILABLE_PARSERS.first 63 end 64 65 # Set @@default_parser to new_value if it is one of the 66 # available parsers. Else raise NotValidXMLParser error. 67 def default_parser=(new_value) 68 if AVAILABLE_PARSERS.include?(new_value) 69 @@default_parser = new_value 70 else 71 raise NotValidXMLParser.new(new_value) 72 end 73 end 74 75 def parse(rss, *args) 76 if args.last.is_a?(Hash) 77 options = args.pop 78 else 79 options = {} 80 end 81 do_validate = boolean_argument(args[0], options[:validate], true) 82 ignore_unknown_element = 83 boolean_argument(args[1], options[:ignore_unknown_element], true) 84 parser_class = args[2] || options[:parser_class] || default_parser 85 parser = new(rss, parser_class) 86 parser.do_validate = do_validate 87 parser.ignore_unknown_element = ignore_unknown_element 88 parser.parse 89 end 90 91 private 92 def boolean_argument(positioned_value, option_value, default) 93 value = positioned_value 94 if value.nil? and not option_value.nil? 95 value = option_value 96 end 97 value = default if value.nil? 98 value 99 end 100 end 101 102 def_delegators(:@parser, :parse, :rss, 103 :ignore_unknown_element, 104 :ignore_unknown_element=, :do_validate, 105 :do_validate=) 106 107 def initialize(rss, parser_class=self.class.default_parser) 108 @parser = parser_class.new(normalize_rss(rss)) 109 end 110 111 private 112 113 # Try to get the XML associated with +rss+. 114 # Return +rss+ if it already looks like XML, or treat it as a URI, 115 # or a file to get the XML, 116 def normalize_rss(rss) 117 return rss if maybe_xml?(rss) 118 119 uri = to_uri(rss) 120 121 if uri.respond_to?(:read) 122 uri.read 123 elsif !rss.tainted? and File.readable?(rss) 124 File.open(rss) {|f| f.read} 125 else 126 rss 127 end 128 end 129 130 # maybe_xml? tests if source is a string that looks like XML. 131 def maybe_xml?(source) 132 source.is_a?(String) and /</ =~ source 133 end 134 135 # Attempt to convert rss to a URI, but just return it if 136 # there's a ::URI::Error 137 def to_uri(rss) 138 return rss if rss.is_a?(::URI::Generic) 139 140 begin 141 ::URI.parse(rss) 142 rescue ::URI::Error 143 rss 144 end 145 end 146 end 147 148 class BaseParser 149 150 class << self 151 def raise_for_undefined_entity? 152 listener.raise_for_undefined_entity? 153 end 154 end 155 156 def initialize(rss) 157 @listener = self.class.listener.new 158 @rss = rss 159 end 160 161 def rss 162 @listener.rss 163 end 164 165 def ignore_unknown_element 166 @listener.ignore_unknown_element 167 end 168 169 def ignore_unknown_element=(new_value) 170 @listener.ignore_unknown_element = new_value 171 end 172 173 def do_validate 174 @listener.do_validate 175 end 176 177 def do_validate=(new_value) 178 @listener.do_validate = new_value 179 end 180 181 def parse 182 if @listener.rss.nil? 183 _parse 184 end 185 @listener.rss 186 end 187 188 end 189 190 class BaseListener 191 192 extend Utils 193 194 class << self 195 196 @@accessor_bases = {} 197 @@registered_uris = {} 198 @@class_names = {} 199 200 # return the setter for the uri, tag_name pair, or nil. 201 def setter(uri, tag_name) 202 _getter = getter(uri, tag_name) 203 if _getter 204 "#{_getter}=" 205 else 206 nil 207 end 208 end 209 210 def getter(uri, tag_name) 211 (@@accessor_bases[uri] || {})[tag_name] 212 end 213 214 # return the tag_names for setters associated with uri 215 def available_tags(uri) 216 (@@accessor_bases[uri] || {}).keys 217 end 218 219 # register uri against this name. 220 def register_uri(uri, name) 221 @@registered_uris[name] ||= {} 222 @@registered_uris[name][uri] = nil 223 end 224 225 # test if this uri is registered against this name 226 def uri_registered?(uri, name) 227 @@registered_uris[name].has_key?(uri) 228 end 229 230 # record class_name for the supplied uri and tag_name 231 def install_class_name(uri, tag_name, class_name) 232 @@class_names[uri] ||= {} 233 @@class_names[uri][tag_name] = class_name 234 end 235 236 # retrieve class_name for the supplied uri and tag_name 237 # If it doesn't exist, capitalize the tag_name 238 def class_name(uri, tag_name) 239 name = (@@class_names[uri] || {})[tag_name] 240 return name if name 241 242 tag_name = tag_name.gsub(/[_\-]([a-z]?)/) {$1.upcase} 243 tag_name[0, 1].upcase + tag_name[1..-1] 244 end 245 246 def install_get_text_element(uri, name, accessor_base) 247 install_accessor_base(uri, name, accessor_base) 248 def_get_text_element(uri, name, *get_file_and_line_from_caller(1)) 249 end 250 251 def raise_for_undefined_entity? 252 true 253 end 254 255 private 256 # set the accessor for the uri, tag_name pair 257 def install_accessor_base(uri, tag_name, accessor_base) 258 @@accessor_bases[uri] ||= {} 259 @@accessor_bases[uri][tag_name] = accessor_base.chomp("=") 260 end 261 262 def def_get_text_element(uri, element_name, file, line) 263 register_uri(uri, element_name) 264 method_name = "start_#{element_name}" 265 unless private_method_defined?(method_name) 266 define_method(method_name) do |name, prefix, attrs, ns| 267 uri = _ns(ns, prefix) 268 if self.class.uri_registered?(uri, element_name) 269 start_get_text_element(name, prefix, ns, uri) 270 else 271 start_else_element(name, prefix, attrs, ns) 272 end 273 end 274 private(method_name) 275 end 276 end 277 end 278 end 279 280 module ListenerMixin 281 attr_reader :rss 282 283 attr_accessor :ignore_unknown_element 284 attr_accessor :do_validate 285 286 def initialize 287 @rss = nil 288 @ignore_unknown_element = true 289 @do_validate = true 290 @ns_stack = [{"xml" => :xml}] 291 @tag_stack = [[]] 292 @text_stack = [''] 293 @proc_stack = [] 294 @last_element = nil 295 @version = @encoding = @standalone = nil 296 @xml_stylesheets = [] 297 @xml_child_mode = false 298 @xml_element = nil 299 @last_xml_element = nil 300 end 301 302 # set instance vars for version, encoding, standalone 303 def xmldecl(version, encoding, standalone) 304 @version, @encoding, @standalone = version, encoding, standalone 305 end 306 307 def instruction(name, content) 308 if name == "xml-stylesheet" 309 params = parse_pi_content(content) 310 if params.has_key?("href") 311 @xml_stylesheets << XMLStyleSheet.new(params) 312 end 313 end 314 end 315 316 def tag_start(name, attributes) 317 @text_stack.push('') 318 319 ns = @ns_stack.last.dup 320 attrs = {} 321 attributes.each do |n, v| 322 if /\Axmlns(?:\z|:)/ =~ n 323 ns[$POSTMATCH] = v 324 else 325 attrs[n] = v 326 end 327 end 328 @ns_stack.push(ns) 329 330 prefix, local = split_name(name) 331 @tag_stack.last.push([_ns(ns, prefix), local]) 332 @tag_stack.push([]) 333 if @xml_child_mode 334 previous = @last_xml_element 335 element_attrs = attributes.dup 336 unless previous 337 ns.each do |ns_prefix, value| 338 next if ns_prefix == "xml" 339 key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}" 340 element_attrs[key] ||= value 341 end 342 end 343 next_element = XML::Element.new(local, 344 prefix.empty? ? nil : prefix, 345 _ns(ns, prefix), 346 element_attrs) 347 previous << next_element if previous 348 @last_xml_element = next_element 349 pr = Proc.new do |text, tags| 350 if previous 351 @last_xml_element = previous 352 else 353 @xml_element = @last_xml_element 354 @last_xml_element = nil 355 end 356 end 357 @proc_stack.push(pr) 358 else 359 if @rss.nil? and respond_to?("initial_start_#{local}", true) 360 __send__("initial_start_#{local}", local, prefix, attrs, ns.dup) 361 elsif respond_to?("start_#{local}", true) 362 __send__("start_#{local}", local, prefix, attrs, ns.dup) 363 else 364 start_else_element(local, prefix, attrs, ns.dup) 365 end 366 end 367 end 368 369 def tag_end(name) 370 if DEBUG 371 p "end tag #{name}" 372 p @tag_stack 373 end 374 text = @text_stack.pop 375 tags = @tag_stack.pop 376 pr = @proc_stack.pop 377 pr.call(text, tags) unless pr.nil? 378 @ns_stack.pop 379 end 380 381 def text(data) 382 if @xml_child_mode 383 @last_xml_element << data if @last_xml_element 384 else 385 @text_stack.last << data 386 end 387 end 388 389 private 390 def _ns(ns, prefix) 391 ns.fetch(prefix, "") 392 end 393 394 CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/ 395 # Extract the first name="value" pair from content. 396 # Works with single quotes according to the constant 397 # CONTENT_PATTERN. Return a Hash. 398 def parse_pi_content(content) 399 params = {} 400 content.scan(CONTENT_PATTERN) do |name, quote, value| 401 params[name] = value 402 end 403 params 404 end 405 406 def start_else_element(local, prefix, attrs, ns) 407 class_name = self.class.class_name(_ns(ns, prefix), local) 408 current_class = @last_element.class 409 if known_class?(current_class, class_name) 410 next_class = current_class.const_get(class_name) 411 start_have_something_element(local, prefix, attrs, ns, next_class) 412 else 413 if !@do_validate or @ignore_unknown_element 414 @proc_stack.push(setup_next_element_in_unknown_element) 415 else 416 parent = "ROOT ELEMENT???" 417 if current_class.tag_name 418 parent = current_class.tag_name 419 end 420 raise NotExpectedTagError.new(local, _ns(ns, prefix), parent) 421 end 422 end 423 end 424 425 if Module.method(:const_defined?).arity == -1 426 def known_class?(target_class, class_name) 427 class_name and 428 (target_class.const_defined?(class_name, false) or 429 target_class.constants.include?(class_name.to_sym)) 430 end 431 else 432 def known_class?(target_class, class_name) 433 class_name and 434 (target_class.const_defined?(class_name) or 435 target_class.constants.include?(class_name)) 436 end 437 end 438 439 NAMESPLIT = /^(?:([\w:][-\w.]*):)?([\w:][-\w.]*)/ 440 def split_name(name) 441 name =~ NAMESPLIT 442 [$1 || '', $2] 443 end 444 445 def check_ns(tag_name, prefix, ns, require_uri, ignore_unknown_element=nil) 446 if _ns(ns, prefix) == require_uri 447 true 448 else 449 if ignore_unknown_element.nil? 450 ignore_unknown_element = @ignore_unknown_element 451 end 452 453 if ignore_unknown_element 454 false 455 elsif @do_validate 456 raise NSError.new(tag_name, prefix, require_uri) 457 else 458 # Force bind required URI with prefix 459 @ns_stack.last[prefix] = require_uri 460 true 461 end 462 end 463 end 464 465 def start_get_text_element(tag_name, prefix, ns, required_uri) 466 pr = Proc.new do |text, tags| 467 setter = self.class.setter(required_uri, tag_name) 468 if setter and @last_element.respond_to?(setter) 469 if @do_validate 470 getter = self.class.getter(required_uri, tag_name) 471 if @last_element.__send__(getter) 472 raise TooMuchTagError.new(tag_name, @last_element.tag_name) 473 end 474 end 475 @last_element.__send__(setter, text.to_s) 476 else 477 if @do_validate and !@ignore_unknown_element 478 raise NotExpectedTagError.new(tag_name, _ns(ns, prefix), 479 @last_element.tag_name) 480 end 481 end 482 end 483 @proc_stack.push(pr) 484 end 485 486 def start_have_something_element(tag_name, prefix, attrs, ns, klass) 487 if check_ns(tag_name, prefix, ns, klass.required_uri) 488 attributes = collect_attributes(tag_name, prefix, attrs, ns, klass) 489 @proc_stack.push(setup_next_element(tag_name, klass, attributes)) 490 else 491 @proc_stack.push(setup_next_element_in_unknown_element) 492 end 493 end 494 495 def collect_attributes(tag_name, prefix, attrs, ns, klass) 496 attributes = {} 497 klass.get_attributes.each do |a_name, a_uri, required, element_name| 498 if a_uri.is_a?(String) or !a_uri.respond_to?(:include?) 499 a_uri = [a_uri] 500 end 501 unless a_uri == [""] 502 for prefix, uri in ns 503 if a_uri.include?(uri) 504 val = attrs["#{prefix}:#{a_name}"] 505 break if val 506 end 507 end 508 end 509 if val.nil? and a_uri.include?("") 510 val = attrs[a_name] 511 end 512 513 if @do_validate and required and val.nil? 514 unless a_uri.include?("") 515 for prefix, uri in ns 516 if a_uri.include?(uri) 517 a_name = "#{prefix}:#{a_name}" 518 end 519 end 520 end 521 raise MissingAttributeError.new(tag_name, a_name) 522 end 523 524 attributes[a_name] = val 525 end 526 attributes 527 end 528 529 def setup_next_element(tag_name, klass, attributes) 530 previous = @last_element 531 next_element = klass.new(@do_validate, attributes) 532 previous.set_next_element(tag_name, next_element) 533 @last_element = next_element 534 @last_element.parent = previous if klass.need_parent? 535 @xml_child_mode = @last_element.have_xml_content? 536 537 Proc.new do |text, tags| 538 p(@last_element.class) if DEBUG 539 if @xml_child_mode 540 @last_element.content = @xml_element.to_s 541 xml_setter = @last_element.class.xml_setter 542 @last_element.__send__(xml_setter, @xml_element) 543 @xml_element = nil 544 @xml_child_mode = false 545 else 546 if klass.have_content? 547 if @last_element.need_base64_encode? 548 text = text.lstrip.unpack("m").first 549 end 550 @last_element.content = text 551 end 552 end 553 if @do_validate 554 @last_element.validate_for_stream(tags, @ignore_unknown_element) 555 end 556 @last_element = previous 557 end 558 end 559 560 def setup_next_element_in_unknown_element 561 current_element, @last_element = @last_element, nil 562 Proc.new {@last_element = current_element} 563 end 564 end 565 566 unless const_defined? :AVAILABLE_PARSER_LIBRARIES 567 # The list of all available libraries for parsing. 568 AVAILABLE_PARSER_LIBRARIES = [ 569 ["rss/xmlparser", :XMLParserParser], 570 ["rss/xmlscanner", :XMLScanParser], 571 ["rss/rexmlparser", :REXMLParser], 572 ] 573 end 574 575 # The list of all available parsers, in constant form. 576 AVAILABLE_PARSERS = [] 577 578 AVAILABLE_PARSER_LIBRARIES.each do |lib, parser| 579 begin 580 require lib 581 AVAILABLE_PARSERS.push(const_get(parser)) 582 rescue LoadError 583 end 584 end 585 586 if AVAILABLE_PARSERS.empty? 587 raise XMLParserNotFound 588 end 589end 590