1# coding: US-ASCII 2# frozen_string_literal: false 3require_relative 'encoding' 4 5module REXML 6 # Generates Source-s. USE THIS CLASS. 7 class SourceFactory 8 # Generates a Source object 9 # @param arg Either a String, or an IO 10 # @return a Source, or nil if a bad argument was given 11 def SourceFactory::create_from(arg) 12 if arg.respond_to? :read and 13 arg.respond_to? :readline and 14 arg.respond_to? :nil? and 15 arg.respond_to? :eof? 16 IOSource.new(arg) 17 elsif arg.respond_to? :to_str 18 require 'stringio' 19 IOSource.new(StringIO.new(arg)) 20 elsif arg.kind_of? Source 21 arg 22 else 23 raise "#{arg.class} is not a valid input stream. It must walk \n"+ 24 "like either a String, an IO, or a Source." 25 end 26 end 27 end 28 29 # A Source can be searched for patterns, and wraps buffers and other 30 # objects and provides consumption of text 31 class Source 32 include Encoding 33 # The current buffer (what we're going to read next) 34 attr_reader :buffer 35 # The line number of the last consumed text 36 attr_reader :line 37 attr_reader :encoding 38 39 # Constructor 40 # @param arg must be a String, and should be a valid XML document 41 # @param encoding if non-null, sets the encoding of the source to this 42 # value, overriding all encoding detection 43 def initialize(arg, encoding=nil) 44 @orig = @buffer = arg 45 if encoding 46 self.encoding = encoding 47 else 48 detect_encoding 49 end 50 @line = 0 51 end 52 53 54 # Inherited from Encoding 55 # Overridden to support optimized en/decoding 56 def encoding=(enc) 57 return unless super 58 encoding_updated 59 end 60 61 # Scans the source for a given pattern. Note, that this is not your 62 # usual scan() method. For one thing, the pattern argument has some 63 # requirements; for another, the source can be consumed. You can easily 64 # confuse this method. Originally, the patterns were easier 65 # to construct and this method more robust, because this method 66 # generated search regexps on the fly; however, this was 67 # computationally expensive and slowed down the entire REXML package 68 # considerably, since this is by far the most commonly called method. 69 # @param pattern must be a Regexp, and must be in the form of 70 # /^\s*(#{your pattern, with no groups})(.*)/. The first group 71 # will be returned; the second group is used if the consume flag is 72 # set. 73 # @param consume if true, the pattern returned will be consumed, leaving 74 # everything after it in the Source. 75 # @return the pattern, if found, or nil if the Source is empty or the 76 # pattern is not found. 77 def scan(pattern, cons=false) 78 return nil if @buffer.nil? 79 rv = @buffer.scan(pattern) 80 @buffer = $' if cons and rv.size>0 81 rv 82 end 83 84 def read 85 end 86 87 def consume( pattern ) 88 @buffer = $' if pattern.match( @buffer ) 89 end 90 91 def match_to( char, pattern ) 92 return pattern.match(@buffer) 93 end 94 95 def match_to_consume( char, pattern ) 96 md = pattern.match(@buffer) 97 @buffer = $' 98 return md 99 end 100 101 def match(pattern, cons=false) 102 md = pattern.match(@buffer) 103 @buffer = $' if cons and md 104 return md 105 end 106 107 # @return true if the Source is exhausted 108 def empty? 109 @buffer == "" 110 end 111 112 def position 113 @orig.index( @buffer ) 114 end 115 116 # @return the current line in the source 117 def current_line 118 lines = @orig.split 119 res = lines.grep @buffer[0..30] 120 res = res[-1] if res.kind_of? Array 121 lines.index( res ) if res 122 end 123 124 private 125 def detect_encoding 126 buffer_encoding = @buffer.encoding 127 detected_encoding = "UTF-8" 128 begin 129 @buffer.force_encoding("ASCII-8BIT") 130 if @buffer[0, 2] == "\xfe\xff" 131 @buffer[0, 2] = "" 132 detected_encoding = "UTF-16BE" 133 elsif @buffer[0, 2] == "\xff\xfe" 134 @buffer[0, 2] = "" 135 detected_encoding = "UTF-16LE" 136 elsif @buffer[0, 3] == "\xef\xbb\xbf" 137 @buffer[0, 3] = "" 138 detected_encoding = "UTF-8" 139 end 140 ensure 141 @buffer.force_encoding(buffer_encoding) 142 end 143 self.encoding = detected_encoding 144 end 145 146 def encoding_updated 147 if @encoding != 'UTF-8' 148 @buffer = decode(@buffer) 149 @to_utf = true 150 else 151 @to_utf = false 152 @buffer.force_encoding ::Encoding::UTF_8 153 end 154 end 155 end 156 157 # A Source that wraps an IO. See the Source class for method 158 # documentation 159 class IOSource < Source 160 #attr_reader :block_size 161 162 # block_size has been deprecated 163 def initialize(arg, block_size=500, encoding=nil) 164 @er_source = @source = arg 165 @to_utf = false 166 @pending_buffer = nil 167 168 if encoding 169 super("", encoding) 170 else 171 super(@source.read(3) || "") 172 end 173 174 if !@to_utf and 175 @buffer.respond_to?(:force_encoding) and 176 @source.respond_to?(:external_encoding) and 177 @source.external_encoding != ::Encoding::UTF_8 178 @force_utf8 = true 179 else 180 @force_utf8 = false 181 end 182 end 183 184 def scan(pattern, cons=false) 185 rv = super 186 # You'll notice that this next section is very similar to the same 187 # section in match(), but just a liiittle different. This is 188 # because it is a touch faster to do it this way with scan() 189 # than the way match() does it; enough faster to warrant duplicating 190 # some code 191 if rv.size == 0 192 until @buffer =~ pattern or @source.nil? 193 begin 194 @buffer << readline 195 rescue Iconv::IllegalSequence 196 raise 197 rescue 198 @source = nil 199 end 200 end 201 rv = super 202 end 203 rv.taint 204 rv 205 end 206 207 def read 208 begin 209 @buffer << readline 210 rescue Exception, NameError 211 @source = nil 212 end 213 end 214 215 def consume( pattern ) 216 match( pattern, true ) 217 end 218 219 def match( pattern, cons=false ) 220 rv = pattern.match(@buffer) 221 @buffer = $' if cons and rv 222 while !rv and @source 223 begin 224 @buffer << readline 225 rv = pattern.match(@buffer) 226 @buffer = $' if cons and rv 227 rescue 228 @source = nil 229 end 230 end 231 rv.taint 232 rv 233 end 234 235 def empty? 236 super and ( @source.nil? || @source.eof? ) 237 end 238 239 def position 240 @er_source.pos rescue 0 241 end 242 243 # @return the current line in the source 244 def current_line 245 begin 246 pos = @er_source.pos # The byte position in the source 247 lineno = @er_source.lineno # The XML < position in the source 248 @er_source.rewind 249 line = 0 # The \r\n position in the source 250 begin 251 while @er_source.pos < pos 252 @er_source.readline 253 line += 1 254 end 255 rescue 256 end 257 @er_source.seek(pos) 258 rescue IOError 259 pos = -1 260 line = -1 261 end 262 [pos, lineno, line] 263 end 264 265 private 266 def readline 267 str = @source.readline(@line_break) 268 if @pending_buffer 269 if str.nil? 270 str = @pending_buffer 271 else 272 str = @pending_buffer + str 273 end 274 @pending_buffer = nil 275 end 276 return nil if str.nil? 277 278 if @to_utf 279 decode(str) 280 else 281 str.force_encoding(::Encoding::UTF_8) if @force_utf8 282 str 283 end 284 end 285 286 def encoding_updated 287 case @encoding 288 when "UTF-16BE", "UTF-16LE" 289 @source.binmode 290 @source.set_encoding(@encoding, @encoding) 291 end 292 @line_break = encode(">") 293 @pending_buffer, @buffer = @buffer, "" 294 @pending_buffer.force_encoding(@encoding) 295 super 296 end 297 end 298end 299