1# coding: US-ASCII
2# frozen_string_literal: false
3require_relative 'encoding'
4
5module REXML
6  # Generates Source-s.  USE THIS CLASS.
7  class SourceFactory
8    # Generates a Source object
9    # @param arg Either a String, or an IO
10    # @return a Source, or nil if a bad argument was given
11    def SourceFactory::create_from(arg)
12      if arg.respond_to? :read and
13          arg.respond_to? :readline and
14          arg.respond_to? :nil? and
15          arg.respond_to? :eof?
16        IOSource.new(arg)
17      elsif arg.respond_to? :to_str
18        require 'stringio'
19        IOSource.new(StringIO.new(arg))
20      elsif arg.kind_of? Source
21        arg
22      else
23        raise "#{arg.class} is not a valid input stream.  It must walk \n"+
24          "like either a String, an IO, or a Source."
25      end
26    end
27  end
28
29  # A Source can be searched for patterns, and wraps buffers and other
30  # objects and provides consumption of text
31  class Source
32    include Encoding
33    # The current buffer (what we're going to read next)
34    attr_reader :buffer
35    # The line number of the last consumed text
36    attr_reader :line
37    attr_reader :encoding
38
39    # Constructor
40    # @param arg must be a String, and should be a valid XML document
41    # @param encoding if non-null, sets the encoding of the source to this
42    # value, overriding all encoding detection
43    def initialize(arg, encoding=nil)
44      @orig = @buffer = arg
45      if encoding
46        self.encoding = encoding
47      else
48        detect_encoding
49      end
50      @line = 0
51    end
52
53
54    # Inherited from Encoding
55    # Overridden to support optimized en/decoding
56    def encoding=(enc)
57      return unless super
58      encoding_updated
59    end
60
61    # Scans the source for a given pattern.  Note, that this is not your
62    # usual scan() method.  For one thing, the pattern argument has some
63    # requirements; for another, the source can be consumed.  You can easily
64    # confuse this method.  Originally, the patterns were easier
65    # to construct and this method more robust, because this method
66    # generated search regexps on the fly; however, this was
67    # computationally expensive and slowed down the entire REXML package
68    # considerably, since this is by far the most commonly called method.
69    # @param pattern must be a Regexp, and must be in the form of
70    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
71    # will be returned; the second group is used if the consume flag is
72    # set.
73    # @param consume if true, the pattern returned will be consumed, leaving
74    # everything after it in the Source.
75    # @return the pattern, if found, or nil if the Source is empty or the
76    # pattern is not found.
77    def scan(pattern, cons=false)
78      return nil if @buffer.nil?
79      rv = @buffer.scan(pattern)
80      @buffer = $' if cons and rv.size>0
81      rv
82    end
83
84    def read
85    end
86
87    def consume( pattern )
88      @buffer = $' if pattern.match( @buffer )
89    end
90
91    def match_to( char, pattern )
92      return pattern.match(@buffer)
93    end
94
95    def match_to_consume( char, pattern )
96      md = pattern.match(@buffer)
97      @buffer = $'
98      return md
99    end
100
101    def match(pattern, cons=false)
102      md = pattern.match(@buffer)
103      @buffer = $' if cons and md
104      return md
105    end
106
107    # @return true if the Source is exhausted
108    def empty?
109      @buffer == ""
110    end
111
112    def position
113      @orig.index( @buffer )
114    end
115
116    # @return the current line in the source
117    def current_line
118      lines = @orig.split
119      res = lines.grep @buffer[0..30]
120      res = res[-1] if res.kind_of? Array
121      lines.index( res ) if res
122    end
123
124    private
125    def detect_encoding
126      buffer_encoding = @buffer.encoding
127      detected_encoding = "UTF-8"
128      begin
129        @buffer.force_encoding("ASCII-8BIT")
130        if @buffer[0, 2] == "\xfe\xff"
131          @buffer[0, 2] = ""
132          detected_encoding = "UTF-16BE"
133        elsif @buffer[0, 2] == "\xff\xfe"
134          @buffer[0, 2] = ""
135          detected_encoding = "UTF-16LE"
136        elsif @buffer[0, 3] == "\xef\xbb\xbf"
137          @buffer[0, 3] = ""
138          detected_encoding = "UTF-8"
139        end
140      ensure
141        @buffer.force_encoding(buffer_encoding)
142      end
143      self.encoding = detected_encoding
144    end
145
146    def encoding_updated
147      if @encoding != 'UTF-8'
148        @buffer = decode(@buffer)
149        @to_utf = true
150      else
151        @to_utf = false
152        @buffer.force_encoding ::Encoding::UTF_8
153      end
154    end
155  end
156
157  # A Source that wraps an IO.  See the Source class for method
158  # documentation
159  class IOSource < Source
160    #attr_reader :block_size
161
162    # block_size has been deprecated
163    def initialize(arg, block_size=500, encoding=nil)
164      @er_source = @source = arg
165      @to_utf = false
166      @pending_buffer = nil
167
168      if encoding
169        super("", encoding)
170      else
171        super(@source.read(3) || "")
172      end
173
174      if !@to_utf and
175          @buffer.respond_to?(:force_encoding) and
176          @source.respond_to?(:external_encoding) and
177          @source.external_encoding != ::Encoding::UTF_8
178        @force_utf8 = true
179      else
180        @force_utf8 = false
181      end
182    end
183
184    def scan(pattern, cons=false)
185      rv = super
186      # You'll notice that this next section is very similar to the same
187      # section in match(), but just a liiittle different.  This is
188      # because it is a touch faster to do it this way with scan()
189      # than the way match() does it; enough faster to warrant duplicating
190      # some code
191      if rv.size == 0
192        until @buffer =~ pattern or @source.nil?
193          begin
194            @buffer << readline
195          rescue Iconv::IllegalSequence
196            raise
197          rescue
198            @source = nil
199          end
200        end
201        rv = super
202      end
203      rv.taint
204      rv
205    end
206
207    def read
208      begin
209        @buffer << readline
210      rescue Exception, NameError
211        @source = nil
212      end
213    end
214
215    def consume( pattern )
216      match( pattern, true )
217    end
218
219    def match( pattern, cons=false )
220      rv = pattern.match(@buffer)
221      @buffer = $' if cons and rv
222      while !rv and @source
223        begin
224          @buffer << readline
225          rv = pattern.match(@buffer)
226          @buffer = $' if cons and rv
227        rescue
228          @source = nil
229        end
230      end
231      rv.taint
232      rv
233    end
234
235    def empty?
236      super and ( @source.nil? || @source.eof? )
237    end
238
239    def position
240      @er_source.pos rescue 0
241    end
242
243    # @return the current line in the source
244    def current_line
245      begin
246        pos = @er_source.pos        # The byte position in the source
247        lineno = @er_source.lineno  # The XML < position in the source
248        @er_source.rewind
249        line = 0                    # The \r\n position in the source
250        begin
251          while @er_source.pos < pos
252            @er_source.readline
253            line += 1
254          end
255        rescue
256        end
257        @er_source.seek(pos)
258      rescue IOError
259        pos = -1
260        line = -1
261      end
262      [pos, lineno, line]
263    end
264
265    private
266    def readline
267      str = @source.readline(@line_break)
268      if @pending_buffer
269        if str.nil?
270          str = @pending_buffer
271        else
272          str = @pending_buffer + str
273        end
274        @pending_buffer = nil
275      end
276      return nil if str.nil?
277
278      if @to_utf
279        decode(str)
280      else
281        str.force_encoding(::Encoding::UTF_8) if @force_utf8
282        str
283      end
284    end
285
286    def encoding_updated
287      case @encoding
288      when "UTF-16BE", "UTF-16LE"
289        @source.binmode
290        @source.set_encoding(@encoding, @encoding)
291      end
292      @line_break = encode(">")
293      @pending_buffer, @buffer = @buffer, ""
294      @pending_buffer.force_encoding(@encoding)
295      super
296    end
297  end
298end
299