1# frozen_string_literal: false
2require_relative 'security'
3require_relative 'entity'
4require_relative 'doctype'
5require_relative 'child'
6require_relative 'doctype'
7require_relative 'parseexception'
8
9module REXML
10  # Represents text nodes in an XML document
11  class Text < Child
12    include Comparable
13    # The order in which the substitutions occur
14    SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
15    SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
16    # Characters which are substituted in written strings
17    SLAICEPS = [ '<', '>', '"', "'", '&' ]
18    SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
19
20    # If +raw+ is true, then REXML leaves the value alone
21    attr_accessor :raw
22
23    NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
24    NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
25    VALID_CHAR = [
26      0x9, 0xA, 0xD,
27      (0x20..0xD7FF),
28      (0xE000..0xFFFD),
29      (0x10000..0x10FFFF)
30    ]
31
32    if String.method_defined? :encode
33      VALID_XML_CHARS = Regexp.new('^['+
34        VALID_CHAR.map { |item|
35          case item
36          when Integer
37            [item].pack('U').force_encoding('utf-8')
38          when Range
39            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
40          end
41        }.join +
42      ']*$')
43    else
44      VALID_XML_CHARS = /^(
45           [\x09\x0A\x0D\x20-\x7E]            # ASCII
46         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
47         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
48         | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
49         |  \xEF[\x80-\xBE]{2}                #
50         |  \xEF\xBF[\x80-\xBD]               # excluding U+fffe and U+ffff
51         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
52         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
53         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
54         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
55       )*$/nx;
56    end
57
58    # Constructor
59    # +arg+ if a String, the content is set to the String.  If a Text,
60    # the object is shallowly cloned.
61    #
62    # +respect_whitespace+ (boolean, false) if true, whitespace is
63    # respected
64    #
65    # +parent+ (nil) if this is a Parent object, the parent
66    # will be set to this.
67    #
68    # +raw+ (nil) This argument can be given three values.
69    # If true, then the value of used to construct this object is expected to
70    # contain no unescaped XML markup, and REXML will not change the text. If
71    # this value is false, the string may contain any characters, and REXML will
72    # escape any and all defined entities whose values are contained in the
73    # text.  If this value is nil (the default), then the raw value of the
74    # parent will be used as the raw value for this node.  If there is no raw
75    # value for the parent, and no value is supplied, the default is false.
76    # Use this field if you have entities defined for some text, and you don't
77    # want REXML to escape that text in output.
78    #   Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
79    #   Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
80    #   Text.new( "<&", false, nil, true )  #-> Parse exception
81    #   Text.new( "&lt;&amp;", false, nil, true )  #-> "&lt;&amp;"
82    #   # Assume that the entity "s" is defined to be "sean"
83    #   # and that the entity    "r" is defined to be "russell"
84    #   Text.new( "sean russell" )          #-> "&s; &r;"
85    #   Text.new( "sean russell", false, nil, true ) #-> "sean russell"
86    #
87    # +entity_filter+ (nil) This can be an array of entities to match in the
88    # supplied text.  This argument is only useful if +raw+ is set to false.
89    #   Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
90    #   Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
91    # In the last example, the +entity_filter+ argument is ignored.
92    #
93    # +illegal+ INTERNAL USE ONLY
94    def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
95      entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
96
97      @raw = false
98      @parent = nil
99      @entity_filter = nil
100
101      if parent
102        super( parent )
103        @raw = parent.raw
104      end
105
106      if arg.kind_of? String
107        @string = arg.dup
108      elsif arg.kind_of? Text
109        @string = arg.instance_variable_get(:@string).dup
110        @raw = arg.raw
111        @entity_filter = arg.instance_variable_get(:@entity_filter)
112      elsif
113        raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
114      end
115
116      @string.squeeze!(" \n\t") unless respect_whitespace
117      @string.gsub!(/\r\n?/, "\n")
118      @raw = raw unless raw.nil?
119      @entity_filter = entity_filter if entity_filter
120      clear_cache
121
122      Text.check(@string, illegal, doctype) if @raw
123    end
124
125    def parent= parent
126      super(parent)
127      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
128    end
129
130    # check for illegal characters
131    def Text.check string, pattern, doctype
132
133      # illegal anywhere
134      if string !~ VALID_XML_CHARS
135        if String.method_defined? :encode
136          string.chars.each do |c|
137            case c.ord
138            when *VALID_CHAR
139            else
140              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
141            end
142          end
143        else
144          string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
145            case c.unpack('U')
146            when *VALID_CHAR
147            else
148              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
149            end
150          end
151        end
152      end
153
154      # context sensitive
155      string.scan(pattern) do
156        if $1[-1] != ?;
157          raise "Illegal character '#{$1}' in raw string \"#{string}\""
158        elsif $1[0] == ?&
159          if $5 and $5[0] == ?#
160            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
161            when *VALID_CHAR
162            else
163              raise "Illegal character '#{$1}' in raw string \"#{string}\""
164            end
165          # FIXME: below can't work but this needs API change.
166          # elsif @parent and $3 and !SUBSTITUTES.include?($1)
167          #   if !doctype or !doctype.entities.has_key?($3)
168          #     raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
169          #   end
170          end
171        end
172      end
173    end
174
175    def node_type
176      :text
177    end
178
179    def empty?
180      @string.size==0
181    end
182
183
184    def clone
185      return Text.new(self, true)
186    end
187
188
189    # Appends text to this text node.  The text is appended in the +raw+ mode
190    # of this text node.
191    #
192    # +returns+ the text itself to enable method chain like
193    # 'text << "XXX" << "YYY"'.
194    def <<( to_append )
195      @string << to_append.gsub( /\r\n?/, "\n" )
196      clear_cache
197      self
198    end
199
200
201    # +other+ a String or a Text
202    # +returns+ the result of (to_s <=> arg.to_s)
203    def <=>( other )
204      to_s() <=> other.to_s
205    end
206
207    def doctype
208      if @parent
209        doc = @parent.document
210        doc.doctype if doc
211      end
212    end
213
214    REFERENCE = /#{Entity::REFERENCE}/
215    # Returns the string value of this text node.  This string is always
216    # escaped, meaning that it is a valid XML text node string, and all
217    # entities that can be escaped, have been inserted.  This method respects
218    # the entity filter set in the constructor.
219    #
220    #   # Assume that the entity "s" is defined to be "sean", and that the
221    #   # entity "r" is defined to be "russell"
222    #   t = Text.new( "< & sean russell", false, nil, false, ['s'] )
223    #   t.to_s   #-> "&lt; &amp; &s; russell"
224    #   t = Text.new( "< & &s; russell", false, nil, false )
225    #   t.to_s   #-> "&lt; &amp; &s; russell"
226    #   u = Text.new( "sean russell", false, nil, true )
227    #   u.to_s   #-> "sean russell"
228    def to_s
229      return @string if @raw
230      @normalized ||= Text::normalize( @string, doctype, @entity_filter )
231    end
232
233    def inspect
234      @string.inspect
235    end
236
237    # Returns the string value of this text.  This is the text without
238    # entities, as it might be used programmatically, or printed to the
239    # console.  This ignores the 'raw' attribute setting, and any
240    # entity_filter.
241    #
242    #   # Assume that the entity "s" is defined to be "sean", and that the
243    #   # entity "r" is defined to be "russell"
244    #   t = Text.new( "< & sean russell", false, nil, false, ['s'] )
245    #   t.value   #-> "< & sean russell"
246    #   t = Text.new( "< & &s; russell", false, nil, false )
247    #   t.value   #-> "< & sean russell"
248    #   u = Text.new( "sean russell", false, nil, true )
249    #   u.value   #-> "sean russell"
250    def value
251      @unnormalized ||= Text::unnormalize( @string, doctype )
252    end
253
254    # Sets the contents of this text node.  This expects the text to be
255    # unnormalized.  It returns self.
256    #
257    #   e = Element.new( "a" )
258    #   e.add_text( "foo" )   # <a>foo</a>
259    #   e[0].value = "bar"    # <a>bar</a>
260    #   e[0].value = "<a>"    # <a>&lt;a&gt;</a>
261    def value=( val )
262      @string = val.gsub( /\r\n?/, "\n" )
263      clear_cache
264      @raw = false
265    end
266
267    def wrap(string, width, addnewline=false)
268      # Recursively wrap string at width.
269      return string if string.length <= width
270      place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
271      if addnewline then
272        return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
273      else
274        return string[0,place] + "\n" + wrap(string[place+1..-1], width)
275      end
276    end
277
278    def indent_text(string, level=1, style="\t", indentfirstline=true)
279      return string if level < 0
280      new_string = ''
281      string.each_line { |line|
282        indent_string = style * level
283        new_line = (indent_string + line).sub(/[\s]+$/,'')
284        new_string << new_line
285      }
286      new_string.strip! unless indentfirstline
287      return new_string
288    end
289
290    # == DEPRECATED
291    # See REXML::Formatters
292    #
293    def write( writer, indent=-1, transitive=false, ie_hack=false )
294      Kernel.warn("#{self.class.name}.write is deprecated.  See REXML::Formatters", uplevel: 1)
295      formatter = if indent > -1
296          REXML::Formatters::Pretty.new( indent )
297        else
298          REXML::Formatters::Default.new
299        end
300      formatter.write( self, writer )
301    end
302
303    # FIXME
304    # This probably won't work properly
305    def xpath
306      path = @parent.xpath
307      path += "/text()"
308      return path
309    end
310
311    # Writes out text, substituting special characters beforehand.
312    # +out+ A String, IO, or any other object supporting <<( String )
313    # +input+ the text to substitute and the write out
314    #
315    #   z=utf8.unpack("U*")
316    #   ascOut=""
317    #   z.each{|r|
318    #     if r <  0x100
319    #       ascOut.concat(r.chr)
320    #     else
321    #       ascOut.concat(sprintf("&#x%x;", r))
322    #     end
323    #   }
324    #   puts ascOut
325    def write_with_substitution out, input
326      copy = input.clone
327      # Doing it like this rather than in a loop improves the speed
328      copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
329      copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
330      copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
331      copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
332      copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
333      copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
334      out << copy
335    end
336
337    private
338    def clear_cache
339      @normalized = nil
340      @unnormalized = nil
341    end
342
343    # Reads text, substituting entities
344    def Text::read_with_substitution( input, illegal=nil )
345      copy = input.clone
346
347      if copy =~ illegal
348        raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
349      end if illegal
350
351      copy.gsub!( /\r\n?/, "\n" )
352      if copy.include? ?&
353        copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
354        copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
355        copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
356        copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
357        copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
358        copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {
359          m=$1
360          #m='0' if m==''
361          m = "0#{m}" if m[0] == ?x
362          [Integer(m)].pack('U*')
363        }
364      end
365      copy
366    end
367
368    EREFERENCE = /&(?!#{Entity::NAME};)/
369    # Escapes all possible entities
370    def Text::normalize( input, doctype=nil, entity_filter=nil )
371      copy = input.to_s
372      # Doing it like this rather than in a loop improves the speed
373      #copy = copy.gsub( EREFERENCE, '&amp;' )
374      copy = copy.gsub( "&", "&amp;" )
375      if doctype
376        # Replace all ampersands that aren't part of an entity
377        doctype.entities.each_value do |entity|
378          copy = copy.gsub( entity.value,
379            "&#{entity.name};" ) if entity.value and
380              not( entity_filter and entity_filter.include?(entity.name) )
381        end
382      else
383        # Replace all ampersands that aren't part of an entity
384        DocType::DEFAULT_ENTITIES.each_value do |entity|
385          copy = copy.gsub(entity.value, "&#{entity.name};" )
386        end
387      end
388      copy
389    end
390
391    # Unescapes all possible entities
392    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
393      sum = 0
394      string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
395        s = Text.expand($&, doctype, filter)
396        if sum + s.bytesize > Security.entity_expansion_text_limit
397          raise "entity expansion has grown too large"
398        else
399          sum += s.bytesize
400        end
401        s
402      }
403    end
404
405    def Text.expand(ref, doctype, filter)
406      if ref[1] == ?#
407        if ref[2] == ?x
408          [ref[3...-1].to_i(16)].pack('U*')
409        else
410          [ref[2...-1].to_i].pack('U*')
411        end
412      elsif ref == '&amp;'
413        '&'
414      elsif filter and filter.include?( ref[1...-1] )
415        ref
416      elsif doctype
417        doctype.entity( ref[1...-1] ) or ref
418      else
419        entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
420        entity_value ? entity_value.value : ref
421      end
422    end
423  end
424end
425