1# frozen_string_literal: false 2require_relative 'security' 3require_relative 'entity' 4require_relative 'doctype' 5require_relative 'child' 6require_relative 'doctype' 7require_relative 'parseexception' 8 9module REXML 10 # Represents text nodes in an XML document 11 class Text < Child 12 include Comparable 13 # The order in which the substitutions occur 14 SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ] 15 SUBSTITUTES = ['&', '<', '>', '"', ''', ' '] 16 # Characters which are substituted in written strings 17 SLAICEPS = [ '<', '>', '"', "'", '&' ] 18 SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ] 19 20 # If +raw+ is true, then REXML leaves the value alone 21 attr_accessor :raw 22 23 NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um 24 NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ 25 VALID_CHAR = [ 26 0x9, 0xA, 0xD, 27 (0x20..0xD7FF), 28 (0xE000..0xFFFD), 29 (0x10000..0x10FFFF) 30 ] 31 32 if String.method_defined? :encode 33 VALID_XML_CHARS = Regexp.new('^['+ 34 VALID_CHAR.map { |item| 35 case item 36 when Integer 37 [item].pack('U').force_encoding('utf-8') 38 when Range 39 [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') 40 end 41 }.join + 42 ']*$') 43 else 44 VALID_XML_CHARS = /^( 45 [\x09\x0A\x0D\x20-\x7E] # ASCII 46 | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte 47 | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs 48 | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte 49 | \xEF[\x80-\xBE]{2} # 50 | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff 51 | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates 52 | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 53 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 54 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 55 )*$/nx; 56 end 57 58 # Constructor 59 # +arg+ if a String, the content is set to the String. If a Text, 60 # the object is shallowly cloned. 61 # 62 # +respect_whitespace+ (boolean, false) if true, whitespace is 63 # respected 64 # 65 # +parent+ (nil) if this is a Parent object, the parent 66 # will be set to this. 67 # 68 # +raw+ (nil) This argument can be given three values. 69 # If true, then the value of used to construct this object is expected to 70 # contain no unescaped XML markup, and REXML will not change the text. If 71 # this value is false, the string may contain any characters, and REXML will 72 # escape any and all defined entities whose values are contained in the 73 # text. If this value is nil (the default), then the raw value of the 74 # parent will be used as the raw value for this node. If there is no raw 75 # value for the parent, and no value is supplied, the default is false. 76 # Use this field if you have entities defined for some text, and you don't 77 # want REXML to escape that text in output. 78 # Text.new( "<&", false, nil, false ) #-> "<&" 79 # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;" 80 # Text.new( "<&", false, nil, true ) #-> Parse exception 81 # Text.new( "<&", false, nil, true ) #-> "<&" 82 # # Assume that the entity "s" is defined to be "sean" 83 # # and that the entity "r" is defined to be "russell" 84 # Text.new( "sean russell" ) #-> "&s; &r;" 85 # Text.new( "sean russell", false, nil, true ) #-> "sean russell" 86 # 87 # +entity_filter+ (nil) This can be an array of entities to match in the 88 # supplied text. This argument is only useful if +raw+ is set to false. 89 # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell" 90 # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell" 91 # In the last example, the +entity_filter+ argument is ignored. 92 # 93 # +illegal+ INTERNAL USE ONLY 94 def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 95 entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK ) 96 97 @raw = false 98 @parent = nil 99 @entity_filter = nil 100 101 if parent 102 super( parent ) 103 @raw = parent.raw 104 end 105 106 if arg.kind_of? String 107 @string = arg.dup 108 elsif arg.kind_of? Text 109 @string = arg.instance_variable_get(:@string).dup 110 @raw = arg.raw 111 @entity_filter = arg.instance_variable_get(:@entity_filter) 112 elsif 113 raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})" 114 end 115 116 @string.squeeze!(" \n\t") unless respect_whitespace 117 @string.gsub!(/\r\n?/, "\n") 118 @raw = raw unless raw.nil? 119 @entity_filter = entity_filter if entity_filter 120 clear_cache 121 122 Text.check(@string, illegal, doctype) if @raw 123 end 124 125 def parent= parent 126 super(parent) 127 Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent 128 end 129 130 # check for illegal characters 131 def Text.check string, pattern, doctype 132 133 # illegal anywhere 134 if string !~ VALID_XML_CHARS 135 if String.method_defined? :encode 136 string.chars.each do |c| 137 case c.ord 138 when *VALID_CHAR 139 else 140 raise "Illegal character #{c.inspect} in raw string \"#{string}\"" 141 end 142 end 143 else 144 string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c| 145 case c.unpack('U') 146 when *VALID_CHAR 147 else 148 raise "Illegal character #{c.inspect} in raw string \"#{string}\"" 149 end 150 end 151 end 152 end 153 154 # context sensitive 155 string.scan(pattern) do 156 if $1[-1] != ?; 157 raise "Illegal character '#{$1}' in raw string \"#{string}\"" 158 elsif $1[0] == ?& 159 if $5 and $5[0] == ?# 160 case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) 161 when *VALID_CHAR 162 else 163 raise "Illegal character '#{$1}' in raw string \"#{string}\"" 164 end 165 # FIXME: below can't work but this needs API change. 166 # elsif @parent and $3 and !SUBSTITUTES.include?($1) 167 # if !doctype or !doctype.entities.has_key?($3) 168 # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" 169 # end 170 end 171 end 172 end 173 end 174 175 def node_type 176 :text 177 end 178 179 def empty? 180 @string.size==0 181 end 182 183 184 def clone 185 return Text.new(self, true) 186 end 187 188 189 # Appends text to this text node. The text is appended in the +raw+ mode 190 # of this text node. 191 # 192 # +returns+ the text itself to enable method chain like 193 # 'text << "XXX" << "YYY"'. 194 def <<( to_append ) 195 @string << to_append.gsub( /\r\n?/, "\n" ) 196 clear_cache 197 self 198 end 199 200 201 # +other+ a String or a Text 202 # +returns+ the result of (to_s <=> arg.to_s) 203 def <=>( other ) 204 to_s() <=> other.to_s 205 end 206 207 def doctype 208 if @parent 209 doc = @parent.document 210 doc.doctype if doc 211 end 212 end 213 214 REFERENCE = /#{Entity::REFERENCE}/ 215 # Returns the string value of this text node. This string is always 216 # escaped, meaning that it is a valid XML text node string, and all 217 # entities that can be escaped, have been inserted. This method respects 218 # the entity filter set in the constructor. 219 # 220 # # Assume that the entity "s" is defined to be "sean", and that the 221 # # entity "r" is defined to be "russell" 222 # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 223 # t.to_s #-> "< & &s; russell" 224 # t = Text.new( "< & &s; russell", false, nil, false ) 225 # t.to_s #-> "< & &s; russell" 226 # u = Text.new( "sean russell", false, nil, true ) 227 # u.to_s #-> "sean russell" 228 def to_s 229 return @string if @raw 230 @normalized ||= Text::normalize( @string, doctype, @entity_filter ) 231 end 232 233 def inspect 234 @string.inspect 235 end 236 237 # Returns the string value of this text. This is the text without 238 # entities, as it might be used programmatically, or printed to the 239 # console. This ignores the 'raw' attribute setting, and any 240 # entity_filter. 241 # 242 # # Assume that the entity "s" is defined to be "sean", and that the 243 # # entity "r" is defined to be "russell" 244 # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 245 # t.value #-> "< & sean russell" 246 # t = Text.new( "< & &s; russell", false, nil, false ) 247 # t.value #-> "< & sean russell" 248 # u = Text.new( "sean russell", false, nil, true ) 249 # u.value #-> "sean russell" 250 def value 251 @unnormalized ||= Text::unnormalize( @string, doctype ) 252 end 253 254 # Sets the contents of this text node. This expects the text to be 255 # unnormalized. It returns self. 256 # 257 # e = Element.new( "a" ) 258 # e.add_text( "foo" ) # <a>foo</a> 259 # e[0].value = "bar" # <a>bar</a> 260 # e[0].value = "<a>" # <a><a></a> 261 def value=( val ) 262 @string = val.gsub( /\r\n?/, "\n" ) 263 clear_cache 264 @raw = false 265 end 266 267 def wrap(string, width, addnewline=false) 268 # Recursively wrap string at width. 269 return string if string.length <= width 270 place = string.rindex(' ', width) # Position in string with last ' ' before cutoff 271 if addnewline then 272 return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) 273 else 274 return string[0,place] + "\n" + wrap(string[place+1..-1], width) 275 end 276 end 277 278 def indent_text(string, level=1, style="\t", indentfirstline=true) 279 return string if level < 0 280 new_string = '' 281 string.each_line { |line| 282 indent_string = style * level 283 new_line = (indent_string + line).sub(/[\s]+$/,'') 284 new_string << new_line 285 } 286 new_string.strip! unless indentfirstline 287 return new_string 288 end 289 290 # == DEPRECATED 291 # See REXML::Formatters 292 # 293 def write( writer, indent=-1, transitive=false, ie_hack=false ) 294 Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) 295 formatter = if indent > -1 296 REXML::Formatters::Pretty.new( indent ) 297 else 298 REXML::Formatters::Default.new 299 end 300 formatter.write( self, writer ) 301 end 302 303 # FIXME 304 # This probably won't work properly 305 def xpath 306 path = @parent.xpath 307 path += "/text()" 308 return path 309 end 310 311 # Writes out text, substituting special characters beforehand. 312 # +out+ A String, IO, or any other object supporting <<( String ) 313 # +input+ the text to substitute and the write out 314 # 315 # z=utf8.unpack("U*") 316 # ascOut="" 317 # z.each{|r| 318 # if r < 0x100 319 # ascOut.concat(r.chr) 320 # else 321 # ascOut.concat(sprintf("&#x%x;", r)) 322 # end 323 # } 324 # puts ascOut 325 def write_with_substitution out, input 326 copy = input.clone 327 # Doing it like this rather than in a loop improves the speed 328 copy.gsub!( SPECIALS[0], SUBSTITUTES[0] ) 329 copy.gsub!( SPECIALS[1], SUBSTITUTES[1] ) 330 copy.gsub!( SPECIALS[2], SUBSTITUTES[2] ) 331 copy.gsub!( SPECIALS[3], SUBSTITUTES[3] ) 332 copy.gsub!( SPECIALS[4], SUBSTITUTES[4] ) 333 copy.gsub!( SPECIALS[5], SUBSTITUTES[5] ) 334 out << copy 335 end 336 337 private 338 def clear_cache 339 @normalized = nil 340 @unnormalized = nil 341 end 342 343 # Reads text, substituting entities 344 def Text::read_with_substitution( input, illegal=nil ) 345 copy = input.clone 346 347 if copy =~ illegal 348 raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" ) 349 end if illegal 350 351 copy.gsub!( /\r\n?/, "\n" ) 352 if copy.include? ?& 353 copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] ) 354 copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] ) 355 copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] ) 356 copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] ) 357 copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] ) 358 copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) { 359 m=$1 360 #m='0' if m=='' 361 m = "0#{m}" if m[0] == ?x 362 [Integer(m)].pack('U*') 363 } 364 end 365 copy 366 end 367 368 EREFERENCE = /&(?!#{Entity::NAME};)/ 369 # Escapes all possible entities 370 def Text::normalize( input, doctype=nil, entity_filter=nil ) 371 copy = input.to_s 372 # Doing it like this rather than in a loop improves the speed 373 #copy = copy.gsub( EREFERENCE, '&' ) 374 copy = copy.gsub( "&", "&" ) 375 if doctype 376 # Replace all ampersands that aren't part of an entity 377 doctype.entities.each_value do |entity| 378 copy = copy.gsub( entity.value, 379 "&#{entity.name};" ) if entity.value and 380 not( entity_filter and entity_filter.include?(entity.name) ) 381 end 382 else 383 # Replace all ampersands that aren't part of an entity 384 DocType::DEFAULT_ENTITIES.each_value do |entity| 385 copy = copy.gsub(entity.value, "&#{entity.name};" ) 386 end 387 end 388 copy 389 end 390 391 # Unescapes all possible entities 392 def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) 393 sum = 0 394 string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { 395 s = Text.expand($&, doctype, filter) 396 if sum + s.bytesize > Security.entity_expansion_text_limit 397 raise "entity expansion has grown too large" 398 else 399 sum += s.bytesize 400 end 401 s 402 } 403 end 404 405 def Text.expand(ref, doctype, filter) 406 if ref[1] == ?# 407 if ref[2] == ?x 408 [ref[3...-1].to_i(16)].pack('U*') 409 else 410 [ref[2...-1].to_i].pack('U*') 411 end 412 elsif ref == '&' 413 '&' 414 elsif filter and filter.include?( ref[1...-1] ) 415 ref 416 elsif doctype 417 doctype.entity( ref[1...-1] ) or ref 418 else 419 entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ] 420 entity_value ? entity_value.value : ref 421 end 422 end 423 end 424end 425