1 /* 2 * File : SimpleXMLParserDocumentImpl.java 3 * Created : 5 Oct. 2003 4 * By : Parg 5 * 6 * Azureus - a Java Bittorrent client 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details ( see the LICENSE file ). 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 */ 22 23 package org.gudy.azureus2.pluginsimpl.local.utils.xml.simpleparser; 24 25 import javax.xml.parsers.*; 26 27 import org.xml.sax.*; 28 import org.apache.commons.lang.Entities; 29 import org.gudy.azureus2.core3.util.AENetworkClassifier; 30 import org.gudy.azureus2.core3.util.Constants; 31 import org.gudy.azureus2.core3.util.Debug; 32 import org.gudy.azureus2.core3.util.FileUtil; 33 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocument; 34 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentAttribute; 35 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentException; 36 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentNode; 37 import org.w3c.dom.*; 38 39 import com.aelitis.azureus.core.util.UncloseableInputStream; 40 41 import java.io.*; 42 import java.net.InetAddress; 43 import java.net.URL; 44 import java.net.URLConnection; 45 import java.net.UnknownHostException; 46 import java.util.*; 47 48 public class 49 SimpleXMLParserDocumentImpl 50 implements SimpleXMLParserDocument 51 { 52 private static DocumentBuilderFactory dbf_singleton; 53 54 private URL source_url; 55 56 private Document document; 57 private SimpleXMLParserDocumentNodeImpl root_node; 58 59 60 public SimpleXMLParserDocumentImpl( File file )61 SimpleXMLParserDocumentImpl( 62 File file ) 63 64 throws SimpleXMLParserDocumentException 65 { 66 try{ 67 68 create( new FileInputStream( file )); 69 70 }catch( Throwable e ){ 71 72 throw( new SimpleXMLParserDocumentException( e )); 73 } 74 } 75 76 public SimpleXMLParserDocumentImpl( String data )77 SimpleXMLParserDocumentImpl( 78 String data ) 79 80 throws SimpleXMLParserDocumentException 81 { 82 try{ 83 create( new ByteArrayInputStream( data.getBytes( Constants.DEFAULT_ENCODING ))); 84 85 }catch( UnsupportedEncodingException e ){ 86 87 } 88 } 89 90 /** 91 * @deprecated 92 * @param _input_stream 93 * @throws SimpleXMLParserDocumentException 94 */ 95 public SimpleXMLParserDocumentImpl( InputStream _input_stream )96 SimpleXMLParserDocumentImpl( 97 InputStream _input_stream ) 98 99 throws SimpleXMLParserDocumentException 100 { 101 this( null, _input_stream ); 102 } 103 104 public SimpleXMLParserDocumentImpl( URL _source_url, InputStream _input_stream )105 SimpleXMLParserDocumentImpl( 106 URL _source_url, 107 InputStream _input_stream ) 108 109 throws SimpleXMLParserDocumentException 110 { 111 source_url = _source_url; 112 113 create( _input_stream ); 114 } 115 116 protected static synchronized DocumentBuilderFactory getDBF()117 getDBF() 118 { 119 // getting the factory involves a fait bit of work - cache it 120 121 if ( dbf_singleton == null ){ 122 123 dbf_singleton = DocumentBuilderFactory.newInstance(); 124 125 // Set namespaceAware to true to get a DOM Level 2 tree with nodes 126 // containing namesapce information. This is necessary because the 127 // default value from JAXP 1.0 was defined to be false. 128 129 dbf_singleton.setNamespaceAware(true); 130 131 // Set the validation mode to either: no validation, DTD 132 // validation, or XSD validation 133 134 dbf_singleton.setValidating( false ); 135 136 // Optional: set various configuration options 137 138 dbf_singleton.setIgnoringComments(true); 139 dbf_singleton.setIgnoringElementContentWhitespace(true); 140 dbf_singleton.setCoalescing(true); 141 142 // The opposite of creating entity ref nodes is expanding them inline 143 // NOTE that usage of, e.g. "&" in text results in an entity ref. e.g. 144 // if ("BUY".equals (type) " 145 // ENT_REF: nodeName="amp" 146 // TEXT: nodeName="#text" nodeValue="&" 147 148 dbf_singleton.setExpandEntityReferences(true); 149 } 150 151 return( dbf_singleton ); 152 } 153 154 private void create( InputStream _input_stream )155 create( 156 InputStream _input_stream ) 157 158 throws SimpleXMLParserDocumentException 159 { 160 // make sure we can mark the stream to permit later recovery if needed 161 162 if ( !_input_stream.markSupported()){ 163 164 _input_stream = new BufferedInputStream( _input_stream ); 165 } 166 167 _input_stream.mark( 100*1024 ); 168 169 // prevent the parser from screwing with our stream by closing it 170 171 UncloseableInputStream uc_is = new UncloseableInputStream( _input_stream ); 172 173 SimpleXMLParserDocumentException error = null; 174 175 try{ 176 createSupport( uc_is ); 177 178 }catch( SimpleXMLParserDocumentException e ){ 179 180 String msg = Debug.getNestedExceptionMessage( e ); 181 182 if ( ( msg.contains( "entity" ) && msg.contains( "was referenced" )) || 183 msg.contains( "entity reference" )){ 184 185 try{ 186 // nasty hack to try and handle HTML entities that some annoying feeds include :( 187 188 _input_stream.reset(); 189 190 createSupport( new EntityFudger( _input_stream )); 191 192 return; 193 194 }catch( Throwable f ){ 195 196 if ( f instanceof SimpleXMLParserDocumentException ){ 197 198 error = (SimpleXMLParserDocumentException)f; 199 } 200 } 201 } 202 203 if ( error == null ){ 204 205 error = e; 206 } 207 208 throw( error ); 209 210 }finally{ 211 212 if ( Constants.isCVSVersion() && error != null ){ 213 214 try{ 215 _input_stream.reset(); 216 217 String stuff = FileUtil.readInputStreamAsStringWithTruncation( _input_stream, 2014 ); 218 219 Debug.out( "RSS parsing failed for '" + stuff + "': " + Debug.getExceptionMessage( error )); 220 221 }catch( Throwable e ){ 222 } 223 } 224 try{ 225 _input_stream.close(); 226 227 }catch( Throwable e ){ 228 } 229 } 230 } 231 232 private void createSupport( InputStream input_stream )233 createSupport( 234 InputStream input_stream ) 235 236 throws SimpleXMLParserDocumentException 237 { 238 try{ 239 DocumentBuilderFactory dbf = getDBF(); 240 241 // Step 2: create a DocumentBuilder that satisfies the constraints 242 // specified by the DocumentBuilderFactory 243 244 DocumentBuilder db = dbf.newDocumentBuilder(); 245 246 // Set an ErrorHandler before parsing 247 248 OutputStreamWriter errorWriter = new OutputStreamWriter(System.err); 249 250 MyErrorHandler error_handler = new MyErrorHandler(new PrintWriter(errorWriter, true)); 251 252 db.setErrorHandler( error_handler ); 253 254 db.setEntityResolver( 255 new EntityResolver() 256 { 257 public InputSource 258 resolveEntity( 259 String publicId, String systemId ) 260 { 261 // System.out.println( publicId + ", " + systemId ); 262 263 // handle bad DTD external refs 264 265 try{ 266 URL url = new URL( systemId ); 267 268 if ( source_url != null ){ 269 270 String net = AENetworkClassifier.categoriseAddress( source_url.getHost()); 271 272 if ( net != AENetworkClassifier.AT_PUBLIC ){ 273 274 if ( AENetworkClassifier.categoriseAddress( url.getHost()) != net ){ 275 276 return new InputSource( new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes())); 277 } 278 } 279 } 280 281 String host = url.getHost(); 282 283 InetAddress.getByName( host ); 284 285 // try connecting too as connection-refused will also bork XML parsing 286 287 InputStream is = null; 288 289 try{ 290 URLConnection con = url.openConnection(); 291 292 con.setConnectTimeout( 15*1000 ); 293 con.setReadTimeout( 15*1000 ); 294 295 is = con.getInputStream(); 296 297 byte[] buffer = new byte[32]; 298 299 int pos = 0; 300 301 while( pos < buffer.length ){ 302 303 int len = is.read( buffer, pos, buffer.length - pos ); 304 305 if ( len <= 0 ){ 306 307 break; 308 } 309 310 pos += len; 311 } 312 313 String str = new String( buffer, "UTF-8" ).trim().toLowerCase( Locale.US ); 314 315 if ( !str.contains( "<?xml" )){ 316 317 // not straightforward to check for naked DTDs, could be lots of <!-- commentry preamble which of course can occur 318 // in HTML too 319 320 buffer = new byte[32000]; 321 322 pos = 0; 323 324 while( pos < buffer.length ){ 325 326 int len = is.read( buffer, pos, buffer.length - pos ); 327 328 if ( len <= 0 ){ 329 330 break; 331 } 332 333 pos += len; 334 } 335 336 str += new String( buffer, "UTF-8" ).trim().toLowerCase( Locale.US ); 337 338 if ( str.contains( "<html") && str.contains( "<head" )){ 339 340 throw( new Exception( "Bad DTD" )); 341 } 342 } 343 }catch( Throwable e ){ 344 345 return new InputSource( new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes())); 346 347 }finally{ 348 349 if ( is != null ){ 350 351 try{ 352 is.close(); 353 354 }catch( Throwable e){ 355 356 } 357 } 358 } 359 return( null ); 360 361 }catch( UnknownHostException e ){ 362 363 return new InputSource( new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes())); 364 365 }catch( Throwable e ){ 366 367 return( null ); 368 } 369 } 370 }); 371 372 // Step 3: parse the input file 373 374 document = db.parse( input_stream ); 375 376 SimpleXMLParserDocumentNodeImpl[] root_nodes = parseNode( document, false ); 377 378 int root_node_count = 0; 379 380 // remove any processing instructions such as <?xml-stylesheet 381 382 for (int i=0;i<root_nodes.length;i++){ 383 384 SimpleXMLParserDocumentNodeImpl node = root_nodes[i]; 385 386 if ( node.getNode().getNodeType() != Node.PROCESSING_INSTRUCTION_NODE ){ 387 388 root_node = node; 389 390 root_node_count++; 391 } 392 } 393 394 if ( root_node_count != 1 ){ 395 396 throw( new SimpleXMLParserDocumentException( "invalid document - " + root_nodes.length + " root elements" )); 397 } 398 399 }catch( Throwable e ){ 400 401 throw( new SimpleXMLParserDocumentException( e )); 402 } 403 } 404 405 public String getName()406 getName() 407 { 408 return( root_node.getName()); 409 } 410 411 public String getFullName()412 getFullName() 413 { 414 return( root_node.getFullName()); 415 } 416 417 public String getNameSpaceURI()418 getNameSpaceURI() 419 { 420 return( root_node.getNameSpaceURI()); 421 } 422 423 public String getValue()424 getValue() 425 { 426 return( root_node.getValue()); 427 } 428 429 public SimpleXMLParserDocumentNode[] getChildren()430 getChildren() 431 { 432 return( root_node.getChildren()); 433 } 434 public SimpleXMLParserDocumentNode getChild( String name )435 getChild( 436 String name ) 437 { 438 return( root_node.getChild(name)); 439 } 440 441 public SimpleXMLParserDocumentAttribute[] getAttributes()442 getAttributes() 443 { 444 return( root_node.getAttributes()); 445 } 446 public SimpleXMLParserDocumentAttribute getAttribute( String name )447 getAttribute( 448 String name ) 449 { 450 return( root_node.getAttribute(name)); 451 } 452 453 public void print()454 print() 455 { 456 PrintWriter pw = new PrintWriter( System.out ); 457 458 print( pw ); 459 460 pw.flush(); 461 } 462 463 public void print( PrintWriter pw )464 print( 465 PrintWriter pw ) 466 { 467 root_node.print( pw, "" ); 468 } 469 470 // idea is to flatten out any unwanted structure. We just want the resultant 471 // tree to have nodes for each nesting element and leaves denoting name/value bits 472 473 protected SimpleXMLParserDocumentNodeImpl[] parseNode( Node node, boolean skip_this_node )474 parseNode( 475 Node node, 476 boolean skip_this_node ) 477 { 478 int type = node.getNodeType(); 479 480 if ( ( type == Node.ELEMENT_NODE || 481 type == Node.PROCESSING_INSTRUCTION_NODE )&& !skip_this_node ){ 482 483 return( new SimpleXMLParserDocumentNodeImpl[]{ new SimpleXMLParserDocumentNodeImpl( this, node )}); 484 } 485 486 Vector v = new Vector(); 487 488 for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()){ 489 490 SimpleXMLParserDocumentNodeImpl[] kids = parseNode( child, false ); 491 492 for (int i=0;i<kids.length;i++){ 493 494 v.addElement(kids[i]); 495 } 496 } 497 498 SimpleXMLParserDocumentNodeImpl[] res = new SimpleXMLParserDocumentNodeImpl[v.size()]; 499 500 v.copyInto( res ); 501 502 return( res ); 503 } 504 505 private static class MyErrorHandler implements ErrorHandler { 506 /** Error handler output goes here */ 507 //private PrintWriter out; 508 MyErrorHandler(PrintWriter out)509 MyErrorHandler(PrintWriter out) { 510 //this.out = out; 511 } 512 513 /** 514 * Returns a string describing parse exception details 515 */ getParseExceptionInfo(SAXParseException spe)516 private String getParseExceptionInfo(SAXParseException spe) { 517 String systemId = spe.getSystemId(); 518 if (systemId == null) { 519 systemId = "null"; 520 } 521 String info = "URI=" + systemId + 522 " Line=" + spe.getLineNumber() + 523 ": " + spe.getMessage(); 524 return info; 525 } 526 527 // The following methods are standard SAX ErrorHandler methods. 528 // See SAX documentation for more info. 529 530 public void warning( SAXParseException spe )531 warning( 532 SAXParseException spe ) 533 534 throws SAXException 535 { 536 // out.println("Warning: " + getParseExceptionInfo(spe)); 537 } 538 539 public void error( SAXParseException spe )540 error( 541 SAXParseException spe ) 542 543 throws SAXException 544 { 545 String message = "Error: " + getParseExceptionInfo(spe); 546 547 throw new SAXException(message); 548 } 549 550 public void fatalError( SAXParseException spe )551 fatalError( 552 SAXParseException spe ) 553 554 throws SAXException 555 { 556 String message = "Fatal Error: " + getParseExceptionInfo(spe); 557 558 throw new SAXException(message,spe); 559 } 560 } 561 562 private static class 563 EntityFudger 564 extends InputStream 565 { 566 private InputStream is; 567 568 char[] buffer = new char[16]; 569 int buffer_pos = 0; 570 571 char[] insertion = new char[16]; 572 int insertion_pos = 0; 573 int insertion_len = 0; 574 575 public EntityFudger( InputStream _is )576 EntityFudger( 577 InputStream _is ) 578 { 579 is = _is; 580 } 581 582 @Override 583 public int read()584 read() 585 throws IOException 586 { 587 if ( insertion_len > 0 ){ 588 589 int result = insertion[ insertion_pos++ ]&0xff; 590 591 if ( insertion_pos == insertion_len ){ 592 593 insertion_pos = 0; 594 insertion_len = 0; 595 } 596 597 return( result ); 598 } 599 600 while( true ){ 601 602 int b = is.read(); 603 604 if ( b < 0 ){ 605 606 // end of file 607 608 if ( buffer_pos == 0 ){ 609 610 return( b ); 611 612 }else if ( buffer_pos == 1 ){ 613 614 buffer_pos = 0; 615 616 return( buffer[0]&0xff ); 617 618 }else{ 619 620 System.arraycopy( buffer, 1, insertion, 0, buffer_pos - 1 ); 621 622 insertion_len = buffer_pos - 1; 623 insertion_pos = 0; 624 625 buffer_pos = 0; 626 627 return( buffer[0]&0xff ); 628 } 629 } 630 631 // normal byte 632 633 if ( buffer_pos == 0 ){ 634 635 if ( b == '&' ){ 636 637 buffer[ buffer_pos++ ] = (char)b; 638 639 }else{ 640 641 return( b ); 642 } 643 644 }else{ 645 646 if ( buffer_pos == buffer.length-1 ){ 647 648 // buffer's full, give up 649 650 buffer[ buffer_pos++ ] = (char)b; 651 652 System.arraycopy( buffer, 0, insertion, 0, buffer_pos ); 653 654 buffer_pos = 0; 655 insertion_pos = 0; 656 insertion_len = buffer_pos; 657 658 return( insertion[insertion_pos++] ); 659 660 }else{ 661 662 if ( b == ';' ){ 663 664 // got some kind of reference mebe 665 666 buffer[ buffer_pos++ ] = (char)b; 667 668 String ref = new String( buffer, 1, buffer_pos-2 ).toLowerCase( Locale.US ); 669 670 String replacement; 671 672 if ( ref.equals( "amp") || 673 ref.equals( "lt" ) || 674 ref.equals( "gt" ) || 675 ref.equals( "quot" ) || 676 ref.equals( "apos" ) || 677 ref.startsWith( "#" )){ 678 679 replacement = new String( buffer, 0, buffer_pos ); 680 681 }else{ 682 683 int num = Entities.HTML40.entityValue( ref ); 684 685 if ( num != -1 ){ 686 687 replacement = "&#" + num + ";"; 688 689 }else{ 690 691 replacement = new String( buffer, 0, buffer_pos ); 692 } 693 } 694 695 char[] chars = replacement.toCharArray(); 696 697 System.arraycopy( chars, 0, insertion, 0, chars.length ); 698 699 buffer_pos = 0; 700 insertion_pos = 0; 701 insertion_len = chars.length; 702 703 return( insertion[insertion_pos++] ); 704 705 }else{ 706 707 buffer[ buffer_pos++ ] = (char)b; 708 709 char c = (char)b; 710 711 if ( !Character.isLetterOrDigit( c )){ 712 713 // handle naked & 714 715 if ( buffer_pos == 2 && buffer[0] == '&'){ 716 717 char[] chars = "&".toCharArray(); 718 719 System.arraycopy( chars, 0, insertion, 0, chars.length ); 720 721 buffer_pos = 0; 722 insertion_pos = 0; 723 insertion_len = chars.length; 724 725 // don't forget the char we just read 726 727 insertion[insertion_len++] = (char)b; 728 729 return( insertion[insertion_pos++] ); 730 731 }else{ 732 733 // not a valid entity reference 734 735 System.arraycopy( buffer, 0, insertion, 0, buffer_pos ); 736 737 buffer_pos = 0; 738 insertion_pos = 0; 739 insertion_len = buffer_pos; 740 741 return( insertion[insertion_pos++] ); 742 } 743 } 744 } 745 } 746 } 747 } 748 } 749 750 public void close()751 close() 752 753 throws IOException 754 { 755 is.close(); 756 } 757 758 public long skip( long n )759 skip( 760 long n ) 761 762 throws IOException 763 { 764 // meh, vague attempt here 765 766 if ( insertion_len > 0 ){ 767 768 // buffer is currently empty, shove remaining into buffer to unify processing 769 770 int rem = insertion_len - insertion_pos; 771 772 System.arraycopy( insertion, insertion_pos, buffer, 0, rem ); 773 774 insertion_pos = 0; 775 insertion_len = 0; 776 777 buffer_pos = rem; 778 } 779 780 if ( n <= buffer_pos ){ 781 782 // skip is <= buffer contents 783 784 int rem = buffer_pos - (int)n; 785 786 System.arraycopy( buffer, (int)n, insertion, 0, rem ); 787 788 insertion_pos = 0; 789 insertion_len = rem; 790 791 return( n ); 792 } 793 794 int to_skip = buffer_pos; 795 796 buffer_pos = 0; 797 798 return( is.skip( n - to_skip ) + to_skip ); 799 } 800 801 public int available()802 available() 803 804 throws IOException 805 { 806 return( buffer_pos + is.available()); 807 } 808 } 809 } 810