1 /*
2  * File    : SimpleXMLParserDocumentImpl.java
3  * Created : 5 Oct. 2003
4  * By      : Parg
5  *
6  * Azureus - a Java Bittorrent client
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details ( see the LICENSE file ).
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21  */
22 
23 package org.gudy.azureus2.pluginsimpl.local.utils.xml.simpleparser;
24 
25 import javax.xml.parsers.*;
26 
27 import org.xml.sax.*;
28 import org.apache.commons.lang.Entities;
29 import org.gudy.azureus2.core3.util.AENetworkClassifier;
30 import org.gudy.azureus2.core3.util.Constants;
31 import org.gudy.azureus2.core3.util.Debug;
32 import org.gudy.azureus2.core3.util.FileUtil;
33 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocument;
34 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentAttribute;
35 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentException;
36 import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentNode;
37 import org.w3c.dom.*;
38 
39 import com.aelitis.azureus.core.util.UncloseableInputStream;
40 
41 import java.io.*;
42 import java.net.InetAddress;
43 import java.net.URL;
44 import java.net.URLConnection;
45 import java.net.UnknownHostException;
46 import java.util.*;
47 
48 public class
49 SimpleXMLParserDocumentImpl
50 	implements SimpleXMLParserDocument
51 {
52 	private static DocumentBuilderFactory 		dbf_singleton;
53 
54 	private URL			source_url;
55 
56 	private Document						document;
57 	private SimpleXMLParserDocumentNodeImpl	root_node;
58 
59 
60 	public
SimpleXMLParserDocumentImpl( File file )61 	SimpleXMLParserDocumentImpl(
62 		File		file )
63 
64 		throws SimpleXMLParserDocumentException
65 	{
66 		try{
67 
68 			create( new FileInputStream( file ));
69 
70 		}catch( Throwable e ){
71 
72 			throw( new SimpleXMLParserDocumentException( e ));
73 		}
74 	}
75 
76 	public
SimpleXMLParserDocumentImpl( String data )77 	SimpleXMLParserDocumentImpl(
78 		String		data )
79 
80 		throws SimpleXMLParserDocumentException
81 	{
82 		try{
83 			create( new ByteArrayInputStream( data.getBytes( Constants.DEFAULT_ENCODING )));
84 
85 		}catch( UnsupportedEncodingException e ){
86 
87 		}
88 	}
89 
90 	/**
91 	 * @deprecated
92 	 * @param _input_stream
93 	 * @throws SimpleXMLParserDocumentException
94 	 */
95 	public
SimpleXMLParserDocumentImpl( InputStream _input_stream )96 	SimpleXMLParserDocumentImpl(
97 		InputStream		_input_stream )
98 
99 		throws SimpleXMLParserDocumentException
100 	{
101 		this( null, _input_stream );
102 	}
103 
104 	public
SimpleXMLParserDocumentImpl( URL _source_url, InputStream _input_stream )105 	SimpleXMLParserDocumentImpl(
106 		URL				_source_url,
107 		InputStream		_input_stream )
108 
109 		throws SimpleXMLParserDocumentException
110 	{
111 		source_url		= _source_url;
112 
113 		create( _input_stream );
114 	}
115 
116 	protected static synchronized DocumentBuilderFactory
getDBF()117 	getDBF()
118 	{
119 			// getting the factory involves a fait bit of work - cache it
120 
121 		if ( dbf_singleton == null ){
122 
123 			dbf_singleton = DocumentBuilderFactory.newInstance();
124 
125 			// Set namespaceAware to true to get a DOM Level 2 tree with nodes
126 			// containing namesapce information.  This is necessary because the
127 			// default value from JAXP 1.0 was defined to be false.
128 
129 			dbf_singleton.setNamespaceAware(true);
130 
131 			// Set the validation mode to either: no validation, DTD
132 			// validation, or XSD validation
133 
134 			dbf_singleton.setValidating( false );
135 
136 			// Optional: set various configuration options
137 
138 			dbf_singleton.setIgnoringComments(true);
139 			dbf_singleton.setIgnoringElementContentWhitespace(true);
140 			dbf_singleton.setCoalescing(true);
141 
142 			// The opposite of creating entity ref nodes is expanding them inline
143 			// NOTE that usage of, e.g. "&" in text results in an entity ref. e.g.
144 			//	if ("BUY".equals (type) "
145 			//		ENT_REF: nodeName="amp"
146 			//		TEXT: nodeName="#text" nodeValue="&"
147 
148 			dbf_singleton.setExpandEntityReferences(true);
149 		}
150 
151 		return( dbf_singleton );
152 	}
153 
154 	private void
create( InputStream _input_stream )155 	create(
156 		InputStream		_input_stream )
157 
158 		throws SimpleXMLParserDocumentException
159 	{
160 			// make sure we can mark the stream to permit later recovery if needed
161 
162 		if ( !_input_stream.markSupported()){
163 
164 			_input_stream = new BufferedInputStream( _input_stream );
165 		}
166 
167 		_input_stream.mark( 100*1024 );
168 
169 			// prevent the parser from screwing with our stream by closing it
170 
171 		UncloseableInputStream	uc_is = new UncloseableInputStream( _input_stream );
172 
173 		SimpleXMLParserDocumentException error = null;
174 
175 		try{
176 			createSupport( uc_is );
177 
178 		}catch( SimpleXMLParserDocumentException e ){
179 
180 			String msg = Debug.getNestedExceptionMessage( e );
181 
182 			if (	( msg.contains( "entity" ) && msg.contains( "was referenced" )) ||
183 					msg.contains( "entity reference" )){
184 
185 				try{
186 						// nasty hack to try and handle HTML entities that some annoying feeds include :(
187 
188 					_input_stream.reset();
189 
190 					createSupport( new EntityFudger( _input_stream ));
191 
192 					return;
193 
194 				}catch( Throwable f ){
195 
196 					if ( f instanceof SimpleXMLParserDocumentException ){
197 
198 						error = (SimpleXMLParserDocumentException)f;
199 					}
200 				}
201 			}
202 
203 			if ( error == null ){
204 
205 				error = e;
206 			}
207 
208 			throw( error );
209 
210 		}finally{
211 
212 			if ( Constants.isCVSVersion() && error != null ){
213 
214 				try{
215 					_input_stream.reset();
216 
217 					String stuff = FileUtil.readInputStreamAsStringWithTruncation( _input_stream, 2014 );
218 
219 					Debug.out( "RSS parsing failed for '" + stuff + "': " + Debug.getExceptionMessage( error ));
220 
221 				}catch( Throwable e ){
222 				}
223 			}
224 			try{
225 				_input_stream.close();
226 
227 			}catch( Throwable e ){
228 			}
229 		}
230 	}
231 
232 	private void
createSupport( InputStream input_stream )233 	createSupport(
234 		InputStream		input_stream )
235 
236 		throws SimpleXMLParserDocumentException
237 	{
238 		try{
239 			DocumentBuilderFactory dbf = getDBF();
240 
241 			// Step 2: create a DocumentBuilder that satisfies the constraints
242 			// specified by the DocumentBuilderFactory
243 
244 			DocumentBuilder db = dbf.newDocumentBuilder();
245 
246 			// Set an ErrorHandler before parsing
247 
248 			OutputStreamWriter errorWriter = new OutputStreamWriter(System.err);
249 
250 			MyErrorHandler error_handler = new MyErrorHandler(new PrintWriter(errorWriter, true));
251 
252 			db.setErrorHandler( error_handler );
253 
254 			db.setEntityResolver(
255 				new EntityResolver()
256 				{
257 					public InputSource
258 					resolveEntity(
259 						String publicId, String systemId )
260 					{
261 						// System.out.println( publicId + ", " + systemId );
262 
263 						// handle bad DTD external refs
264 
265 						try{
266 							URL url  = new URL( systemId );
267 
268 							if ( source_url != null ){
269 
270 								String net = AENetworkClassifier.categoriseAddress( source_url.getHost());
271 
272 								if ( net != AENetworkClassifier.AT_PUBLIC ){
273 
274 									if ( AENetworkClassifier.categoriseAddress( url.getHost()) != net ){
275 
276 										return new InputSource(	new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes()));
277 									}
278 								}
279 							}
280 
281 							String host = url.getHost();
282 
283 							InetAddress.getByName( host );
284 
285 								// try connecting too as connection-refused will also bork XML parsing
286 
287 							InputStream is = null;
288 
289 							try{
290 								URLConnection con = url.openConnection();
291 
292 								con.setConnectTimeout( 15*1000 );
293 								con.setReadTimeout( 15*1000 );
294 
295 								is = con.getInputStream();
296 
297 								byte[]	buffer = new byte[32];
298 
299 								int	pos = 0;
300 
301 								while( pos < buffer.length ){
302 
303 									int len = is.read( buffer, pos, buffer.length - pos );
304 
305 									if ( len <= 0 ){
306 
307 										break;
308 									}
309 
310 									pos += len;
311 								}
312 
313 								String str = new String( buffer, "UTF-8" ).trim().toLowerCase( Locale.US );
314 
315 								if ( !str.contains( "<?xml" )){
316 
317 										// not straightforward to check for naked DTDs, could be lots of <!-- commentry preamble which of course can occur
318 										// in HTML too
319 
320 									buffer = new byte[32000];
321 
322 									pos = 0;
323 
324 									while( pos < buffer.length ){
325 
326 										int len = is.read( buffer, pos, buffer.length - pos );
327 
328 										if ( len <= 0 ){
329 
330 											break;
331 										}
332 
333 										pos += len;
334 									}
335 
336 									str += new String( buffer, "UTF-8" ).trim().toLowerCase( Locale.US );
337 
338 									if ( str.contains( "<html") && str.contains( "<head" )){
339 
340 										throw( new Exception( "Bad DTD" ));
341 									}
342 								}
343 							}catch( Throwable e ){
344 
345 								return new InputSource(	new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes()));
346 
347 							}finally{
348 
349 								if ( is != null ){
350 
351 									try{
352 										is.close();
353 
354 									}catch( Throwable e){
355 
356 									}
357 								}
358 							}
359 							return( null );
360 
361 						}catch( UnknownHostException e ){
362 
363 							return new InputSource(	new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes()));
364 
365 						}catch( Throwable e ){
366 
367 							return( null );
368 						}
369 					}
370 				});
371 
372 			// Step 3: parse the input file
373 
374 			document = db.parse( input_stream );
375 
376 			SimpleXMLParserDocumentNodeImpl[] root_nodes = parseNode( document, false );
377 
378 			int	root_node_count	= 0;
379 
380 				// remove any processing instructions such as <?xml-stylesheet
381 
382 			for (int i=0;i<root_nodes.length;i++){
383 
384 				SimpleXMLParserDocumentNodeImpl	node = root_nodes[i];
385 
386 				if ( node.getNode().getNodeType() != Node.PROCESSING_INSTRUCTION_NODE ){
387 
388 					root_node	= node;
389 
390 					root_node_count++;
391 				}
392 			}
393 
394 			if ( root_node_count != 1 ){
395 
396 				throw( new SimpleXMLParserDocumentException( "invalid document - " + root_nodes.length + " root elements" ));
397 			}
398 
399 		}catch( Throwable e ){
400 
401 			throw( new SimpleXMLParserDocumentException( e ));
402 		}
403 	}
404 
405 	public String
getName()406 	getName()
407 	{
408 		return( root_node.getName());
409 	}
410 
411 	public String
getFullName()412 	getFullName()
413 	{
414 		return( root_node.getFullName());
415 	}
416 
417 	public String
getNameSpaceURI()418 	getNameSpaceURI()
419 	{
420 		return( root_node.getNameSpaceURI());
421 	}
422 
423 	public String
getValue()424 	getValue()
425 	{
426 		return( root_node.getValue());
427 	}
428 
429 	public SimpleXMLParserDocumentNode[]
getChildren()430 	getChildren()
431 	{
432 		return( root_node.getChildren());
433 	}
434 	public SimpleXMLParserDocumentNode
getChild( String name )435 	getChild(
436 		String	name )
437 	{
438 		return( root_node.getChild(name));
439 	}
440 
441 	public SimpleXMLParserDocumentAttribute[]
getAttributes()442 	getAttributes()
443 	{
444 		return( root_node.getAttributes());
445 	}
446 	public SimpleXMLParserDocumentAttribute
getAttribute( String name )447 	getAttribute(
448 		String		name )
449 	{
450 		return( root_node.getAttribute(name));
451 	}
452 
453 	public void
print()454 	print()
455 	{
456 		PrintWriter	pw = new PrintWriter( System.out );
457 
458 		print( pw );
459 
460 		pw.flush();
461 	}
462 
463 	public void
print( PrintWriter pw )464 	print(
465 		PrintWriter	pw )
466 	{
467 		root_node.print( pw, "" );
468 	}
469 
470 		// idea is to flatten out any unwanted structure. We just want the resultant
471 		// tree to have nodes for each nesting element and leaves denoting name/value bits
472 
473 	protected SimpleXMLParserDocumentNodeImpl[]
parseNode( Node node, boolean skip_this_node )474 	parseNode(
475 		Node		node,
476 		boolean		skip_this_node )
477 	{
478         int type = node.getNodeType();
479 
480 		if ( (	type == Node.ELEMENT_NODE ||
481 				type == Node.PROCESSING_INSTRUCTION_NODE )&& !skip_this_node ){
482 
483 			return( new SimpleXMLParserDocumentNodeImpl[]{ new SimpleXMLParserDocumentNodeImpl( this, node )});
484 		}
485 
486 		Vector	v = new Vector();
487 
488         for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()){
489 
490 			SimpleXMLParserDocumentNodeImpl[] kids = parseNode( child, false );
491 
492 			for (int i=0;i<kids.length;i++){
493 
494 				v.addElement(kids[i]);
495 			}
496         }
497 
498 		SimpleXMLParserDocumentNodeImpl[]	res = new SimpleXMLParserDocumentNodeImpl[v.size()];
499 
500 		v.copyInto( res );
501 
502 		return( res );
503 	}
504 
505     private static class MyErrorHandler implements ErrorHandler {
506         /** Error handler output goes here */
507         //private PrintWriter out;
508 
MyErrorHandler(PrintWriter out)509         MyErrorHandler(PrintWriter out) {
510             //this.out = out;
511         }
512 
513         /**
514          * Returns a string describing parse exception details
515          */
getParseExceptionInfo(SAXParseException spe)516         private String getParseExceptionInfo(SAXParseException spe) {
517             String systemId = spe.getSystemId();
518             if (systemId == null) {
519                 systemId = "null";
520             }
521             String info = "URI=" + systemId +
522                 " Line=" + spe.getLineNumber() +
523                 ": " + spe.getMessage();
524             return info;
525         }
526 
527         // The following methods are standard SAX ErrorHandler methods.
528         // See SAX documentation for more info.
529 
530         public void
warning( SAXParseException spe )531 		warning(
532 			SAXParseException spe )
533 
534 			throws SAXException
535 		{
536             // out.println("Warning: " + getParseExceptionInfo(spe));
537         }
538 
539         public void
error( SAXParseException spe )540 		error(
541 			SAXParseException spe )
542 
543 			throws SAXException
544 		{
545             String message = "Error: " + getParseExceptionInfo(spe);
546 
547             throw new SAXException(message);
548         }
549 
550         public void
fatalError( SAXParseException spe )551 		fatalError(
552 			SAXParseException spe )
553 
554 			throws SAXException
555 		{
556             String message = "Fatal Error: " + getParseExceptionInfo(spe);
557 
558             throw new SAXException(message,spe);
559         }
560     }
561 
562     private static class
563     EntityFudger
564     	extends InputStream
565     {
566     	private InputStream		is;
567 
568     	char[]	buffer		= new char[16];
569     	int		buffer_pos	= 0;
570 
571     	char[] 	insertion		= new char[16];
572     	int		insertion_pos	= 0;
573     	int		insertion_len	= 0;
574 
575     	public
EntityFudger( InputStream _is )576     	EntityFudger(
577     		InputStream		_is )
578     	{
579     		is		= _is;
580     	}
581 
582     	@Override
583     	public int
read()584     	read()
585     		throws IOException
586     	{
587     		if ( insertion_len > 0 ){
588 
589     			int	result = insertion[ insertion_pos++ ]&0xff;
590 
591     			if ( insertion_pos == insertion_len ){
592 
593      				insertion_pos	= 0;
594      				insertion_len	= 0;
595     			}
596 
597     			return( result );
598     		}
599 
600     		while( true ){
601 
602 	     		int	b = is.read();
603 
604 	     		if ( b < 0 ){
605 
606 	     				// end of file
607 
608 	     			if ( buffer_pos == 0 ){
609 
610 	     				return( b );
611 
612 	     			}else if ( buffer_pos == 1 ){
613 
614 	     				buffer_pos = 0;
615 
616 	     				return( buffer[0]&0xff );
617 
618 	     			}else{
619 
620 	     				System.arraycopy( buffer, 1, insertion, 0, buffer_pos - 1 );
621 
622 	     				insertion_len 	= buffer_pos - 1;
623 	     				insertion_pos	= 0;
624 
625 	     				buffer_pos = 0;
626 
627 	     				return( buffer[0]&0xff );
628 	     			}
629 	     		}
630 
631 	     			// normal byte
632 
633 	     		if ( buffer_pos == 0 ){
634 
635 	     			if ( b == '&' ){
636 
637 	     				buffer[ buffer_pos++ ] = (char)b;
638 
639 	     			}else{
640 
641 	     				return( b );
642 	     			}
643 
644 	     		}else{
645 
646 	     			if ( buffer_pos == buffer.length-1 ){
647 
648 	     					// buffer's full, give up
649 
650 	     				buffer[ buffer_pos++ ] = (char)b;
651 
652 	     				System.arraycopy( buffer, 0, insertion, 0, buffer_pos );
653 
654 	     				buffer_pos		= 0;
655 	     				insertion_pos	= 0;
656 	     				insertion_len	= buffer_pos;
657 
658 	     				return( insertion[insertion_pos++] );
659 
660 	     			}else{
661 
662 		     			if ( b == ';' ){
663 
664 		     					// got some kind of reference mebe
665 
666 		     				buffer[ buffer_pos++ ] = (char)b;
667 
668 		     				String	ref = new String( buffer, 1, buffer_pos-2 ).toLowerCase( Locale.US );
669 
670 		     				String	replacement;
671 
672 		     				if ( 	ref.equals( "amp") 		||
673 		     						ref.equals( "lt" ) 		||
674 		     						ref.equals( "gt" ) 		||
675 		     						ref.equals( "quot" )	||
676 		     						ref.equals( "apos" ) 	||
677 		     						ref.startsWith( "#" )){
678 
679 		     					replacement = new String( buffer, 0, buffer_pos );
680 
681 		     				}else{
682 
683 			     				int num = Entities.HTML40.entityValue( ref );
684 
685 		     					if ( num != -1 ){
686 
687 		     						replacement = "&#" + num + ";";
688 
689 		     					}else{
690 
691 		     						replacement = new String( buffer, 0, buffer_pos );
692 		     					}
693 		     				}
694 
695 		     				char[] chars = replacement.toCharArray();
696 
697 		     				System.arraycopy( chars, 0, insertion, 0, chars.length );
698 
699 		     				buffer_pos		= 0;
700 		     				insertion_pos	= 0;
701 		     				insertion_len	= chars.length;
702 
703 		     				return( insertion[insertion_pos++] );
704 
705 		     			}else{
706 
707 	     					buffer[ buffer_pos++ ] = (char)b;
708 
709 		     				char c = (char)b;
710 
711 		     				if ( !Character.isLetterOrDigit( c )){
712 
713 		     						// handle naked &
714 
715 		     					if ( buffer_pos == 2 && buffer[0] == '&'){
716 
717 		     						char[] chars = "&amp;".toCharArray();
718 
719 		     						System.arraycopy( chars, 0, insertion, 0, chars.length );
720 
721 		     						buffer_pos		= 0;
722 		     						insertion_pos	= 0;
723 		     						insertion_len	= chars.length;
724 
725 		     							// don't forget the char we just read
726 
727 		     						insertion[insertion_len++] = (char)b;
728 
729 		     						return( insertion[insertion_pos++] );
730 
731 		     					}else{
732 
733 		     							// not a valid entity reference
734 
735 		    	     				System.arraycopy( buffer, 0, insertion, 0, buffer_pos );
736 
737 		    	     				buffer_pos		= 0;
738 		    	     				insertion_pos	= 0;
739 		    	     				insertion_len	= buffer_pos;
740 
741 		    	     				return( insertion[insertion_pos++] );
742 		     					}
743 		     				}
744 		     			}
745 	     			}
746 	     		}
747     		}
748     	}
749 
750     	public void
close()751     	close()
752 
753     		throws IOException
754     	{
755     		is.close();
756     	}
757 
758     	public long
skip( long n )759     	skip(
760     		long n )
761 
762     		throws IOException
763     	{
764     			// meh, vague attempt here
765 
766     		if ( insertion_len > 0 ){
767 
768     				// buffer is currently empty, shove remaining into buffer to unify processing
769 
770     			int	rem = insertion_len - insertion_pos;
771 
772     			System.arraycopy( insertion, insertion_pos, buffer, 0, rem );
773 
774     			insertion_pos 	= 0;
775     			insertion_len	= 0;
776 
777     			buffer_pos = rem;
778     		}
779 
780     		if ( n <= buffer_pos ){
781 
782     				// skip is <= buffer contents
783 
784     			int	rem = buffer_pos - (int)n;
785 
786       			System.arraycopy( buffer, (int)n, insertion, 0, rem );
787 
788       			insertion_pos	= 0;
789       			insertion_len 	= rem;
790 
791       			return( n );
792     		}
793 
794     		int	to_skip = buffer_pos;
795 
796     		buffer_pos	= 0;
797 
798     		return( is.skip( n - to_skip ) + to_skip );
799     	}
800 
801     	public int
available()802     	available()
803 
804     		throws IOException
805     	{
806      		return( buffer_pos + is.available());
807     	}
808     }
809 }
810