1 /* LinkFilter.java --
2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 package gnu.xml.pipeline;
39 
40 import java.io.IOException;
41 import java.net.URL;
42 import java.util.Enumeration;
43 import java.util.Vector;
44 
45 import org.xml.sax.Attributes;
46 import org.xml.sax.SAXException;
47 
48 
49 /**
50  * Pipeline filter to remember XHTML links found in a document,
51  * so they can later be crawled.  Fragments are not counted, and duplicates
52  * are ignored.  Callers are responsible for filtering out URLs they aren't
53  * interested in.  Events are passed through unmodified.
54  *
55  * <p> Input MUST include a setDocumentLocator() call, as it's used to
56  * resolve relative links in the absence of a "base" element.  Input MUST
57  * also include namespace identifiers, since it is the XHTML namespace
58  * identifier which is used to identify the relevant elements.
59  *
60  * <p><em>FIXME:</em> handle xml:base attribute ... in association with
61  * a stack of base URIs.  Similarly, recognize/support XLink data.
62  *
63  * @author David Brownell
64  */
65 public class LinkFilter extends EventFilter
66 {
67     // for storing URIs
68     private Vector		vector = new Vector ();
69 
70 	// struct for "full" link record (tbd)
71 	// these for troubleshooting original source:
72 	//	original uri
73 	//	uri as resolved (base, relative, etc)
74 	//	URI of originating doc
75 	//	line #
76 	//	original element + attrs (img src, desc, etc)
77 
78 	// XLink model of the link ... for inter-site pairups ?
79 
80     private String		baseURI;
81 
82     private boolean		siteRestricted = false;
83 
84     //
85     // XXX leverage blacklist info (like robots.txt)
86     //
87     // XXX constructor w/param ... pipeline for sending link data
88     // probably XHTML --> XLink, providing info as sketched above
89     //
90 
91 
92     /**
93      * Constructs a new event filter, which collects links in private data
94      * structure for later enumeration.
95      */
96 	// constructor used by PipelineFactory
LinkFilter()97     public LinkFilter ()
98     {
99 	super.setContentHandler (this);
100     }
101 
102 
103     /**
104      * Constructs a new event filter, which collects links in private data
105      * structure for later enumeration and passes all events, unmodified,
106      * to the next consumer.
107      */
108 	// constructor used by PipelineFactory
LinkFilter(EventConsumer next)109     public LinkFilter (EventConsumer next)
110     {
111 	super (next);
112 	super.setContentHandler (this);
113     }
114 
115 
116     /**
117      * Returns an enumeration of the links found since the filter
118      * was constructed, or since removeAllLinks() was called.
119      *
120      * @return enumeration of strings.
121      */
getLinks()122     public Enumeration getLinks ()
123     {
124 	return vector.elements ();
125     }
126 
127     /**
128      * Removes records about all links reported to the event
129      * stream, as if the filter were newly created.
130      */
removeAllLinks()131     public void removeAllLinks ()
132     {
133 	vector = new Vector ();
134     }
135 
136 
137     /**
138      * Collects URIs for (X)HTML content from elements which hold them.
139      */
startElement( String uri, String localName, String qName, Attributes atts )140     public void startElement (
141 	String		uri,
142 	String		localName,
143 	String		qName,
144 	Attributes	atts
145     ) throws SAXException
146     {
147 	String	link;
148 
149 	// Recognize XHTML links.
150 	if ("http://www.w3.org/1999/xhtml".equals (uri)) {
151 
152 	    if ("a".equals (localName) || "base".equals (localName)
153 		    || "area".equals (localName))
154 		link = atts.getValue ("href");
155 	    else if ("iframe".equals (localName) || "frame".equals (localName))
156 		link = atts.getValue ("src");
157 	    else if ("blockquote".equals (localName) || "q".equals (localName)
158 		    || "ins".equals (localName) || "del".equals (localName))
159 		link = atts.getValue ("cite");
160 	    else
161 		link = null;
162 	    link = maybeAddLink (link);
163 
164 	    // "base" modifies designated baseURI
165 	    if ("base".equals (localName) && link != null)
166 		baseURI = link;
167 
168 	    if ("iframe".equals (localName) || "img".equals (localName))
169 		maybeAddLink (atts.getValue ("longdesc"));
170 	}
171 
172 	super.startElement (uri, localName, qName, atts);
173     }
174 
maybeAddLink(String link)175     private String maybeAddLink (String link)
176     {
177 	int		index;
178 
179 	// ignore empty links and fragments inside docs
180 	if (link == null)
181 	    return null;
182 	if ((index = link.indexOf ("#")) >= 0)
183 	    link = link.substring (0, index);
184 	if (link.equals (""))
185 	    return null;
186 
187 	try {
188 	    // get the real URI
189 	    URL		base = new URL ((baseURI != null)
190 				    ? baseURI
191 				    : getDocumentLocator ().getSystemId ());
192 	    URL		url = new URL (base, link);
193 
194 	    link = url.toString ();
195 
196 	    // ignore duplicates
197 	    if (vector.contains (link))
198 		return link;
199 
200 	    // other than what "base" does, stick to original site:
201 	    if (siteRestricted) {
202 		// don't switch protocols
203 		if (!base.getProtocol ().equals (url.getProtocol ()))
204 		    return link;
205 		// don't switch servers
206 		if (base.getHost () != null
207 			&& !base.getHost ().equals (url.getHost ()))
208 		    return link;
209 	    }
210 
211 	    vector.addElement (link);
212 
213 	    return link;
214 
215 	} catch (IOException e) {
216 	    // bad URLs we don't want
217 	}
218 	return null;
219     }
220 
221     /**
222      * Reports an error if no Locator has been made available.
223      */
startDocument()224     public void startDocument ()
225     throws SAXException
226     {
227 	if (getDocumentLocator () == null)
228 	    throw new SAXException ("no Locator!");
229     }
230 
231     /**
232      * Forgets about any base URI information that may be recorded.
233      * Applications will often want to call removeAllLinks(), likely
234      * after examining the links which were reported.
235      */
endDocument()236     public void endDocument ()
237     throws SAXException
238     {
239 	baseURI = null;
240 	super.endDocument ();
241     }
242 }
243