1 /* LinkFilter.java -- 2 Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 package gnu.xml.pipeline; 39 40 import java.io.IOException; 41 import java.net.URL; 42 import java.util.Enumeration; 43 import java.util.Vector; 44 45 import org.xml.sax.Attributes; 46 import org.xml.sax.SAXException; 47 48 49 /** 50 * Pipeline filter to remember XHTML links found in a document, 51 * so they can later be crawled. Fragments are not counted, and duplicates 52 * are ignored. Callers are responsible for filtering out URLs they aren't 53 * interested in. Events are passed through unmodified. 54 * 55 * <p> Input MUST include a setDocumentLocator() call, as it's used to 56 * resolve relative links in the absence of a "base" element. Input MUST 57 * also include namespace identifiers, since it is the XHTML namespace 58 * identifier which is used to identify the relevant elements. 59 * 60 * <p><em>FIXME:</em> handle xml:base attribute ... in association with 61 * a stack of base URIs. Similarly, recognize/support XLink data. 62 * 63 * @author David Brownell 64 */ 65 public class LinkFilter extends EventFilter 66 { 67 // for storing URIs 68 private Vector vector = new Vector (); 69 70 // struct for "full" link record (tbd) 71 // these for troubleshooting original source: 72 // original uri 73 // uri as resolved (base, relative, etc) 74 // URI of originating doc 75 // line # 76 // original element + attrs (img src, desc, etc) 77 78 // XLink model of the link ... for inter-site pairups ? 79 80 private String baseURI; 81 82 private boolean siteRestricted = false; 83 84 // 85 // XXX leverage blacklist info (like robots.txt) 86 // 87 // XXX constructor w/param ... pipeline for sending link data 88 // probably XHTML --> XLink, providing info as sketched above 89 // 90 91 92 /** 93 * Constructs a new event filter, which collects links in private data 94 * structure for later enumeration. 95 */ 96 // constructor used by PipelineFactory LinkFilter()97 public LinkFilter () 98 { 99 super.setContentHandler (this); 100 } 101 102 103 /** 104 * Constructs a new event filter, which collects links in private data 105 * structure for later enumeration and passes all events, unmodified, 106 * to the next consumer. 107 */ 108 // constructor used by PipelineFactory LinkFilter(EventConsumer next)109 public LinkFilter (EventConsumer next) 110 { 111 super (next); 112 super.setContentHandler (this); 113 } 114 115 116 /** 117 * Returns an enumeration of the links found since the filter 118 * was constructed, or since removeAllLinks() was called. 119 * 120 * @return enumeration of strings. 121 */ getLinks()122 public Enumeration getLinks () 123 { 124 return vector.elements (); 125 } 126 127 /** 128 * Removes records about all links reported to the event 129 * stream, as if the filter were newly created. 130 */ removeAllLinks()131 public void removeAllLinks () 132 { 133 vector = new Vector (); 134 } 135 136 137 /** 138 * Collects URIs for (X)HTML content from elements which hold them. 139 */ startElement( String uri, String localName, String qName, Attributes atts )140 public void startElement ( 141 String uri, 142 String localName, 143 String qName, 144 Attributes atts 145 ) throws SAXException 146 { 147 String link; 148 149 // Recognize XHTML links. 150 if ("http://www.w3.org/1999/xhtml".equals (uri)) { 151 152 if ("a".equals (localName) || "base".equals (localName) 153 || "area".equals (localName)) 154 link = atts.getValue ("href"); 155 else if ("iframe".equals (localName) || "frame".equals (localName)) 156 link = atts.getValue ("src"); 157 else if ("blockquote".equals (localName) || "q".equals (localName) 158 || "ins".equals (localName) || "del".equals (localName)) 159 link = atts.getValue ("cite"); 160 else 161 link = null; 162 link = maybeAddLink (link); 163 164 // "base" modifies designated baseURI 165 if ("base".equals (localName) && link != null) 166 baseURI = link; 167 168 if ("iframe".equals (localName) || "img".equals (localName)) 169 maybeAddLink (atts.getValue ("longdesc")); 170 } 171 172 super.startElement (uri, localName, qName, atts); 173 } 174 maybeAddLink(String link)175 private String maybeAddLink (String link) 176 { 177 int index; 178 179 // ignore empty links and fragments inside docs 180 if (link == null) 181 return null; 182 if ((index = link.indexOf ("#")) >= 0) 183 link = link.substring (0, index); 184 if (link.equals ("")) 185 return null; 186 187 try { 188 // get the real URI 189 URL base = new URL ((baseURI != null) 190 ? baseURI 191 : getDocumentLocator ().getSystemId ()); 192 URL url = new URL (base, link); 193 194 link = url.toString (); 195 196 // ignore duplicates 197 if (vector.contains (link)) 198 return link; 199 200 // other than what "base" does, stick to original site: 201 if (siteRestricted) { 202 // don't switch protocols 203 if (!base.getProtocol ().equals (url.getProtocol ())) 204 return link; 205 // don't switch servers 206 if (base.getHost () != null 207 && !base.getHost ().equals (url.getHost ())) 208 return link; 209 } 210 211 vector.addElement (link); 212 213 return link; 214 215 } catch (IOException e) { 216 // bad URLs we don't want 217 } 218 return null; 219 } 220 221 /** 222 * Reports an error if no Locator has been made available. 223 */ startDocument()224 public void startDocument () 225 throws SAXException 226 { 227 if (getDocumentLocator () == null) 228 throw new SAXException ("no Locator!"); 229 } 230 231 /** 232 * Forgets about any base URI information that may be recorded. 233 * Applications will often want to call removeAllLinks(), likely 234 * after examining the links which were reported. 235 */ endDocument()236 public void endDocument () 237 throws SAXException 238 { 239 baseURI = null; 240 super.endDocument (); 241 } 242 } 243