1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML 2 // Version 3.2 3 // Copyright (C) 2004-2009 Martin Jericho 4 // http://jericho.htmlparser.net/ 5 // 6 // This library is free software; you can redistribute it and/or 7 // modify it under the terms of either one of the following licences: 8 // 9 // 1. The Eclipse Public License (EPL) version 1.0, 10 // included in this distribution in the file licence-epl-1.0.html 11 // or available at http://www.eclipse.org/legal/epl-v10.html 12 // 13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, 14 // included in this distribution in the file licence-lgpl-2.1.txt 15 // or available at http://www.gnu.org/licenses/lgpl.txt 16 // 17 // This library is distributed on an "AS IS" basis, 18 // WITHOUT WARRANTY OF ANY KIND, either express or implied. 19 // See the individual licence texts for more details. 20 21 package net.htmlparser.jericho; 22 23 import java.util.*; 24 25 /** 26 * Represents the <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-element-3">end tag</a> of an 27 * {@linkplain Element element} in a specific {@linkplain Source source} document. 28 * <p> 29 * An end tag always has a {@linkplain #getTagType() type} that is a subclass of {@link EndTagType}, meaning it 30 * always starts with the characters '<code></</code>'. 31 * <p> 32 * <code>EndTag</code> instances are obtained using one of the following methods: 33 * <ul> 34 * <li>{@link Element#getEndTag()} 35 * <li>{@link Tag#getNextTag()} 36 * <li>{@link Tag#getPreviousTag()} 37 * <li>{@link Source#getPreviousEndTag(int pos)} 38 * <li>{@link Source#getPreviousEndTag(int pos, String name)} 39 * <li>{@link Source#getPreviousTag(int pos)} 40 * <li>{@link Source#getPreviousTag(int pos, TagType)} 41 * <li>{@link Source#getNextEndTag(int pos)} 42 * <li>{@link Source#getNextEndTag(int pos, String name)} 43 * <li>{@link Source#getNextEndTag(int pos, String name, EndTagType)} 44 * <li>{@link Source#getNextTag(int pos)} 45 * <li>{@link Source#getNextTag(int pos, TagType)} 46 * <li>{@link Source#getEnclosingTag(int pos)} 47 * <li>{@link Source#getEnclosingTag(int pos, TagType)} 48 * <li>{@link Source#getTagAt(int pos)} 49 * <li>{@link Segment#getAllTags()} 50 * <li>{@link Segment#getAllTags(TagType)} 51 * </ul> 52 * <p> 53 * The {@link Tag} superclass defines the {@link Tag#getName() getName()} method used to get the name of this end tag. 54 * <p> 55 * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-etag">end tags</a>. 56 * 57 * @see Tag 58 * @see StartTag 59 * @see Element 60 */ 61 public final class EndTag extends Tag { 62 private final EndTagType endTagType; 63 64 /** 65 * Constructs a new <code>EndTag</code>. 66 * 67 * @param source the {@link Source} document. 68 * @param begin the character position in the source document where this tag {@linkplain Segment#getBegin() begins}. 69 * @param end the character position in the source document where this tag {@linkplain Segment#getEnd() ends}. 70 * @param endTagType the {@linkplain #getEndTagType() type} of the end tag. 71 * @param name the {@linkplain Tag#getName() name} of the tag. 72 */ EndTag(final Source source, final int begin, final int end, final EndTagType endTagType, final String name)73 EndTag(final Source source, final int begin, final int end, final EndTagType endTagType, final String name) { 74 super(source,begin,end,name); 75 this.endTagType=endTagType; 76 } 77 78 /** 79 * Returns the {@linkplain Element element} that is ended by this end tag. 80 * <p> 81 * Returns <code>null</code> if this end tag is not properly matched to any {@linkplain StartTag start tag} in the source document. 82 * <p> 83 * This method is much less efficient than the {@link StartTag#getElement()} method. 84 * <p> 85 * IMPLEMENTATION NOTE: The explanation for why this method is relatively inefficient lies in the fact that more than one 86 * {@linkplain StartTagType start tag type} can have the same 87 * {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}, so it is not possible to know for certain 88 * which type of start tag this end tag is matched to (see {@link EndTagType#getCorrespondingStartTagType()} for more explanation). 89 * Because of this uncertainty, the implementation of this method must check every start tag preceding this end tag, calling its 90 * {@link StartTag#getElement()} method to see whether it is terminated by this end tag. 91 * 92 * @return the {@linkplain Element element} that is ended by this end tag. 93 */ getElement()94 public Element getElement() { 95 if (element!=Element.NOT_CACHED) return element; 96 int pos=begin; 97 while (pos!=0) { 98 StartTag startTag=source.getPreviousStartTag(pos-1); 99 if (startTag==null) break; 100 Element foundElement=startTag.getElement(); // this automatically sets foundElement.getEndTag().element cache 101 if (foundElement.getEndTag()==this) return foundElement; // no need to set element as it was already done in previous statement 102 pos=startTag.begin; 103 } 104 return element=null; 105 } 106 107 /** 108 * Returns the {@linkplain EndTagType type} of this end tag. 109 * <p> 110 * This is equivalent to <code>(EndTagType)</code>{@link #getTagType()}. 111 * 112 * @return the {@linkplain EndTagType type} of this end tag. 113 */ getEndTagType()114 public EndTagType getEndTagType() { 115 return endTagType; 116 } 117 118 // Documentation inherited from Tag getTagType()119 public TagType getTagType() { 120 return endTagType; 121 } 122 123 // Documentation inherited from Tag isUnregistered()124 public boolean isUnregistered() { 125 return endTagType==EndTagType.UNREGISTERED; 126 } 127 128 /** 129 * Returns an XML representation of this end tag. 130 * <p> 131 * This method is included for symmetry with the {@link StartTag#tidy()} method and simply 132 * returns the {@linkplain Segment#toString() source text} of the tag. 133 * 134 * @return an XML representation of this end tag. 135 */ tidy()136 public String tidy() { 137 return toString(); 138 } 139 140 /** 141 * Generates the HTML text of a {@linkplain EndTagType#NORMAL normal} end tag with the specified tag {@linkplain #getName() name}. 142 * <p> 143 * <dl> 144 * <dt>Example:</dt> 145 * <dd> 146 * <p> 147 * The following method call: 148 * <blockquote class="code"> 149 * <code>EndTag.generateHTML("INPUT")</code> 150 * </blockquote> 151 * returns the following output: 152 * <blockquote class="code"> 153 * <code></INPUT></code> 154 * </blockquote> 155 * </dd> 156 * </dl> 157 * 158 * @param tagName the {@linkplain #getName() name} of the end tag. 159 * @return the HTML text of a {@linkplain EndTagType#NORMAL normal} end tag with the specified tag {@linkplain #getName() name}. 160 * @see StartTag#generateHTML(String tagName, Map attributesMap, boolean emptyElementTag) 161 */ generateHTML(final String tagName)162 public static String generateHTML(final String tagName) { 163 return EndTagType.NORMAL.generateHTML(tagName); 164 } 165 getDebugInfo()166 public String getDebugInfo() { 167 final StringBuilder sb=new StringBuilder(); 168 sb.append(this).append(' '); 169 if (endTagType!=EndTagType.NORMAL) sb.append('(').append(endTagType.getDescription()).append(") "); 170 sb.append(super.getDebugInfo()); 171 return sb.toString(); 172 } 173 174 /** 175 * Returns the previous end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position. 176 * <p> 177 * Called from {@link Source#getPreviousEndTag(int pos, String name)}. 178 * 179 * @param source the {@link Source} document. 180 * @param pos the position to search from. 181 * @param name the {@linkplain #getName() name} of the tag including its {@linkplain TagType#getNamePrefix() prefix} (must be lower case, may be null). 182 * @param endTagType the {@linkplain EndTagType type} of end tag to search for. 183 * @return the previous end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position, or null if none is found. 184 */ getPrevious(final Source source, final int pos, final String name, final EndTagType endTagType)185 static EndTag getPrevious(final Source source, final int pos, final String name, final EndTagType endTagType) { 186 if (name==null) return (EndTag)Tag.getPreviousTag(source,pos,endTagType); 187 if (name.length()==0) throw new IllegalArgumentException("name argument must not be zero length"); 188 final String searchString=endTagType.START_DELIMITER_PREFIX+name; 189 try { 190 final ParseText parseText=source.getParseText(); 191 int begin=pos; 192 do { 193 begin=parseText.lastIndexOf(searchString,begin); 194 if (begin==-1) return null; 195 final EndTag endTag=(EndTag)source.getTagAt(begin); 196 if (endTag!=null && endTag.getEndTagType()==endTagType && name.equals(endTag.getName())) return endTag; 197 } while ((begin-=1)>=0); 198 } catch (IndexOutOfBoundsException ex) { 199 // this should never happen during a get previous operation so rethrow it: 200 throw ex; 201 } 202 return null; 203 } 204 205 /** 206 * Returns the next end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position. 207 * <p> 208 * Called from {@link Source#getNextEndTag(int pos, String name, EndTagType endTagType)}. 209 * 210 * @param source the {@link Source} document. 211 * @param pos the position to search from. 212 * @param name the {@linkplain #getName() name} of the tag including its {@linkplain TagType#getNamePrefix() prefix} (must be lower case, may be null). 213 * @param endTagType the {@linkplain EndTagType type} of end tag to search for. 214 * @return the next end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position, or null if none is found. 215 */ getNext(final Source source, final int pos, final String name, final EndTagType endTagType)216 static EndTag getNext(final Source source, final int pos, final String name, final EndTagType endTagType) { 217 if (name==null) return (EndTag)Tag.getNextTag(source,pos,endTagType); 218 if (name.length()==0) throw new IllegalArgumentException("name argument must not be zero length"); 219 final String searchString=endTagType.START_DELIMITER_PREFIX+name; 220 try { 221 final ParseText parseText=source.getParseText(); 222 int begin=pos; 223 do { 224 begin=parseText.indexOf(searchString,begin); 225 if (begin==-1) return null; 226 final EndTag endTag=(EndTag)source.getTagAt(begin); 227 if (endTag!=null && endTag.getEndTagType()==endTagType && name.equals(endTag.getName())) return endTag; 228 } while ((begin+=1)<source.end); 229 } catch (IndexOutOfBoundsException ex) { 230 // this should only happen when the end of file is reached in the middle of a tag. 231 // we don't have to do anything to handle it as there will be no more tags anyway. 232 } 233 return null; 234 } 235 getPrevious(final Source source, int pos)236 static EndTag getPrevious(final Source source, int pos) { 237 while (true) { 238 final Tag tag=Tag.getPreviousTag(source,pos); 239 if (tag==null) return null; 240 if (tag instanceof EndTag) return (EndTag)tag; 241 pos-=1; 242 } 243 } 244 getNext(final Source source, int pos)245 static EndTag getNext(final Source source, int pos) { 246 while (true) { 247 final Tag tag=Tag.getNextTag(source,pos); 248 if (tag==null) return null; 249 if (tag instanceof EndTag) return (EndTag)tag; 250 pos+=1; 251 } 252 } 253 } 254 255