1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML 2 // Version 3.2 3 // Copyright (C) 2004-2009 Martin Jericho 4 // http://jericho.htmlparser.net/ 5 // 6 // This library is free software; you can redistribute it and/or 7 // modify it under the terms of either one of the following licences: 8 // 9 // 1. The Eclipse Public License (EPL) version 1.0, 10 // included in this distribution in the file licence-epl-1.0.html 11 // or available at http://www.eclipse.org/legal/epl-v10.html 12 // 13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, 14 // included in this distribution in the file licence-lgpl-2.1.txt 15 // or available at http://www.gnu.org/licenses/lgpl.txt 16 // 17 // This library is distributed on an "AS IS" basis, 18 // WITHOUT WARRANTY OF ANY KIND, either express or implied. 19 // See the individual licence texts for more details. 20 21 package net.htmlparser.jericho; 22 23 import java.util.*; 24 25 /** 26 * Represents either a {@link StartTag} or {@link EndTag} in a specific {@linkplain Source source} document. 27 * <p> 28 * Take the following HTML segment as an example: 29 * <p> 30 * <code><p>This is a sample paragraph.</p></code> 31 * <p> 32 * The "<code><p></code>" is represented by a {@link StartTag} object, and the "<code></p></code>" is represented by an {@link EndTag} object, 33 * both of which are subclasses of the <code>Tag</code> class. 34 * The whole segment, including the start tag, its corresponding end tag and all of the content in between, is represented by an {@link Element} object. 35 * 36 * <h3><a name="ParsingProcess">Tag Parsing Process</a></h3> 37 * The following process describes how each tag is identified by the parser: 38 * <ol class="Separated"> 39 * <li> 40 * Every '<code><</code>' character found in the source document is considered to be the start of a tag. 41 * The characters following it are compared with the {@linkplain TagType#getStartDelimiter() start delimiters} 42 * of all the {@linkplain TagType#register() registered} {@linkplain TagType tag types}, and a list of matching tag types 43 * is determined. 44 * <li> 45 * A more detailed analysis of the source is performed according to the features of each matching tag type from the first step, 46 * in order of <a href="TagType.html#Precedence">precedence</a>, until a valid tag is able to be constructed. 47 * <p> 48 * The analysis performed in relation to each candidate tag type is a two-stage process: 49 * <ol> 50 * <li> 51 * The position of the tag is checked to determine whether it is {@linkplain TagType#isValidPosition(Source,int,int[]) valid}. 52 * In theory, a {@linkplain TagType#isServerTag() server tag} is valid in any position, but a non-server tag is not valid inside any other tag, 53 * nor inside elements with CDATA content such as {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements. 54 * Theory dictates therefore that {@linkplain StartTagType#COMMENT comments} and explicit {@linkplain StartTagType#CDATA_SECTION CDATA sections} 55 * inside script elements should not be recognised as tags. 56 * The behaviour of the parser however does not always strictly adhere to the theory, to maintain compatability with major browsers 57 * and also for efficiency reasons. 58 * <p> 59 * The {@link TagType#isValidPosition(Source, int pos, int[] fullSequentialParseData)} method is responsible for this check 60 * and has a common default implementation for all tag types 61 * (although <a href="TagType.html#custom">custom</a> tag types can override it if necessary). 62 * Its behaviour differs depending on whether or not a {@linkplain Source#fullSequentialParse() full sequential parse} is peformed. 63 * See the documentation of the {@link TagType#isValidPosition(Source,int,int[]) isValidPosition} method for full details. 64 * <li> 65 * A final analysis is performed by the {@link TagType#constructTagAt(Source, int pos)} method of the candidate tag type. 66 * This method returns a valid {@link Tag} object if all conditions of the candidate tag type are met, otherwise it returns 67 * <code>null</code> and the process continues with the next candidate tag type. 68 * </ol> 69 * <li> 70 * If the source does not match the start delimiter or syntax of any registered tag type, the segment spanning it and the next 71 * '<code>></code>' character is taken to be an {@linkplain #isUnregistered() unregistered} tag. 72 * Some tag search methods ignore unregistered tags. See the {@link #isUnregistered()} method for more information. 73 * </ol> 74 * <p> 75 * See the documentation of the {@link TagType} class for more details on how tags are recognised. 76 * 77 * <h3><a name="TagSearchMethods">Tag Search Methods</a></h3> 78 * <p> 79 * Methods that get tags in a source document are collectively referred to as <i>Tag Search Methods</i>. 80 * They are found mostly in the {@link Source} and {@link Segment} classes, and can be generally categorised as follows: 81 * <dl class="Separated"> 82 * <dt><a name="OpenSearch">Open Search:</a> 83 * <dd>These methods search for tags of any {@linkplain #getName() name} and {@linkplain #getTagType() type}. 84 * <ul class="Unseparated"> 85 * <li>{@link Tag#getNextTag()} 86 * <li>{@link Tag#getPreviousTag()} 87 * <li>{@link Segment#getAllElements()} 88 * <li>{@link Segment#getFirstElement()} 89 * <li>{@link Source#getTagAt(int pos)} 90 * <li>{@link Source#getPreviousTag(int pos)} 91 * <li>{@link Source#getNextTag(int pos)} 92 * <li>{@link Source#getEnclosingTag(int pos)} 93 * <li>{@link Segment#getAllTags()} 94 * <li>{@link Segment#getAllStartTags()} 95 * <li>{@link Segment#getFirstStartTag()} 96 * <li>{@link Source#getPreviousStartTag(int pos)} 97 * <li>{@link Source#getNextStartTag(int pos)} 98 * <li>{@link Source#getPreviousEndTag(int pos)} 99 * <li>{@link Source#getNextEndTag(int pos)} 100 * </ul> 101 * <dt><a name="NamedSearch">Named Search:</a> 102 * <dd>These methods include a parameter called <code>name</code> which is used to specify the {@linkplain #getName() name} of the tag to search for. 103 * Specifying a name that ends in a colon (<code>:</code>) searches for all elements or tags in the specified XML namespace. 104 * <ul class="Unseparated"> 105 * <li>{@link Segment#getAllElements(String name)} 106 * <li>{@link Segment#getFirstElement(String name)} 107 * <li>{@link Segment#getAllStartTags(String name)} 108 * <li>{@link Segment#getFirstStartTag(String name)} 109 * <li>{@link Source#getPreviousStartTag(int pos, String name)} 110 * <li>{@link Source#getNextStartTag(int pos, String name)} 111 * <li>{@link Source#getPreviousEndTag(int pos, String name)} 112 * <li>{@link Source#getNextEndTag(int pos, String name)} 113 * <li>{@link Source#getNextEndTag(int pos, String name, EndTagType)} 114 * </ul> 115 * <dt><a name="TagTypeSearch">Tag Type Search:</a> 116 * <dd>These methods typically include a parameter called <code>tagType</code> which is used to specify the {@linkplain #getTagType() type} of the tag to search for. 117 * In some methods the search parameter is restricted to the {@link StartTagType} or {@link EndTagType} subclass of <code>TagType</code>. 118 * <ul class="Unseparated"> 119 * <li>{@link Segment#getAllElements(StartTagType)} 120 * <li>{@link Segment#getAllTags(TagType)} 121 * <li>{@link Segment#getAllStartTags(StartTagType)} 122 * <li>{@link Segment#getFirstStartTag(StartTagType)} 123 * <li>{@link Source#getPreviousTag(int pos, TagType)} 124 * <li>{@link Source#getPreviousStartTag(int pos, StartTagType)} 125 * <li>{@link Source#getPreviousEndTag(int pos, EndTagType)} 126 * <li>{@link Source#getNextTag(int pos, TagType)} 127 * <li>{@link Source#getNextStartTag(int pos, StartTagType)} 128 * <li>{@link Source#getNextEndTag(int pos, EndTagType)} 129 * <li>{@link Source#getEnclosingTag(int pos, TagType)} 130 * <li>{@link Source#getNextEndTag(int pos, String name, EndTagType)} 131 * </ul> 132 * <dt><a name="OtherSearch">Attribute Search:</a> 133 * <dd>These methods perform the search based on an attribute name and value. 134 * <ul class="Unseparated"> 135 * <li>{@link Segment#getAllElements(String attributeName, String value, boolean valueCaseSensitive)} 136 * <li>{@link Segment#getFirstElement(String attributeName, String value, boolean valueCaseSensitive)} 137 * <li>{@link Segment#getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} 138 * <li>{@link Segment#getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)} 139 * <li>{@link Segment#getAllElements(String attributeName, Pattern valueRegexPattern)} 140 * <li>{@link Segment#getFirstElement(String attributeName, Pattern valueRegexPattern)} 141 * <li>{@link Segment#getAllStartTags(String attributeName, Pattern valueRegexPattern)} 142 * <li>{@link Segment#getFirstStartTag(String attributeName, Pattern valueRegexPattern)} 143 * <li>{@link Segment#getAllElementsByClass(String className)} 144 * <li>{@link Segment#getFirstElementByClass(String className)} 145 * <li>{@link Segment#getAllStartTagsByClass(String className)} 146 * <li>{@link Segment#getFirstStartTagByClass(String className)} 147 * <li>{@link Source#getElementById(String id)} 148 * <li>{@link Source#getNextElement(int pos, String attributeName, Pattern valueRegexPattern)} 149 * <li>{@link Source#getNextElement(int pos, String attributeName, String value, boolean valueCaseSensitive)} 150 * <li>{@link Source#getNextElementByClass(int pos, String className)} 151 * <li>{@link Source#getNextStartTag(int pos, String attributeName, Pattern valueRegexPattern)} 152 * <li>{@link Source#getNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive)} 153 * <li>{@link Source#getNextStartTagByClass(int pos, String className)} 154 * </ul> 155 * </dl> 156 */ 157 public abstract class Tag extends Segment { 158 String name=null; // always lower case, can always use == operator to compare with constants in HTMLElementName interface 159 private Object userData=null; 160 // cached values: 161 Element element=Element.NOT_CACHED; 162 private Tag previousTag=NOT_CACHED; // does not include unregistered tags 163 private Tag nextTag=NOT_CACHED; // does not include unregistered tags 164 // A NOT_CACHED value in nextTag can also indicate that this tag is not in the cache. See isOrphaned() for details. 165 166 static final Tag NOT_CACHED=new StartTag(); 167 168 private static final boolean INCLUDE_UNREGISTERED_IN_SEARCH=false; // determines whether unregistered tags are included in searches 169 Tag(final Source source, final int begin, final int end, final String name)170 Tag(final Source source, final int begin, final int end, final String name) { 171 super(source,begin,end); 172 this.name=HTMLElements.getConstantElementName(name.toLowerCase()); 173 } 174 175 // only used to create Tag.NOT_CACHED Tag()176 Tag() {} 177 178 /** 179 * Returns the {@linkplain Element element} that is started or ended by this tag. 180 * <p> 181 * {@link StartTag#getElement()} is guaranteed not <code>null</code>. 182 * <p> 183 * {@link EndTag#getElement()} can return <code>null</code> if the end tag is not properly matched to a start tag. 184 * 185 * @return the {@linkplain Element element} that is started or ended by this tag. 186 */ getElement()187 public abstract Element getElement(); 188 189 /** 190 * Returns the name of this tag, always in lower case. 191 * <p> 192 * The name always starts with the {@linkplain TagType#getNamePrefix() name prefix} defined in this tag's {@linkplain TagType type}. 193 * For some tag types, the name consists only of this prefix, while in others it must be followed by a valid 194 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML name</a> 195 * (see {@link StartTagType#isNameAfterPrefixRequired()}). 196 * <p> 197 * If the name is equal to one of the constants defined in the {@link HTMLElementName} interface, this method is guaranteed to return 198 * the constant itself. 199 * This allows comparisons to be performed using the <code>==</code> operator instead of the less efficient 200 * <code>String.equals(Object)</code> method. 201 * <p> 202 * For example, the following expression can be used to test whether a {@link StartTag} is from a 203 * <code><a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#edef-SELECT">SELECT</a></code> element: 204 * <br /><code>startTag.getName()==HTMLElementName.SELECT</code> 205 * <p> 206 * To get the name of this tag in its original case, use {@link #getNameSegment()}<code>.toString()</code>. 207 * 208 * @return the name of this tag, always in lower case. 209 */ getName()210 public final String getName() { 211 return name; 212 } 213 214 /** 215 * Returns the segment spanning the {@linkplain #getName() name} of this tag. 216 * <p> 217 * The code <code>getNameSegment().toString()</code> can be used to retrieve the name of this tag in its original case. 218 * <p> 219 * Every call to this method constructs a new <code>Segment</code> object. 220 * 221 * @return the segment spanning the {@linkplain #getName() name} of this tag. 222 * @see #getName() 223 */ getNameSegment()224 public Segment getNameSegment() { 225 final int nameSegmentBegin=begin+getTagType().startDelimiterPrefix.length(); 226 return new Segment(source,nameSegmentBegin,nameSegmentBegin+name.length()); 227 } 228 229 /** 230 * Returns the {@linkplain TagType type} of this tag. 231 * @return the {@linkplain TagType type} of this tag. 232 */ getTagType()233 public abstract TagType getTagType(); 234 235 /** 236 * Returns the general purpose user data object that has previously been associated with this tag via the {@link #setUserData(Object)} method. 237 * <p> 238 * If {@link #setUserData(Object)} has not been called, this method returns <code>null</code>. 239 * 240 * @return the generic data object that has previously been associated with this tag via the {@link #setUserData(Object)} method. 241 */ getUserData()242 public Object getUserData() { 243 return userData; 244 } 245 246 /** 247 * Associates the specified general purpose user data object with this tag. 248 * <p> 249 * This property can be useful for applications that need to associate extra information with tags. 250 * The object can be retrieved later via the {@link #getUserData()} method. 251 * 252 * @param userData general purpose user data of any type. 253 */ setUserData(final Object userData)254 public void setUserData(final Object userData) { 255 this.userData=userData; 256 } 257 258 /** 259 * Returns the next tag in the source document. 260 * <p> 261 * This method also returns {@linkplain TagType#isServerTag() server tags}. 262 * <p> 263 * The result of a call to this method is cached. 264 * Performing a {@linkplain Source#fullSequentialParse() full sequential parse} prepopulates this cache. 265 * <p> 266 * If the result is not cached, a call to this method is equivalent to <code>source.</code>{@link Source#getNextTag(int) getNextTag}<code>(</code>{@link #getBegin() getBegin()}<code>+1)</code>. 267 * <p> 268 * See the {@link Tag} class documentation for more details about the behaviour of this method. 269 * 270 * @return the next tag in the source document, or <code>null</code> if this is the last tag. 271 */ getNextTag()272 public Tag getNextTag() { 273 if (nextTag==NOT_CACHED) { 274 final Tag localNextTag=getNextTag(source,begin+1); 275 if (source.wasFullSequentialParseCalled()) return localNextTag; // Don't set nextTag if this is an orphaned tag. See isOrphaned() for details. 276 nextTag=localNextTag; 277 } 278 return nextTag; 279 } 280 281 /** 282 * Returns the previous tag in the source document. 283 * <p> 284 * This method also returns {@linkplain TagType#isServerTag() server tags}. 285 * <p> 286 * The result of a call to this method is cached. 287 * Performing a {@linkplain Source#fullSequentialParse() full sequential parse} prepopulates this cache. 288 * <p> 289 * If the result is not cached, a call to this method is equivalent to <code>source.</code>{@link Source#getPreviousTag(int) getPreviousTag}<code>(</code>{@link #getBegin() getBegin()}<code>-1)</code>. 290 * <p> 291 * See the {@link Tag} class documentation for more details about the behaviour of this method. 292 * 293 * @return the previous tag in the source document, or <code>null</code> if this is the first tag. 294 */ getPreviousTag()295 public Tag getPreviousTag() { 296 if (previousTag==NOT_CACHED) previousTag=getPreviousTag(source,begin-1); 297 return previousTag; 298 } 299 300 /** 301 * Indicates whether this tag has a syntax that does not match any of the {@linkplain TagType#register() registered} {@linkplain TagType tag types}. 302 * <p> 303 * The only requirement of an unregistered tag type is that it {@linkplain TagType#getStartDelimiter() starts} with 304 * '<code><</code>' and there is a {@linkplain TagType#getClosingDelimiter() closing} '<code>></code>' character 305 * at some position after it in the source document. 306 * <p> 307 * The absence or presence of a '<code>/</code>' character after the initial '<code><</code>' determines whether an 308 * unregistered tag is respectively a 309 * {@link StartTag} with a {@linkplain #getTagType() type} of {@link StartTagType#UNREGISTERED} or an 310 * {@link EndTag} with a {@linkplain #getTagType() type} of {@link EndTagType#UNREGISTERED}. 311 * <p> 312 * There are no restrictions on the characters that might appear between these delimiters, including other '<code><</code>' 313 * characters. This may result in a '<code>></code>' character that is identified as the closing delimiter of two 314 * separate tags, one an unregistered tag, and the other a tag of any type that {@linkplain #getBegin() begins} in the middle 315 * of the unregistered tag. As explained below, unregistered tags are usually only found when specifically looking for them, 316 * so it is up to the user to detect and deal with any such nonsensical results. 317 * <p> 318 * Unregistered tags are only returned by the {@link Source#getTagAt(int pos)} method, 319 * <a href="Tag.html#NamedSearch">named search</a> methods, where the specified <code>name</code> 320 * matches the first characters inside the tag, and by <a href="Tag.html#TagTypeSearch">tag type search</a> methods, where the 321 * specified <code>tagType</code> is either {@link StartTagType#UNREGISTERED} or {@link EndTagType#UNREGISTERED}. 322 * <p> 323 * <a href="Tag.html#OpenSearch">Open</a> tag searches and <a href="Tag.html#OtherSearch">other</a> searches always ignore 324 * unregistered tags, although every discovery of an unregistered tag is {@linkplain Source#getLogger() logged} by the parser. 325 * <p> 326 * The logic behind this design is that unregistered tag types are usually the result of a '<code><</code>' character 327 * in the text that was mistakenly left {@linkplain CharacterReference#encode(CharSequence) unencoded}, or a less-than 328 * operator inside a script, or some other occurrence which is of no interest to the user. 329 * By returning unregistered tags in <a href="Tag.html#NamedSearch">named</a> and <a href="Tag.html#TagTypeSearch">tag type</a> 330 * search methods, the library allows the user to specifically search for tags with a certain syntax that does not match any 331 * existing {@link TagType}. This expediency feature avoids the need for the user to create a 332 * <a href="TagType.html#Custom">custom tag type</a> to define the syntax before searching for these tags. 333 * By not returning unregistered tags in the less specific search methods, it is providing only the information that 334 * most users are interested in. 335 * 336 * @return <code>true</code> if this tag has a syntax that does not match any of the {@linkplain TagType#register() registered} {@linkplain TagType tag types}, otherwise <code>false</code>. 337 */ isUnregistered()338 public abstract boolean isUnregistered(); 339 340 /** 341 * Returns an XML representation of this tag. 342 * <p> 343 * This is an abstract method which is implemented in the {@link StartTag} and {@link EndTag} subclasses. 344 * See the documentation of the {@link StartTag#tidy()} and {@link EndTag#tidy()} methods for details. 345 * 346 * @return an XML representation of this tag. 347 */ tidy()348 public abstract String tidy(); 349 350 /** 351 * Indicates whether the specified text is a valid <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>. 352 * <p> 353 * This implementation first checks that the first character of the specified text is a valid XML Name start character 354 * as defined by the {@link #isXMLNameStartChar(char)} method, and then checks that the rest of the characters are valid 355 * XML Name characters as defined by the {@link #isXMLNameChar(char)} method. 356 * <p> 357 * Note that this implementation does not exactly adhere to the 358 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">formal definition of an XML Name</a>, 359 * but the differences are unlikely to be significant in real-world XML or HTML documents. 360 * 361 * @param text the text to test. 362 * @return <code>true</code> if the specified text is a valid <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>. 363 * @see Source#getNameEnd(int pos) 364 */ isXMLName(final CharSequence text)365 public static final boolean isXMLName(final CharSequence text) { 366 if (text==null || text.length()==0 || !isXMLNameStartChar(text.charAt(0))) return false; 367 for (int i=1; i<text.length(); i++) 368 if (!isXMLNameChar(text.charAt(i))) return false; 369 return true; 370 } 371 372 /** 373 * Indicates whether the specified character is valid at the start of an 374 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>. 375 * <p> 376 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> defines a 377 * <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">Name</a></code> as starting with one of the characters 378 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a> | '_' | ':')</code>. 379 * <p> 380 * This method uses the expression 381 * <br /><code>Character.isLetter(ch) || ch=='_' || ch==':'</code>. 382 * <p> 383 * Note that there are many differences between the <code>Character.isLetter()</code> definition of a Letter and the 384 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">XML definition of a Letter</a>, 385 * but these differences are unlikely to be significant in real-world XML or HTML documents. 386 * 387 * @param ch the character to test. 388 * @return <code>true</code> if the specified character is valid at the start of an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>. 389 * @see Source#getNameEnd(int pos) 390 */ isXMLNameStartChar(final char ch)391 public static final boolean isXMLNameStartChar(final char ch) { 392 return Character.isLetter(ch) || ch=='_' || ch==':'; 393 } 394 395 /** 396 * Indicates whether the specified character is valid anywhere in an 397 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>. 398 * <p> 399 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> uses the 400 * entity <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-NameChar">NameChar</a></code> to represent this set of 401 * characters, which is defined as 402 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a> 403 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Digit">Digit</a> | '.' | '-' | '_' | ':' 404 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-CombiningChar">CombiningChar</a> 405 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Extender">Extender</a>)</code>. 406 * <p> 407 * This method uses the expression 408 * <br /><code>Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':'</code>. 409 * <p> 410 * Note that there are many differences between these definitions, 411 * but these differences are unlikely to be significant in real-world XML or HTML documents. 412 * 413 * @param ch the character to test. 414 * @return <code>true</code> if the specified character is valid anywhere in an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>. 415 * @see Source#getNameEnd(int pos) 416 */ isXMLNameChar(final char ch)417 public static final boolean isXMLNameChar(final char ch) { 418 return Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':'; 419 } 420 421 // *** consider making public getNextStartTag()422 StartTag getNextStartTag() { 423 Tag tag=this; 424 while (true) { 425 tag=tag.getNextTag(); 426 if (tag==null) return null; 427 if (tag instanceof StartTag) return (StartTag)tag; 428 } 429 } 430 431 // *** consider making public getPreviousStartTag()432 StartTag getPreviousStartTag() { 433 Tag tag=this; 434 while (true) { 435 tag=tag.getPreviousTag(); 436 if (tag==null) return null; 437 if (tag instanceof StartTag) return (StartTag)tag; 438 } 439 } 440 441 // *** consider making public getNextTag(final TagType tagType)442 Tag getNextTag(final TagType tagType) { 443 if (tagType==null) return getNextTag(); 444 if (tagType==StartTagType.UNREGISTERED || tagType==EndTagType.UNREGISTERED) return getNextTag(source,begin+1,tagType); 445 Tag tag=this; 446 while (true) { 447 if (tag.nextTag==NOT_CACHED) return getNextTag(source,tag.begin+1,tagType); 448 tag=tag.nextTag; 449 if (tag==null) return null; 450 if (tag.getTagType()==tagType) return tag; 451 } 452 } 453 454 // *** consider making public getPreviousTag(final TagType tagType)455 Tag getPreviousTag(final TagType tagType) { 456 if (tagType==null) return getPreviousTag(); 457 if (tagType==StartTagType.UNREGISTERED || tagType==EndTagType.UNREGISTERED) return getPreviousTag(source,begin-1,tagType); 458 Tag tag=this; 459 while (true) { 460 if (tag.previousTag==NOT_CACHED) return getPreviousTag(source,tag.begin-1,tagType); 461 tag=tag.previousTag; 462 if (tag==null) return null; 463 if (tag.getTagType()==tagType) return tag; 464 } 465 } 466 includeInSearch()467 final boolean includeInSearch() { 468 return INCLUDE_UNREGISTERED_IN_SEARCH || !isUnregistered(); 469 } 470 getPreviousTag(final Source source, final int pos)471 static final Tag getPreviousTag(final Source source, final int pos) { 472 // returns null if pos is out of range. 473 return source.useAllTypesCache 474 ? source.cache.getPreviousTag(pos) 475 : getPreviousTagUncached(source,pos,ParseText.NO_BREAK); 476 } 477 getNextTag(final Source source, final int pos)478 static final Tag getNextTag(final Source source, final int pos) { 479 // returns null if pos is out of range. 480 return source.useAllTypesCache 481 ? source.cache.getNextTag(pos) 482 : getNextTagUncached(source,pos,ParseText.NO_BREAK); 483 } 484 getPreviousTagUncached(final Source source, final int pos, final int breakAtPos)485 static final Tag getPreviousTagUncached(final Source source, final int pos, final int breakAtPos) { 486 // returns null if pos is out of range. 487 try { 488 final ParseText parseText=source.getParseText(); 489 int begin=pos; 490 do { 491 begin=parseText.lastIndexOf('<',begin,breakAtPos); // this assumes that all tags start with '<' 492 // parseText.lastIndexOf and indexOf return -1 if pos is out of range. 493 if (begin==-1) return null; 494 final Tag tag=getTagAt(source,begin,false); 495 if (tag!=null && tag.includeInSearch()) return tag; 496 } while ((begin-=1)>=0); 497 } catch (IndexOutOfBoundsException ex) { 498 throw new AssertionError("Unexpected internal exception"); 499 } 500 return null; 501 } 502 getNextTagUncached(final Source source, final int pos, final int breakAtPos)503 static final Tag getNextTagUncached(final Source source, final int pos, final int breakAtPos) { 504 // returns null if pos is out of range. 505 try { 506 final ParseText parseText=source.getParseText(); 507 int begin=pos; 508 do { 509 begin=parseText.indexOf('<',begin,breakAtPos); // this assumes that all tags start with '<' 510 // parseText.lastIndexOf and indexOf return -1 if pos is out of range. 511 if (begin==-1) return null; 512 final Tag tag=getTagAt(source,begin,false); 513 if (tag!=null && tag.includeInSearch()) return tag; 514 } while ((begin+=1)<source.end); 515 } catch (IndexOutOfBoundsException ex) { 516 // this should only happen when the end of file is reached in the middle of a tag. 517 // we don't have to do anything to handle it as there are no more tags anyway. 518 } 519 return null; 520 } 521 getPreviousTag(final Source source, final int pos, final TagType tagType)522 static final Tag getPreviousTag(final Source source, final int pos, final TagType tagType) { 523 // returns null if pos is out of range. 524 if (source.useSpecialTypesCache) return source.cache.getPreviousTag(pos,tagType); 525 return getPreviousTagUncached(source,pos,tagType,ParseText.NO_BREAK); 526 } 527 getNextTag(final Source source, final int pos, final TagType tagType)528 static final Tag getNextTag(final Source source, final int pos, final TagType tagType) { 529 // returns null if pos is out of range. 530 if (source.useSpecialTypesCache) return source.cache.getNextTag(pos,tagType); 531 return getNextTagUncached(source,pos,tagType,ParseText.NO_BREAK); 532 } 533 getPreviousTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos)534 static final Tag getPreviousTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) { 535 // returns null if pos is out of range. 536 if (tagType==null) return getPreviousTagUncached(source,pos,breakAtPos); 537 final String startDelimiter=tagType.getStartDelimiter(); 538 try { 539 final ParseText parseText=source.getParseText(); 540 int begin=pos; 541 do { 542 begin=parseText.lastIndexOf(startDelimiter,begin,breakAtPos); 543 // parseText.lastIndexOf and indexOf return -1 if pos is out of range. 544 if (begin==-1) return null; 545 final Tag tag=getTagAt(source,begin,false); 546 if (tag!=null && tag.getTagType()==tagType) return tag; 547 } while ((begin-=1)>=0); 548 } catch (IndexOutOfBoundsException ex) { 549 // this should never happen during a get previous operation so rethrow it: 550 throw ex; 551 } 552 return null; 553 } 554 getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos)555 static final Tag getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) { 556 // returns null if pos is out of range. 557 if (tagType==null) return getNextTagUncached(source,pos,breakAtPos); 558 final String startDelimiter=tagType.getStartDelimiter(); 559 try { 560 final ParseText parseText=source.getParseText(); 561 int begin=pos; 562 do { 563 begin=parseText.indexOf(startDelimiter,begin,breakAtPos); 564 // parseText.lastIndexOf and indexOf return -1 if pos is out of range. 565 if (begin==-1) return null; 566 final Tag tag=getTagAt(source,begin,false); 567 if (tag!=null && tag.getTagType()==tagType) return tag; 568 } while ((begin+=1)<source.end); 569 } catch (IndexOutOfBoundsException ex) { 570 // this should only happen when the end of file is reached in the middle of a tag. 571 // we don't have to do anything to handle it as there are no more tags anyway. 572 } 573 return null; 574 } 575 getTagAt(final Source source, final int pos, final boolean serverTagOnly)576 static final Tag getTagAt(final Source source, final int pos, final boolean serverTagOnly) { 577 // returns null if pos is out of range. 578 return source.useAllTypesCache 579 ? source.cache.getTagAt(pos,serverTagOnly) 580 : getTagAtUncached(source,pos,serverTagOnly); 581 } 582 getTagAtUncached(final Source source, final int pos, final boolean serverTagOnly)583 static final Tag getTagAtUncached(final Source source, final int pos, final boolean serverTagOnly) { 584 // returns null if pos is out of range. 585 return TagType.getTagAt(source,pos,serverTagOnly,false); 586 } 587 parseAll(final Source source, final boolean assumeNoNestedTags)588 static final Tag[] parseAll(final Source source, final boolean assumeNoNestedTags) { 589 int registeredTagCount=0; 590 int registeredStartTagCount=0; 591 final ArrayList<Tag> list=new ArrayList<Tag>(); 592 source.fullSequentialParseData=new int[1]; // fullSequentialParseData is simply a holder for a single mutable integer. It holds the end position of the last normal tag (ie one that ignores enclosed markup), or MAX_VALUE if we are in a SCRIPT element. 593 if (source.end!=0) { 594 final ParseText parseText=source.getParseText(); 595 Tag tag=parseAllgetNextTag(source,parseText,0,assumeNoNestedTags); 596 while (tag!=null) { 597 list.add(tag); 598 if (!tag.isUnregistered()) { 599 registeredTagCount++; 600 if (tag instanceof StartTag) registeredStartTagCount++; 601 } 602 // Look for next tag after end of next tag if either: 603 // - this is a server comment (which doesn't allow any other tags within it) 604 // - or we're assuming tags don't appear inside other tags, as long as the last tag found was not an unregistered tag: 605 final int pos=(tag.getTagType()==StartTagType.SERVER_COMMON_COMMENT || (assumeNoNestedTags && !tag.isUnregistered())) ? tag.end : tag.begin+1; 606 if (pos==source.end) break; 607 tag=parseAllgetNextTag(source,parseText,pos,assumeNoNestedTags); 608 } 609 } 610 final Tag[] allRegisteredTags=new Tag[registeredTagCount]; 611 final StartTag[] allRegisteredStartTags=new StartTag[registeredStartTagCount]; 612 source.cache.loadAllTags(list,allRegisteredTags,allRegisteredStartTags); 613 source.allTagsArray=allRegisteredTags; 614 source.allTags=Arrays.asList(allRegisteredTags); 615 source.allStartTags=Arrays.asList(allRegisteredStartTags); 616 final int lastIndex=allRegisteredTags.length-1; 617 for (int i=0; i<allRegisteredTags.length; i++) { 618 final Tag tag=allRegisteredTags[i]; 619 tag.previousTag=i>0 ? allRegisteredTags[i-1] : null; 620 tag.nextTag=i<lastIndex ? allRegisteredTags[i+1] : null; 621 } 622 return allRegisteredTags; 623 } 624 625 private static final Tag parseAllgetNextTag(final Source source, final ParseText parseText, final int pos, final boolean assumeNoNestedTags) { 626 try { 627 int begin=pos; 628 do { 629 begin=parseText.indexOf('<',begin); // this assumes that all tags start with '<' 630 if (begin==-1) return null; 631 final Tag tag=TagType.getTagAt(source,begin,false,assumeNoNestedTags); 632 if (tag!=null) { 633 if (!assumeNoNestedTags) { 634 // POSSIBLE BUG: 635 // It appears that this code should be executed even if assumeNoNestedTags is true. 636 // This was originally not the case when first created, but the subsequent addition of the SCRIPT element handling means it should always be executed. 637 // This should be proven and fixed if assumeNoNestedTags is ever allowed to be true (at present it is hard coded to false). 638 final TagType tagType=tag.getTagType(); 639 if (tag.end>source.fullSequentialParseData[0] 640 && tagType!=StartTagType.DOCTYPE_DECLARATION 641 && tagType!=StartTagType.UNREGISTERED && tagType!=EndTagType.UNREGISTERED) { 642 source.fullSequentialParseData[0]=(tagType==StartTagType.NORMAL && tag.name==HTMLElementName.SCRIPT && !((StartTag)tag).isEmptyElementTag()) ? Integer.MAX_VALUE : tag.end; 643 } 644 } 645 return tag; 646 } 647 } while ((begin+=1)<source.end); 648 } catch (IndexOutOfBoundsException ex) { 649 // this should only happen when the end of file is reached in the middle of a tag. 650 // we don't have to do anything to handle it as there are no more tags anyway. 651 } 652 return null; 653 } 654 655 void orphan() { 656 // see isOrphaned() for details 657 nextTag=NOT_CACHED; 658 } 659 660 boolean isOrphaned() { 661 // Indicates whether this tag has been orphaned after being cleared from the cache by a full sequential parse after it was constructed. 662 // Use nextTag as a flag to avoid using the extra memory allocation for such a rare issue. 663 // This means that getNextTag() shouldn't set the nextTag field if this tag is orphaned. 664 return source.wasFullSequentialParseCalled() && nextTag==NOT_CACHED; 665 } 666 } 667