1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20 
21 package net.htmlparser.jericho;
22 
23 import java.util.*;
24 
25 /**
26  * Represents either a {@link StartTag} or {@link EndTag} in a specific {@linkplain Source source} document.
27  * <p>
28  * Take the following HTML segment as an example:
29  * <p>
30  * <code>&lt;p&gt;This is a sample paragraph.&lt;/p&gt;</code>
31  * <p>
32  * The "<code>&lt;p&gt;</code>" is represented by a {@link StartTag} object, and the "<code>&lt;/p&gt;</code>" is represented by an {@link EndTag} object,
33  * both of which are subclasses of the <code>Tag</code> class.
34  * The whole segment, including the start tag, its corresponding end tag and all of the content in between, is represented by an {@link Element} object.
35  *
36  * <h3><a name="ParsingProcess">Tag Parsing Process</a></h3>
37  * The following process describes how each tag is identified by the parser:
38  * <ol class="Separated">
39  *  <li>
40  *   Every '<code>&lt;</code>' character found in the source document is considered to be the start of a tag.
41  *   The characters following it are compared with the {@linkplain TagType#getStartDelimiter() start delimiters}
42  *   of all the {@linkplain TagType#register() registered} {@linkplain TagType tag types}, and a list of matching tag types
43  *   is determined.
44  *  <li>
45  *   A more detailed analysis of the source is performed according to the features of each matching tag type from the first step,
46  *   in order of <a href="TagType.html#Precedence">precedence</a>, until a valid tag is able to be constructed.
47  *   <p>
48  *   The analysis performed in relation to each candidate tag type is a two-stage process:
49  *   <ol>
50  *    <li>
51  *     The position of the tag is checked to determine whether it is {@linkplain TagType#isValidPosition(Source,int,int[]) valid}.
52  *     In theory, a {@linkplain TagType#isServerTag() server tag} is valid in any position, but a non-server tag is not valid inside any other tag,
53  *     nor inside elements with CDATA content such as {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
54  *     Theory dictates therefore that {@linkplain StartTagType#COMMENT comments} and explicit {@linkplain StartTagType#CDATA_SECTION CDATA sections}
55  *     inside script elements should not be recognised as tags.
56  *     The behaviour of the parser however does not always strictly adhere to the theory, to maintain compatability with major browsers
57  *     and also for efficiency reasons.
58  *     <p>
59  *     The {@link TagType#isValidPosition(Source, int pos, int[] fullSequentialParseData)} method is responsible for this check
60  *     and has a common default implementation for all tag types
61  *     (although <a href="TagType.html#custom">custom</a> tag types can override it if necessary).
62  *     Its behaviour differs depending on whether or not a {@linkplain Source#fullSequentialParse() full sequential parse} is peformed.
63  *     See the documentation of the {@link TagType#isValidPosition(Source,int,int[]) isValidPosition} method for full details.
64  *    <li>
65  *     A final analysis is performed by the {@link TagType#constructTagAt(Source, int pos)} method of the candidate tag type.
66  *     This method returns a valid {@link Tag} object if all conditions of the candidate tag type are met, otherwise it returns
67  *     <code>null</code> and the process continues with the next candidate tag type.
68  *   </ol>
69  *  <li>
70  *   If the source does not match the start delimiter or syntax of any registered tag type, the segment spanning it and the next
71  *   '<code>&gt;</code>' character is taken to be an {@linkplain #isUnregistered() unregistered} tag.
72  *   Some tag search methods ignore unregistered tags.  See the {@link #isUnregistered()} method for more information.
73  * </ol>
74  * <p>
75  * See the documentation of the {@link TagType} class for more details on how tags are recognised.
76  *
77  * <h3><a name="TagSearchMethods">Tag Search Methods</a></h3>
78  * <p>
79  * Methods that get tags in a source document are collectively referred to as <i>Tag Search Methods</i>.
80  * They are found mostly in the {@link Source} and {@link Segment} classes, and can be generally categorised as follows:
81  * <dl class="Separated">
82  *  <dt><a name="OpenSearch">Open Search:</a>
83  *   <dd>These methods search for tags of any {@linkplain #getName() name} and {@linkplain #getTagType() type}.
84  *    <ul class="Unseparated">
85  *     <li>{@link Tag#getNextTag()}
86  *     <li>{@link Tag#getPreviousTag()}
87  *     <li>{@link Segment#getAllElements()}
88  *     <li>{@link Segment#getFirstElement()}
89  *     <li>{@link Source#getTagAt(int pos)}
90  *     <li>{@link Source#getPreviousTag(int pos)}
91  *     <li>{@link Source#getNextTag(int pos)}
92  *     <li>{@link Source#getEnclosingTag(int pos)}
93  *     <li>{@link Segment#getAllTags()}
94  *     <li>{@link Segment#getAllStartTags()}
95  *     <li>{@link Segment#getFirstStartTag()}
96  *     <li>{@link Source#getPreviousStartTag(int pos)}
97  *     <li>{@link Source#getNextStartTag(int pos)}
98  *     <li>{@link Source#getPreviousEndTag(int pos)}
99  *     <li>{@link Source#getNextEndTag(int pos)}
100  *    </ul>
101  *  <dt><a name="NamedSearch">Named Search:</a>
102  *   <dd>These methods include a parameter called <code>name</code> which is used to specify the {@linkplain #getName() name} of the tag to search for.
103  *    Specifying a name that ends in a colon (<code>:</code>) searches for all elements or tags in the specified XML namespace.
104  *    <ul class="Unseparated">
105  *     <li>{@link Segment#getAllElements(String name)}
106  *     <li>{@link Segment#getFirstElement(String name)}
107  *     <li>{@link Segment#getAllStartTags(String name)}
108  *     <li>{@link Segment#getFirstStartTag(String name)}
109  *     <li>{@link Source#getPreviousStartTag(int pos, String name)}
110  *     <li>{@link Source#getNextStartTag(int pos, String name)}
111  *     <li>{@link Source#getPreviousEndTag(int pos, String name)}
112  *     <li>{@link Source#getNextEndTag(int pos, String name)}
113  *     <li>{@link Source#getNextEndTag(int pos, String name, EndTagType)}
114  *    </ul>
115  *  <dt><a name="TagTypeSearch">Tag Type Search:</a>
116  *   <dd>These methods typically include a parameter called <code>tagType</code> which is used to specify the {@linkplain #getTagType() type} of the tag to search for.
117  *    In some methods the search parameter is restricted to the {@link StartTagType} or {@link EndTagType} subclass of <code>TagType</code>.
118  *    <ul class="Unseparated">
119  *     <li>{@link Segment#getAllElements(StartTagType)}
120  *     <li>{@link Segment#getAllTags(TagType)}
121  *     <li>{@link Segment#getAllStartTags(StartTagType)}
122  *     <li>{@link Segment#getFirstStartTag(StartTagType)}
123  *     <li>{@link Source#getPreviousTag(int pos, TagType)}
124  *     <li>{@link Source#getPreviousStartTag(int pos, StartTagType)}
125  *     <li>{@link Source#getPreviousEndTag(int pos, EndTagType)}
126  *     <li>{@link Source#getNextTag(int pos, TagType)}
127  *     <li>{@link Source#getNextStartTag(int pos, StartTagType)}
128  *     <li>{@link Source#getNextEndTag(int pos, EndTagType)}
129  *     <li>{@link Source#getEnclosingTag(int pos, TagType)}
130  *     <li>{@link Source#getNextEndTag(int pos, String name, EndTagType)}
131  *    </ul>
132  *  <dt><a name="OtherSearch">Attribute Search:</a>
133  *   <dd>These methods perform the search based on an attribute name and value.
134  *    <ul class="Unseparated">
135  *     <li>{@link Segment#getAllElements(String attributeName, String value, boolean valueCaseSensitive)}
136  *     <li>{@link Segment#getFirstElement(String attributeName, String value, boolean valueCaseSensitive)}
137  *     <li>{@link Segment#getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)}
138  *     <li>{@link Segment#getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)}
139  *     <li>{@link Segment#getAllElements(String attributeName, Pattern valueRegexPattern)}
140  *     <li>{@link Segment#getFirstElement(String attributeName, Pattern valueRegexPattern)}
141  *     <li>{@link Segment#getAllStartTags(String attributeName, Pattern valueRegexPattern)}
142  *     <li>{@link Segment#getFirstStartTag(String attributeName, Pattern valueRegexPattern)}
143  *     <li>{@link Segment#getAllElementsByClass(String className)}
144  *     <li>{@link Segment#getFirstElementByClass(String className)}
145  *     <li>{@link Segment#getAllStartTagsByClass(String className)}
146  *     <li>{@link Segment#getFirstStartTagByClass(String className)}
147  *     <li>{@link Source#getElementById(String id)}
148  *     <li>{@link Source#getNextElement(int pos, String attributeName, Pattern valueRegexPattern)}
149  *     <li>{@link Source#getNextElement(int pos, String attributeName, String value, boolean valueCaseSensitive)}
150  *     <li>{@link Source#getNextElementByClass(int pos, String className)}
151  *     <li>{@link Source#getNextStartTag(int pos, String attributeName, Pattern valueRegexPattern)}
152  *     <li>{@link Source#getNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive)}
153  *     <li>{@link Source#getNextStartTagByClass(int pos, String className)}
154  *    </ul>
155  * </dl>
156  */
157 public abstract class Tag extends Segment {
158 	String name=null; // always lower case, can always use == operator to compare with constants in HTMLElementName interface
159 	private Object userData=null;
160 	// cached values:
161 	Element element=Element.NOT_CACHED;
162 	private Tag previousTag=NOT_CACHED; // does not include unregistered tags
163 	private Tag nextTag=NOT_CACHED; // does not include unregistered tags
164 	// A NOT_CACHED value in nextTag can also indicate that this tag is not in the cache. See isOrphaned() for details.
165 
166 	static final Tag NOT_CACHED=new StartTag();
167 
168 	private static final boolean INCLUDE_UNREGISTERED_IN_SEARCH=false; // determines whether unregistered tags are included in searches
169 
Tag(final Source source, final int begin, final int end, final String name)170 	Tag(final Source source, final int begin, final int end, final String name) {
171 		super(source,begin,end);
172 		this.name=HTMLElements.getConstantElementName(name.toLowerCase());
173 	}
174 
175 	// only used to create Tag.NOT_CACHED
Tag()176 	Tag() {}
177 
178 	/**
179 	 * Returns the {@linkplain Element element} that is started or ended by this tag.
180 	 * <p>
181 	 * {@link StartTag#getElement()} is guaranteed not <code>null</code>.
182 	 * <p>
183 	 * {@link EndTag#getElement()} can return <code>null</code> if the end tag is not properly matched to a start tag.
184 	 *
185 	 * @return the {@linkplain Element element} that is started or ended by this tag.
186 	 */
getElement()187 	public abstract Element getElement();
188 
189 	/**
190 	 * Returns the name of this tag, always in lower case.
191 	 * <p>
192 	 * The name always starts with the {@linkplain TagType#getNamePrefix() name prefix} defined in this tag's {@linkplain TagType type}.
193 	 * For some tag types, the name consists only of this prefix, while in others it must be followed by a valid
194 	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML name</a>
195 	 * (see {@link StartTagType#isNameAfterPrefixRequired()}).
196 	 * <p>
197 	 * If the name is equal to one of the constants defined in the {@link HTMLElementName} interface, this method is guaranteed to return
198 	 * the constant itself.
199 	 * This allows comparisons to be performed using the <code>==</code> operator instead of the less efficient
200 	 * <code>String.equals(Object)</code> method.
201 	 * <p>
202 	 * For example, the following expression can be used to test whether a {@link StartTag} is from a
203 	 * <code><a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#edef-SELECT">SELECT</a></code> element:
204 	 * <br /><code>startTag.getName()==HTMLElementName.SELECT</code>
205 	 * <p>
206 	 * To get the name of this tag in its original case, use {@link #getNameSegment()}<code>.toString()</code>.
207 	 *
208 	 * @return the name of this tag, always in lower case.
209 	 */
getName()210 	public final String getName() {
211 		return name;
212 	}
213 
214 	/**
215 	 * Returns the segment spanning the {@linkplain #getName() name} of this tag.
216 	 * <p>
217 	 * The code <code>getNameSegment().toString()</code> can be used to retrieve the name of this tag in its original case.
218 	 * <p>
219 	 * Every call to this method constructs a new <code>Segment</code> object.
220 	 *
221 	 * @return the segment spanning the {@linkplain #getName() name} of this tag.
222 	 * @see #getName()
223 	 */
getNameSegment()224 	public Segment getNameSegment() {
225 		final int nameSegmentBegin=begin+getTagType().startDelimiterPrefix.length();
226 		return new Segment(source,nameSegmentBegin,nameSegmentBegin+name.length());
227 	}
228 
229 	/**
230 	 * Returns the {@linkplain TagType type} of this tag.
231 	 * @return the {@linkplain TagType type} of this tag.
232 	 */
getTagType()233 	public abstract TagType getTagType();
234 
235 	/**
236 	 * Returns the general purpose user data object that has previously been associated with this tag via the {@link #setUserData(Object)} method.
237 	 * <p>
238 	 * If {@link #setUserData(Object)} has not been called, this method returns <code>null</code>.
239 	 *
240 	 * @return the generic data object that has previously been associated with this tag via the {@link #setUserData(Object)} method.
241 	 */
getUserData()242 	public Object getUserData() {
243 		return userData;
244 	}
245 
246 	/**
247 	 * Associates the specified general purpose user data object with this tag.
248 	 * <p>
249 	 * This property can be useful for applications that need to associate extra information with tags.
250 	 * The object can be retrieved later via the {@link #getUserData()} method.
251 	 *
252 	 * @param userData  general purpose user data of any type.
253 	 */
setUserData(final Object userData)254 	public void setUserData(final Object userData) {
255 		this.userData=userData;
256 	}
257 
258 	/**
259 	 * Returns the next tag in the source document.
260 	 * <p>
261 	 * This method also returns {@linkplain TagType#isServerTag() server tags}.
262 	 * <p>
263 	 * The result of a call to this method is cached.
264 	 * Performing a {@linkplain Source#fullSequentialParse() full sequential parse} prepopulates this cache.
265 	 * <p>
266 	 * If the result is not cached, a call to this method is equivalent to <code>source.</code>{@link Source#getNextTag(int) getNextTag}<code>(</code>{@link #getBegin() getBegin()}<code>+1)</code>.
267 	 * <p>
268 	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
269 	 *
270 	 * @return the next tag in the source document, or <code>null</code> if this is the last tag.
271 	 */
getNextTag()272 	public Tag getNextTag() {
273 		if (nextTag==NOT_CACHED) {
274 			final Tag localNextTag=getNextTag(source,begin+1);
275 			if (source.wasFullSequentialParseCalled()) return localNextTag; // Don't set nextTag if this is an orphaned tag. See isOrphaned() for details.
276 			nextTag=localNextTag;
277 		}
278 		return nextTag;
279 	}
280 
281 	/**
282 	 * Returns the previous tag in the source document.
283 	 * <p>
284 	 * This method also returns {@linkplain TagType#isServerTag() server tags}.
285 	 * <p>
286 	 * The result of a call to this method is cached.
287 	 * Performing a {@linkplain Source#fullSequentialParse() full sequential parse} prepopulates this cache.
288 	 * <p>
289 	 * If the result is not cached, a call to this method is equivalent to <code>source.</code>{@link Source#getPreviousTag(int) getPreviousTag}<code>(</code>{@link #getBegin() getBegin()}<code>-1)</code>.
290 	 * <p>
291 	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
292 	 *
293 	 * @return the previous tag in the source document, or <code>null</code> if this is the first tag.
294 	 */
getPreviousTag()295 	public Tag getPreviousTag() {
296 		if (previousTag==NOT_CACHED) previousTag=getPreviousTag(source,begin-1);
297 		return previousTag;
298 	}
299 
300 	/**
301 	 * Indicates whether this tag has a syntax that does not match any of the {@linkplain TagType#register() registered} {@linkplain TagType tag types}.
302 	 * <p>
303  	 * The only requirement of an unregistered tag type is that it {@linkplain TagType#getStartDelimiter() starts} with
304  	 * '<code>&lt;</code>' and there is a {@linkplain TagType#getClosingDelimiter() closing} '<code>&gt;</code>' character
305  	 * at some position after it in the source document.
306 	 * <p>
307 	 * The absence or presence of a '<code>/</code>' character after the initial '<code>&lt;</code>' determines whether an
308 	 * unregistered tag is respectively a
309 	 * {@link StartTag} with a {@linkplain #getTagType() type} of {@link StartTagType#UNREGISTERED} or an
310 	 * {@link EndTag} with a {@linkplain #getTagType() type} of {@link EndTagType#UNREGISTERED}.
311 	 * <p>
312 	 * There are no restrictions on the characters that might appear between these delimiters, including other '<code>&lt;</code>'
313 	 * characters.  This may result in a '<code>&gt;</code>' character that is identified as the closing delimiter of two
314 	 * separate tags, one an unregistered tag, and the other a tag of any type that {@linkplain #getBegin() begins} in the middle
315 	 * of the unregistered tag.  As explained below, unregistered tags are usually only found when specifically looking for them,
316 	 * so it is up to the user to detect and deal with any such nonsensical results.
317 	 * <p>
318 	 * Unregistered tags are only returned by the {@link Source#getTagAt(int pos)} method,
319 	 * <a href="Tag.html#NamedSearch">named search</a> methods, where the specified <code>name</code>
320 	 * matches the first characters inside the tag, and by <a href="Tag.html#TagTypeSearch">tag type search</a> methods, where the
321 	 * specified <code>tagType</code> is either {@link StartTagType#UNREGISTERED} or {@link EndTagType#UNREGISTERED}.
322 	 * <p>
323 	 * <a href="Tag.html#OpenSearch">Open</a> tag searches and <a href="Tag.html#OtherSearch">other</a> searches always ignore
324 	 * unregistered tags, although every discovery of an unregistered tag is {@linkplain Source#getLogger() logged} by the parser.
325 	 * <p>
326 	 * The logic behind this design is that unregistered tag types are usually the result of a '<code>&lt;</code>' character
327 	 * in the text that was mistakenly left {@linkplain CharacterReference#encode(CharSequence) unencoded}, or a less-than
328 	 * operator inside a script, or some other occurrence which is of no interest to the user.
329 	 * By returning unregistered tags in <a href="Tag.html#NamedSearch">named</a> and <a href="Tag.html#TagTypeSearch">tag type</a>
330 	 * search methods, the library allows the user to specifically search for tags with a certain syntax that does not match any
331 	 * existing {@link TagType}.  This expediency feature avoids the need for the user to create a
332 	 * <a href="TagType.html#Custom">custom tag type</a> to define the syntax before searching for these tags.
333 	 * By not returning unregistered tags in the less specific search methods, it is providing only the information that
334 	 * most users are interested in.
335 	 *
336 	 * @return <code>true</code> if this tag has a syntax that does not match any of the {@linkplain TagType#register() registered} {@linkplain TagType tag types}, otherwise <code>false</code>.
337 	 */
isUnregistered()338 	public abstract boolean isUnregistered();
339 
340 	/**
341 	 * Returns an XML representation of this tag.
342 	 * <p>
343 	 * This is an abstract method which is implemented in the {@link StartTag} and {@link EndTag} subclasses.
344 	 * See the documentation of the {@link StartTag#tidy()} and {@link EndTag#tidy()} methods for details.
345 	 *
346 	 * @return an XML representation of this tag.
347 	 */
tidy()348 	public abstract String tidy();
349 
350 	/**
351 	 * Indicates whether the specified text is a valid <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
352 	 * <p>
353 	 * This implementation first checks that the first character of the specified text is a valid XML Name start character
354 	 * as defined by the {@link #isXMLNameStartChar(char)} method, and then checks that the rest of the characters are valid
355 	 * XML Name characters as defined by the {@link #isXMLNameChar(char)} method.
356 	 * <p>
357 	 * Note that this implementation does not exactly adhere to the
358 	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">formal definition of an XML Name</a>,
359 	 * but the differences are unlikely to be significant in real-world XML or HTML documents.
360 	 *
361 	 * @param text  the text to test.
362 	 * @return <code>true</code> if the specified text is a valid <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
363 	 * @see Source#getNameEnd(int pos)
364 	 */
isXMLName(final CharSequence text)365 	public static final boolean isXMLName(final CharSequence text) {
366 		if (text==null || text.length()==0 || !isXMLNameStartChar(text.charAt(0))) return false;
367 		for (int i=1; i<text.length(); i++)
368 			if (!isXMLNameChar(text.charAt(i))) return false;
369 		return true;
370 	}
371 
372 	/**
373 	 * Indicates whether the specified character is valid at the start of an
374 	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
375 	 * <p>
376 	 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> defines a
377 	 * <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">Name</a></code> as starting with one of the characters
378 	 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a> | '_' | ':')</code>.
379 	 * <p>
380 	 * This method uses the expression
381 	 * <br /><code>Character.isLetter(ch) || ch=='_' || ch==':'</code>.
382 	 * <p>
383 	 * Note that there are many differences between the <code>Character.isLetter()</code> definition of a Letter and the
384 	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">XML definition of a Letter</a>,
385 	 * but these differences are unlikely to be significant in real-world XML or HTML documents.
386 	 *
387 	 * @param ch  the character to test.
388 	 * @return <code>true</code> if the specified character is valid at the start of an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
389 	 * @see Source#getNameEnd(int pos)
390 	 */
isXMLNameStartChar(final char ch)391 	public static final boolean isXMLNameStartChar(final char ch) {
392 		return Character.isLetter(ch) || ch=='_' || ch==':';
393 	}
394 
395 	/**
396 	 * Indicates whether the specified character is valid anywhere in an
397 	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
398 	 * <p>
399 	 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> uses the
400 	 * entity <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-NameChar">NameChar</a></code> to represent this set of
401 	 * characters, which is defined as
402 	 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a>
403 	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Digit">Digit</a> | '.' | '-' | '_' | ':'
404 	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-CombiningChar">CombiningChar</a>
405 	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Extender">Extender</a>)</code>.
406 	 * <p>
407 	 * This method uses the expression
408 	 * <br /><code>Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':'</code>.
409 	 * <p>
410 	 * Note that there are many differences between these definitions,
411 	 * but these differences are unlikely to be significant in real-world XML or HTML documents.
412 	 *
413 	 * @param ch  the character to test.
414 	 * @return <code>true</code> if the specified character is valid anywhere in an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
415 	 * @see Source#getNameEnd(int pos)
416 	 */
isXMLNameChar(final char ch)417 	public static final boolean isXMLNameChar(final char ch) {
418 		return Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':';
419 	}
420 
421 	// *** consider making public
getNextStartTag()422 	StartTag getNextStartTag() {
423 		Tag tag=this;
424 		while (true) {
425 			tag=tag.getNextTag();
426 			if (tag==null) return null;
427 			if (tag instanceof StartTag) return (StartTag)tag;
428 		}
429 	}
430 
431 	// *** consider making public
getPreviousStartTag()432 	StartTag getPreviousStartTag() {
433 		Tag tag=this;
434 		while (true) {
435 			tag=tag.getPreviousTag();
436 			if (tag==null) return null;
437 			if (tag instanceof StartTag) return (StartTag)tag;
438 		}
439 	}
440 
441 	// *** consider making public
getNextTag(final TagType tagType)442 	Tag getNextTag(final TagType tagType) {
443 		if (tagType==null) return getNextTag();
444 		if (tagType==StartTagType.UNREGISTERED || tagType==EndTagType.UNREGISTERED) return getNextTag(source,begin+1,tagType);
445 		Tag tag=this;
446 		while (true) {
447 			if (tag.nextTag==NOT_CACHED) return getNextTag(source,tag.begin+1,tagType);
448 			tag=tag.nextTag;
449 			if (tag==null) return null;
450 			if (tag.getTagType()==tagType) return tag;
451 		}
452 	}
453 
454 	// *** consider making public
getPreviousTag(final TagType tagType)455 	Tag getPreviousTag(final TagType tagType) {
456 		if (tagType==null) return getPreviousTag();
457 		if (tagType==StartTagType.UNREGISTERED || tagType==EndTagType.UNREGISTERED) return getPreviousTag(source,begin-1,tagType);
458 		Tag tag=this;
459 		while (true) {
460 			if (tag.previousTag==NOT_CACHED) return getPreviousTag(source,tag.begin-1,tagType);
461 			tag=tag.previousTag;
462 			if (tag==null) return null;
463 			if (tag.getTagType()==tagType) return tag;
464 		}
465 	}
466 
includeInSearch()467 	final boolean includeInSearch() {
468 		return INCLUDE_UNREGISTERED_IN_SEARCH || !isUnregistered();
469 	}
470 
getPreviousTag(final Source source, final int pos)471 	static final Tag getPreviousTag(final Source source, final int pos) {
472 		// returns null if pos is out of range.
473 		return source.useAllTypesCache
474 			? source.cache.getPreviousTag(pos)
475 			: getPreviousTagUncached(source,pos,ParseText.NO_BREAK);
476 	}
477 
getNextTag(final Source source, final int pos)478 	static final Tag getNextTag(final Source source, final int pos) {
479 		// returns null if pos is out of range.
480 		return source.useAllTypesCache
481 			? source.cache.getNextTag(pos)
482 			: getNextTagUncached(source,pos,ParseText.NO_BREAK);
483 	}
484 
getPreviousTagUncached(final Source source, final int pos, final int breakAtPos)485 	static final Tag getPreviousTagUncached(final Source source, final int pos, final int breakAtPos) {
486 		// returns null if pos is out of range.
487 		try {
488 			final ParseText parseText=source.getParseText();
489 			int begin=pos;
490 			do {
491 				begin=parseText.lastIndexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
492 				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
493 				if (begin==-1) return null;
494 				final Tag tag=getTagAt(source,begin,false);
495 				if (tag!=null && tag.includeInSearch()) return tag;
496 			} while ((begin-=1)>=0);
497 		} catch (IndexOutOfBoundsException ex) {
498 			throw new AssertionError("Unexpected internal exception");
499 		}
500 		return null;
501 	}
502 
getNextTagUncached(final Source source, final int pos, final int breakAtPos)503 	static final Tag getNextTagUncached(final Source source, final int pos, final int breakAtPos) {
504 		// returns null if pos is out of range.
505 		try {
506 			final ParseText parseText=source.getParseText();
507 			int begin=pos;
508 			do {
509 				begin=parseText.indexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
510 				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
511 				if (begin==-1) return null;
512 				final Tag tag=getTagAt(source,begin,false);
513 				if (tag!=null && tag.includeInSearch()) return tag;
514 			} while ((begin+=1)<source.end);
515 		} catch (IndexOutOfBoundsException ex) {
516 			// this should only happen when the end of file is reached in the middle of a tag.
517 			// we don't have to do anything to handle it as there are no more tags anyway.
518 		}
519 		return null;
520 	}
521 
getPreviousTag(final Source source, final int pos, final TagType tagType)522 	static final Tag getPreviousTag(final Source source, final int pos, final TagType tagType) {
523 		// returns null if pos is out of range.
524 		if (source.useSpecialTypesCache) return source.cache.getPreviousTag(pos,tagType);
525 		return getPreviousTagUncached(source,pos,tagType,ParseText.NO_BREAK);
526 	}
527 
getNextTag(final Source source, final int pos, final TagType tagType)528 	static final Tag getNextTag(final Source source, final int pos, final TagType tagType) {
529 		// returns null if pos is out of range.
530 		if (source.useSpecialTypesCache) return source.cache.getNextTag(pos,tagType);
531 		return getNextTagUncached(source,pos,tagType,ParseText.NO_BREAK);
532 	}
533 
getPreviousTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos)534 	static final Tag getPreviousTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) {
535 		// returns null if pos is out of range.
536 		if (tagType==null) return getPreviousTagUncached(source,pos,breakAtPos);
537 		final String startDelimiter=tagType.getStartDelimiter();
538 		try {
539 			final ParseText parseText=source.getParseText();
540 			int begin=pos;
541 			do {
542 				begin=parseText.lastIndexOf(startDelimiter,begin,breakAtPos);
543 				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
544 				if (begin==-1) return null;
545 				final Tag tag=getTagAt(source,begin,false);
546 				if (tag!=null && tag.getTagType()==tagType) return tag;
547 			} while ((begin-=1)>=0);
548 		} catch (IndexOutOfBoundsException ex) {
549 			// this should never happen during a get previous operation so rethrow it:
550 			throw ex;
551 		}
552 		return null;
553 	}
554 
getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos)555 	static final Tag getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) {
556 		// returns null if pos is out of range.
557 		if (tagType==null) return getNextTagUncached(source,pos,breakAtPos);
558 		final String startDelimiter=tagType.getStartDelimiter();
559 		try {
560 			final ParseText parseText=source.getParseText();
561 			int begin=pos;
562 			do {
563 				begin=parseText.indexOf(startDelimiter,begin,breakAtPos);
564 				// parseText.lastIndexOf and indexOf return -1 if pos is out of range.
565 				if (begin==-1) return null;
566 				final Tag tag=getTagAt(source,begin,false);
567 				if (tag!=null && tag.getTagType()==tagType) return tag;
568 			} while ((begin+=1)<source.end);
569 		} catch (IndexOutOfBoundsException ex) {
570 			// this should only happen when the end of file is reached in the middle of a tag.
571 			// we don't have to do anything to handle it as there are no more tags anyway.
572 		}
573 		return null;
574 	}
575 
getTagAt(final Source source, final int pos, final boolean serverTagOnly)576 	static final Tag getTagAt(final Source source, final int pos, final boolean serverTagOnly) {
577 		// returns null if pos is out of range.
578 		return source.useAllTypesCache
579 			? source.cache.getTagAt(pos,serverTagOnly)
580 			: getTagAtUncached(source,pos,serverTagOnly);
581 	}
582 
getTagAtUncached(final Source source, final int pos, final boolean serverTagOnly)583 	static final Tag getTagAtUncached(final Source source, final int pos, final boolean serverTagOnly) {
584 		// returns null if pos is out of range.
585 		return TagType.getTagAt(source,pos,serverTagOnly,false);
586 	}
587 
parseAll(final Source source, final boolean assumeNoNestedTags)588 	static final Tag[] parseAll(final Source source, final boolean assumeNoNestedTags) {
589 		int registeredTagCount=0;
590 		int registeredStartTagCount=0;
591 		final ArrayList<Tag> list=new ArrayList<Tag>();
592 		source.fullSequentialParseData=new int[1]; // fullSequentialParseData is simply a holder for a single mutable integer. It holds the end position of the last normal tag (ie one that ignores enclosed markup), or MAX_VALUE if we are in a SCRIPT element.
593 		if (source.end!=0) {
594 			final ParseText parseText=source.getParseText();
595 			Tag tag=parseAllgetNextTag(source,parseText,0,assumeNoNestedTags);
596 			while (tag!=null) {
597 				list.add(tag);
598 				if (!tag.isUnregistered()) {
599 					registeredTagCount++;
600 					if (tag instanceof StartTag) registeredStartTagCount++;
601 				}
602 				// Look for next tag after end of next tag if either:
603 				//   - this is a server comment (which doesn't allow any other tags within it)
604 				//   - or we're assuming tags don't appear inside other tags, as long as the last tag found was not an unregistered tag:
605 				final int pos=(tag.getTagType()==StartTagType.SERVER_COMMON_COMMENT || (assumeNoNestedTags && !tag.isUnregistered())) ? tag.end : tag.begin+1;
606 				if (pos==source.end) break;
607 				tag=parseAllgetNextTag(source,parseText,pos,assumeNoNestedTags);
608 			}
609 		}
610 		final Tag[] allRegisteredTags=new Tag[registeredTagCount];
611 		final StartTag[] allRegisteredStartTags=new StartTag[registeredStartTagCount];
612 		source.cache.loadAllTags(list,allRegisteredTags,allRegisteredStartTags);
613 		source.allTagsArray=allRegisteredTags;
614 		source.allTags=Arrays.asList(allRegisteredTags);
615 		source.allStartTags=Arrays.asList(allRegisteredStartTags);
616 		final int lastIndex=allRegisteredTags.length-1;
617 		for (int i=0; i<allRegisteredTags.length; i++) {
618 			final Tag tag=allRegisteredTags[i];
619 			tag.previousTag=i>0 ? allRegisteredTags[i-1] : null;
620 			tag.nextTag=i<lastIndex ? allRegisteredTags[i+1] : null;
621 		}
622 		return allRegisteredTags;
623 	}
624 
625 	private static final Tag parseAllgetNextTag(final Source source, final ParseText parseText, final int pos, final boolean assumeNoNestedTags) {
626 		try {
627 			int begin=pos;
628 			do {
629 				begin=parseText.indexOf('<',begin); // this assumes that all tags start with '<'
630 				if (begin==-1) return null;
631 				final Tag tag=TagType.getTagAt(source,begin,false,assumeNoNestedTags);
632 				if (tag!=null) {
633 					if (!assumeNoNestedTags) {
634 						// POSSIBLE BUG:
635 						// It appears that this code should be executed even if assumeNoNestedTags is true.
636 						// This was originally not the case when first created, but the subsequent addition of the SCRIPT element handling means it should always be executed.
637 						// This should be proven and fixed if assumeNoNestedTags is ever allowed to be true (at present it is hard coded to false).
638 						final TagType tagType=tag.getTagType();
639 						if (tag.end>source.fullSequentialParseData[0]
640 								&& tagType!=StartTagType.DOCTYPE_DECLARATION
641 								&& tagType!=StartTagType.UNREGISTERED && tagType!=EndTagType.UNREGISTERED) {
642 							source.fullSequentialParseData[0]=(tagType==StartTagType.NORMAL && tag.name==HTMLElementName.SCRIPT && !((StartTag)tag).isEmptyElementTag()) ? Integer.MAX_VALUE : tag.end;
643 						}
644 					}
645 					return tag;
646 				}
647 			} while ((begin+=1)<source.end);
648 		} catch (IndexOutOfBoundsException ex) {
649 			// this should only happen when the end of file is reached in the middle of a tag.
650 			// we don't have to do anything to handle it as there are no more tags anyway.
651 		}
652 		return null;
653 	}
654 
655 	void orphan() {
656 		// see isOrphaned() for details
657 		nextTag=NOT_CACHED;
658 	}
659 
660 	boolean isOrphaned() {
661 		// Indicates whether this tag has been orphaned after being cleared from the cache by a full sequential parse after it was constructed.
662 		// Use nextTag as a flag to avoid using the extra memory allocation for such a rare issue.
663 		// This means that getNextTag() shouldn't set the nextTag field if this tag is orphaned.
664 		return source.wasFullSequentialParseCalled() && nextTag==NOT_CACHED;
665 	}
666 }
667