1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20 
21 package net.htmlparser.jericho;
22 
23 import java.util.*;
24 import java.io.*;
25 
26 /**
27  * Represents an HTML <a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">Character Reference</a>,
28  * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
29  * <p>
30  * This class, together with its subclasses, contains static methods to perform most required operations
31  * without having to instantiate an object.
32  * <p>
33  * Instances of this class are useful when the positions of character references in a source document are required,
34  * or to replace the found character references with customised text.
35  * <p>
36  * <code>CharacterReference</code> instances are obtained using one of the following methods:
37  * <ul>
38  *  <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
39  *  <li>{@link Source#getNextCharacterReference(int pos)}
40  *  <li>{@link Source#getPreviousCharacterReference(int pos)}
41  *  <li>{@link Segment#getAllCharacterReferences()}
42  * </ul>
43  */
44 public abstract class CharacterReference extends Segment {
45 	int codePoint;
46 
47 	/**
48 	 * Represents an invalid unicode code point.
49 	 * <p>
50 	 * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
51 	 */
52 	public static final int INVALID_CODE_POINT=-1;
53 
54 	static int MAX_ENTITY_REFERENCE_LENGTH; // set in CharacterEntityReference static class initialisation
55 
56 	/** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
57 	private static final int TAB_LENGTH=4;
58 
CharacterReference(final Source source, final int begin, final int end, final int codePoint)59 	CharacterReference(final Source source, final int begin, final int end, final int codePoint) {
60 		super(source,begin,end);
61 		this.codePoint=codePoint;
62 	}
63 
64 	/**
65 	 * Returns the <a target="_blank" href="http://www.unicode.org">unicode</a> code point represented by this character reference.
66 	 * @return the unicode code point represented by this character reference.
67 	 * @see #appendCharTo(Appendable)
68 	 */
getCodePoint()69 	public int getCodePoint() {
70 		return codePoint;
71 	}
72 
73 	/**
74 	 * Returns the character represented by this character reference.
75 	 * <p>
76 	 * If this character reference represents a unicode
77 	 * <a target="_blank" href="http://www.unicode.org/glossary/#supplementary_code_point">supplimentary code point</a>,
78 	 * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
79 	 * <p>
80 	 * To ensure that the character is correctly appended to an <code>Appendable</code> object such as a <code>Writer</code>, use the code:
81 	 * <br /><code>characterReference.</code>{@link #appendCharTo(Appendable) appendCharTo}<code>(appendable)</code><br />
82 	 * instead of:
83 	 * <br /><code>appendable.append(characterReference.getChar())</code>
84 	 *
85 	 * @return the character represented by this character reference.
86 	 * @see #appendCharTo(Appendable)
87 	 * @see #getCodePoint()
88 	 */
getChar()89 	public char getChar() {
90 		return (char)codePoint;
91 	}
92 
93 	/**
94 	 * Appends the character represented by this character reference to the specified appendable object.
95 	 * <p>
96 	 * If this character is a unicode <a target="_blank" href="http://unicode.org/glossary/#supplementary_character">supplementary character</a>,
97 	 * then both the UTF-16 high/low surrogate <code>char</code> values of the of the character are appended, as described in the
98 	 * <a target="_blank" href="http://java.sun.com/javase/6/docs/api/java/lang/Character.html#unicode">Unicode character representations</a> section of the
99 	 * <code>java.lang.Character</code> class.
100 	 * <p>
101 	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
102 	 * then calling this method on a non-breaking space character reference ({@link CharacterEntityReference#_nbsp &amp;nbsp;})
103 	 * results in a normal space being appended.
104 	 *
105 	 * @param appendable  the object to append this character reference to.
106 	 */
appendCharTo(Appendable appendable)107 	public final void appendCharTo(Appendable appendable) throws IOException {
108 		appendCharTo(appendable,Config.ConvertNonBreakingSpaces);
109 	}
110 
appendCharTo(Appendable appendable, final boolean convertNonBreakingSpaces)111 	private void appendCharTo(Appendable appendable, final boolean convertNonBreakingSpaces) throws IOException {
112 		if (Character.isSupplementaryCodePoint(codePoint)) {
113 			appendable.append(getHighSurrogate(codePoint));
114 			appendable.append(getLowSurrogate(codePoint));
115 		} else {
116 			final char ch=getChar();
117 			if (ch==CharacterEntityReference._nbsp && convertNonBreakingSpaces) {
118 				appendable.append(' ');
119 			} else {
120 				appendable.append(ch);
121 			}
122 		}
123 	}
124 
125 	/**
126 	 * Indicates whether this character reference is terminated by a semicolon (<code>;</code>).
127 	 * <p>
128 	 * Conversely, this library defines an <i><a name="Unterminated">unterminated</a></i> character reference as one which does
129 	 * not end with a semicolon.
130 	 * <p>
131 	 * The SGML specification allows unterminated character references in some circumstances, and because the
132 	 * HTML 4.01 specification states simply that
133 	 * "<a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">authors may use SGML character references</a>",
134 	 * it follows that they are also valid in HTML documents, although their use is strongly discouraged.
135 	 * <p>
136 	 * Unterminated character references are not allowed in <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> documents.
137 	 *
138 	 * @return <code>true</code> if this character reference is terminated by a semicolon, otherwise <code>false</code>.
139 	 * @see #decode(CharSequence encodedText, boolean insideAttributeValue)
140 	 */
isTerminated()141 	public boolean isTerminated() {
142 		return source.charAt(end-1)==';';
143 	}
144 
145 	/**
146 	 * Encodes the specified text, escaping special characters into character references.
147 	 * <p>
148 	 * Each character is encoded only if the {@link #requiresEncoding(char)} method would return <code>true</code> for that character,
149 	 * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode
150 	 * code point is greater than U+007F.
151 	 * <p>
152 	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
153 	 * which depending on the current setting of the static {@link Config#IsApostropheEncoded} property,
154 	 * is either left unencoded (default setting), or encoded as the numeric character reference "<code>&amp;#39;</code>".
155 	 * <p>
156 	 * This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos &amp;apos;}
157 	 * as this entity is not defined for use in HTML.  See the comments in the {@link CharacterEntityReference} class for more information.
158 	 * <p>
159 	 * To encode text using only numeric character references, use the<br />
160 	 * {@link NumericCharacterReference#encode(CharSequence)} method instead.
161 	 *
162 	 * @param unencodedText  the text to encode.
163 	 * @return the encoded string.
164 	 * @see #decode(CharSequence)
165 	 */
encode(final CharSequence unencodedText)166 	public static String encode(final CharSequence unencodedText) {
167 		if (unencodedText==null) return null;
168 		try {
169 			return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,false).toString();
170 		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
171 	}
172 
173 	/**
174 	 * Encodes the specified character into a character reference if {@linkplain #requiresEncoding(char) required}.
175 	 * <p>
176 	 * The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText)} method.
177 	 *
178 	 * @param ch  the character to encode.
179 	 * @return a character reference if appropriate, otherwise a string containing the original character.
180 	 */
encode(final char ch)181 	public static String encode(final char ch) {
182 		try {
183 			return appendEncode(new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH),ch).toString();
184 		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
185 	}
186 
187 	/**
188 	 * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
189 	 * <p>
190 	 * This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions:
191 	 * <ul>
192 	 *  <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
193 	 *   are converted to "<code>&lt;br /&gt;</code>".  CR/LF pairs are treated as a single line break.
194 	 *  <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&amp;nbsp;</code>"
195 	 *   while ensuring the last is always a normal space.
196 	 *  <li>Tab characters (U+0009) are converted as if they were four consecutive spaces.
197 	 * </ul>
198 	 * <p>
199 	 * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
200 	 * spaces to be rendered, but also allows the line to wrap in the middle of it.
201 	 * <p>
202 	 * Note that zero-width spaces (U+200B) are converted to the numeric character reference
203 	 * "<code>&amp;#x200B;</code>" through the normal encoding process, but IE6 does not render them properly
204 	 * either encoded or unencoded.
205 	 * <p>
206 	 * There is no method provided to reverse this encoding.
207 	 *
208 	 * @param unencodedText  the text to encode.
209 	 * @return the encoded string with white space formatting converted to markup.
210 	 * @see #encode(CharSequence)
211 	 */
encodeWithWhiteSpaceFormatting(final CharSequence unencodedText)212 	public static String encodeWithWhiteSpaceFormatting(final CharSequence unencodedText) {
213 		if (unencodedText==null) return null;
214 		try {
215 			return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,true).toString();
216 		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
217 	}
218 
219 	/**
220 	 * Decodes the specified HTML encoded text into normal text.
221 	 * <p>
222 	 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
223 	 * are converted to their respective characters.
224 	 * <p>
225 	 * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
226 	 * <p>
227 	 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
228 	 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
229 	 * <p>
230 	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
231 	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
232 	 * <p>
233 	 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
234 	 * some browsers also recognise them in a case-insensitive way.
235 	 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
236 	 *
237 	 * @param encodedText  the text to decode.
238 	 * @return the decoded string.
239 	 * @see #encode(CharSequence)
240 	 */
decode(final CharSequence encodedText)241 	public static String decode(final CharSequence encodedText) {
242 		return decode(encodedText,false,Config.ConvertNonBreakingSpaces);
243 	}
244 
245 	/**
246 	 * Decodes the specified HTML encoded text into normal text.
247 	 * <p>
248 	 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
249 	 * are converted to their respective characters.
250 	 * <p>
251 	 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the
252 	 * value of the <code>insideAttributeValue</code> parameter and the
253 	 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
254 	 * <p>
255 	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
256 	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
257 	 * <p>
258 	 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
259 	 * some browsers also recognise them in a case-insensitive way.
260 	 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
261 	 *
262 	 * @param encodedText  the text to decode.
263 	 * @param insideAttributeValue  specifies whether the encoded text is inside an attribute value.
264 	 * @return the decoded string.
265 	 * @see #decode(CharSequence)
266 	 * @see #encode(CharSequence)
267 	 */
decode(final CharSequence encodedText, final boolean insideAttributeValue)268 	public static String decode(final CharSequence encodedText, final boolean insideAttributeValue) {
269 		return decode(encodedText,insideAttributeValue,Config.ConvertNonBreakingSpaces);
270 	}
271 
decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces)272 	static String decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) {
273 		if (encodedText==null) return null;
274 		for (int i=0; i<encodedText.length(); i++) {
275 			if (encodedText.charAt(i)=='&') {
276 				try {
277 					return appendDecode(new StringBuilder(encodedText.length()),encodedText,i,insideAttributeValue,convertNonBreakingSpaces).toString();
278 				} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
279 			}
280 		}
281 		return encodedText.toString();
282 	}
283 
284 	/**
285 	 * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
286 	 * <p>
287 	 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
288 	 * <p>
289 	 * The result is how the text would normally be rendered by a
290 	 * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>,
291 	 * assuming it does not contain any tags.
292 	 * <p>
293 	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default),
294 	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to normal spaces.
295 	 * <p>
296 	 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for
297 	 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
298 	 * See the discussion of the <code>insideAttributeValue</code> parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)}
299 	 * method for a more detailed explanation of this topic.
300 	 *
301 	 * @param text  the source text
302 	 * @return the decoded text with collapsed white space.
303 	 * @see FormControl#getPredefinedValues()
304 	 */
decodeCollapseWhiteSpace(final CharSequence text)305 	public static String decodeCollapseWhiteSpace(final CharSequence text) {
306 		return decodeCollapseWhiteSpace(text,Config.ConvertNonBreakingSpaces);
307 	}
308 
decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces)309 	static String decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces) {
310 		return decode(appendCollapseWhiteSpace(new StringBuilder(text.length()),text),false,convertNonBreakingSpaces);
311 	}
312 
313 	/**
314 	 * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
315 	 * <p>
316 	 * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
317 	 * <p>
318 	 * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
319 	 * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
320 	 * may be used in future.
321 	 *
322 	 * @param encodedText  the text to re-encode.
323 	 * @return the re-encoded string.
324 	 */
reencode(final CharSequence encodedText)325 	public static String reencode(final CharSequence encodedText) {
326 		return encode(decode(encodedText,true));
327 	}
328 
329 	/**
330 	 * Returns the encoded form of this character reference.
331 	 * <p>
332 	 * The exact behaviour of this method depends on the class of this object.
333 	 * See the {@link CharacterEntityReference#getCharacterReferenceString()} and
334 	 * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
335 	 * <p>
336 	 * <dl>
337 	 *  <dt>Examples:</dt>
338 	 *   <dd><code>CharacterReference.parse("&amp;GT;").getCharacterReferenceString()</code> returns "<code>&amp;gt;</code>"</dd>
339 	 *   <dd><code>CharacterReference.parse("&amp;#x3E;").getCharacterReferenceString()</code> returns "<code>&amp;#3e;</code>"</dd>
340 	 * </dl>
341 	 *
342 	 * @return the encoded form of this character reference.
343 	 * @see #getCharacterReferenceString(int codePoint)
344 	 * @see #getDecimalCharacterReferenceString()
345 	 */
getCharacterReferenceString()346 	public abstract String getCharacterReferenceString();
347 
348 	/**
349 	 * Returns the encoded form of the specified unicode code point.
350 	 * <p>
351 	 * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
352 	 * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
353 	 * <p>
354 	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
355 	 * which is encoded as the numeric character reference "<code>&amp;#39;</code>" instead of its character entity reference
356 	 * "<code>&amp;apos;</code>".
357 	 * <p>
358 	 * <dl>
359 	 *  <dt>Examples:</dt>
360 	 *   <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&amp;gt;</code>"</dd>
361 	 *   <dd><code>CharacterReference.getCharacterReferenceString('&gt;')</code> returns "<code>&amp;gt;</code>"</dd>
362 	 *   <dd><code>CharacterReference.getCharacterReferenceString('&#9786;')</code> returns "<code>&amp;#9786;</code>"</dd>
363 	 * </dl>
364 	 *
365 	 * @param codePoint  the unicode code point to encode.
366 	 * @return the encoded form of the specified unicode code point.
367 	 * @see #getHexadecimalCharacterReferenceString(int codePoint)
368 	 */
getCharacterReferenceString(final int codePoint)369 	public static String getCharacterReferenceString(final int codePoint) {
370 		String characterReferenceString=null;
371 		if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint);
372 		if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint);
373 		return characterReferenceString;
374 	}
375 
376 	/**
377 	 * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference.
378 	 * <p>
379 	 * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
380 	 * <p>
381 	 * <dl>
382 	 *  <dt>Example:</dt>
383 	 *  <dd><code>CharacterReference.parse("&amp;gt;").getDecimalCharacterReferenceString()</code> returns "<code>&amp;#62;</code>"</dd>
384 	 * </dl>
385 	 *
386 	 * @return the decimal encoded form of this character reference.
387 	 * @see #getCharacterReferenceString()
388 	 * @see #getHexadecimalCharacterReferenceString()
389 	 */
getDecimalCharacterReferenceString()390 	public String getDecimalCharacterReferenceString() {
391 		return getDecimalCharacterReferenceString(codePoint);
392 	}
393 
394 	/**
395 	 * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point.
396 	 * <p>
397 	 * <dl>
398 	 *  <dt>Example:</dt>
399 	 *  <dd><code>CharacterReference.getDecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#62;</code>"</dd>
400 	 * </dl>
401 	 *
402 	 * @param codePoint  the unicode code point to encode.
403 	 * @return the decimal encoded form of the specified unicode code point.
404 	 * @see #getCharacterReferenceString(int codePoint)
405 	 * @see #getHexadecimalCharacterReferenceString(int codePoint)
406 	 */
getDecimalCharacterReferenceString(final int codePoint)407 	public static String getDecimalCharacterReferenceString(final int codePoint) {
408 		try {
409 			return appendDecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
410 		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
411 	}
412 
413 	/**
414 	 * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference.
415 	 * <p>
416 	 * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>.
417 	 * <p>
418 	 * <dl>
419 	 *  <dt>Example:</dt>
420 	 *  <dd><code>CharacterReference.parse("&amp;gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&amp;#x3e;</code>"</dd>
421 	 * </dl>
422 	 *
423 	 * @return the hexadecimal encoded form of this character reference.
424 	 * @see #getCharacterReferenceString()
425 	 * @see #getDecimalCharacterReferenceString()
426 	 */
getHexadecimalCharacterReferenceString()427 	public String getHexadecimalCharacterReferenceString() {
428 		return getHexadecimalCharacterReferenceString(codePoint);
429 	}
430 
431 	/**
432 	 * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point.
433 	 * <p>
434 	 * <dl>
435 	 *  <dt>Example:</dt>
436 	 *  <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('&gt;')</code> returns "<code>&amp;#x3e;</code>"</dd>
437 	 * </dl>
438 	 *
439 	 * @param codePoint  the unicode code point to encode.
440 	 * @return the hexadecimal encoded form of the specified unicode code point.
441 	 * @see #getCharacterReferenceString(int codePoint)
442 	 * @see #getDecimalCharacterReferenceString(int codePoint)
443 	 */
getHexadecimalCharacterReferenceString(final int codePoint)444 	public static String getHexadecimalCharacterReferenceString(final int codePoint) {
445 		try {
446 			return appendHexadecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
447 		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
448 	}
449 
450 	/**
451 	 * Returns the unicode code point of this character reference in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
452 	 * <p>
453 	 * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
454 	 * <p>
455 	 * <dl>
456 	 *  <dt>Example:</dt>
457 	 *  <dd><code>CharacterReference.parse("&amp;gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd>
458 	 * </dl>
459 	 *
460 	 * @return the unicode code point of this character reference in U+ notation.
461 	 * @see #getUnicodeText(int codePoint)
462 	 */
getUnicodeText()463 	public String getUnicodeText() {
464 		return getUnicodeText(codePoint);
465 	}
466 
467 	/**
468 	 * Returns the specified unicode code point in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>.
469 	 * <p>
470 	 * <dl>
471 	 *  <dt>Example:</dt>
472 	 *  <dd><code>CharacterReference.getUnicodeText('&gt;')</code> returns "<code>U+003E</code>"</dd>
473 	 * </dl>
474 	 *
475 	 * @param codePoint  the unicode code point.
476 	 * @return the specified unicode code point in U+ notation.
477 	 */
getUnicodeText(final int codePoint)478 	public static String getUnicodeText(final int codePoint) {
479 		try {
480 			return appendUnicodeText(new StringBuilder(),codePoint).toString();
481 		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
482 	}
483 
appendUnicodeText(final Appendable appendable, final int codePoint)484 	static final Appendable appendUnicodeText(final Appendable appendable, final int codePoint) throws IOException {
485 		appendable.append("U+");
486 		final String hex=Integer.toString(codePoint,16).toUpperCase();
487 		for (int i=4-hex.length(); i>0; i--) appendable.append('0');
488 		appendable.append(hex);
489 		return appendable;
490 	}
491 
492 	/**
493 	 * Parses a single encoded character reference text into a <code>CharacterReference</code> object.
494 	 * <p>
495 	 * The character reference must be at the start of the given text, but may contain other characters at the end.
496 	 * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
497 	 * <p>
498 	 * If the text does not represent a valid character reference, this method returns <code>null</code>.
499 	 * <p>
500  	 * <a href="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the
501 	 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
502 	 * <p>
503 	 * To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead.
504 	 * <p>
505 	 * <dl>
506 	 *  <dt>Example:</dt>
507 	 *  <dd><code>CharacterReference.parse("&amp;gt;").getChar()</code> returns '<code>&gt;</code>'</dd>
508 	 * </dl>
509 	 *
510 	 * @param characterReferenceText  the text containing a single encoded character reference.
511 	 * @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference.
512 	 * @see #decode(CharSequence)
513 	 */
parse(final CharSequence characterReferenceText)514 	public static CharacterReference parse(final CharSequence characterReferenceText) {
515 		return construct(new Source(characterReferenceText,true),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
516 	}
517 
518 	/**
519 	 * Parses a single encoded character reference text into a unicode code point.
520 	 * <p>
521 	 * The character reference must be at the start of the given text, but may contain other characters at the end.
522 	 * <p>
523 	 * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
524 	 * <p>
525 	 * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}<code>.</code>{@link #getCodePoint()},
526 	 * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
527 	 * <code>NullPointerException</code>.
528 	 * <p>
529 	 * <dl>
530 	 *  <dt>Example:</dt>
531 	 *  <dd><code>CharacterReference.getCodePointFromCharacterReferenceString("&amp;gt;")</code> returns <code>38</code></dd>
532 	 * </dl>
533 	 *
534 	 * @param characterReferenceText  the text containing a single encoded character reference.
535 	 * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
536 	 */
getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText)537 	public static int getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText) {
538 		final CharacterReference characterReference=parse(characterReferenceText);
539 		return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT;
540 	}
541 
542 	/**
543 	 * Indicates whether the specified character would need to be encoded in HTML text.
544 	 * <p>
545 	 * This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F.
546 	 * <p>
547 	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
548 	 * which only returns <code>true</code> if the static {@link Config#IsApostropheEncoded} property
549 	 * is currently set to <code>true</code>.
550 	 *
551 	 * @param ch  the character to test.
552 	 * @return <code>true</code> if the specified character would need to be encoded in HTML text, otherwise <code>false</code>.
553 	 */
requiresEncoding(final char ch)554 	public static final boolean requiresEncoding(final char ch) {
555 		return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || Config.IsApostropheEncoded));
556 	}
557 
558 	/**
559 	 * Returns a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
560 	 *
561 	 * @param writer  the destination for the encoded text
562 	 * @return a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>.
563 	 * @see #encode(CharSequence unencodedText)
564 	 */
getEncodingFilterWriter(final Writer writer)565 	public static Writer getEncodingFilterWriter(final Writer writer) {
566 		return new EncodingFilterWriter(writer);
567 	}
568 
569 	private static final class EncodingFilterWriter extends FilterWriter {
570 		StringBuilder sb=new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH);
EncodingFilterWriter(final Writer writer)571 		public EncodingFilterWriter(final Writer writer) {
572 			super(writer);
573 		}
write(final char ch)574 		public void write(final char ch) throws IOException {
575 			sb.setLength(0);
576 			appendEncode(sb,ch);
577 			if (sb.length()==1)
578 				out.write(sb.charAt(0));
579 			else
580 				out.append(sb);
581 		}
write(final int chInt)582 		public void write(final int chInt) throws IOException {
583 			write((char)chInt);
584 		}
write(final char[] cbuf, final int off, final int len)585 		public void write(final char[] cbuf, final int off, final int len) throws IOException {
586 			final int end=off+len;
587 			for (int i=off; i<end; i++) write(cbuf[i]);
588 		}
write(final String str, final int off, final int len)589 		public void write(final String str, final int off, final int len) throws IOException {
590 			final int end=off+len;
591 			for (int i=off; i<end; i++) write(str.charAt(i));
592 		}
593 	}
594 
appendEncode(final Appendable appendable, char ch)595 	private static Appendable appendEncode(final Appendable appendable, char ch) throws IOException {
596 		if (appendEncodeCheckForWhiteSpaceFormatting(appendable,ch,false)) return appendable;
597 		return appendable.append(ch);
598 	}
599 
appendEncode(final Appendable appendable, CharSequence unencodedText, final boolean whiteSpaceFormatting)600 	static Appendable appendEncode(final Appendable appendable, CharSequence unencodedText, final boolean whiteSpaceFormatting) throws IOException {
601 		if (unencodedText==null) return appendable;
602 		int beginPos=0;
603 		int endPos=unencodedText.length();
604 		if (unencodedText instanceof Segment) {
605 			// this might improve performance slightly
606 			final Segment segment=(Segment)unencodedText;
607 			final int segmentOffset=segment.getBegin();
608 			beginPos=segmentOffset;
609 			endPos+=segmentOffset;
610 			unencodedText=segment.source;
611 		}
612 		final boolean isApostropheEncoded=Config.IsApostropheEncoded;
613 		for (int i=beginPos; i<endPos; i++) {
614 			char ch=unencodedText.charAt(i);
615 			if (appendEncodeCheckForWhiteSpaceFormatting(appendable,ch,whiteSpaceFormatting)) continue;
616 			// need to process white space
617 			// whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup
618 			int spaceCount;
619 			int nexti=i+1;
620 			if (ch!=' ') {
621 				if (ch!='\t') {
622 					// must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string
623 					if (ch=='\r' && nexti<endPos && unencodedText.charAt(nexti)=='\n') i++; // process cr/lf pair as one line break
624 					appendable.append("<br />"); // add line break
625 					continue;
626 				} else {
627 					spaceCount=TAB_LENGTH;
628 				}
629 			} else {
630 				spaceCount=1;
631 			}
632 			while (nexti<endPos) {
633 				ch=unencodedText.charAt(nexti);
634 				if (ch==' ')
635 					spaceCount+=1;
636 				else if (ch=='\t')
637 					spaceCount+=TAB_LENGTH;
638 				else
639 					break;
640 				nexti++;
641 			}
642 			if (spaceCount==1) {
643 				// handle the very common case of a single character to improve efficiency slightly
644 				appendable.append(' ');
645 				continue;
646 			}
647 			if (spaceCount%2==1) appendable.append(' '); // fist character is a space if we have an odd number of spaces
648 			while (spaceCount>=2) {
649 				appendable.append("&nbsp; "); // use alternating &nbsp; and spaces to keep original number of spaces
650 				spaceCount-=2;
651 			}
652 			// note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
653 			i=nexti-1; // minus 1 because top level for loop will add it again
654 		}
655 		return appendable;
656 	}
657 
appendEncodeCheckForWhiteSpaceFormatting(final Appendable appendable, char ch, final boolean whiteSpaceFormatting)658 	private static final boolean appendEncodeCheckForWhiteSpaceFormatting(final Appendable appendable, char ch, final boolean whiteSpaceFormatting) throws IOException {
659 		final String characterEntityReferenceName=CharacterEntityReference.getName(ch);
660 		if (characterEntityReferenceName!=null) {
661 			if (ch=='\'') {
662 				if (Config.IsApostropheEncoded)
663 					appendable.append("&#39;");
664 				else
665 					appendable.append(ch);
666 			} else {
667 				CharacterEntityReference.appendCharacterReferenceString(appendable,characterEntityReferenceName);
668 			}
669 		} else if (ch>127) {
670 			appendDecimalCharacterReferenceString(appendable,ch);
671 		} else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) {
672 			appendable.append(ch);
673 		} else {
674 			return false;
675 		}
676 		return true;
677 	}
678 
getPrevious(final Source source, final int pos)679 	static CharacterReference getPrevious(final Source source, final int pos) {
680 		return getPrevious(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
681 	}
682 
getNext(final Source source, final int pos)683 	static CharacterReference getNext(final Source source, final int pos) {
684 		return getNext(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
685 	}
686 
getPrevious(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings)687 	private static CharacterReference getPrevious(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
688 		final ParseText parseText=source.getParseText();
689 		pos=parseText.lastIndexOf('&',pos);
690 		while (pos!=-1) {
691 			final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
692 			if (characterReference!=null) return characterReference;
693 			pos=parseText.lastIndexOf('&',pos-1);
694 		}
695 		return null;
696 	}
697 
getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings)698 	private static CharacterReference getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
699 		final ParseText parseText=source.getParseText();
700 		pos=parseText.indexOf('&',pos);
701 		while (pos!=-1) {
702 			final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
703 			if (characterReference!=null) return characterReference;
704 			pos=parseText.indexOf('&',pos+1);
705 		}
706 		return null;
707 	}
708 
appendHexadecimalCharacterReferenceString(final Appendable appendable, final int codePoint)709 	static final Appendable appendHexadecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException {
710 		return appendable.append("&#x").append(Integer.toString(codePoint,16)).append(';');
711 	}
712 
appendDecimalCharacterReferenceString(final Appendable appendable, final int codePoint)713 	static final Appendable appendDecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException {
714 		return appendable.append("&#").append(Integer.toString(codePoint)).append(';');
715 	}
716 
construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings)717 	static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
718 		try {
719 			if (source.getParseText().charAt(begin)!='&') return null;
720 			return (source.getParseText().charAt(begin+1)=='#')
721 				? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings)
722 				: CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
723 		} catch (IndexOutOfBoundsException ex) {
724 			return null;
725 		}
726 	}
727 
appendDecode(final Appendable appendable, final CharSequence encodedText, int pos, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces)728 	private static Appendable appendDecode(final Appendable appendable, final CharSequence encodedText, int pos, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException {
729 		final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
730 		int lastEnd=0;
731 		final StreamedSource streamedSource=new StreamedSource(encodedText).setHandleTags(false).setUnterminatedCharacterReferenceSettings(unterminatedCharacterReferenceSettings).setSearchBegin(pos);
732 		for (Segment segment : streamedSource) {
733 			if (segment instanceof CharacterReference) {
734 				((CharacterReference)segment).appendCharTo(appendable,convertNonBreakingSpaces);
735 			} else {
736 				appendable.append(segment.toString()); // benchmark tests reveal (surprisingly) that converting to a string before appending is faster than appending the specified section of the encodedText or segment directly.
737 //				appendable.append(encodedText,segment.begin,segment.end);
738 //				appendable.append(segment);
739 			}
740 		}
741 		return appendable;
742 	}
743 
744 	// pinched from http://svn.apache.org/repos/asf/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/text/CharUtils.java
getHighSurrogate(int codePoint)745 	private static char getHighSurrogate(int codePoint) {
746 		return (char)((0xD800 - (0x10000 >> 10)) + (codePoint >> 10));
747 	}
getLowSurrogate(int codePoint)748 	private static char getLowSurrogate(int codePoint) {
749 		return (char)(0xDC00 + (codePoint & 0x3FF));
750 	}
751 }
752