1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML 2 // Version 3.2 3 // Copyright (C) 2004-2009 Martin Jericho 4 // http://jericho.htmlparser.net/ 5 // 6 // This library is free software; you can redistribute it and/or 7 // modify it under the terms of either one of the following licences: 8 // 9 // 1. The Eclipse Public License (EPL) version 1.0, 10 // included in this distribution in the file licence-epl-1.0.html 11 // or available at http://www.eclipse.org/legal/epl-v10.html 12 // 13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, 14 // included in this distribution in the file licence-lgpl-2.1.txt 15 // or available at http://www.gnu.org/licenses/lgpl.txt 16 // 17 // This library is distributed on an "AS IS" basis, 18 // WITHOUT WARRANTY OF ANY KIND, either express or implied. 19 // See the individual licence texts for more details. 20 21 package net.htmlparser.jericho; 22 23 import java.util.*; 24 import java.io.*; 25 26 /** 27 * Represents an HTML <a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">Character Reference</a>, 28 * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}. 29 * <p> 30 * This class, together with its subclasses, contains static methods to perform most required operations 31 * without having to instantiate an object. 32 * <p> 33 * Instances of this class are useful when the positions of character references in a source document are required, 34 * or to replace the found character references with customised text. 35 * <p> 36 * <code>CharacterReference</code> instances are obtained using one of the following methods: 37 * <ul> 38 * <li>{@link CharacterReference#parse(CharSequence characterReferenceText)} 39 * <li>{@link Source#getNextCharacterReference(int pos)} 40 * <li>{@link Source#getPreviousCharacterReference(int pos)} 41 * <li>{@link Segment#getAllCharacterReferences()} 42 * </ul> 43 */ 44 public abstract class CharacterReference extends Segment { 45 int codePoint; 46 47 /** 48 * Represents an invalid unicode code point. 49 * <p> 50 * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference. 51 */ 52 public static final int INVALID_CODE_POINT=-1; 53 54 static int MAX_ENTITY_REFERENCE_LENGTH; // set in CharacterEntityReference static class initialisation 55 56 /** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */ 57 private static final int TAB_LENGTH=4; 58 CharacterReference(final Source source, final int begin, final int end, final int codePoint)59 CharacterReference(final Source source, final int begin, final int end, final int codePoint) { 60 super(source,begin,end); 61 this.codePoint=codePoint; 62 } 63 64 /** 65 * Returns the <a target="_blank" href="http://www.unicode.org">unicode</a> code point represented by this character reference. 66 * @return the unicode code point represented by this character reference. 67 * @see #appendCharTo(Appendable) 68 */ getCodePoint()69 public int getCodePoint() { 70 return codePoint; 71 } 72 73 /** 74 * Returns the character represented by this character reference. 75 * <p> 76 * If this character reference represents a unicode 77 * <a target="_blank" href="http://www.unicode.org/glossary/#supplementary_code_point">supplimentary code point</a>, 78 * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result. 79 * <p> 80 * To ensure that the character is correctly appended to an <code>Appendable</code> object such as a <code>Writer</code>, use the code: 81 * <br /><code>characterReference.</code>{@link #appendCharTo(Appendable) appendCharTo}<code>(appendable)</code><br /> 82 * instead of: 83 * <br /><code>appendable.append(characterReference.getChar())</code> 84 * 85 * @return the character represented by this character reference. 86 * @see #appendCharTo(Appendable) 87 * @see #getCodePoint() 88 */ getChar()89 public char getChar() { 90 return (char)codePoint; 91 } 92 93 /** 94 * Appends the character represented by this character reference to the specified appendable object. 95 * <p> 96 * If this character is a unicode <a target="_blank" href="http://unicode.org/glossary/#supplementary_character">supplementary character</a>, 97 * then both the UTF-16 high/low surrogate <code>char</code> values of the of the character are appended, as described in the 98 * <a target="_blank" href="http://java.sun.com/javase/6/docs/api/java/lang/Character.html#unicode">Unicode character representations</a> section of the 99 * <code>java.lang.Character</code> class. 100 * <p> 101 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default), 102 * then calling this method on a non-breaking space character reference ({@link CharacterEntityReference#_nbsp &nbsp;}) 103 * results in a normal space being appended. 104 * 105 * @param appendable the object to append this character reference to. 106 */ appendCharTo(Appendable appendable)107 public final void appendCharTo(Appendable appendable) throws IOException { 108 appendCharTo(appendable,Config.ConvertNonBreakingSpaces); 109 } 110 appendCharTo(Appendable appendable, final boolean convertNonBreakingSpaces)111 private void appendCharTo(Appendable appendable, final boolean convertNonBreakingSpaces) throws IOException { 112 if (Character.isSupplementaryCodePoint(codePoint)) { 113 appendable.append(getHighSurrogate(codePoint)); 114 appendable.append(getLowSurrogate(codePoint)); 115 } else { 116 final char ch=getChar(); 117 if (ch==CharacterEntityReference._nbsp && convertNonBreakingSpaces) { 118 appendable.append(' '); 119 } else { 120 appendable.append(ch); 121 } 122 } 123 } 124 125 /** 126 * Indicates whether this character reference is terminated by a semicolon (<code>;</code>). 127 * <p> 128 * Conversely, this library defines an <i><a name="Unterminated">unterminated</a></i> character reference as one which does 129 * not end with a semicolon. 130 * <p> 131 * The SGML specification allows unterminated character references in some circumstances, and because the 132 * HTML 4.01 specification states simply that 133 * "<a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#entities">authors may use SGML character references</a>", 134 * it follows that they are also valid in HTML documents, although their use is strongly discouraged. 135 * <p> 136 * Unterminated character references are not allowed in <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> documents. 137 * 138 * @return <code>true</code> if this character reference is terminated by a semicolon, otherwise <code>false</code>. 139 * @see #decode(CharSequence encodedText, boolean insideAttributeValue) 140 */ isTerminated()141 public boolean isTerminated() { 142 return source.charAt(end-1)==';'; 143 } 144 145 /** 146 * Encodes the specified text, escaping special characters into character references. 147 * <p> 148 * Each character is encoded only if the {@link #requiresEncoding(char)} method would return <code>true</code> for that character, 149 * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode 150 * code point is greater than U+007F. 151 * <p> 152 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027), 153 * which depending on the current setting of the static {@link Config#IsApostropheEncoded} property, 154 * is either left unencoded (default setting), or encoded as the numeric character reference "<code>&#39;</code>". 155 * <p> 156 * This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos &apos;} 157 * as this entity is not defined for use in HTML. See the comments in the {@link CharacterEntityReference} class for more information. 158 * <p> 159 * To encode text using only numeric character references, use the<br /> 160 * {@link NumericCharacterReference#encode(CharSequence)} method instead. 161 * 162 * @param unencodedText the text to encode. 163 * @return the encoded string. 164 * @see #decode(CharSequence) 165 */ encode(final CharSequence unencodedText)166 public static String encode(final CharSequence unencodedText) { 167 if (unencodedText==null) return null; 168 try { 169 return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,false).toString(); 170 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens 171 } 172 173 /** 174 * Encodes the specified character into a character reference if {@linkplain #requiresEncoding(char) required}. 175 * <p> 176 * The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText)} method. 177 * 178 * @param ch the character to encode. 179 * @return a character reference if appropriate, otherwise a string containing the original character. 180 */ encode(final char ch)181 public static String encode(final char ch) { 182 try { 183 return appendEncode(new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH),ch).toString(); 184 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens 185 } 186 187 /** 188 * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup. 189 * <p> 190 * This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions: 191 * <ul> 192 * <li>Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C) 193 * are converted to "<code><br /></code>". CR/LF pairs are treated as a single line break. 194 * <li>Multiple consecutive spaces are converted so that every second space is converted to "<code>&nbsp;</code>" 195 * while ensuring the last is always a normal space. 196 * <li>Tab characters (U+0009) are converted as if they were four consecutive spaces. 197 * </ul> 198 * <p> 199 * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of 200 * spaces to be rendered, but also allows the line to wrap in the middle of it. 201 * <p> 202 * Note that zero-width spaces (U+200B) are converted to the numeric character reference 203 * "<code>&#x200B;</code>" through the normal encoding process, but IE6 does not render them properly 204 * either encoded or unencoded. 205 * <p> 206 * There is no method provided to reverse this encoding. 207 * 208 * @param unencodedText the text to encode. 209 * @return the encoded string with white space formatting converted to markup. 210 * @see #encode(CharSequence) 211 */ encodeWithWhiteSpaceFormatting(final CharSequence unencodedText)212 public static String encodeWithWhiteSpaceFormatting(final CharSequence unencodedText) { 213 if (unencodedText==null) return null; 214 try { 215 return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,true).toString(); 216 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens 217 } 218 219 /** 220 * Decodes the specified HTML encoded text into normal text. 221 * <p> 222 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references} 223 * are converted to their respective characters. 224 * <p> 225 * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}. 226 * <p> 227 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for 228 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}. 229 * <p> 230 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default), 231 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to normal spaces. 232 * <p> 233 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case, 234 * some browsers also recognise them in a case-insensitive way. 235 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case. 236 * 237 * @param encodedText the text to decode. 238 * @return the decoded string. 239 * @see #encode(CharSequence) 240 */ decode(final CharSequence encodedText)241 public static String decode(final CharSequence encodedText) { 242 return decode(encodedText,false,Config.ConvertNonBreakingSpaces); 243 } 244 245 /** 246 * Decodes the specified HTML encoded text into normal text. 247 * <p> 248 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references} 249 * are converted to their respective characters. 250 * <p> 251 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the 252 * value of the <code>insideAttributeValue</code> parameter and the 253 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}. 254 * <p> 255 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default), 256 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to normal spaces. 257 * <p> 258 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case, 259 * some browsers also recognise them in a case-insensitive way. 260 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case. 261 * 262 * @param encodedText the text to decode. 263 * @param insideAttributeValue specifies whether the encoded text is inside an attribute value. 264 * @return the decoded string. 265 * @see #decode(CharSequence) 266 * @see #encode(CharSequence) 267 */ decode(final CharSequence encodedText, final boolean insideAttributeValue)268 public static String decode(final CharSequence encodedText, final boolean insideAttributeValue) { 269 return decode(encodedText,insideAttributeValue,Config.ConvertNonBreakingSpaces); 270 } 271 decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces)272 static String decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) { 273 if (encodedText==null) return null; 274 for (int i=0; i<encodedText.length(); i++) { 275 if (encodedText.charAt(i)=='&') { 276 try { 277 return appendDecode(new StringBuilder(encodedText.length()),encodedText,i,insideAttributeValue,convertNonBreakingSpaces).toString(); 278 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens 279 } 280 } 281 return encodedText.toString(); 282 } 283 284 /** 285 * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}. 286 * <p> 287 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space. 288 * <p> 289 * The result is how the text would normally be rendered by a 290 * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>, 291 * assuming it does not contain any tags. 292 * <p> 293 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to <code>true</code> (the default), 294 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to normal spaces. 295 * <p> 296 * <a href="#Unterminated">Unterminated</a> character references are dealt with according to the rules for 297 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}. 298 * See the discussion of the <code>insideAttributeValue</code> parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)} 299 * method for a more detailed explanation of this topic. 300 * 301 * @param text the source text 302 * @return the decoded text with collapsed white space. 303 * @see FormControl#getPredefinedValues() 304 */ decodeCollapseWhiteSpace(final CharSequence text)305 public static String decodeCollapseWhiteSpace(final CharSequence text) { 306 return decodeCollapseWhiteSpace(text,Config.ConvertNonBreakingSpaces); 307 } 308 decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces)309 static String decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces) { 310 return decode(appendCollapseWhiteSpace(new StringBuilder(text.length()),text),false,convertNonBreakingSpaces); 311 } 312 313 /** 314 * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again. 315 * <p> 316 * This process ensures that the specified encoded text does not contain any remaining unencoded characters. 317 * <p> 318 * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method 319 * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation 320 * may be used in future. 321 * 322 * @param encodedText the text to re-encode. 323 * @return the re-encoded string. 324 */ reencode(final CharSequence encodedText)325 public static String reencode(final CharSequence encodedText) { 326 return encode(decode(encodedText,true)); 327 } 328 329 /** 330 * Returns the encoded form of this character reference. 331 * <p> 332 * The exact behaviour of this method depends on the class of this object. 333 * See the {@link CharacterEntityReference#getCharacterReferenceString()} and 334 * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details. 335 * <p> 336 * <dl> 337 * <dt>Examples:</dt> 338 * <dd><code>CharacterReference.parse("&GT;").getCharacterReferenceString()</code> returns "<code>&gt;</code>"</dd> 339 * <dd><code>CharacterReference.parse("&#x3E;").getCharacterReferenceString()</code> returns "<code>&#3e;</code>"</dd> 340 * </dl> 341 * 342 * @return the encoded form of this character reference. 343 * @see #getCharacterReferenceString(int codePoint) 344 * @see #getDecimalCharacterReferenceString() 345 */ getCharacterReferenceString()346 public abstract String getCharacterReferenceString(); 347 348 /** 349 * Returns the encoded form of the specified unicode code point. 350 * <p> 351 * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point 352 * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form. 353 * <p> 354 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027), 355 * which is encoded as the numeric character reference "<code>&#39;</code>" instead of its character entity reference 356 * "<code>&apos;</code>". 357 * <p> 358 * <dl> 359 * <dt>Examples:</dt> 360 * <dd><code>CharacterReference.getCharacterReferenceString(62)</code> returns "<code>&gt;</code>"</dd> 361 * <dd><code>CharacterReference.getCharacterReferenceString('>')</code> returns "<code>&gt;</code>"</dd> 362 * <dd><code>CharacterReference.getCharacterReferenceString('☺')</code> returns "<code>&#9786;</code>"</dd> 363 * </dl> 364 * 365 * @param codePoint the unicode code point to encode. 366 * @return the encoded form of the specified unicode code point. 367 * @see #getHexadecimalCharacterReferenceString(int codePoint) 368 */ getCharacterReferenceString(final int codePoint)369 public static String getCharacterReferenceString(final int codePoint) { 370 String characterReferenceString=null; 371 if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint); 372 if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint); 373 return characterReferenceString; 374 } 375 376 /** 377 * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of this character reference. 378 * <p> 379 * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>. 380 * <p> 381 * <dl> 382 * <dt>Example:</dt> 383 * <dd><code>CharacterReference.parse("&gt;").getDecimalCharacterReferenceString()</code> returns "<code>&#62;</code>"</dd> 384 * </dl> 385 * 386 * @return the decimal encoded form of this character reference. 387 * @see #getCharacterReferenceString() 388 * @see #getHexadecimalCharacterReferenceString() 389 */ getDecimalCharacterReferenceString()390 public String getDecimalCharacterReferenceString() { 391 return getDecimalCharacterReferenceString(codePoint); 392 } 393 394 /** 395 * Returns the <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal encoded form</a> of the specified unicode code point. 396 * <p> 397 * <dl> 398 * <dt>Example:</dt> 399 * <dd><code>CharacterReference.getDecimalCharacterReferenceString('>')</code> returns "<code>&#62;</code>"</dd> 400 * </dl> 401 * 402 * @param codePoint the unicode code point to encode. 403 * @return the decimal encoded form of the specified unicode code point. 404 * @see #getCharacterReferenceString(int codePoint) 405 * @see #getHexadecimalCharacterReferenceString(int codePoint) 406 */ getDecimalCharacterReferenceString(final int codePoint)407 public static String getDecimalCharacterReferenceString(final int codePoint) { 408 try { 409 return appendDecimalCharacterReferenceString(new StringBuilder(),codePoint).toString(); 410 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens 411 } 412 413 /** 414 * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of this character reference. 415 * <p> 416 * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}<code>(</code>{@link #getCodePoint()}<code>)</code>. 417 * <p> 418 * <dl> 419 * <dt>Example:</dt> 420 * <dd><code>CharacterReference.parse("&gt;").getHexadecimalCharacterReferenceString()</code> returns "<code>&#x3e;</code>"</dd> 421 * </dl> 422 * 423 * @return the hexadecimal encoded form of this character reference. 424 * @see #getCharacterReferenceString() 425 * @see #getDecimalCharacterReferenceString() 426 */ getHexadecimalCharacterReferenceString()427 public String getHexadecimalCharacterReferenceString() { 428 return getHexadecimalCharacterReferenceString(codePoint); 429 } 430 431 /** 432 * Returns the <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal encoded form</a> of the specified unicode code point. 433 * <p> 434 * <dl> 435 * <dt>Example:</dt> 436 * <dd><code>CharacterReference.getHexadecimalCharacterReferenceString('>')</code> returns "<code>&#x3e;</code>"</dd> 437 * </dl> 438 * 439 * @param codePoint the unicode code point to encode. 440 * @return the hexadecimal encoded form of the specified unicode code point. 441 * @see #getCharacterReferenceString(int codePoint) 442 * @see #getDecimalCharacterReferenceString(int codePoint) 443 */ getHexadecimalCharacterReferenceString(final int codePoint)444 public static String getHexadecimalCharacterReferenceString(final int codePoint) { 445 try { 446 return appendHexadecimalCharacterReferenceString(new StringBuilder(),codePoint).toString(); 447 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens 448 } 449 450 /** 451 * Returns the unicode code point of this character reference in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>. 452 * <p> 453 * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}. 454 * <p> 455 * <dl> 456 * <dt>Example:</dt> 457 * <dd><code>CharacterReference.parse("&gt;").getUnicodeText()</code> returns "<code>U+003E</code>"</dd> 458 * </dl> 459 * 460 * @return the unicode code point of this character reference in U+ notation. 461 * @see #getUnicodeText(int codePoint) 462 */ getUnicodeText()463 public String getUnicodeText() { 464 return getUnicodeText(codePoint); 465 } 466 467 /** 468 * Returns the specified unicode code point in <a target="_blank" href="http://www.unicode.org/reports/tr27/#notation">U+ notation</a>. 469 * <p> 470 * <dl> 471 * <dt>Example:</dt> 472 * <dd><code>CharacterReference.getUnicodeText('>')</code> returns "<code>U+003E</code>"</dd> 473 * </dl> 474 * 475 * @param codePoint the unicode code point. 476 * @return the specified unicode code point in U+ notation. 477 */ getUnicodeText(final int codePoint)478 public static String getUnicodeText(final int codePoint) { 479 try { 480 return appendUnicodeText(new StringBuilder(),codePoint).toString(); 481 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens 482 } 483 appendUnicodeText(final Appendable appendable, final int codePoint)484 static final Appendable appendUnicodeText(final Appendable appendable, final int codePoint) throws IOException { 485 appendable.append("U+"); 486 final String hex=Integer.toString(codePoint,16).toUpperCase(); 487 for (int i=4-hex.length(); i>0; i--) appendable.append('0'); 488 appendable.append(hex); 489 return appendable; 490 } 491 492 /** 493 * Parses a single encoded character reference text into a <code>CharacterReference</code> object. 494 * <p> 495 * The character reference must be at the start of the given text, but may contain other characters at the end. 496 * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended. 497 * <p> 498 * If the text does not represent a valid character reference, this method returns <code>null</code>. 499 * <p> 500 * <a href="#Unterminated">Unterminated</a> character references are always accepted, regardless of the settings in the 501 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}. 502 * <p> 503 * To decode <i>all</i> character references in a given text, use the {@link #decode(CharSequence)} method instead. 504 * <p> 505 * <dl> 506 * <dt>Example:</dt> 507 * <dd><code>CharacterReference.parse("&gt;").getChar()</code> returns '<code>></code>'</dd> 508 * </dl> 509 * 510 * @param characterReferenceText the text containing a single encoded character reference. 511 * @return a <code>CharacterReference</code> object representing the specified text, or <code>null</code> if the text does not represent a valid character reference. 512 * @see #decode(CharSequence) 513 */ parse(final CharSequence characterReferenceText)514 public static CharacterReference parse(final CharSequence characterReferenceText) { 515 return construct(new Source(characterReferenceText,true),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL); 516 } 517 518 /** 519 * Parses a single encoded character reference text into a unicode code point. 520 * <p> 521 * The character reference must be at the start of the given text, but may contain other characters at the end. 522 * <p> 523 * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}. 524 * <p> 525 * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}<code>.</code>{@link #getCodePoint()}, 526 * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a 527 * <code>NullPointerException</code>. 528 * <p> 529 * <dl> 530 * <dt>Example:</dt> 531 * <dd><code>CharacterReference.getCodePointFromCharacterReferenceString("&gt;")</code> returns <code>38</code></dd> 532 * </dl> 533 * 534 * @param characterReferenceText the text containing a single encoded character reference. 535 * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference. 536 */ getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText)537 public static int getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText) { 538 final CharacterReference characterReference=parse(characterReferenceText); 539 return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT; 540 } 541 542 /** 543 * Indicates whether the specified character would need to be encoded in HTML text. 544 * <p> 545 * This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F. 546 * <p> 547 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027), 548 * which only returns <code>true</code> if the static {@link Config#IsApostropheEncoded} property 549 * is currently set to <code>true</code>. 550 * 551 * @param ch the character to test. 552 * @return <code>true</code> if the specified character would need to be encoded in HTML text, otherwise <code>false</code>. 553 */ requiresEncoding(final char ch)554 public static final boolean requiresEncoding(final char ch) { 555 return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || Config.IsApostropheEncoded)); 556 } 557 558 /** 559 * Returns a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>. 560 * 561 * @param writer the destination for the encoded text 562 * @return a filter <code>Writer</code> that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified <code>Writer</code>. 563 * @see #encode(CharSequence unencodedText) 564 */ getEncodingFilterWriter(final Writer writer)565 public static Writer getEncodingFilterWriter(final Writer writer) { 566 return new EncodingFilterWriter(writer); 567 } 568 569 private static final class EncodingFilterWriter extends FilterWriter { 570 StringBuilder sb=new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH); EncodingFilterWriter(final Writer writer)571 public EncodingFilterWriter(final Writer writer) { 572 super(writer); 573 } write(final char ch)574 public void write(final char ch) throws IOException { 575 sb.setLength(0); 576 appendEncode(sb,ch); 577 if (sb.length()==1) 578 out.write(sb.charAt(0)); 579 else 580 out.append(sb); 581 } write(final int chInt)582 public void write(final int chInt) throws IOException { 583 write((char)chInt); 584 } write(final char[] cbuf, final int off, final int len)585 public void write(final char[] cbuf, final int off, final int len) throws IOException { 586 final int end=off+len; 587 for (int i=off; i<end; i++) write(cbuf[i]); 588 } write(final String str, final int off, final int len)589 public void write(final String str, final int off, final int len) throws IOException { 590 final int end=off+len; 591 for (int i=off; i<end; i++) write(str.charAt(i)); 592 } 593 } 594 appendEncode(final Appendable appendable, char ch)595 private static Appendable appendEncode(final Appendable appendable, char ch) throws IOException { 596 if (appendEncodeCheckForWhiteSpaceFormatting(appendable,ch,false)) return appendable; 597 return appendable.append(ch); 598 } 599 appendEncode(final Appendable appendable, CharSequence unencodedText, final boolean whiteSpaceFormatting)600 static Appendable appendEncode(final Appendable appendable, CharSequence unencodedText, final boolean whiteSpaceFormatting) throws IOException { 601 if (unencodedText==null) return appendable; 602 int beginPos=0; 603 int endPos=unencodedText.length(); 604 if (unencodedText instanceof Segment) { 605 // this might improve performance slightly 606 final Segment segment=(Segment)unencodedText; 607 final int segmentOffset=segment.getBegin(); 608 beginPos=segmentOffset; 609 endPos+=segmentOffset; 610 unencodedText=segment.source; 611 } 612 final boolean isApostropheEncoded=Config.IsApostropheEncoded; 613 for (int i=beginPos; i<endPos; i++) { 614 char ch=unencodedText.charAt(i); 615 if (appendEncodeCheckForWhiteSpaceFormatting(appendable,ch,whiteSpaceFormatting)) continue; 616 // need to process white space 617 // whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup 618 int spaceCount; 619 int nexti=i+1; 620 if (ch!=' ') { 621 if (ch!='\t') { 622 // must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string 623 if (ch=='\r' && nexti<endPos && unencodedText.charAt(nexti)=='\n') i++; // process cr/lf pair as one line break 624 appendable.append("<br />"); // add line break 625 continue; 626 } else { 627 spaceCount=TAB_LENGTH; 628 } 629 } else { 630 spaceCount=1; 631 } 632 while (nexti<endPos) { 633 ch=unencodedText.charAt(nexti); 634 if (ch==' ') 635 spaceCount+=1; 636 else if (ch=='\t') 637 spaceCount+=TAB_LENGTH; 638 else 639 break; 640 nexti++; 641 } 642 if (spaceCount==1) { 643 // handle the very common case of a single character to improve efficiency slightly 644 appendable.append(' '); 645 continue; 646 } 647 if (spaceCount%2==1) appendable.append(' '); // fist character is a space if we have an odd number of spaces 648 while (spaceCount>=2) { 649 appendable.append(" "); // use alternating and spaces to keep original number of spaces 650 spaceCount-=2; 651 } 652 // note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line 653 i=nexti-1; // minus 1 because top level for loop will add it again 654 } 655 return appendable; 656 } 657 appendEncodeCheckForWhiteSpaceFormatting(final Appendable appendable, char ch, final boolean whiteSpaceFormatting)658 private static final boolean appendEncodeCheckForWhiteSpaceFormatting(final Appendable appendable, char ch, final boolean whiteSpaceFormatting) throws IOException { 659 final String characterEntityReferenceName=CharacterEntityReference.getName(ch); 660 if (characterEntityReferenceName!=null) { 661 if (ch=='\'') { 662 if (Config.IsApostropheEncoded) 663 appendable.append("'"); 664 else 665 appendable.append(ch); 666 } else { 667 CharacterEntityReference.appendCharacterReferenceString(appendable,characterEntityReferenceName); 668 } 669 } else if (ch>127) { 670 appendDecimalCharacterReferenceString(appendable,ch); 671 } else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) { 672 appendable.append(ch); 673 } else { 674 return false; 675 } 676 return true; 677 } 678 getPrevious(final Source source, final int pos)679 static CharacterReference getPrevious(final Source source, final int pos) { 680 return getPrevious(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL); 681 } 682 getNext(final Source source, final int pos)683 static CharacterReference getNext(final Source source, final int pos) { 684 return getNext(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL); 685 } 686 getPrevious(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings)687 private static CharacterReference getPrevious(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) { 688 final ParseText parseText=source.getParseText(); 689 pos=parseText.lastIndexOf('&',pos); 690 while (pos!=-1) { 691 final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings); 692 if (characterReference!=null) return characterReference; 693 pos=parseText.lastIndexOf('&',pos-1); 694 } 695 return null; 696 } 697 getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings)698 private static CharacterReference getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) { 699 final ParseText parseText=source.getParseText(); 700 pos=parseText.indexOf('&',pos); 701 while (pos!=-1) { 702 final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings); 703 if (characterReference!=null) return characterReference; 704 pos=parseText.indexOf('&',pos+1); 705 } 706 return null; 707 } 708 appendHexadecimalCharacterReferenceString(final Appendable appendable, final int codePoint)709 static final Appendable appendHexadecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException { 710 return appendable.append("&#x").append(Integer.toString(codePoint,16)).append(';'); 711 } 712 appendDecimalCharacterReferenceString(final Appendable appendable, final int codePoint)713 static final Appendable appendDecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException { 714 return appendable.append("&#").append(Integer.toString(codePoint)).append(';'); 715 } 716 construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings)717 static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) { 718 try { 719 if (source.getParseText().charAt(begin)!='&') return null; 720 return (source.getParseText().charAt(begin+1)=='#') 721 ? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings) 722 : CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint); 723 } catch (IndexOutOfBoundsException ex) { 724 return null; 725 } 726 } 727 appendDecode(final Appendable appendable, final CharSequence encodedText, int pos, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces)728 private static Appendable appendDecode(final Appendable appendable, final CharSequence encodedText, int pos, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException { 729 final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue); 730 int lastEnd=0; 731 final StreamedSource streamedSource=new StreamedSource(encodedText).setHandleTags(false).setUnterminatedCharacterReferenceSettings(unterminatedCharacterReferenceSettings).setSearchBegin(pos); 732 for (Segment segment : streamedSource) { 733 if (segment instanceof CharacterReference) { 734 ((CharacterReference)segment).appendCharTo(appendable,convertNonBreakingSpaces); 735 } else { 736 appendable.append(segment.toString()); // benchmark tests reveal (surprisingly) that converting to a string before appending is faster than appending the specified section of the encodedText or segment directly. 737 // appendable.append(encodedText,segment.begin,segment.end); 738 // appendable.append(segment); 739 } 740 } 741 return appendable; 742 } 743 744 // pinched from http://svn.apache.org/repos/asf/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/text/CharUtils.java getHighSurrogate(int codePoint)745 private static char getHighSurrogate(int codePoint) { 746 return (char)((0xD800 - (0x10000 >> 10)) + (codePoint >> 10)); 747 } getLowSurrogate(int codePoint)748 private static char getLowSurrogate(int codePoint) { 749 return (char)(0xDC00 + (codePoint & 0x3FF)); 750 } 751 } 752