1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML 2 // Version 3.2 3 // Copyright (C) 2004-2009 Martin Jericho 4 // http://jericho.htmlparser.net/ 5 // 6 // This library is free software; you can redistribute it and/or 7 // modify it under the terms of either one of the following licences: 8 // 9 // 1. The Eclipse Public License (EPL) version 1.0, 10 // included in this distribution in the file licence-epl-1.0.html 11 // or available at http://www.eclipse.org/legal/epl-v10.html 12 // 13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, 14 // included in this distribution in the file licence-lgpl-2.1.txt 15 // or available at http://www.gnu.org/licenses/lgpl.txt 16 // 17 // This library is distributed on an "AS IS" basis, 18 // WITHOUT WARRANTY OF ANY KIND, either express or implied. 19 // See the individual licence texts for more details. 20 21 package net.htmlparser.jericho; 22 23 import java.util.*; 24 25 /** 26 * Encapsulates global configuration properties which determine the behaviour of various functions. 27 * <p> 28 * All of the properties in this class are static, affecting all objects and threads. 29 * Multiple concurrent configurations are not possible. 30 * <p> 31 * Properties that relate to <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a> 32 * compatibility issues are stored in instances of the {@link Config.CompatibilityMode} class. 33 * This allows all of the properties in the compatibility mode to be set as a block by setting the static 34 * {@link #CurrentCompatibilityMode} property to a different instance. 35 * 36 * @see Config.CompatibilityMode 37 */ 38 public final class Config { Config()39 private Config() {} 40 41 /** 42 * Determines the string used to separate a single column's multiple values in the output of the {@link FormFields#getColumnValues(Map)} method. 43 * <p> 44 * The situation where a single column has multiple values only arises if {@link FormField#getUserValueCount()}<code>>1</code> 45 * on the relevant form field, which usually indicates a poorly designed form. 46 * <p> 47 * The default value is "<code>,</code>" (a comma, not including the quotes). 48 * <p> 49 * Must not be <code>null</code>. 50 */ 51 public static String ColumnMultipleValueSeparator=","; 52 53 /** 54 * Determines the string that represents the value <code>true</code> in the output of the {@link FormFields#getColumnValues(Map)} method. 55 * <p> 56 * The default value is "<code>true</code>" (without the quotes). 57 * <p> 58 * Must not be <code>null</code>. 59 */ 60 public static String ColumnValueTrue=Boolean.toString(true); 61 62 /** 63 * Determines the string that represents the value <code>false</code> in the output of the {@link FormFields#getColumnValues(Map)} method. 64 * <p> 65 * The default value is <code>null</code>, which represents no output at all. 66 */ 67 public static String ColumnValueFalse=null; 68 69 /** 70 * Determines whether the {@link CharacterReference#decode(CharSequence)} and similar methods convert non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character references to normal spaces. 71 * <p> 72 * The default value is <code>true</code>. 73 * <p> 74 * When this property is set to <code>false</code>, non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) 75 * character references are decoded as non-breaking space characters (U+00A0) instead of being converted to normal spaces (U+0020). 76 * <p> 77 * The default behaviour of the library reflects the fact that non-breaking space character references are almost always used in HTML documents 78 * as a <a target="_blank" href="http://en.wikipedia.org/wiki/Non-breaking_space#Use_as_non-collapsing_whitespace">non-collapsing white space</a> character. 79 * Converting them to the correct character code U+00A0, which is represented by a visible character in many older character sets, was confusing to most users 80 * who expected to see only normal spaces. 81 * The most common example of this is its visualisation as the character <b>á</b> in the MS-DOS <a target="_blank" href="http://en.wikipedia.org/wiki/Code_page_437">CP437</a> character set. 82 * <p> 83 * The functionality of the following methods is affected: 84 * <ul> 85 * <li>{@link CharacterReference#appendCharTo(Appendable)} 86 * <li>{@link CharacterReference#decode(CharSequence)} 87 * <li>{@link CharacterReference#decode(CharSequence, boolean insideAttributeValue)} 88 * <li>{@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} 89 * <li>{@link CharacterReference#reencode(CharSequence)} 90 * <li>{@link Attribute#getValue()} 91 * <li>{@link Attributes#getValue(String name)} 92 * <li>{@link Attributes#populateMap(Map, boolean convertNamesToLowerCase)} 93 * <li>{@link StartTag#getAttributeValue(String attributeName)} 94 * <li>{@link Element#getAttributeValue(String attributeName)} 95 * <li>{@link FormControl#getPredefinedValues()} 96 * <li>{@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} 97 * <li>{@link Renderer#getConvertNonBreakingSpaces()} 98 * <li>{@link TextExtractor#getConvertNonBreakingSpaces()} 99 * </ul> 100 */ 101 public static boolean ConvertNonBreakingSpaces=true; 102 103 104 /** 105 * Determines the currently active {@linkplain Config.CompatibilityMode compatibility mode}. 106 * <p> 107 * The default setting is {@link Config.CompatibilityMode#IE} (MS Internet Explorer 6.0). 108 * <p> 109 * Must not be <code>null</code>. 110 */ 111 public static CompatibilityMode CurrentCompatibilityMode=CompatibilityMode.IE; 112 113 /** 114 * Determines whether apostrophes are encoded when calling the {@link CharacterReference#encode(CharSequence)} method. 115 * <p> 116 * A value of <code>false</code> means {@linkplain CharacterEntityReference#_apos apostrophe} 117 * (U+0027) characters are not encoded. 118 * The only time apostrophes need to be encoded is within an attribute value delimited by 119 * single quotes (apostrophes), so in most cases ignoring apostrophes is perfectly safe and 120 * enhances the readability of the source document. 121 * <p> 122 * Note that apostrophes are always encoded as a {@linkplain NumericCharacterReference numeric character reference}, never as the 123 * character entity reference {@link CharacterEntityReference#_apos &apos;}. 124 * <p> 125 * The default value is <code>false</code>. 126 */ 127 public static boolean IsApostropheEncoded=false; 128 129 /** 130 * Determines whether all {@linkplain StartTag#isEmptyElementTag() empty-element tags} are recognised. 131 * <p> 132 * The major browsers do not recognise empty-element tags (those having the characters "/>" at the end of the start tag) if the element is defined by the 133 * HTML specification to have a {@linkplain HTMLElements#getEndTagRequiredElementNames() required} or an {@linkplain HTMLElements#getEndTagOptionalElementNames() optional} end tag. 134 * This is the case even in <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> documents, which can cause a lot of confusion. 135 * <p> 136 * Setting this property to <code>true</code> forces the parser to recognise all {@linkplain StartTag#isSyntacticalEmptyElementTag() syntactical empty-element tags}, 137 * regardless of whether the element is defined by the HTML specification to have a required or optional end tag. 138 * <p> 139 * Use of this feature is however not recommended as it makes the parser behaviour inconsistent with that of most browsers. 140 * <p> 141 * The default value is <code>false</code>. 142 * 143 * @see StartTag#isEmptyElementTag() 144 */ 145 public static boolean IsHTMLEmptyElementTagRecognised=false; 146 147 /** 148 * Determines the {@link LoggerProvider} that is used to create the default {@link Logger} object for each new {@link Source} object. 149 * <p> 150 * The {@link LoggerProvider} interface contains several predefined <code>LoggerProvider</code> instances which this property can be set to, 151 * mostly representing wrappers to common logging frameworks. 152 * <p> 153 * The default value is <code>null</code>, which results in the auto-detection of the most appropriate logging mechanism according to the following algorithm: 154 * <p> 155 * <ol> 156 * <li>If the class <code>org.slf4j.impl.StaticLoggerBinder</code> is detected: 157 * <ul> 158 * <li>If the class <code>org.slf4j.impl.JDK14LoggerFactory</code> is detected, use {@link LoggerProvider#JAVA}. 159 * <li>If the class <code>org.slf4j.impl.Log4jLoggerFactory</code> is detected, use {@link LoggerProvider#LOG4J}. 160 * <li>If the class <code>org.slf4j.impl.JCLLoggerFactory</code> is NOT detected, use {@link LoggerProvider#SLF4J}. 161 * </ul> 162 * <li>If the class <code>org.apache.commons.logging.Log</code> is detected: 163 * <blockquote> 164 * Create an instance of it using the commons-logging <code>LogFactory</code> class. 165 * <ul> 166 * <li>If the created <code>Log</code> is of type <code>org.apache.commons.logging.impl.Jdk14Logger</code>, use {@link LoggerProvider#JAVA}. 167 * <li>If the created <code>Log</code> is of type <code>org.apache.commons.logging.impl.Log4JLogger</code>, use {@link LoggerProvider#LOG4J}. 168 * <li>otherwise, use {@link LoggerProvider#JCL}. 169 * </ul> 170 * </blockquote> 171 * <li>If the class <code>org.apache.log4j.Logger</code> is detected, use {@link LoggerProvider#LOG4J}. 172 * <li>otherwise, use {@link LoggerProvider#JAVA}. 173 * </ol> 174 * 175 * @see Source#setLogger(Logger) 176 */ 177 public static LoggerProvider LoggerProvider=null; 178 179 /** 180 * Determines the string used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in text output throughout the library. 181 * <p> 182 * The default value is the standard new line character sequence of the host platform, determined by <code>System.getProperty("line.separator")</code>. 183 */ 184 public static String NewLine=System.getProperty("line.separator"); 185 186 /** 187 * Used in Element.getChildElements. 188 * Server elements containing markup should be included in the hierarchy, so consider making this option public in future. 189 */ 190 static final boolean IncludeServerTagsInElementHierarchy=false; 191 192 /** 193 * Represents a set of maximum unicode code points to be recognised for the three types of 194 * <a href="CharacterReference.html#Unterminated">unterminated</a> character reference in a given context. 195 * <p> 196 * The three types of character reference are: 197 * <ul> 198 * <li>{@linkplain CharacterEntityReference Character entity reference} 199 * <li><a href="NumericCharacterReference.html#DecimalCharacterReference">Decimal character reference</a> 200 * <li><a href="NumericCharacterReference.html#HexadecimalCharacterReference">Hexadecimal character reference</a> 201 * </ul> 202 * <p> 203 * The two types of contexts used in this library are: 204 * <ul> 205 * <li>Inside an attribute value 206 * <li>Outside an attribute value 207 * </ul> 208 */ 209 static class UnterminatedCharacterReferenceSettings { 210 // use volatile fields to make them thread safe 211 public volatile int characterEntityReferenceMaxCodePoint; 212 public volatile int decimalCharacterReferenceMaxCodePoint; 213 public volatile int hexadecimalCharacterReferenceMaxCodePoint; 214 215 public static UnterminatedCharacterReferenceSettings ACCEPT_ALL=new UnterminatedCharacterReferenceSettings(CompatibilityMode.CODE_POINTS_ALL,CompatibilityMode.CODE_POINTS_ALL,CompatibilityMode.CODE_POINTS_ALL); 216 UnterminatedCharacterReferenceSettings()217 public UnterminatedCharacterReferenceSettings() { 218 this(CompatibilityMode.CODE_POINTS_NONE,CompatibilityMode.CODE_POINTS_NONE,CompatibilityMode.CODE_POINTS_NONE); 219 } 220 UnterminatedCharacterReferenceSettings(final int characterEntityReferenceMaxCodePoint, final int decimalCharacterReferenceMaxCodePoint, final int hexadecimalCharacterReferenceMaxCodePoint)221 public UnterminatedCharacterReferenceSettings(final int characterEntityReferenceMaxCodePoint, final int decimalCharacterReferenceMaxCodePoint, final int hexadecimalCharacterReferenceMaxCodePoint) { 222 this.characterEntityReferenceMaxCodePoint=characterEntityReferenceMaxCodePoint; 223 this.decimalCharacterReferenceMaxCodePoint=decimalCharacterReferenceMaxCodePoint; 224 this.hexadecimalCharacterReferenceMaxCodePoint=hexadecimalCharacterReferenceMaxCodePoint; 225 } 226 toString()227 public String toString() { 228 return Config.NewLine+" Character entity reference: "+getDescription(characterEntityReferenceMaxCodePoint) 229 +Config.NewLine+" Decimal character reference: "+getDescription(decimalCharacterReferenceMaxCodePoint) 230 +Config.NewLine+" Haxadecimal character reference: "+getDescription(hexadecimalCharacterReferenceMaxCodePoint); 231 } 232 getDescription(final int codePoint)233 private String getDescription(final int codePoint) { 234 if (codePoint==CompatibilityMode.CODE_POINTS_NONE) return "None"; 235 if (codePoint==CompatibilityMode.CODE_POINTS_ALL) return "All"; 236 return "0x"+Integer.toString(codePoint,16); 237 } 238 } 239 240 /** 241 * Represents a set of configuration parameters that relate to 242 * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a> compatibility issues. 243 * <p> 244 * The predefined compatibility modes {@link #IE}, {@link #MOZILLA}, {@link #OPERA} and {@link #XHTML} provide an easy means of 245 * ensuring the library interprets the markup in a way consistent with some of the most commonly used browsers, 246 * at least in relation to the behaviour described by the properties in this class. 247 * <p> 248 * The properties of any <code>CompatibilityMode</code> object can be modified individually, including those in 249 * the predefined instances as well as newly constructed instances. 250 * Take note however that modifying the properties of the predefined instances has a global affect. 251 * <p> 252 * The currently active compatibility mode is stored in the static {@link Config#CurrentCompatibilityMode} property. 253 * <p> 254 */ 255 public static final class CompatibilityMode { 256 private String name; 257 private volatile boolean formFieldNameCaseInsensitive; 258 volatile UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsInsideAttributeValue; 259 volatile UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsOutsideAttributeValue; 260 261 /** 262 * Indicates the recognition of all unicode code points. 263 * <p> 264 * This value is used in properties which specify a maximum unicode code point to be recognised by the parser. 265 * 266 * @see #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue) 267 * @see #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue) 268 * @see #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue) 269 */ 270 public static final int CODE_POINTS_ALL=Character.MAX_CODE_POINT; // 0x10FFFF (decimal 1114111) 271 272 /** 273 * Indicates the recognition of no unicode code points. 274 * <p> 275 * This value is used in properties which specify a maximum unicode code point to be recognised by the parser. 276 * 277 * @see #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue) 278 * @see #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue) 279 * @see #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue) 280 */ 281 public static final int CODE_POINTS_NONE=CharacterReference.INVALID_CODE_POINT; 282 283 /** 284 * <a target="_blank" href="http://www.microsoft.com/windows/ie/">Microsoft Internet Explorer</a> compatibility mode. 285 * <p> 286 * <code>{@link #getName() Name} = IE</code><br /> 287 * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = true</code><br /> 288 * <table cellspacing="0" cellpadding="0"> 289 * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute) 290 * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+00FF<td align="center">U+00FF 291 * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All} 292 * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_NONE None} 293 * </table> 294 */ 295 public static final CompatibilityMode IE=new CompatibilityMode("IE",true, 296 new UnterminatedCharacterReferenceSettings(0xFF, CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes 297 new UnterminatedCharacterReferenceSettings(0xFF, CODE_POINTS_ALL, CODE_POINTS_NONE) // outside attributes 298 ); 299 300 /** 301 * <a target="_blank" href="http://www.mozilla.org/products/mozilla1.x/">Mozilla</a> / 302 * <a target="_blank" href="http://www.mozilla.org/products/firefox/">Firefox</a> / 303 * <a target="_blank" href="http://browser.netscape.com/">Netscape</a> compatibility mode. 304 * <p> 305 * <code>{@link #getName() Name} = Mozilla</code><br /> 306 * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = false</code><br /> 307 * <table cellspacing="0" cellpadding="0"> 308 * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute) 309 * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+00FF<td align="center">{@linkplain #CODE_POINTS_ALL All} 310 * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All} 311 * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All} 312 * </table> 313 */ 314 public static final CompatibilityMode MOZILLA=new CompatibilityMode("Mozilla",false, 315 new UnterminatedCharacterReferenceSettings(0xFF, CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes 316 new UnterminatedCharacterReferenceSettings(CODE_POINTS_ALL, CODE_POINTS_ALL, CODE_POINTS_ALL) // outside attributes 317 ); 318 319 /** 320 * Opera compatibility mode. 321 * <p> 322 * <code>{@link #getName() Name} = Opera</code><br /> 323 * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = true</code><br /> 324 * <table cellspacing="0" cellpadding="0"> 325 * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute) 326 * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+003E<td align="center">{@linkplain #CODE_POINTS_ALL All} 327 * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All} 328 * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All} 329 * </table> 330 */ 331 public static final CompatibilityMode OPERA=new CompatibilityMode("Opera",true, 332 new UnterminatedCharacterReferenceSettings(0x3E, CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes 333 new UnterminatedCharacterReferenceSettings(CODE_POINTS_ALL, CODE_POINTS_ALL, CODE_POINTS_ALL) // outside attributes 334 ); 335 336 /** 337 * <a target="_blank" href="http://www.w3.org/TR/xhtml1/#xhtml">XHTML</a> compatibility mode. 338 * <p> 339 * <code>{@link #getName() Name} = XHTML</code><br /> 340 * <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = false</code><br /> 341 * <table cellspacing="0" cellpadding="0"> 342 * <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute) 343 * <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None} 344 * <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None} 345 * <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None} 346 * </table> 347 */ 348 public static final CompatibilityMode XHTML=new CompatibilityMode("XHTML"); 349 350 /** 351 * Constructs a new <code>CompatibilityMode</code> with the given {@linkplain #getName() name}. 352 * <p> 353 * All properties in the new instance are initially assigned their default values, which are the same as the strict 354 * rules of the {@link #XHTML} compatibility mode. 355 * 356 * @param name the {@linkplain #getName() name} of the new compatibility mode 357 */ CompatibilityMode(final String name)358 public CompatibilityMode(final String name) { 359 this(name,false,new UnterminatedCharacterReferenceSettings(),new UnterminatedCharacterReferenceSettings()); 360 } 361 CompatibilityMode(final String name, final boolean formFieldNameCaseInsensitive, final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsInsideAttributeValue, final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsOutsideAttributeValue)362 private CompatibilityMode(final String name, final boolean formFieldNameCaseInsensitive, final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsInsideAttributeValue, final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsOutsideAttributeValue) { 363 this.name=name; 364 this.formFieldNameCaseInsensitive=formFieldNameCaseInsensitive; 365 this.unterminatedCharacterReferenceSettingsInsideAttributeValue=unterminatedCharacterReferenceSettingsInsideAttributeValue; 366 this.unterminatedCharacterReferenceSettingsOutsideAttributeValue=unterminatedCharacterReferenceSettingsOutsideAttributeValue; 367 } 368 369 /** 370 * Returns the name of this compatibility mode. 371 * @return the name of this compatibility mode. 372 */ getName()373 public String getName() { 374 return name; 375 } 376 377 /** 378 * Indicates whether {@linkplain FormField#getName() form field names} are treated as case insensitive. 379 * <p> 380 * Microsoft Internet Explorer treats field names as case insensitive, 381 * while Mozilla treats them as case sensitive. 382 * <p> 383 * The value of this property in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode} 384 * affects all instances of the {@link FormFields} class. 385 * It should be set to the desired configuration before any instances of <code>FormFields</code> are created. 386 * 387 * @return <code>true</code> if {@linkplain FormField#getName() form field names} are treated as case insensitive, otherwise <code>false</code>. 388 * @see #setFormFieldNameCaseInsensitive(boolean) 389 */ isFormFieldNameCaseInsensitive()390 public boolean isFormFieldNameCaseInsensitive() { 391 return formFieldNameCaseInsensitive; 392 } 393 394 /** 395 * Sets whether {@linkplain FormField#getName() form field names} are treated as case insensitive. 396 * <p> 397 * See {@link #isFormFieldNameCaseInsensitive()} for the documentation of this property. 398 * 399 * @param value the new value of the property 400 */ setFormFieldNameCaseInsensitive(final boolean value)401 public void setFormFieldNameCaseInsensitive(final boolean value) { 402 formFieldNameCaseInsensitive=value; 403 } 404 405 /** 406 * Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> 407 * {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context. 408 * <p> 409 * For example, if <code>getUnterminatedCharacterEntityReferenceMaxCodePoint(true)</code> has the value <code>0xFF</code> (U+00FF) 410 * in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then: 411 * <ul> 412 * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&gt",true)} 413 * returns "<code>></code>".<br /> 414 * The string is recognised as the character entity reference {@link CharacterEntityReference#_gt &gt;} 415 * despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>, 416 * because its unicode code point U+003E is below the maximum of U+00FF set by this property. 417 * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&euro",true)} 418 * returns "<code>&euro</code>".<br /> 419 * The string is not recognised as the character entity reference {@link CharacterEntityReference#_euro &euro;} 420 * because it is <a href="CharacterReference.html#Unterminated">unterminated</a> 421 * and its unicode code point U+20AC is above the maximum of U+00FF set by this property. 422 * </ul> 423 * <p> 424 * See the documentation of the {@link Attribute#getValue()} method for further discussion. 425 * 426 * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value. 427 * @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context. 428 * @see #setUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint) 429 */ getUnterminatedCharacterEntityReferenceMaxCodePoint(final boolean insideAttributeValue)430 public int getUnterminatedCharacterEntityReferenceMaxCodePoint(final boolean insideAttributeValue) { 431 return getUnterminatedCharacterReferenceSettings(insideAttributeValue).characterEntityReferenceMaxCodePoint; 432 } 433 434 /** 435 * Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> 436 * {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context. 437 * <p> 438 * See {@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property. 439 * 440 * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value. 441 * @param maxCodePoint the maximum unicode code point. 442 */ setUnterminatedCharacterEntityReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint)443 public void setUnterminatedCharacterEntityReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint) { 444 getUnterminatedCharacterReferenceSettings(insideAttributeValue).characterEntityReferenceMaxCodePoint=maxCodePoint; 445 } 446 447 /** 448 * Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> 449 * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context. 450 * <p> 451 * For example, if <code>getUnterminatedDecimalCharacterReferenceMaxCodePoint(true)</code> had the hypothetical value <code>0xFF</code> (U+00FF) 452 * in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then: 453 * <ul> 454 * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#62",true)} 455 * returns "<code>></code>".<br /> 456 * The string is recognised as the numeric character reference <code>&#62;</code> 457 * despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>, 458 * because its unicode code point U+003E is below the maximum of U+00FF set by this property. 459 * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#8364",true)} 460 * returns "<code>&#8364</code>".<br /> 461 * The string is not recognised as the numeric character reference <code>&#8364;</code> 462 * because it is <a href="CharacterReference.html#Unterminated">unterminated</a> 463 * and its unicode code point U+20AC is above the maximum of U+00FF set by this property. 464 * </ul> 465 * 466 * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value. 467 * @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context. 468 * @see #setUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint) 469 */ getUnterminatedDecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue)470 public int getUnterminatedDecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue) { 471 return getUnterminatedCharacterReferenceSettings(insideAttributeValue).decimalCharacterReferenceMaxCodePoint; 472 } 473 474 /** 475 * Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> 476 * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context. 477 * <p> 478 * See {@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property. 479 * 480 * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value. 481 * @param maxCodePoint the maximum unicode code point. 482 */ setUnterminatedDecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint)483 public void setUnterminatedDecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint) { 484 getUnterminatedCharacterReferenceSettings(insideAttributeValue).decimalCharacterReferenceMaxCodePoint=maxCodePoint; 485 } 486 487 /** 488 * Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> 489 * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character reference</a> which is to be recognised in the specified context. 490 * <p> 491 * For example, if <code>getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(true)</code> had the hypothetical value <code>0xFF</code> (U+00FF) 492 * in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then: 493 * <ul> 494 * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#x3e",true)} 495 * returns "<code>></code>".<br /> 496 * The string is recognised as the numeric character reference <code>&#x3e;</code> 497 * despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>, 498 * because its unicode code point U+003E is below the maximum of U+00FF set by this property. 499 * <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#x20ac",true)} 500 * returns "<code>&#x20ac</code>".<br /> 501 * The string is not recognised as the numeric character reference <code>&#20ac;</code> 502 * because it is <a href="CharacterReference.html#Unterminated">unterminated</a> 503 * and its unicode code point U+20AC is above the maximum of U+00FF set by this property. 504 * </ul> 505 * 506 * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value. 507 * @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character reference</a> which is to be recognised in the specified context. 508 * @see #setUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint) 509 */ getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue)510 public int getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue) { 511 return getUnterminatedCharacterReferenceSettings(insideAttributeValue).hexadecimalCharacterReferenceMaxCodePoint; 512 } 513 514 /** 515 * Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> 516 * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">headecimal character reference</a> which is to be recognised in the specified context. 517 * <p> 518 * See {@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property. 519 * 520 * @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value. 521 * @param maxCodePoint the maximum unicode code point. 522 */ setUnterminatedHexadecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint)523 public void setUnterminatedHexadecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint) { 524 getUnterminatedCharacterReferenceSettings(insideAttributeValue).hexadecimalCharacterReferenceMaxCodePoint=maxCodePoint; 525 } 526 527 /** 528 * Returns a string representation of this object useful for debugging purposes. 529 * @return a string representation of this object useful for debugging purposes. 530 */ getDebugInfo()531 public String getDebugInfo() { 532 return "Form field name case insensitive: "+formFieldNameCaseInsensitive 533 +Config.NewLine+"Maximum codepoints in unterminated character references:" 534 +Config.NewLine+" Inside attribute values:" 535 +unterminatedCharacterReferenceSettingsInsideAttributeValue 536 +Config.NewLine+" Outside attribute values:" 537 +unterminatedCharacterReferenceSettingsOutsideAttributeValue; 538 } 539 540 /** 541 * Returns the {@linkplain #getName() name} of this compatibility mode. 542 * @return the {@linkplain #getName() name} of this compatibility mode. 543 */ toString()544 public String toString() { 545 return getName(); 546 } 547 getUnterminatedCharacterReferenceSettings(final boolean insideAttributeValue)548 UnterminatedCharacterReferenceSettings getUnterminatedCharacterReferenceSettings(final boolean insideAttributeValue) { 549 return insideAttributeValue ? unterminatedCharacterReferenceSettingsInsideAttributeValue : unterminatedCharacterReferenceSettingsOutsideAttributeValue; 550 } 551 } 552 } 553