1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML 2 // Version 3.2 3 // Copyright (C) 2004-2009 Martin Jericho 4 // http://jericho.htmlparser.net/ 5 // 6 // This library is free software; you can redistribute it and/or 7 // modify it under the terms of either one of the following licences: 8 // 9 // 1. The Eclipse Public License (EPL) version 1.0, 10 // included in this distribution in the file licence-epl-1.0.html 11 // or available at http://www.eclipse.org/legal/epl-v10.html 12 // 13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, 14 // included in this distribution in the file licence-lgpl-2.1.txt 15 // or available at http://www.gnu.org/licenses/lgpl.txt 16 // 17 // This library is distributed on an "AS IS" basis, 18 // WITHOUT WARRANTY OF ANY KIND, either express or implied. 19 // See the individual licence texts for more details. 20 21 package net.htmlparser.jericho; 22 23 import java.io.*; 24 25 /** 26 * Represents a single <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#h-3.2.2">attribute</a> 27 * name/value segment within a {@link StartTag}. 28 * <p> 29 * An instance of this class is a representation of a single attribute in the source document and is not modifiable. 30 * The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods 31 * provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}. 32 * <p> 33 * Obtained using the {@link Attributes#get(String key)} method. 34 * <p> 35 * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-attr">attributes</a>. 36 * 37 * @see Attributes 38 */ 39 public final class Attribute extends Segment { 40 private final String key; 41 private final Segment nameSegment; 42 private final Segment valueSegment; 43 private final Segment valueSegmentIncludingQuotes; 44 StartTag startTag=StartTag.NOT_CACHED; 45 46 static final String CHECKED="checked"; 47 static final String CLASS="class"; 48 static final String DISABLED="disabled"; 49 static final String ID="id"; 50 static final String MULTIPLE="multiple"; 51 static final String NAME="name"; 52 static final String SELECTED="selected"; 53 static final String STYLE="style"; 54 static final String TYPE="type"; 55 static final String VALUE="value"; 56 57 /** 58 * Constructs a new Attribute with no value part, called from Attributes class. 59 * <p> 60 * Note that the resulting Attribute segment has the same span as the supplied nameSegment. 61 * 62 * @param source the {@link Source} document. 63 * @param key the name of this attribute in lower case. 64 * @param nameSegment the segment representing the name. 65 */ Attribute(final Source source, final String key, final Segment nameSegment)66 Attribute(final Source source, final String key, final Segment nameSegment) { 67 this(source,key,nameSegment,null,null); 68 } 69 70 /** 71 * Constructs a new Attribute, called from Attributes class. 72 * <p> 73 * The resulting Attribute segment begins at the start of the nameSegment 74 * and finishes at the end of the valueSegmentIncludingQuotes. If this attribute 75 * has no value, it finishes at the end of the nameSegment. 76 * <p> 77 * If this attribute has no value, the <code>valueSegment</code> and <code>valueSegmentIncludingQuotes</code> must be null. 78 * The <valueSegmentIncludingQuotes</code> parameter must not be null if the <code>valueSegment</code> is not null, and vice versa 79 * 80 * @param source the {@link Source} document. 81 * @param key the name of this attribute in lower case. 82 * @param nameSegment the segment spanning the name. 83 * @param valueSegment the segment spanning the value. 84 * @param valueSegmentIncludingQuotes the segment spanning the value, including quotation marks if any. 85 */ Attribute(final Source source, final String key, final Segment nameSegment, final Segment valueSegment, final Segment valueSegmentIncludingQuotes)86 Attribute(final Source source, final String key, final Segment nameSegment, final Segment valueSegment, final Segment valueSegmentIncludingQuotes) { 87 super(source,nameSegment.getBegin(),(valueSegmentIncludingQuotes==null ? nameSegment.getEnd() : valueSegmentIncludingQuotes.getEnd())); 88 this.key=key; 89 this.nameSegment=nameSegment; 90 this.valueSegment=valueSegment; 91 this.valueSegmentIncludingQuotes=valueSegmentIncludingQuotes; 92 } 93 94 /** 95 * Returns the name of this attribute in lower case. 96 * <p> 97 * This package treats all attribute names as case insensitive, consistent with 98 * <a target="_blank" href="http://www.w3.org/TR/html401/">HTML</a> but not consistent with 99 * <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a>. 100 * 101 * @return the name of this attribute in lower case. 102 * @see #getName() 103 */ getKey()104 public String getKey() { 105 return key; 106 } 107 108 /** 109 * Returns the name of this attribute in original case. 110 * <p> 111 * This is exactly equivalent to {@link #getNameSegment()}<code>.toString()</code>. 112 * 113 * @return the name of this attribute in original case. 114 * @see #getKey() 115 */ getName()116 public String getName() { 117 return nameSegment.toString(); 118 } 119 120 /** 121 * Returns the segment spanning the {@linkplain #getName() name} of this attribute. 122 * @return the segment spanning the {@linkplain #getName() name} of this attribute. 123 * @see #getName() 124 */ getNameSegment()125 public Segment getNameSegment() { 126 return nameSegment; 127 } 128 129 /** 130 * Indicates whether this attribute has a value. 131 * <p> 132 * This method also returns <code>true</code> if this attribute has been assigned a zero-length value. 133 * <p> 134 * It only returns <code>false</code> if this attribute appears in 135 * <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-boolean_attribute-1">minimized form</a>. 136 * 137 * @return <code>true</code> if this attribute has a value, otherwise <code>false</code>. 138 */ hasValue()139 public boolean hasValue() { 140 return valueSegment!=null; 141 } 142 143 /** 144 * Returns the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, 145 * or <code>null</code> if it {@linkplain #hasValue() has no value}. 146 * <p> 147 * This is equivalent to {@link CharacterReference}<code>.</code>{@link CharacterReference#decode(CharSequence,boolean) decode}<code>(</code>{@link #getValueSegment()}<code>,true)</code>. 148 * <p> 149 * Note that before version 1.4.1 this method returned the raw value of the attribute as it appears in the source document, 150 * without {@linkplain CharacterReference#decode(CharSequence,boolean) decoding}. 151 * <p> 152 * To obtain the raw value without decoding, use {@link #getValueSegment()}<code>.toString()</code>. 153 * <p> 154 * Special attention should be given to attributes that contain URLs, such as the 155 * <code><a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a></code> attribute. 156 * When such an attribute contains a URL with parameters (as described in the 157 * <a target="_blank" href="http://www.w3.org/MarkUp/html-spec/html-spec_8.html#SEC8.2.1">form-urlencoded media type</a>), 158 * the ampersand (<code>&</code>) characters used to separate the parameters should be 159 * {@linkplain CharacterReference#encode(CharSequence) encoded} to prevent the parameter names from being 160 * unintentionally interpreted as {@linkplain CharacterEntityReference character entity references}. 161 * This requirement is explicitly stated in the 162 * <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#h-5.3.2">HTML 4.01 specification section 5.3.2</a>. 163 * <p> 164 * For example, take the following element in the source document: 165 * <div style="margin: 0.5em"><code><a href="Report.jsp?chapt=2&sect=3">next</a></code></div> 166 * By default, calling 167 * {@link Element#getAttributes() getAttributes()}<code>.</code>{@link Attributes#getValue(String) getValue}<code>("href")</code> 168 * on this element returns the string 169 * "<code>Report.jsp?chapt=2§=3</code>", since the text "<code>&sect</code>" is interpreted as the rarely used 170 * character entity reference {@link CharacterEntityReference#_sect &sect;} (U+00A7), despite the fact that it is 171 * missing the {@linkplain CharacterReference#isTerminated() terminating semicolon} (<code>;</code>). 172 * <p> 173 * Most browsers recognise <a href="CharacterReference.html#Unterminated">unterminated</a> character entity references 174 * in attribute values representing a codepoint of U+00FF or below, but ignore those representing codepoints above this value. 175 * One relatively popular browser only recognises those representing a codepoint of U+003E or below, meaning it would 176 * have interpreted the URL in the above example differently to most other browsers. 177 * Most browsers also use different rules depending on whether the unterminated character reference is inside or outside 178 * of an attribute value, with both of these possibilities further split into different rules for 179 * {@linkplain CharacterEntityReference character entity references}, 180 * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character references</a>, and 181 * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character references</a>. 182 * <p> 183 * The behaviour of this library is determined by the current {@linkplain Config.CompatibilityMode compatibility mode} setting, 184 * which is determined by the static {@link Config#CurrentCompatibilityMode} property. 185 * 186 * @return the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}. 187 */ getValue()188 public String getValue() { 189 return CharacterReference.decode(valueSegment,true); 190 } 191 192 /** 193 * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}. 194 * @return the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}. 195 * @see #getValue() 196 */ getValueSegment()197 public Segment getValueSegment() { 198 return valueSegment; 199 } 200 201 /** 202 * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, 203 * or <code>null</code> if it {@linkplain #hasValue() has no value}. 204 * <p> 205 * If the value is not enclosed by quotation marks, this is the same as the {@linkplain #getValueSegment() value segment} 206 * 207 * @return the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, or <code>null</code> if it {@linkplain #hasValue() has no value}. 208 */ getValueSegmentIncludingQuotes()209 public Segment getValueSegmentIncludingQuotes() { 210 return valueSegmentIncludingQuotes; 211 } 212 213 /** 214 * Returns the character used to quote the value. 215 * <p> 216 * The return value is either a double-quote (<code>"</code>), a single-quote (<code>'</code>), or a space. 217 * 218 * @return the character used to quote the value, or a space if the value is not quoted or this attribute has no value. 219 */ getQuoteChar()220 public char getQuoteChar() { 221 if (valueSegment==valueSegmentIncludingQuotes) return ' '; // no quotes 222 return source.charAt(valueSegmentIncludingQuotes.getBegin()); 223 } 224 225 /** 226 * Returns the start tag to which this attribute belongs. 227 * @return the start tag to which this attribute belongs, or <code>null</code> if it is not within a start tag. 228 */ getStartTag()229 public StartTag getStartTag() { 230 if (startTag==StartTag.NOT_CACHED) { 231 final Tag tag=source.getEnclosingTag(begin); 232 startTag=(tag==null || tag instanceof EndTag) ? null : (StartTag)tag; 233 } 234 return startTag; 235 } 236 237 /** 238 * Returns a string representation of this object useful for debugging purposes. 239 * @return a string representation of this object useful for debugging purposes. 240 */ getDebugInfo()241 public String getDebugInfo() { 242 final StringBuilder sb=new StringBuilder().append(key).append(super.getDebugInfo()).append(",name=").append(nameSegment.getDebugInfo()); 243 if (hasValue()) 244 sb.append(",value=").append(valueSegment.getDebugInfo()).append('"').append(valueSegment).append('"').append(Config.NewLine); 245 else 246 sb.append(",NO VALUE").append(Config.NewLine); 247 return sb.toString(); 248 } 249 appendTidy(final Appendable appendable, Tag nextTag)250 Tag appendTidy(final Appendable appendable, Tag nextTag) throws IOException { 251 appendable.append(' ').append(nameSegment); 252 if (valueSegment!=null) { 253 appendable.append("=\""); 254 while (nextTag!=null && nextTag.begin<valueSegment.begin) nextTag=nextTag.getNextTag(); 255 if (nextTag==null || nextTag.begin>=valueSegment.end) { 256 appendTidyValue(appendable,valueSegment); 257 } else { 258 int i=valueSegment.begin; 259 while (nextTag!=null && nextTag.begin<valueSegment.end) { 260 appendTidyValue(appendable,new Segment(source,i,nextTag.begin)); 261 if (nextTag.end>valueSegment.end) { 262 appendable.append(new Segment(source,nextTag.begin,i=valueSegment.end)); 263 break; 264 } 265 appendable.append(nextTag); 266 i=nextTag.end; 267 nextTag=nextTag.getNextTag(); 268 } 269 if (i<valueSegment.end) appendTidyValue(appendable,new Segment(source,i,valueSegment.end)); 270 } 271 appendable.append('"'); 272 } 273 return nextTag; 274 } 275 appendTidyValue(final Appendable appendable, final CharSequence unencodedValue)276 private static void appendTidyValue(final Appendable appendable, final CharSequence unencodedValue) throws IOException { 277 CharacterReference.appendEncode(appendable,CharacterReference.decode(unencodedValue,true),false); 278 } 279 appendHTML(final Appendable appendable, final CharSequence name, final CharSequence value)280 static Appendable appendHTML(final Appendable appendable, final CharSequence name, final CharSequence value) throws IOException { 281 appendable.append(' ').append(name); 282 if (value!=null) { 283 appendable.append("=\""); 284 CharacterReference.appendEncode(appendable,value,false); 285 appendable.append('"'); 286 } 287 return appendable; 288 } 289 } 290