1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20 
21 package net.htmlparser.jericho;
22 
23 import java.io.*;
24 
25 /**
26  * Represents a single <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#h-3.2.2">attribute</a>
27  * name/value segment within a {@link StartTag}.
28  * <p>
29  * An instance of this class is a representation of a single attribute in the source document and is not modifiable.
30  * The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods
31  * provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}.
32  * <p>
33  * Obtained using the {@link Attributes#get(String key)} method.
34  * <p>
35  * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-attr">attributes</a>.
36  *
37  * @see Attributes
38  */
39 public final class Attribute extends Segment {
40 	private final String key;
41 	private final Segment nameSegment;
42 	private final Segment valueSegment;
43 	private final Segment valueSegmentIncludingQuotes;
44 	StartTag startTag=StartTag.NOT_CACHED;
45 
46 	static final String CHECKED="checked";
47 	static final String CLASS="class";
48 	static final String DISABLED="disabled";
49 	static final String ID="id";
50 	static final String MULTIPLE="multiple";
51 	static final String NAME="name";
52 	static final String SELECTED="selected";
53 	static final String STYLE="style";
54 	static final String TYPE="type";
55 	static final String VALUE="value";
56 
57 	/**
58 	 * Constructs a new Attribute with no value part, called from Attributes class.
59 	 * <p>
60 	 * Note that the resulting Attribute segment has the same span as the supplied nameSegment.
61 	 *
62 	 * @param source  the {@link Source} document.
63 	 * @param key  the name of this attribute in lower case.
64 	 * @param nameSegment  the segment representing the name.
65 	 */
Attribute(final Source source, final String key, final Segment nameSegment)66 	Attribute(final Source source, final String key, final Segment nameSegment) {
67 		this(source,key,nameSegment,null,null);
68 	}
69 
70 	/**
71 	 * Constructs a new Attribute, called from Attributes class.
72 	 * <p>
73 	 * The resulting Attribute segment begins at the start of the nameSegment
74 	 * and finishes at the end of the valueSegmentIncludingQuotes.  If this attribute
75 	 * has no value, it finishes at the end of the nameSegment.
76 	 * <p>
77 	 * If this attribute has no value, the <code>valueSegment</code> and <code>valueSegmentIncludingQuotes</code> must be null.
78 	 * The <valueSegmentIncludingQuotes</code> parameter must not be null if the <code>valueSegment</code> is not null, and vice versa
79 	 *
80 	 * @param source  the {@link Source} document.
81 	 * @param key  the name of this attribute in lower case.
82 	 * @param nameSegment  the segment spanning the name.
83 	 * @param valueSegment  the segment spanning the value.
84 	 * @param valueSegmentIncludingQuotes  the segment spanning the value, including quotation marks if any.
85 	 */
Attribute(final Source source, final String key, final Segment nameSegment, final Segment valueSegment, final Segment valueSegmentIncludingQuotes)86 	Attribute(final Source source, final String key, final Segment nameSegment, final Segment valueSegment, final Segment valueSegmentIncludingQuotes) {
87 		super(source,nameSegment.getBegin(),(valueSegmentIncludingQuotes==null ? nameSegment.getEnd() : valueSegmentIncludingQuotes.getEnd()));
88 		this.key=key;
89 		this.nameSegment=nameSegment;
90 		this.valueSegment=valueSegment;
91 		this.valueSegmentIncludingQuotes=valueSegmentIncludingQuotes;
92 	}
93 
94 	/**
95 	 * Returns the name of this attribute in lower case.
96 	 * <p>
97 	 * This package treats all attribute names as case insensitive, consistent with
98 	 * <a target="_blank" href="http://www.w3.org/TR/html401/">HTML</a> but not consistent with
99 	 * <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a>.
100 	 *
101 	 * @return the name of this attribute in lower case.
102 	 * @see #getName()
103 	 */
getKey()104 	public String getKey() {
105 		return key;
106 	}
107 
108 	/**
109 	 * Returns the name of this attribute in original case.
110 	 * <p>
111 	 * This is exactly equivalent to {@link #getNameSegment()}<code>.toString()</code>.
112 	 *
113 	 * @return the name of this attribute in original case.
114 	 * @see #getKey()
115 	 */
getName()116 	public String getName() {
117 		return nameSegment.toString();
118 	}
119 
120 	/**
121 	 * Returns the segment spanning the {@linkplain #getName() name} of this attribute.
122 	 * @return the segment spanning the {@linkplain #getName() name} of this attribute.
123 	 * @see #getName()
124 	 */
getNameSegment()125 	public Segment getNameSegment() {
126 		return nameSegment;
127 	}
128 
129 	/**
130 	 * Indicates whether this attribute has a value.
131 	 * <p>
132 	 * This method also returns <code>true</code> if this attribute has been assigned a zero-length value.
133 	 * <p>
134 	 * It only returns <code>false</code> if this attribute appears in
135 	 * <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-boolean_attribute-1">minimized form</a>.
136 	 *
137 	 * @return <code>true</code> if this attribute has a value, otherwise <code>false</code>.
138 	 */
hasValue()139 	public boolean hasValue() {
140 		return valueSegment!=null;
141 	}
142 
143 	/**
144 	 * Returns the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute,
145 	 * or <code>null</code> if it {@linkplain #hasValue() has no value}.
146 	 * <p>
147 	 * This is equivalent to {@link CharacterReference}<code>.</code>{@link CharacterReference#decode(CharSequence,boolean) decode}<code>(</code>{@link #getValueSegment()}<code>,true)</code>.
148 	 * <p>
149 	 * Note that before version 1.4.1 this method returned the raw value of the attribute as it appears in the source document,
150 	 * without {@linkplain CharacterReference#decode(CharSequence,boolean) decoding}.
151 	 * <p>
152 	 * To obtain the raw value without decoding, use {@link #getValueSegment()}<code>.toString()</code>.
153 	 * <p>
154 	 * Special attention should be given to attributes that contain URLs, such as the
155 	 * <code><a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a></code> attribute.
156 	 * When such an attribute contains a URL with parameters (as described in the
157 	 * <a target="_blank" href="http://www.w3.org/MarkUp/html-spec/html-spec_8.html#SEC8.2.1">form-urlencoded media type</a>),
158 	 * the ampersand (<code>&amp;</code>) characters used to separate the parameters should be
159 	 * {@linkplain CharacterReference#encode(CharSequence) encoded} to prevent the parameter names from being
160 	 * unintentionally interpreted as {@linkplain CharacterEntityReference character entity references}.
161 	 * This requirement is explicitly stated in the
162 	 * <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#h-5.3.2">HTML 4.01 specification section 5.3.2</a>.
163 	 * <p>
164 	 * For example, take the following element in the source document:
165 	 * <div style="margin: 0.5em"><code>&lt;a href="Report.jsp?chapt=2&amp;sect=3"&gt;next&lt;/a&gt;</code></div>
166 	 * By default, calling
167 	 * {@link Element#getAttributes() getAttributes()}<code>.</code>{@link Attributes#getValue(String) getValue}<code>("href")</code>
168 	 * on this element returns the string
169 	 * "<code>Report.jsp?chapt=2&sect;=3</code>", since the text "<code>&amp;sect</code>" is interpreted as the rarely used
170 	 * character entity reference {@link CharacterEntityReference#_sect &amp;sect;} (U+00A7), despite the fact that it is
171 	 * missing the {@linkplain CharacterReference#isTerminated() terminating semicolon} (<code>;</code>).
172 	 * <p>
173 	 * Most browsers recognise <a href="CharacterReference.html#Unterminated">unterminated</a> character entity references
174 	 * in attribute values representing a codepoint of U+00FF or below, but ignore those representing codepoints above this value.
175  	 * One relatively popular browser only recognises those representing a codepoint of U+003E or below, meaning it would
176  	 * have interpreted the URL in the above example differently to most other browsers.
177 	 * Most browsers also use different rules depending on whether the unterminated character reference is inside or outside
178 	 * of an attribute value, with both of these possibilities further split into different rules for
179 	 * {@linkplain CharacterEntityReference character entity references},
180 	 * <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character references</a>, and
181 	 * <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character references</a>.
182 	 * <p>
183 	 * The behaviour of this library is determined by the current {@linkplain Config.CompatibilityMode compatibility mode} setting,
184 	 * which is determined by the static {@link Config#CurrentCompatibilityMode} property.
185 	 *
186 	 * @return the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}.
187 	 */
getValue()188 	public String getValue() {
189 		return CharacterReference.decode(valueSegment,true);
190 	}
191 
192 	/**
193 	 * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}.
194 	 * @return the segment spanning the {@linkplain #getValue() value} of this attribute, or <code>null</code> if it {@linkplain #hasValue() has no value}.
195 	 * @see #getValue()
196 	 */
getValueSegment()197 	public Segment getValueSegment() {
198 		return valueSegment;
199 	}
200 
201 	/**
202 	 * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any,
203 	 * or <code>null</code> if it {@linkplain #hasValue() has no value}.
204 	 * <p>
205 	 * If the value is not enclosed by quotation marks, this is the same as the {@linkplain #getValueSegment() value segment}
206 	 *
207 	 * @return the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, or <code>null</code> if it {@linkplain #hasValue() has no value}.
208 	 */
getValueSegmentIncludingQuotes()209 	public Segment getValueSegmentIncludingQuotes() {
210 		return valueSegmentIncludingQuotes;
211 	}
212 
213 	/**
214 	 * Returns the character used to quote the value.
215 	 * <p>
216 	 * The return value is either a double-quote (<code>"</code>), a single-quote (<code>'</code>), or a space.
217 	 *
218 	 * @return the character used to quote the value, or a space if the value is not quoted or this attribute has no value.
219 	 */
getQuoteChar()220 	public char getQuoteChar() {
221 		if (valueSegment==valueSegmentIncludingQuotes) return ' '; // no quotes
222 		return source.charAt(valueSegmentIncludingQuotes.getBegin());
223 	}
224 
225 	/**
226 	 * Returns the start tag to which this attribute belongs.
227 	 * @return the start tag to which this attribute belongs, or <code>null</code> if it is not within a start tag.
228 	 */
getStartTag()229 	public StartTag getStartTag() {
230 		if (startTag==StartTag.NOT_CACHED) {
231 			final Tag tag=source.getEnclosingTag(begin);
232 			startTag=(tag==null || tag instanceof EndTag) ? null : (StartTag)tag;
233 		}
234 		return startTag;
235 	}
236 
237 	/**
238 	 * Returns a string representation of this object useful for debugging purposes.
239 	 * @return a string representation of this object useful for debugging purposes.
240 	 */
getDebugInfo()241 	public String getDebugInfo() {
242 		final StringBuilder sb=new StringBuilder().append(key).append(super.getDebugInfo()).append(",name=").append(nameSegment.getDebugInfo());
243 		if (hasValue())
244 			sb.append(",value=").append(valueSegment.getDebugInfo()).append('"').append(valueSegment).append('"').append(Config.NewLine);
245 		else
246 			sb.append(",NO VALUE").append(Config.NewLine);
247 		return sb.toString();
248 	}
249 
appendTidy(final Appendable appendable, Tag nextTag)250 	Tag appendTidy(final Appendable appendable, Tag nextTag) throws IOException {
251 		appendable.append(' ').append(nameSegment);
252 		if (valueSegment!=null) {
253 			appendable.append("=\"");
254 			while (nextTag!=null && nextTag.begin<valueSegment.begin) nextTag=nextTag.getNextTag();
255 			if (nextTag==null || nextTag.begin>=valueSegment.end) {
256 				appendTidyValue(appendable,valueSegment);
257 			} else {
258 				int i=valueSegment.begin;
259 				while (nextTag!=null && nextTag.begin<valueSegment.end) {
260 					appendTidyValue(appendable,new Segment(source,i,nextTag.begin));
261 					if (nextTag.end>valueSegment.end) {
262 						appendable.append(new Segment(source,nextTag.begin,i=valueSegment.end));
263 						break;
264 					}
265 					appendable.append(nextTag);
266 					i=nextTag.end;
267 					nextTag=nextTag.getNextTag();
268 				}
269 				if (i<valueSegment.end) appendTidyValue(appendable,new Segment(source,i,valueSegment.end));
270 			}
271 			appendable.append('"');
272 		}
273 		return nextTag;
274 	}
275 
appendTidyValue(final Appendable appendable, final CharSequence unencodedValue)276 	private static void appendTidyValue(final Appendable appendable, final CharSequence unencodedValue) throws IOException {
277 		CharacterReference.appendEncode(appendable,CharacterReference.decode(unencodedValue,true),false);
278 	}
279 
appendHTML(final Appendable appendable, final CharSequence name, final CharSequence value)280 	static Appendable appendHTML(final Appendable appendable, final CharSequence name, final CharSequence value) throws IOException {
281 		appendable.append(' ').append(name);
282 		if (value!=null) {
283 			appendable.append("=\"");
284 			CharacterReference.appendEncode(appendable,value,false);
285 			appendable.append('"');
286 		}
287 		return appendable;
288 	}
289 }
290