1 /* Copyright 2002-2005, 2010 Elliotte Rusty Harold
2 
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6 
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU Lesser General Public License for more details.
11 
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307  USA
16 
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@ibiblio.org. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */
21 
22 package nu.xom;
23 
24 import java.io.UnsupportedEncodingException;
25 
26 /**
27  * <p>
28  *   This class represents a run of text.
29  *   CDATA sections are not treated differently than
30  *   normal text. <code>Text</code> objects may be adjacent to other
31  *   <code>Text</code> objects.
32  * </p>
33  *
34  * <p>
35  *   The maximum size of a String or an array in Java limits the maximum
36  *   size of a text node to about 2 gigabytes. If you're stuffing Base-64
37  *   encoded movies inside one node, you will have problems. Try breaking
38  *   up the text into smaller, contiguous chunks. Even then you will
39  *   probably not be able to use XPath on the result.
40  * </p>
41  *
42  * @author Elliotte Rusty Harold
43  * @version 1.2.7
44  *
45  */
46 public class Text extends Node {
47 
48 
49     private byte[] data;
50 
51 
52     /**
53      * <p>
54      * This constructor creates a new <code>Text</code> object.
55      * The data is checked for  legality according to XML 1.0 rules.
56      * Characters that can be serialized by escaping them
57      * such as &lt; and &amp; are allowed. However, characters
58      * such as the form feed, null, vertical tab,
59      * unmatched halves of surrogate pairs,
60      * and 0xFFFE and 0xFFFF are not allowed.
61      * </p>
62      *
63      * @param data the initial text of the object
64      *
65      * @throws IllegalCharacterDataException if data contains any
66      *     characters which are illegal in well-formed XML 1.0 such as
67      *     null, vertical tab, or unmatched halves of surrogate pairs
68      */
Text(String data)69     public Text(String data) {
70         _setValue(data);
71     }
72 
73 
74     /**
75      * <p>
76      * Creates a copy of the specified <code>Text</code> object.
77      * </p>
78      *
79      * @param text the <code>Text</code> object to copy
80      */
Text(Text text)81     public Text(Text text) {
82         // I'm relying here on the data array being immutable.
83         // If this ever changes, e.g. by adding an append method,
84         // this method needs to change too.
85         this.data = text.data;
86     }
87 
88 
Text()89     private Text() {}
90 
91 
build(String data)92     static Text build(String data) {
93 
94         Text result = new Text();
95         try {
96             result.data = data.getBytes("UTF8");
97         }
98         catch (UnsupportedEncodingException ex) {
99             throw new RuntimeException(
100               "Bad VM! Does not support UTF-8"
101             );
102         }
103         return result;
104 
105     }
106 
107 
108     /**
109      * <p>
110      * Sets the content of the <code>Text</code> object
111      * to the specified data. The data is checked for
112      * legality according to XML 1.0 rules. Characters that
113      * can be serialized such as &lt; and &amp; are allowed.
114      * However, characters such as the form feed, null,
115      * vertical tab, unmatched halves of surrogate pairs,
116      * and 0xFFFE and 0xFFFF are not allowed. Passing null is the same
117      * as passing the empty string.
118      * </p>
119      *
120      * @param data the text to install in the object
121      *
122      * @throws IllegalCharacterDataException if data contains any
123      *     characters which are illegal in well-formed XML 1.0 such as
124      *     null, vertical tab, or unmatched halves of surrogate pairs
125      */
setValue(String data)126     public void setValue(String data) {
127         _setValue(data);
128     }
129 
130 
_setValue(String data)131     private void _setValue(String data) {
132 
133         if (data == null) data = "";
134         else Verifier.checkPCDATA(data);
135         try {
136             this.data = data.getBytes("UTF8");
137         }
138         catch (UnsupportedEncodingException ex) {
139             throw new RuntimeException(
140               "Bad VM! Does not support UTF-8"
141             );
142         }
143 
144     }
145 
146     /**
147      * <p>
148      * Returns the XPath 1.0 string-value of this <code>Text</code>
149      * node. The XPath string-value of a text node is the same as
150      * the text of the node.
151      * </p>
152      *
153      * @return the content of the node
154      */
getValue()155     public final String getValue() {
156 
157         try {
158             return new String(data, "UTF8");
159         }
160         catch (UnsupportedEncodingException ex) {
161             throw new RuntimeException(
162               "Bad VM! Does not support UTF-8"
163             );
164         }
165 
166     }
167 
168 
169     /**
170      * <p>
171      * Throws <code>IndexOutOfBoundsException</code> because
172      * texts do not have children.
173      * </p>
174      *
175      * @return never returns because texts do not have children;
176      *     always throws an exception.
177      *
178      * @param position the index of the child node to return
179      *
180      * @throws IndexOutOfBoundsException because texts
181      *     do not have children
182      */
getChild(int position)183     public final Node getChild(int position) {
184         throw new IndexOutOfBoundsException(
185           "LeafNodes do not have children");
186     }
187 
188 
189     /**
190      * <p>
191      * Returns 0 because texts do not have children.
192      * </p>
193      *
194      * @return zero
195      */
getChildCount()196     public final int getChildCount() {
197         return 0;
198     }
199 
200 
201     /**
202      * <p>
203      * Returns a deep copy of this <code>Text</code> with no parent,
204      * that can be added to this document or a different one.
205      * </p>
206      *
207      * @return a deep copy of this text node with no parent
208      */
copy()209     public Node copy() {
210 
211         if (isCDATASection()) {
212             return new CDATASection(this);
213         }
214         else {
215             return new Text(this);
216         }
217 
218     }
219 
220 
221     /**
222      * <p>
223      * Returns a string containing the XML serialization of this text
224      * node.  Unlike <code>getValue</code>, this method escapes
225      * characters such as &amp; and &lt; using entity references such
226      * as <code>&amp;amp;</code> and <code>&amp;lt;</code>.
227      * It escapes the carriage return (\r) as <code>&amp;#x0D;</code>.
228      * If this text node is a CDATA section, then it may wrap the value
229      * in CDATA section delimiters instead of escaping.
230      * </p>
231      *
232      * @return the string form of this text node
233      */
toXML()234     public final String toXML() {
235         return escapeText();
236     }
237 
238 
escapeText()239     String escapeText() {
240 
241         String s = getValue();
242         int length = s.length();
243         // Give the string buffer enough room for a couple of escaped characters
244         StringBuffer result = new StringBuffer(length+12);
245         for (int i = 0; i < length; i++) {
246             char c = s.charAt(i);
247             switch (c) {
248                 case '\r':
249                     result.append("&#x0D;");
250                     break;
251                 case 14:
252                     // impossible
253                     break;
254                 case 15:
255                     // impossible
256                     break;
257                 case 16:
258                     // impossible
259                     break;
260                 case 17:
261                     // impossible
262                     break;
263                 case 18:
264                     // impossible
265                     break;
266                 case 19:
267                     // impossible
268                     break;
269                 case 20:
270                     // impossible
271                     break;
272                 case 21:
273                     // impossible
274                     break;
275                 case 22:
276                     // impossible
277                     break;
278                 case 23:
279                     // impossible
280                     break;
281                 case 24:
282                     // impossible
283                     break;
284                 case 25:
285                     // impossible
286                     break;
287                 case 26:
288                     // impossible
289                     break;
290                 case 27:
291                     // impossible
292                     break;
293                 case 28:
294                     // impossible
295                     break;
296                 case 29:
297                     // impossible
298                     break;
299                 case 30:
300                     // impossible
301                     break;
302                 case 31:
303                     // impossible
304                     break;
305                 case ' ':
306                     result.append(' ');
307                     break;
308                 case '!':
309                     result.append('!');
310                     break;
311                 case '"':
312                     result.append('"');
313                     break;
314                 case '#':
315                     result.append('#');
316                     break;
317                 case '$':
318                     result.append('$');
319                     break;
320                 case '%':
321                     result.append('%');
322                     break;
323                 case '&':
324                     result.append("&amp;");
325                     break;
326                 case '\'':
327                     result.append('\'');
328                     break;
329                 case '(':
330                     result.append('(');
331                     break;
332                 case ')':
333                     result.append(')');
334                     break;
335                 case '*':
336                     result.append('*');
337                     break;
338                 case '+':
339                     result.append('+');
340                     break;
341                 case ',':
342                     result.append(',');
343                     break;
344                 case '-':
345                     result.append('-');
346                     break;
347                 case '.':
348                     result.append('.');
349                     break;
350                 case '/':
351                     result.append('/');
352                     break;
353                 case '0':
354                     result.append('0');
355                     break;
356                 case '1':
357                     result.append('1');
358                     break;
359                 case '2':
360                     result.append('2');
361                     break;
362                 case '3':
363                     result.append('3');
364                     break;
365                 case '4':
366                     result.append('4');
367                     break;
368                 case '5':
369                     result.append('5');
370                     break;
371                 case '6':
372                     result.append('6');
373                     break;
374                 case '7':
375                     result.append('7');
376                     break;
377                 case '8':
378                     result.append('8');
379                     break;
380                 case '9':
381                     result.append('9');
382                     break;
383                 case ':':
384                     result.append(':');
385                     break;
386                 case ';':
387                     result.append(';');
388                     break;
389                 case '<':
390                     result.append("&lt;");
391                     break;
392                 case '=':
393                     result.append('=');
394                     break;
395                 case '>':
396                     result.append("&gt;");
397                     break;
398                 default:
399                     result.append(c);
400             }
401         }
402 
403         return result.toString();
404 
405     }
406 
407 
isText()408     boolean isText() {
409         return true;
410     }
411 
412 
413     /**
414      * <p>
415      * Returns a <code>String</code>
416      * representation of this <code>Text</code> suitable for
417      * debugging and diagnosis. This is <em>not</em>
418      * the XML representation of this <code>Text</code> node.
419      * </p>
420      *
421      * @return a non-XML string representation of this node
422      */
toString()423     public final String toString() {
424 
425         return "[" + getClass().getName() + ": "
426           + escapeLineBreaksAndTruncate(getValue()) + "]";
427 
428     }
429 
430 
escapeLineBreaksAndTruncate(String s)431     static String escapeLineBreaksAndTruncate(String s) {
432 
433         int length = s.length();
434         boolean tooLong = length > 40;
435         if (length > 40) {
436             length = 35;
437             s = s.substring(0, 35);
438         }
439 
440         StringBuffer result = new StringBuffer(length);
441         for (int i = 0; i < length; i++) {
442             char c = s.charAt(i);
443             switch (c) {
444                 case '\n':
445                     result.append("\\n");
446                     break;
447                 case '\r':
448                     result.append("\\r");
449                     break;
450                 case '\t':
451                     result.append("\\t");
452                     break;
453                 default:
454                     result.append(c);
455             }
456         }
457         if (tooLong) result.append("...");
458 
459         return result.toString();
460 
461     }
462 
463 
isCDATASection()464     boolean isCDATASection() {
465         return false;
466     }
467 
468 
isEmpty()469     boolean isEmpty() {
470         return this.data.length == 0;
471     }
472 
473 
474 }