1 /*
2  * Copyright (c) 2005, The Regents of the University of California, through
3  * Lawrence Berkeley National Laboratory (subject to receipt of any required
4  * approvals from the U.S. Dept. of Energy). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * (1) Redistributions of source code must retain the above copyright notice,
10  * this list of conditions and the following disclaimer.
11  *
12  * (2) Redistributions in binary form must reproduce the above copyright notice,
13  * this list of conditions and the following disclaimer in the documentation
14  * and/or other materials provided with the distribution.
15  *
16  * (3) Neither the name of the University of California, Lawrence Berkeley
17  * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
18  * may be used to endorse or promote products derived from this software without
19  * specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * You are under no obligation whatsoever to provide any bug fixes, patches, or
34  * upgrades to the features, functionality or performance of the source code
35  * ("Enhancements") to anyone; however, if you choose to make your Enhancements
36  * available either publicly, or directly to Lawrence Berkeley National
37  * Laboratory, without imposing a separate written license agreement for such
38  * Enhancements, then you hereby grant the following license: a non-exclusive,
39  * royalty-free perpetual license to install, use, modify, prepare derivative
40  * works, incorporate into other computer software, distribute, and sublicense
41  * such enhancements or derivative works thereof, in binary and source code
42  * form.
43  */
44 package nux.xom.binary;
45 
46 import java.io.ByteArrayInputStream;
47 import java.io.ByteArrayOutputStream;
48 import java.io.IOException;
49 import java.io.InputStream;
50 import java.io.OutputStream;
51 import java.util.ArrayList;
52 import java.util.Arrays;
53 import java.util.Comparator;
54 import java.util.zip.DataFormatException;
55 import java.util.zip.Deflater;
56 import java.util.zip.Inflater;
57 
58 import nu.xom.Attribute;
59 import nu.xom.Comment;
60 import nu.xom.DocType;
61 import nu.xom.Document;
62 import nu.xom.Element;
63 import nu.xom.IllegalAddException;
64 import nu.xom.Node;
65 import nu.xom.NodeFactory;
66 import nu.xom.Nodes;
67 import nu.xom.ParentNode;
68 import nu.xom.ProcessingInstruction;
69 import nu.xom.Text;
70 import nu.xom.WellformednessException;
71 import nu.xom.XMLException;
72 import nux.xom.io.StreamingSerializer;
73 
74 /**
75  * Serializes (encodes) and deserializes (decodes) XOM XML documents to and from
76  * an efficient and compact custom binary XML data format (termed <i>bnux </i>
77  * format), without loss or change of any information. Serialization and
78  * deserialization is much faster than with the standard textual XML format, and
79  * the resulting binary data is more compressed than textual XML.
80  *
81  * <h4>Applicability</h4>
82  *
83  * The overall goal of the <i>bnux algorithm</i> is to maximize serialization
84  * and deserialization (parsing) performance without requiring any schema
85  * description. Serialization and deserialization speed are roughly balanced
86  * against each other; neither side is particularly favoured over the other.
87  * Another benefitial effect of the algorithm is that a considerable degree of
88  * XML data redundancy is eliminated, but compression is more a welcome
89  * side-effect than a primary goal in itself. The algorithm is primarily
90  * intended for tightly coupled high-performance systems exchanging large
91  * volumes of XML data over networks, as well as for compact main memory caches
92  * and for <i>short-term </i> storage as BLOBs in backend databases or files
93  * (e.g. "session" data with limited duration). In the case of BLOB storage,
94  * selecting matching BLOBs can be sped up by maintaining a simple metaindex
95  * side table for the most frequent access patterns. See the <a
96  * href="#performance">performance results</a> below.
97  * <p>
98  * While the Java API is considered stable, the bnux data format should be
99  * considered a black box: Its internals are under-documented and may change
100  * without notice from release to release in backwards-incompatible manners. It
101  * is unlikely that support for reading data written with older Nux versions
102  * will ever be available. bnux is an exchange format but not an
103  * interoperability format. Having said that, the data format is machine
104  * architecture/platform independent. For example a bnux file can be moved back
105  * and forth between a 32 bit Intel little-endian machine and a 64 bit PowerPC
106  * big-endian machine; it remains parseable no matter where.
107  * <p>
108  * This approach is expressly <b>not intended </b>as a replacement for standard
109  * textual XML in loosely coupled systems where maximum long-term
110  * interoperability is the overarching concern. It is also expressly <b>not
111  * intended </b>for long-term data storage. If you store data in bnux format
112  * there's every chance you won't be able to read it back a year or two from
113  * now, or even earlier. Finally, it is probably unwise to use this class if
114  * your application's performance requirements are not particularly stringent,
115  * or profiling indicates that the bottleneck is not related to XML
116  * serialization/deserialization anyway.
117  * <p>
118  * The bnux serialization algorithm is a fully streaming block-oriented
119  * algorithm, ideal for large numbers of very small to arbitrarily large
120  * XML documents.
121  * <p>
122  * The bnux deserialization algorithm is a fully streaming algorithm and can
123  * optionally be pushed through a {@link nu.xom.NodeFactory}. This enables
124  * efficient filtering and can avoid the need to build a main memory tree, which
125  * is particularly useful for arbitrarily large documents. For example, streaming
126  * XQueries over binary XML can be expressed via the NodeFactory generated by a
127  * {@link nux.xom.xquery.StreamingPathFilter}. In streaming mode, the binary
128  * codec exactly mimics the NodeFactory based behaviour of the XOM
129  * {@link nu.xom.Builder}.
130  *
131  * <h4>Faithfully Preversing XML</h4>
132  *
133  * Any and all arbitrary XOM XML documents are supported, and no schema is
134  * required. A XOM document that is serialized and subsequently deserialized by
135  * this class is <i>exactly the same </i> as the original document, preserving
136  * "as is" all names and data for elements, namespaces, additional namespace
137  * declarations, attributes, texts, document type, comments, processing
138  * instructions, whitespace, Unicode characters including surrogates, etc. As a
139  * result, the W3C XML Infoset and the W3C Canonical XML representation is
140  * guaranteed to be preserved. In particular there always holds:
141  *
142  * <pre>
143  * java.util.Arrays.equals(XOMUtil.toCanonicalXML(doc), XOMUtil
144  * 		.toCanonicalXML(deserialize(serialize(doc))));
145  * </pre>
146  *
147  * <h4>Optional ZLIB Compression</h4>
148  *
149  * The bnux algorithm considerably compresses XML data with little CPU
150  * consumption, by its very design. However, bnux also has an option to further
151  * compress/decompress its output/input with the ZLIB compression algorithm.
152  * ZLIB is based on Huffman coding and also used by the popular
153  * <code>gzip</code> (e.g. {@link java.util.zip.Deflater}). ZLIB compression
154  * is rather CPU intensive, but it typically yields strong compression factors,
155  * in particular for documents containing mostly narrative text (e.g. the
156  * bible). For example, strong compression may be desirable over low-bandwith
157  * networks or when bnux data is known to be accessed rather infrequently. On
158  * the other hand, ZLIB compression probably kills performance in the presence
159  * of high-bandwidth networks such as ESnet, Internet2/Abilene or 10 Gigabit
160  * Ethernet/InfiniBand LANs, even with high-end CPUs. CPU drain is also a
161  * scalability problem in the presence of large amounts of concurrent
162  * connections. An option ranging from 0 (no ZLIB compression; best performance)
163  * to 1 (little ZLIB compression; reduced performance) to 9 (strongest ZLIB
164  * compression; worst performance) allows one to configure the CPU/memory
165  * consumption trade-off.
166  *
167  * <h4>Reliability</h4>
168  *
169  * This class has been successfully tested against some 50000 extremely
170  * weird and unique test documents, including the W3C XML conformance test
171  * suite, and no bugs are known.
172  * <p>
173  * Serialization employs no error checking at all, since malformed XOM input
174  * documents are impossible to produce given XOM's design: XOM strictly enforces
175  * wellformedness anyway. Deserialization employs some limited error checking,
176  * throwing exceptions for any improper API usage, non-bnux input data, data
177  * format version mismatch, or general binary data corruption. Beyond this,
178  * deserialization relies on XOM's hard-wired wellformedness checks, just like
179  * serialization does. Barring one of the above catastrophic situations, the
180  * bnux algorithm will always correctly and faithfully reconstruct the exact
181  * same well-formed XOM document.
182  *
183  * <h4>Example Usage:</h4>
184  *
185  * <pre>
186  * // parse standard textual XML, convert to binary format, round-trip it and compare results
187  * Document doc = new Builder().build(new File("samples/data/periodic.xml"));
188  * BinaryXMLCodec codec = new BinaryXMLCodec();
189  * byte[] bnuxDoc = codec.serialize(doc, 0);
190  * Document doc2 = codec.deserialize(bnuxDoc);
191  * boolean isEqual = java.util.Arrays.equals(
192  *     XOMUtil.toCanonicalXML(doc), XOMUtil.toCanonicalXML(doc2));
193  * System.out.println("isEqual = " + isEqual);
194  * System.out.println(doc2.toXML());
195  *
196  * // write binary XML document to file
197  * OutputStream out = new FileOutputStream("/tmp/periodic.xml.bnux");
198  * out.write(bnuxDoc);
199  * out.close();
200  *
201  * // read binary XML document from file
202  * bnuxDoc = FileUtil.toByteArray(new FileInputStream("/tmp/periodic.xml.bnux"));
203  * Document doc3 = codec.deserialize(bnuxDoc);
204  * System.out.println(doc3.toXML());
205  * </pre>
206  *
207  * <a name="performance"/>
208  * <h4>Performance</h4>
209  *
210  * This class has been carefully profiled and optimized. Preliminary performance
211  * results over a wide range of real-world documents are given below. A more
212  * detailed presentation can be found at the Global Grid Forum <a
213  * target="_blank"
214  * href="http://www.ggf.org/GGF15/ggf_events_schedule_WSPerform.htm">Web
215  * Services Performance Workshop</a>.
216  * <p>
217  * Contrasting bnux BinaryXMLCodec with the XOM Builder and Serializer:
218  * <ul>
219  * <li>Tree Deserialization speedup: 40-100 MB/s vs. 3-30 MB/s</li>
220  * <li>Streaming Deserialization speedup: 60-500 MB/s vs. 3-30 MB/s</li>
221  * <li>Tree Serialization speedup: 50-150 MB/s vs. 5-20 MB/s</li>
222  * <li>Data compression factor: 1.0 - 4</li>
223  * </ul>
224  * For meaningful comparison, MB/s and compression factors are always given
225  * normalized in relation to the original standard textual XML file size.
226  * <ul>
227  * <li>Benchmark test data: A wide variety of small to medium large XML
228  * documents is used, including SOAP documents heavily using namespaces ( <a
229  * target="_blank" href="http://xbis.sourceforge.net/">XBIS </a>), simple XML
230  * formatted web server logs using no namespaces, RDF documents with lots of
231  * attributes and namespaces, the periodic table, documents consisting of large
232  * narrative text ( <a target="_blank"
233  * href="http://www.oasis-open.org/cover/bosakShakespeare200.html">Shakespeare
234  * </a>), publication citations ( <a target="_blank"
235  * href="http://dblp.uni-trier.de/xml/">DBLP </a>), music title databases ( <a
236  * target="_blank" href="http://www.freedb.org/">FreeDB </a>), Japanese
237  * documents (XML conformance test suite), SVG image files, etc.</li>
238  *
239  * <li>Benchmark configuration: no ZLIB compression, xom-1.2, non-validating
240  * XOM Builder using xerces-2.8.0, no DTD or schema, Sun JDK 1.5.0 server VM,
241  * commodity PC 2004, Dual Pentium 4, 3.4 GHz, Redhat 9</li>
242  * </ul>
243  *
244  * Example Interpretation:
245  * <ul>
246  * <li>Small documents: Results translate, for example, to ping-pong
247  * round-tripping of typical 500 byte SOAP request/response message documents at
248  * a rate of 25000 msg/s, compared to 2500 msg/s with XOM (excluding network
249  * latency). More pronounced, 500 (150) byte documents with few namespaces
250  * translate to 35000 (120000) msg/s, compared to 3500 (5000) msg/s with XOM.
251  * Consequently, XML serialization and deserialization are probably nomore your
252  * application's bottleneck, leaving, say, 95% CPU headroom free for other
253  * application modules.</li>
254  *
255  * <li>Medium documents: When having a main-memory cache of several thousand 1
256  * MB documents, each containing highly structured complex data, one can
257  * deserialize, XQuery and serve from the cache at a rate of 50 documents/s,
258  * compared to 5 documents/s with XOM.</li>
259  *
260  * </ul>
261  * Note that in contrast to other algorithms, these measurements include XOM
262  * tree building and walking, hence measures delivering data to and from actual
263  * XML applications, rather than merely to and from a low-level SAX event stream
264  * (which is considerably cheaper and deemed less useful).
265  * <p>
266  * The deserialization speedup is further multiplied when DTDs or schema
267  * validation is used while parsing standard textual XML.
268  * <p>
269  * This class relies on advanced Java compiler optimizations, which take
270  * considerable time to warm up. Hence, for comparative benchmarks, use a
271  * server-class VM and make sure to repeat runs for at least 30 seconds.
272  * <p>
273  * Further, you will probably want to eliminate drastic XOM hotspots by
274  * compiling XOM with "ant -Dfat=true jar" to maintain an internal String
275  * instead of an UTF-8 encoded byte array in {@link nu.xom.Text}, which
276  * eliminates the expensive character conversions implied for each access to a
277  * Text object. This increases performance at the expense of memory footprint.
278  * The measurements above report numbers using these patches, both for xom and
279  * bnux. If you're curious about the whereabouts of bottlenecks, run java with
280  * the non-perturbing '-server -agentlib:hprof=cpu=samples,depth=10' flags, then
281  * study the trace log and correlate its hotspot trailer with its call stack
282  * headers (see <a target="_blank"
283  * href="http://java.sun.com/developer/technicalArticles/Programming/HPROF.html">
284  * hprof tracing </a>).
285  * <p>
286  * Use class {@link nux.xom.sandbox.BinaryXMLTest} to reproduce results, verify
287  * correctness or to evaluate performance for your own datasets.
288  *
289  * @author whoschek.AT.lbl.DOT.gov
290  * @author $Author: hoschek $
291  * @version $Revision: 1.179 $, $Date: 2006/06/18 21:25:02 $
292  */
293 public class BinaryXMLCodec {
294 
295 	/*
296 	 * TODO: add a StAX interface on top of bnux?
297 	 * e.g. createXMLStreamReader(byte[]), similar for XMLStreamWriter
298 	 * TODO: add coalescing of adjacent Text nodes on deserialization?
299 	 */
300 
301 	/*
302 	 * TODO: My impression is that there is remaining potential for some speedup,
303 	 * both for serialization and deserialization performance. Ideas towards this
304 	 * end include:
305 	 *
306 	 * - Add option to always use UTF16 (level=-1)
307 	 * - Micro caches may become obsolete once XOM has its own internal QName LRU cache
308 	 * - Use a better low level namespace iteration
309 	 * - Split unified symbolTable into several smaller symbolTables for Text, Attributes, and other.
310 	 * - other?
311 	 */
312 
313 	// for deserialization: factory to stream (push) into
314 	private NodeFactory factory;
315 
316 	// for deserialization: unique symbols from deserialized symbolTable
317 	private String[] symbols;
318 
319 	// for deserialization: are pages ZLIB compressed?
320 	private boolean isCompressed;
321 
322 	// for deserialization and serialization: page buffer
323 	private ArrayByteList page; // multi-byte integers are ALWAYS in big-endian
324 
325 	// for serialization:
326 	// holds unique strings found in document (qnames, texts, uris, etc.)
327 	private SymbolTable symbolTable;
328 
329 	// for serialization:
330 	// byte-level node type tokens in XML document order; also length of next index(es)
331 	private ArrayByteList nodeTokens;
332 
333 	// for serialization: indexes into symbolTable/entries
334 	private ArrayIntList indexData;
335 
336 	// for serialization: has first page of current document already been written?
337 	private boolean isFirstPage = true;
338 
339 	// ZLIB
340 	private Inflater decompressor; // ZLIB
341 	private Deflater compressor;   // ZLIB
342 	private int compressionLevel = -1; // initialize to "undefined"
343 
344 	// for deserialization: avoids reverification of PCDATA
345 	private Text[] textCache;
346 
347 	// for deserialization: avoids reverification of namespace URIs
348 	private String[] nameCache;
349 	private LRUHashMap1 internedNames;
350 
351 	// for deserialization:
352 	// avoids reverification of qname and URI, as well as indexOf() calls, and saves string memory
353 	private NodeBuilder nodeBuilder;
354 
355 	// for streaming serialization
356 	private OutputStream out;
357 
358 	/**
359 	 * For serialization: (approximate) maximum number of bytes per page.
360 	 * <p>
361 	 * To enable true streaming, a serialized document consists of one or more
362 	 * independent consecutive pages. Each page contains a portion of the XML
363 	 * document, in document order. More specifically, each page consists of a
364 	 * tokenized byte array and corresponding symbols. Once a page has been
365 	 * read/written related (heavy) state can be discarded, freeing memory. No
366 	 * more than one page needs to be held in memory at any given time. For very
367 	 * large documents this page design reduces memory consumption, increases
368 	 * throughput and reduces latency. For small to medium sized documents it
369 	 * makes next to no difference.
370 	 * <p>
371 	 * A small page capacity (e.g. 128 bytes) leads to lower latency per page
372 	 * but also lower throughput and lower compression overall. Conversely, a
373 	 * large page capacity (e.g. 1 MB) leads to higher throughput and higher
374 	 * compression, at the expense of higher latency. However, a very large page
375 	 * capacity (e.g. 10 MB) leads to memory subsystem pressure on streaming.
376 	 * Thus, here we use a happy medium, small enough to generate little memory
377 	 * subsystem pressure, and large enough to gain high throughput, retain
378 	 * almost all compression stemming from redundancy eliminating tokenization,
379 	 * with near zero overhead for small to medium sized documents, and
380 	 * outstanding performance for very large documents.
381 	 */
382 	private static final int MAX_PAGE_CAPACITY = 64 * 1024;
383 //	private static final int MAX_PAGE_CAPACITY = 1; // DEBUG only
384 
385 	// low 3 bits for node token types
386 	// high 4 bits typically hold number of bytes of the next two indexes
387 	private static final int TEXT = 0;
388 	private static final int ATTRIBUTE = 1;
389 	private static final int BEGIN_ELEMENT = 2;
390 	private static final int END_ELEMENT = 3;
391 	private static final int COMMENT = 4;
392 	private static final int NAMESPACE_DECLARATION = 5;
393 	private static final int PROCESSING_INSTRUCTION = 6;
394 	private static final int DOC_TYPE = 7;
395 
396 	private static final int BNUX_MAGIC = createMagicNumber(); // for sanity checks
397 	private static final byte VERSION = 7; // version of bnux data format
398 	private static final int DOCUMENT_HEADER_SIZE = 4 + 1; // in bytes
399 	private static final int PAGE_HEADER_SIZE = 4 + 4 + 4 + 4 + 4; // in bytes
400 
401 	private static final boolean IS_EXTENDED_XOM = hasXOMExtensions();
402 
403 	/**
404 	 * Marker for non-existant systemID or publicID in DocType. XML and the
405 	 * nu.xom.Verifier regard " " as an illegal ID, so it can never occur in
406 	 * practice. Thus we can use it as an unambigous marker identifying a
407 	 * serialized null value.
408 	 */
409 	private static final String DOCTYPE_NULL_ID = " ";
410 
411 	private static final boolean DEBUG = false; // VM does dead code elimination
412 
413 	/** Reinitializes instance variables to virgin state. */
reset()414 	private void reset() {
415 		// deserialization state:
416 		internedNames = null;
417 		nodeBuilder = null;
418 		factory = null;
419 
420 		// serialization state:
421 		symbolTable = null;
422 		page = null;
423 		nodeTokens = null;
424 		indexData = null;
425 		isFirstPage = true;
426 		out = null;
427 
428 		// better safe than sorry:
429 		try {
430 			if (decompressor != null) decompressor.end(); // free resources
431 		} finally {
432 			decompressor = null;
433 			try {
434 				if (compressor != null) compressor.end(); // free resources
435 			} finally {
436 				compressor = null;
437 			}
438 		}
439 	}
440 
441 	/**
442 	 * Constructs an instance; An instance can be reused serially, but is not
443 	 * thread-safe, just like a {@link nu.xom.Builder}.
444 	 */
BinaryXMLCodec()445 	public BinaryXMLCodec() {
446 	}
447 
448 	/**
449 	 * Constructs a new streaming serializer that serializes bnux binary XML to
450 	 * the given underlying output stream, using the given ZLIB compression
451 	 * level.
452 	 * <p>
453 	 * An optional zlib compression level ranging from 0 (no ZLIB compression;
454 	 * best performance) to 1 (little ZLIB compression; reduced performance) to
455 	 * 9 (strongest ZLIB compression; worst performance) allows one to configure
456 	 * the CPU/memory consumption trade-off.
457 	 * <p>
458 	 * Unless there is a good reason to the contrary, you should always use
459 	 * level 0: the bnux algorithm typically already precompresses considerably.
460 	 *
461 	 * @param out
462 	 *            the underlying output stream to write to
463 	 * @param zlibCompressionLevel
464 	 *            a number in the range 0..9
465 	 * @return a streaming serializer
466 	 */
createStreamingSerializer(OutputStream out, int zlibCompressionLevel)467 	public StreamingSerializer createStreamingSerializer(OutputStream out, int zlibCompressionLevel) {
468 		return new StreamingBinaryXMLSerializer(this, out, zlibCompressionLevel);
469 	}
470 
471 	/**
472 	 * Returns whether or not the given input stream contains a bnux document.
473 	 * <p>
474 	 * A peek into the first 4 bytes is sufficient for unambigous detection, as
475 	 * standard textual XML cannot start with any arbitrary four byte
476 	 * combination.
477 	 * <p>
478 	 * Finally, the read bytes are put back onto the stream, so they can be
479 	 * reread as part of subsequent parsing attempts. Therefore, the input
480 	 * stream must support <code>input.mark()</code> and
481 	 * <code>input.reset()</code>. For example, a
482 	 * {@link java.io.BufferedInputStream} is a good choice.
483 	 *
484 	 * @param input
485 	 *            the stream to read from
486 	 * @return true if the stream contains a bnux document
487 	 * @throws IllegalArgumentException
488 	 *             if the underlying stream does not support
489 	 *             <code>input.mark()</code> and <code>input.reset()</code>.
490 	 * @throws IOException
491 	 *             if the underlying input stream encounters an I/O error
492 	 * @see InputStream#mark(int)
493 	 */
isBnuxDocument(InputStream input)494 	public boolean isBnuxDocument(InputStream input) throws IOException {
495 		if (input == null)
496 			throw new IllegalArgumentException("input stream must not be null");
497 		if (!input.markSupported())
498 			throw new IllegalArgumentException("markSupported() must be true");
499 
500 		int magicBytes = 4;
501 		input.mark(magicBytes);
502 		try {
503 			ArrayByteList list = new ArrayByteList(magicBytes);
504 			if (!list.ensureRemaining(input, magicBytes)) {
505 				return false; // stream contains less than 4 bytes
506 			}
507 			return list.getInt() == BNUX_MAGIC;
508 		} finally {
509 			input.reset(); // unread the header
510 		}
511 	}
512 
513 	/**
514 	 * Equivalent to
515 	 * <code>deserialize(new ByteArrayInputStream(input), new NodeFactory())</code>.
516 	 *
517 	 * @param bnuxDocument
518 	 *            the bnux document to deserialize.
519 	 * @return the new XOM document obtained from deserialization.
520 	 * @throws BinaryParsingException
521 	 *             if the bnux document is unreadable or corrupt for some reason
522 	 */
deserialize(byte[] bnuxDocument)523 	public Document deserialize(byte[] bnuxDocument) throws BinaryParsingException {
524 		if (bnuxDocument == null)
525 			throw new IllegalArgumentException("bnuxDocument must not be null");
526 
527 		try {
528 			return deserialize(new ByteArrayInputStream(bnuxDocument), null);
529 		} catch (IOException e) {
530 			throw new BinaryParsingException(e); // can never happen
531 		}
532 	}
533 
534 	/**
535 	 * Returns the XOM document obtained by deserializing the next binary XML
536 	 * document from the given input stream.
537 	 * <p>
538 	 * If the document is in ZLIB compressed bnux format, it will be
539 	 * auto-detected and auto-decompressed as part of deserialization.
540 	 * <p>
541 	 * This method exactly mimics the NodeFactory based behaviour of the XOM
542 	 * {@link nu.xom.Builder}. A NodeFactory enables efficient filtering and
543 	 * can avoid the need to build a main memory tree, which is particularly
544 	 * useful for large documents. For example, streaming XQueries over binary
545 	 * XML can be expressed via the NodeFactory generated by a
546 	 * {@link nux.xom.xquery.StreamingPathFilter}. Binary XML files can be
547 	 * converted to and from standard textual XML files via a
548 	 * {@link nux.xom.pool.XOMUtil#getRedirectingNodeFactory(StreamingSerializer)}. For
549 	 * other example factories, see {@link nux.xom.pool.XOMUtil}.
550 	 * <p>
551 	 * Bnux is a self-framing data format: It knows where the end of a document
552 	 * occurs. An input stream can contain any number of independent documents,
553 	 * one after another. Thus, this method reads from the stream as many bytes
554 	 * as required for the current document, but no more than that. Unlike SAX
555 	 * XML parsers and unlike a {@link nu.xom.Builder}, it does not read until
556 	 * end-of-stream (EOS), and it does not auto-close the input stream. If this
557 	 * method returns successfully, the input stream has been positioned one
558 	 * byte past the current bnux document, ready to deserialize the following
559 	 * document, if any. It is the responsibility of the caller to ensure the
560 	 * input stream gets properly closed when deemed appropriate.
561 	 *
562 	 * @param input
563 	 *            the stream to read and deserialize from
564 	 * @param factory
565 	 *            the node factory to stream into. May be <code>null</code> in
566 	 *            which case the default XOM NodeFactory is used, building the
567 	 *            complete XML document tree.
568 	 * @return the new XOM document obtained from deserialization.
569 	 * @throws BinaryParsingException
570 	 *             if the bnux document is unreadable or corrupt for some reason
571 	 * @throws IOException
572 	 *             if the underlying input stream encounters an I/O error
573 	 */
deserialize(InputStream input, NodeFactory factory)574 	public Document deserialize(InputStream input, NodeFactory factory)
575 			throws BinaryParsingException, IOException {
576 
577 		if (input == null)
578 			throw new IllegalArgumentException("input stream must not be null");
579 		if (factory == null) factory = new NodeFactory();
580 
581 		// read document header
582 		if (page == null) page = new ArrayByteList(256);
583 		page.clear();
584 		if (!page.ensureRemaining(input, 4 + 1 + 1 + 4))
585 			throw new BinaryParsingException("Missing bnux document header");
586 
587 		int magic = page.getInt();
588 		if (magic != BNUX_MAGIC) throw new BinaryParsingException(
589 			"Bnux magic number mismatch: " + magic + ", must be: " + BNUX_MAGIC);
590 
591 		int version = page.get();
592 		isCompressed = version < 0;
593 		if (isCompressed) version = -version;
594 		if (version != VERSION) throw new BinaryParsingException(
595 			"Bnux data format version mismatch: " + version + ", must be: " + VERSION);
596 		if (isCompressed) {
597 			if (decompressor == null) decompressor = new Inflater();
598 		}
599 
600 		if (page.get() != DOC_TYPE) // surrogate hack to identify BEGIN_PAGE
601 			throw new BinaryParsingException("Illegal bnux page header marker");
602 
603 		// prepare
604 		if (internedNames == null) internedNames = new LRUHashMap1(128);
605 		if (nodeBuilder == null) nodeBuilder = new NodeBuilder();
606 
607 		// parse node token data and packed indexes, building the XOM tree
608 		this.factory = factory;
609 		try {
610 			return readDocument(page, input);
611 		} catch (Throwable t) {
612 			reset(); // better safe than sorry
613 			if (t instanceof Error) {
614 				throw (Error) t;
615 			} else if (t instanceof BinaryParsingException) {
616 				throw (BinaryParsingException) t;
617 			} else if (t instanceof IOException) {
618 				throw (IOException) t;
619 			} else {
620 				throw new BinaryParsingException(t);
621 			}
622 		} finally {
623 			this.symbols = null; // help gc
624 			this.textCache = null; // help gc
625 			this.nameCache = null; // help gc
626 			this.factory = null; // help gc
627 //			if (decompressor != null) decompressor.end();
628 //			decompressor = null;
629 		}
630 	}
631 
632 	/**
633 	 * Returns the bnux binary XML document obtained by serializing the given
634 	 * XOM document.
635 	 * <p>
636 	 * An optional zlib compression level ranging from 0 (no ZLIB compression;
637 	 * best performance) to 1 (little ZLIB compression; reduced performance) to
638 	 * 9 (strongest ZLIB compression; worst performance) allows one to configure
639 	 * the CPU/memory consumption trade-off.
640 	 * <p>
641 	 * Unless there is a good reason to the contrary, you should always use
642 	 * level 0: the bnux algorithm typically already precompresses considerably.
643 	 *
644 	 * @param document
645 	 *            the XOM document to serialize
646 	 * @param zlibCompressionLevel
647 	 *            a number in the range 0..9
648 	 * @return the bnux document obtained from serialization.
649 	 * @throws IllegalArgumentException
650 	 *             if the compression level is out of range.
651 	 */
652 	public byte[] serialize(Document document, int zlibCompressionLevel)
653 			throws IllegalArgumentException {
654 
655 		ByteArrayOutputStream result = new ByteArrayOutputStream(256);
656 		try {
657 			serialize(document, zlibCompressionLevel, result);
658 		} catch (IOException e) {
659 			throw new RuntimeException(e); // can never happen
660 		}
661 		return result.toByteArray();
662 	}
663 
664 	/**
665 	 * Serializes the given XOM document as a bnux binary XML document onto
666 	 * the given output stream.
667 	 * <p>
668 	 * An optional zlib compression level ranging from 0 (no ZLIB compression;
669 	 * best performance) to 1 (little ZLIB compression; reduced performance) to
670 	 * 9 (strongest ZLIB compression; worst performance) allows one to configure
671 	 * the CPU/memory consumption trade-off.
672 	 * <p>
673 	 * Unless there is a good reason to the contrary, you should always use
674 	 * level 0: the bnux algorithm typically already precompresses considerably.
675 	 *
676 	 * @param document
677 	 *            the XOM document to serialize
678 	 * @param zlibCompressionLevel
679 	 *            a number in the range 0..9
680 	 * @param out
681 	 * 			 the output stream to write to
682 	 * @throws IllegalArgumentException
683 	 *             if the compression level is out of range.
684 	 * @throws IOException
685 	 *             if the underlying output stream encounters an I/O error
686 	 */
687 	public void serialize(Document document, int zlibCompressionLevel,
688 			OutputStream out) throws IllegalArgumentException, IOException {
689 
690 		if (document == null)
691 			throw new IllegalArgumentException("XOM document must not be null");
692 		if (zlibCompressionLevel < 0 || zlibCompressionLevel > 9)
693 			throw new IllegalArgumentException("Compression level must be 0..9");
694 		if (out == null)
695 			throw new IllegalArgumentException("Output stream must not be null");
696 
697 		try {
698 			setOutputStream(zlibCompressionLevel, out);
699 			writeDocument(document); // generate output
700 		} catch (Throwable t) {
701 			reset(); // better safe than sorry
702 			if (t instanceof Error) {
703 				throw (Error) t;
704 			} else if (t instanceof RuntimeException) {
705 				throw (RuntimeException) t;
706 			} else if (t instanceof IOException) {
707 				throw (IOException) t;
708 			} else {
709 				throw new RuntimeException(t);
710 			}
711 		} finally {
712 			this.symbolTable = null; // help gc
713 			this.out = null;
714 		}
715 	}
716 
717 	final void setOutputStream(int zlibCompressionLevel, OutputStream out) {
718 		if (zlibCompressionLevel > 0) {
719 			if (compressor == null || zlibCompressionLevel != compressionLevel) {
720 				if (compressor != null) compressor.end(); // free resources
721 				compressor = new Deflater(zlibCompressionLevel);
722 			}
723 		}
724 		compressionLevel = zlibCompressionLevel;
725 		this.out = out;
726 	}
727 
728 	/** Prepares reading from the next page. */
readPage(ArrayByteList src, InputStream input)729 	private void readPage(ArrayByteList src, InputStream input)
730 			throws BinaryParsingException, IOException {
731 
732 		if (DEBUG) System.err.println("reading page");
733 		if (!src.ensureRemaining(input, 4))
734 			throw new BinaryParsingException("Missing remaining bnux page size");
735 		int pageSize = src.getInt();
736 		if (src.remaining() != 0)
737 			throw new IllegalStateException("Internal codec bug");
738 
739 		boolean isLastPage = pageSize < 0;
740 		if (isLastPage) pageSize = -pageSize;
741 //		if (DEBUG) System.err.println("pageSize = " + pageSize);
742 		if (!isLastPage) {
743 			pageSize++; // read one byte past page, fetching PAGE_BEGIN marker
744 		}
745 		if (!src.ensureRemaining(input, pageSize))
746 			throw new BinaryParsingException("Missing remaining bnux page body");
747 
748 		if (isCompressed) decompress(src);
749 
750 		int symbolTableSize = src.getInt();
751 		if (symbolTableSize < 0)
752 			throw new BinaryParsingException("Negative symbol table size");
753 		int decodedSize = src.getInt();
754 		if (decodedSize < 0)
755 			throw new BinaryParsingException("Negative decodedSize");
756 		int encodedSize = src.getInt();
757 		if (encodedSize < 0)
758 			throw new BinaryParsingException("Negative encodedSize");
759 
760 		// read symbolTable
761 		this.symbols = null; // help gc
762 		if (decodedSize == encodedSize) { // safe trick, faster
763 			// Note that 7 bit ASCII is a proper subset of UTF-8
764 			this.symbols = src.getASCIIStrings(symbolTableSize);
765 		} else {
766 			this.symbols = src.getUTF8Strings(symbolTableSize);
767 		}
768 //		this.symbols = src.getUTF16Strings(symbolTableSize);
769 		if (DEBUG) System.err.println("read symbols = " + Arrays.asList(symbols));
770 
771 		int magic = src.getInt();
772 		if (magic != BNUX_MAGIC) throw new BinaryParsingException(
773 			"Bnux magic number mismatch: " + magic + ", must be: " + BNUX_MAGIC);
774 
775 		// reset caches in preparation for XML token decoding
776 		if (this.nameCache == null) {
777 			nameCache = new String[Math.min(64, symbolTableSize)];
778 		} else {
779 			for (int i=nameCache.length; --i >= 0; ) nameCache[i] = null;
780 		}
781 
782 		if (factory.getClass() == NodeFactory.class) { // fast path
783 			if (this.textCache == null) {
784 				textCache = new Text[Math.min(256, symbolTableSize)];
785 			} else {
786 				for (int i=textCache.length; --i >= 0; ) textCache[i] = null;
787 			}
788 		}
789 //		this.nameCache = null; // help gc
790 //		this.nameCache = new String[Math.min(64, symbolTableSize)];
791 //		this.textCache = null; // help gc
792 //		if (factory.getClass() == NodeFactory.class) { // fast path
793 //			this.textCache = new Text[Math.min(128, symbolTableSize)];
794 //		}
795 	}
796 
797 	private void decompress(ArrayByteList src) throws BinaryParsingException {
798 		if (nodeTokens == null) nodeTokens = new ArrayByteList();
799 		nodeTokens.clear();
800 
801 		try {
802 			nodeTokens.add(decompressor, src);
803 		} catch (DataFormatException e) {
804 			String s = e.getMessage();
805 		    throw new BinaryParsingException(
806 		    		s != null ? s : "Invalid ZLIB data format", e);
807 		}
808 
809 		src.swap(nodeTokens); // replace src with nodeTokens
810 		nodeTokens.clear();
811 	}
812 
813 	/** Parses document from encoded src buffer; tokens appear in document order. */
814 	private Document readDocument(ArrayByteList src, InputStream input)
815 			throws BinaryParsingException, IOException {
816 
817 		if (DEBUG) System.err.println("reading document");
818 		readPage(src, input);
819 		Document doc = factory.startMakingDocument();
820 //		doc.setBaseURI(symbols[src.getInt()]);
821 		doc.setBaseURI(getInternedName(src.getInt()));
822 		boolean hasRootElement = false;
823 		int i = 0;
824 
825 		// add children of document, retaining the exact same order found in input
826 		while (src.remaining() > 0) {
827 			Nodes nodes;
828 			int type = src.get(); // look ahead
829 			if (DEBUG) System.err.println("reading type = " + toString(type));
830 			switch (type & 0x07) { // three low bits indicate node type
831 				case TEXT: {
832 					throw new BinaryParsingException("Unreachable text");
833 				}
834 				case ATTRIBUTE: {
835 					throw new BinaryParsingException("Unreachable attribute");
836 				}
837 				case BEGIN_ELEMENT: {
838 					if (factory.getClass() == NodeFactory.class) { // fast path
839 						Element root = readStartTag(src, type);
840 						readElement(src, root, input); // reads entire subtree
841 						nodes = new Nodes(root);
842 					} else { // slow path
843 						Element root = readStartTagF(src, type, true);
844 						if (root == null) {
845 							throw new NullPointerException("Factory failed to create root element.");
846 						}
847 						doc.setRootElement(root);
848 						readElementF(src, root, input);
849 						nodes = factory.finishMakingElement(root);
850 					}
851 					break;
852 				}
853 				case END_ELEMENT: {
854 					throw new BinaryParsingException("Unreachable end of element");
855 				}
856 				case COMMENT: {
857 					nodes = readCommentF(src, type);
858 					break;
859 				}
860 				case NAMESPACE_DECLARATION: {
861 					throw new BinaryParsingException("Unreachable namespace declaration");
862 				}
863 				case PROCESSING_INSTRUCTION: {
864 					nodes = readProcessingInstructionF(src);
865 					break;
866 				}
867 				case DOC_TYPE: {
868 					nodes = readDocTypeF(src);
869 					break;
870 				}
871 				default: {
872 					throw new BinaryParsingException("Illegal node type code=" + type);
873 				}
874 			}
875 
876 			// append nodes:
877 			for (int j=0; j < nodes.size(); j++) {
878 				Node node = nodes.get(j);
879 				if (node instanceof Element) { // replace fake root with real root
880 					if (hasRootElement) {
881 						throw new IllegalAddException(
882 							"Factory returned multiple root elements");
883 					}
884 					doc.setRootElement((Element) node);
885 					hasRootElement = true;
886 				} else {
887 					doc.insertChild(node, i);
888 				}
889 				i++;
890 			}
891 		}
892 
893 		if (!hasRootElement) throw new WellformednessException(
894 				"Factory attempted to remove the root element");
895 		factory.finishMakingDocument(doc);
896 		if (DEBUG) System.err.println("finished reading document");
897 		return doc;
898 	}
899 
900 	/** Reads start tag and returns a corresponding empty element */
901 	private Element readStartTag(ArrayByteList src, int type) {
902 		String qname = readString(src, 4, type);
903 		String namespaceURI = readName(src, 6, type);
904 		return this.nodeBuilder.createElement(qname, namespaceURI);
905 //		return new Element(qname, namespaceURI);
906 	}
907 
908 	private Element readStartTagF(ArrayByteList src, int type, boolean isRoot) {
909 		String qname = readString(src, 4, type);
910 		String namespaceURI = readName(src, 6, type);
911 		return isRoot ?
912 			factory.makeRootElement(qname, namespaceURI) :
913 			factory.startMakingElement(qname, namespaceURI);
914 	}
915 
916 	/** Iterative pull parser reading an entire element subtree. */
917 	private void readElement(ArrayByteList src, Element current, InputStream input)
918 		throws BinaryParsingException, IOException {
919 
920 		while (true) {
921 			Node node = null;
922 			Element down = null;
923 			int type = src.get(); // look ahead
924 //			if (DEBUG) System.err.println("reading type = " + toString(type));
925 			switch (type & 0x07) { // three low bits indicate node type
926 				case TEXT: {
927 					node = readText(src, type);
928 					break;
929 				}
930 				case ATTRIBUTE: {
931 					readAttribute(src, current, type);
932 					continue;
933 				}
934 				case BEGIN_ELEMENT: {
935 					down = readStartTag(src, type);
936 					node = down;
937 					break;
938 				}
939 				case END_ELEMENT: {
940 					current = (Element) current.getParent();
941 					if (current == null) return; // we're done with the root element
942 					continue;
943 				}
944 				case COMMENT: {
945 					node = readComment(src, type);
946 					break;
947 				}
948 				case NAMESPACE_DECLARATION: {
949 					readNamespaceDeclaration(src, current, type);
950 					continue;
951 				}
952 				case PROCESSING_INSTRUCTION: {
953 					node = readProcessingInstruction(src);
954 					break;
955 				}
956 				case DOC_TYPE: { // surrogate hack to identify BEGIN_PAGE
957 					readPage(src, input);
958 					continue;
959 				}
960 			}
961 
962 //			if (DEBUG) System.err.println("read node=" + node.toXML());
963 
964 			// assert node != null
965 			if (IS_EXTENDED_XOM) { // xom-1.1 + patch
966 				current.fastInsertChild(node, current.getChildCount());
967 			} else {
968 				current.insertChild(node, current.getChildCount());
969 			}
970 
971 			if (down != null) current = down; // recurse down
972 		}
973 	}
974 
975 	/** Iterative pull parser reading an entire element subtree (using NodeFactory). */
976 	private void readElementF(ArrayByteList src, Element current, InputStream input)
977 			throws BinaryParsingException, IOException {
978 
979 //		final ArrayList stack = new ArrayList();
980 		final FastStack stack = new FastStack();
981 		stack.push(current);
982 		boolean addAttributesAndNamespaces = true;
983 
984 		while (true) {
985 			Nodes nodes = null;
986 			int type = src.get(); // look ahead
987 //			if (DEBUG) System.err.println("reading type = " + toString(type));
988 
989 			switch (type & 0x07) { // three low bits indicate node type
990 				case TEXT: {
991 					nodes = readTextF(src, type);
992 					break;
993 				}
994 				case ATTRIBUTE: {
995 					Element elem = addAttributesAndNamespaces ? current : null;
996 					nodes = readAttributeF(src, elem, type);
997 					break;
998 				}
999 				case BEGIN_ELEMENT: {
1000 					Element elem = readStartTagF(src, type, false);
1001 					stack.push(elem); // even if it's null
1002 					if (elem != null) {
1003 						current.insertChild(elem, current.getChildCount());
1004 						current = elem; // recurse down
1005 					}
1006 					addAttributesAndNamespaces = elem != null;
1007 					continue;
1008 				}
1009 				case END_ELEMENT: {
1010 					Element elem = stack.pop();
1011 					if (elem == null) {
1012 						continue; // skip element
1013 					}
1014 					ParentNode parent = elem.getParent();
1015 					if (parent == null) throwTamperedWithParent();
1016 					if (parent instanceof Document) {
1017 						return; // we're done with the root element
1018 					}
1019 
1020 					current = (Element) parent; // recurse up
1021 					nodes = factory.finishMakingElement(elem);
1022 
1023 					if (nodes.size()==1 && nodes.get(0)==elem) { // same node? (common case)
1024 						continue; // optimization: no need to remove and then readd same element
1025 					}
1026 
1027 					if (current.getChildCount()-1 < 0) throwTamperedWithParent();
1028 					current.removeChild(current.getChildCount()-1);
1029 					break;
1030 				}
1031 				case COMMENT: {
1032 					nodes = readCommentF(src, type);
1033 					break;
1034 				}
1035 				case NAMESPACE_DECLARATION: {
1036 					Element elem = addAttributesAndNamespaces ? current : null;
1037 					readNamespaceDeclaration(src, elem, type);
1038 					continue;
1039 				}
1040 				case PROCESSING_INSTRUCTION: {
1041 					nodes = readProcessingInstructionF(src);
1042 					break;
1043 				}
1044 				case DOC_TYPE: { // surrogate hack for BEGIN_PAGE
1045 					readPage(src, input);
1046 					continue;
1047 				}
1048 			}
1049 
1050 			appendNodes(current, nodes);
1051 		}
1052 	}
1053 
1054 	private static void appendNodes(Element elem, Nodes nodes) {
1055 		if (nodes != null) {
1056 			int size = nodes.size();
1057 			for (int i=0; i < size; i++) {
1058 				Node node = nodes.get(i);
1059 				if (node instanceof Attribute) {
1060 					elem.addAttribute((Attribute) node);
1061 				} else {
1062 					elem.insertChild(node, elem.getChildCount());
1063 				}
1064 			}
1065 		}
1066 	}
1067 
1068 	private static void throwTamperedWithParent() {
1069 		throw new XMLException("Factory has tampered with a parent pointer " +
1070 				"of ancestor-or-self in finishMakingElement()");
1071 	}
1072 
1073 	private void readAttribute(ArrayByteList src, Element dst, int type) throws BinaryParsingException {
1074 		String qname = readString(src, 4, type);
1075 		String namespaceURI = readName(src, 6, type);
1076 		String value = readString(src, 4, src.get());
1077 		Attribute.Type attrType = Util.getAttributeType(src.get());
1078 		Attribute attr = this.nodeBuilder.createAttribute(qname, namespaceURI, value, attrType);
1079 //		Attribute attr = new Attribute(qname, namespaceURI, value, attrType);
1080 		dst.addAttribute(attr);
1081 	}
1082 
1083 	private Nodes readAttributeF(ArrayByteList src, Element dst, int type) throws BinaryParsingException {
1084 		String qname = readString(src, 4, type);
1085 		String namespaceURI = readName(src, 6, type);
1086 		String value = readString(src, 4, src.get());
1087 		Attribute.Type attrType = Util.getAttributeType(src.get());
1088 		if (dst == null) return null; // NONE;
1089 		return factory.makeAttribute(qname, namespaceURI, value, attrType);
1090 	}
1091 
1092 	private Comment readComment(ArrayByteList src, int type) {
1093 		return new Comment(readString(src, 4, type));
1094 	}
1095 
1096 	private Nodes readCommentF(ArrayByteList src, int type) {
1097 		return factory.makeComment(readString(src, 4, type));
1098 	}
1099 
1100 	private void readNamespaceDeclaration(ArrayByteList src, Element dst, int type) {
1101 		String prefix = readString(src, 4, type);
1102 		String uri = readName(src, 6, type);
1103 		if (dst != null) dst.addNamespaceDeclaration(prefix, uri);
1104 	}
1105 
1106 	private ProcessingInstruction readProcessingInstruction(ArrayByteList src) {
1107 		int type = src.get(src.position() - 1);
1108 		String target = readString(src, 4, type);
1109 		String value = readString(src, 6, type);
1110 		return new ProcessingInstruction(target, value);
1111 	}
1112 
1113 	private Nodes readProcessingInstructionF(ArrayByteList src) {
1114 		int type = src.get(src.position() - 1);
1115 		String target = readString(src, 4, type);
1116 		String value = readString(src, 6, type);
1117 		return factory.makeProcessingInstruction(target, value);
1118 	}
1119 
1120 	/** Does not pack indexes of doctype (infrequent anyway) */
1121 	private Nodes readDocTypeF(ArrayByteList src) {
1122 		String rootElementName = symbols[src.getInt()];
1123 		String publicID = symbols[src.getInt()];
1124 		if (DOCTYPE_NULL_ID.equals(publicID)) publicID = null;
1125 		String systemID = symbols[src.getInt()];
1126 		if (DOCTYPE_NULL_ID.equals(systemID)) systemID = null;
1127 		String internalDTDSubset = symbols[src.getInt()];
1128 		if (internalDTDSubset.length() == 0) internalDTDSubset = null;
1129 
1130 		Nodes nodes = factory.makeDocType(rootElementName, publicID, systemID);
1131 		for (int i=0; i < nodes.size(); i++) {
1132 			if (nodes.get(i) instanceof DocType) {
1133 				DocType docType = (DocType) nodes.get(i);
1134 				if (docType.getInternalDTDSubset().length() == 0) {
1135 					try {
1136 						docType.setInternalDTDSubset(internalDTDSubset);
1137 					} catch (IllegalAccessError e) {
1138 						; // ignore; setInternalDTDSubset() is private in xom < 1.1
1139 					}
1140 				}
1141 			}
1142 		}
1143 		return nodes;
1144 	}
1145 
1146 	// try to avoid XML reverification by caching repetitive Texts
1147 	private Text readText(ArrayByteList src, int type) {
1148 		int i = readSymbol(src, 4, type);
1149 		Text text;
1150 		if (i < textCache.length) {
1151 			text = textCache[i];
1152 			if (text != null) return new Text(text);
1153 		}
1154 		text = new Text(symbols[i]);
1155 		if (i < textCache.length) textCache[i] = text;
1156 		return text;
1157 	}
1158 
1159 	private Nodes readTextF(ArrayByteList src, int type) {
1160 		return factory.makeText(readString(src, 4, type));
1161 	}
1162 
1163 	/** Reads string via packed index from symbolTable */
1164 	private String readString(ArrayByteList src, int shift, int type) {
1165 		int i = readSymbol(src, shift, type);
1166 		if (i < 0) return "";
1167 		return symbols[i];
1168 	}
1169 
1170 	/** Reads string via packed index from symbolTable */
1171 	private static int readSymbol(ArrayByteList src, int shift, int type) {
1172 		// assert shift == 4 || shift == 6
1173 		if (Util.isInlinedIndex(type)) {
1174 			if (shift == 6) return -1;
1175 			return Util.getInlinedIndex(type);
1176 		}
1177 
1178 		switch ((type >>> shift) & 0x03) { // look at two bits indicating index length
1179 			case 0 : return Util.getUnsignedByte(src.get());
1180 			case 1 : return Util.getUnsignedShort(src.getShort());
1181 			case 2 : return -1;
1182 			default: return src.getInt();
1183 		}
1184 	}
1185 
1186 	private String readName(ArrayByteList src, int shift, int type) {
1187 		int i = readSymbol(src, shift, type);
1188 		if (i < 0) return "";
1189 		if (i < nameCache.length) {
1190 			String name = nameCache[i];
1191 			if (name == null) { // cache miss
1192 				name = getInternedName(i);
1193 				nameCache[i] = name;
1194 			}
1195 			return name;
1196 		}
1197 		return symbols[i];
1198 	}
1199 
1200 	private String getInternedName(int i) {
1201 		String name = symbols[i];
1202 		if (name.length() == 0) {
1203 			name = "";
1204 		} else {
1205 			name = (String) internedNames.get(name);
1206 			if (name == null) {
1207 				name = symbols[i];
1208 				internedNames.put(name, name);
1209 			}
1210 		}
1211 		return name;
1212 	}
1213 
1214 	/** Writes nodeTokens, indexData, symbolTable to output stream */
1215 	private void writePage(boolean isLastPage) throws IOException {
1216 		if (DEBUG) System.err.println("writing page");
1217 		Entry[] entries = symbolTable.getEntries();
1218 		int numChars = symbolTable.numCharacters();
1219 
1220 		// reorder entries and update indexData accordingly.
1221 		packSort(entries, indexData);
1222 //		if (DEBUG) printStatistics(entries);
1223 
1224 		// add header to page
1225 		page.ensureCapacity(page.size() + 1 + 4 + 4 + 4 + 4 + 4 + numChars*4 + entries.length +
1226 				nodeTokens.size() + indexData.size() + numChars/100); // an educated guess
1227 		page.add((byte) DOC_TYPE); // BEGIN_PAGE marker surrogate
1228 		page.addInt(0); // pageSize dummy placeholder
1229 		int pageOffset = page.size();
1230 		page.addInt(entries.length);
1231 		page.addInt(numChars + entries.length);  // decodedSize (+zero terminator)
1232 		page.addInt(0); // encodedSize dummy placeholder
1233 		int encodedOffset = page.size();
1234 
1235 		// add symbolTable to page
1236 		encodeSymbols(entries, page); // assert: no need to expand underlying array
1237 		int encodedSize = page.size() - encodedOffset;
1238 		page.setInt(encodedOffset-4, encodedSize); // replace dummy placeholder
1239 		entries = null; // help gc
1240 
1241 		// add node tokens and packed symbol indexes to page
1242 		page.addInt(BNUX_MAGIC);
1243 		encodeTokens(nodeTokens, indexData.asArray(), page);
1244 
1245 		int pageSize;
1246 		nodeTokens.clear();
1247 		if (compressionLevel > 0) { // compress page body
1248 			page.position(pageOffset);
1249 			nodeTokens.add(compressor, page);
1250 			page.remove(pageOffset, page.size());
1251 			pageSize = nodeTokens.size();
1252 		} else {
1253 			pageSize = page.size() - pageOffset;
1254 		}
1255 		if (isLastPage) pageSize = -pageSize;
1256 		page.setInt(pageOffset-4, pageSize); // replace pageSize dummy placeholder
1257 
1258 		// having filled the buffers, flush them onto underlying output stream
1259 		page.write(out);
1260 		nodeTokens.write(out);
1261 
1262 		// reset
1263 		if (!isLastPage) symbolTable.clear();
1264 		page.clear();
1265 		nodeTokens.clear();
1266 		indexData.clear();
1267 		if (DEBUG) System.err.println("finished writing page");
1268 	}
1269 
1270 	private void writeDocument(Document doc) throws IOException {
1271 		if (DEBUG) System.err.println("writing document");
1272 		writeXMLDeclaration(doc.getBaseURI());
1273 		for (int i = 0; i < doc.getChildCount(); i++) {
1274 			Node node = doc.getChild(i);
1275 			if (node instanceof Element) {
1276 				writeElement((Element) node);
1277 			} else if (node instanceof Comment) {
1278 				writeComment((Comment) node);
1279 			} else if (node instanceof ProcessingInstruction) {
1280 				writeProcessingInstruction((ProcessingInstruction) node);
1281 			} else if (node instanceof DocType) {
1282 				writeDocType((DocType) node);
1283 			} else {
1284 				throw new IllegalAddException("Cannot write node type: " + node);
1285 			}
1286 		}
1287 		writeEndDocument();
1288 		if (DEBUG) System.err.println("finished writing document");
1289 	}
1290 
1291 	final void writeXMLDeclaration(String baseURI) {
1292 		if (baseURI == null) baseURI = "";
1293 
1294 		// setup
1295 		symbolTable = new SymbolTable();
1296 		if (nodeTokens == null) nodeTokens = new ArrayByteList();
1297 		nodeTokens.clear();
1298 		if (indexData == null) indexData = new ArrayIntList();
1299 		indexData.clear();
1300 
1301 		// write bnux document header
1302 		if (page == null) page = new ArrayByteList(256);
1303 		page.clear();
1304 		page.ensureCapacity(DOCUMENT_HEADER_SIZE + PAGE_HEADER_SIZE + 1);
1305 		page.addInt(BNUX_MAGIC);
1306 		int version = VERSION;
1307 		if (compressionLevel > 0) version = -version;
1308 		page.add((byte)version);
1309 
1310 		isFirstPage = true;
1311 		writeIndex(baseURI);
1312 	}
1313 
1314 	final void writeEndDocument() throws IOException {
1315 		flush(true);
1316 	}
1317 
1318 	// assertion: must not be called with !isLastPage
1319 	// if we're not at a safe point, i.e. at nesting depth == 0
1320 	final void flush(boolean isLastPage) throws IOException {
1321 		try {
1322 			if (nodeTokens.size() > 0) { // anything remaining to be written?
1323 				writePage(isLastPage);
1324 			}
1325 			out.flush();
1326 		} finally {
1327 			if (isLastPage) {
1328 				this.symbolTable = null; // help gc
1329 				this.out = null;
1330 			}
1331 			nodeTokens.clear();
1332 		}
1333 	}
1334 
1335 	/** Encodes a node into the intermediate unpacked binary form */
1336 	private void writeChild(Node node) throws IOException {
1337 		if (node instanceof Element) {
1338 			writeElement((Element) node);
1339 		} else if (node instanceof Text) {
1340 			writeText((Text) node);
1341 		} else if (node instanceof Comment) {
1342 			writeComment((Comment) node);
1343 		} else if (node instanceof ProcessingInstruction) {
1344 			writeProcessingInstruction((ProcessingInstruction) node);
1345 		} else {
1346 			throw new IllegalAddException("Cannot write node type: " + node);
1347 		}
1348 	}
1349 
1350 	final void writeElement(Element elem) throws IOException {
1351 		writeStartTag(elem);
1352 
1353 		for (int i = 0; i < elem.getChildCount(); i++) {
1354 			writeChild(elem.getChild(i));
1355 		}
1356 
1357 		writeEndTag();
1358 	}
1359 
1360 	final void writeStartTag(Element elem) {
1361 		writeIndex(elem.getNamespacePrefix(), elem.getLocalName());
1362 
1363 		int type = BEGIN_ELEMENT;
1364 		if (elem.getNamespaceURI().length() == 0) {
1365 			type = Util.noNamespace(type);
1366 		} else {
1367 			writeIndex(elem.getNamespaceURI());
1368 		}
1369 		nodeTokens.add((byte)type);
1370 
1371 		for (int i = 0; i < elem.getAttributeCount(); i++) {
1372 			writeAttribute(elem.getAttribute(i));
1373 		}
1374 
1375 		if (IS_EXTENDED_XOM) {
1376 			writeNamespaceDeclarationsFast(elem);
1377 		} else {
1378 			writeNamespaceDeclarations(elem);
1379 		}
1380 	}
1381 
1382 	final void writeEndTag() throws IOException {
1383 		if (nodeTokens.size() + indexData.size() + symbolTable.numCharacters()
1384 				+ symbolTable.size() >= MAX_PAGE_CAPACITY) {
1385 			writePage(false); // write nodeTokens, indexData, symbolTable to output stream
1386 		}
1387 
1388 		nodeTokens.add((byte)END_ELEMENT);
1389 	}
1390 
1391 	private void writeAttribute(Attribute attr) {
1392 		writeIndex(attr.getNamespacePrefix(), attr.getLocalName());
1393 
1394 		int type = ATTRIBUTE;
1395 		if (attr.getNamespaceURI().length() == 0) {
1396 			type = Util.noNamespace(type);
1397 		} else {
1398 			writeIndex(attr.getNamespaceURI());
1399 		}
1400 
1401 		writeIndex(attr.getValue());
1402 		nodeTokens.add((byte)type);
1403 		nodeTokens.add(Util.getAttributeTypeCode(attr));
1404 	}
1405 
1406 	final void writeComment(Comment comment) {
1407 		nodeTokens.add((byte)COMMENT);
1408 		writeIndex(comment.getValue());
1409 	}
1410 
1411 	/** Does not pack indexes of doctype (infrequent anyway) */
1412 	final void writeDocType(DocType docType) {
1413 		nodeTokens.add((byte)DOC_TYPE);
1414 		writeIndex(docType.getRootElementName());
1415 		writeIndex(docType.getPublicID() == null ? DOCTYPE_NULL_ID : docType.getPublicID());
1416 		writeIndex(docType.getSystemID() == null ? DOCTYPE_NULL_ID : docType.getSystemID());
1417 		writeIndex(docType.getInternalDTDSubset() == null ? "" : docType.getInternalDTDSubset());
1418 	}
1419 
1420 	// requires xom-1.1 + patches
1421 	private void writeNamespaceDeclarationsFast(Element elem) {
1422 		String[] decls = elem.getAdditionalNamespaceDeclarations();
1423 		for (int i=0; i < decls.length; ) {
1424 			nodeTokens.add((byte)NAMESPACE_DECLARATION);
1425 			writeIndex(decls[i++]); // prefix
1426 			writeIndex(decls[i++]); // URI
1427 		}
1428 	}
1429 
1430 	private void writeNamespaceDeclarations(Element elem) {
1431 		int count = elem.getNamespaceDeclarationCount();
1432 		if (count == 1)
1433 			return; // elem.getNamespaceURI() has already been written
1434 
1435 		for (int i = 0; i < count; i++) {
1436 			String prefix = elem.getNamespacePrefix(i);
1437 			String uri = elem.getNamespaceURI(prefix);
1438 			if (prefix.equals(elem.getNamespacePrefix()) && uri.equals(elem.getNamespaceURI())) {
1439 //				if (DEBUG) System.err.println("********** NAMESPACE IGNORED ON WRITE ***************\n");
1440 				continue;
1441 			}
1442 			nodeTokens.add((byte)NAMESPACE_DECLARATION);
1443 			writeIndex(prefix);
1444 			writeIndex(uri);
1445 		}
1446 	}
1447 
1448 	final void writeProcessingInstruction(ProcessingInstruction pi) {
1449 		nodeTokens.add((byte)PROCESSING_INSTRUCTION);
1450 		writeIndex(pi.getTarget());
1451 		writeIndex(pi.getValue());
1452 	}
1453 
1454 	final void writeText(Text text) {
1455 		nodeTokens.add((byte)TEXT);
1456 		writeIndex(text.getValue());
1457 	}
1458 
1459 	/** Puts symbol into symbolTable and appends the string's index to indexData */
1460 	private final void writeIndex(String symbol) {
1461 		writeIndex("", symbol);
1462 	}
1463 
1464 	/** Puts symbol into symbolTable and appends the string's index to indexData */
1465 	private final void writeIndex(String prefix, String localName) {
1466 		int index = symbolTable.addSymbol(prefix, localName);
1467 		indexData.add(index);
1468 	}
1469 
1470 	/** Converts strings of symbolTable into UTF-8 bytes */
1471 	private void encodeSymbols(Entry[] entries, ArrayByteList dst) {
1472 		/*
1473 		 * As an optimization, one could use dst.ensureCapacity(dst.size + 4 *
1474 		 * sum(entries.key.length) + entries.length) then use plain array
1475 		 * accesses via dst.addUTF8Entries(entries) instead of many small
1476 		 * dst.add(byte) calls. Update: I benchmarked various variants of this
1477 		 * idea; none of them turned out to be worthwhile, so, for the time
1478 		 * being, we'll keep the simple version below.
1479 		 */
1480 //		if (DEBUG) System.err.println("encoding symbols = " + toString(entries));
1481 		int len = entries.length;
1482 		for (int i=0; i < len; i++) {
1483 			Entry entry = entries[i];
1484 			dst.addUTF8String(entry.getKey1(), entry.getKey2());
1485 			//dst.addUTF16String(entry.getKey1(), entry.getKey2());
1486 		}
1487 	}
1488 
1489 	/**
1490 	 * Sorts entries descending by symbol frequency (# of occurances) and
1491 	 * updates indexData accordingly. This allows to compress frequent 4 byte
1492 	 * indexes into a single byte. FAQ: this sort is *not* the bottleneck.
1493 	 */
1494 	private void packSort(Entry[] entries, ArrayIntList indexData) {
1495 		if (!DEBUG && entries.length <= 256) { // 0, 1, ... , 255
1496 			return; // no need to sort indexes - all fit into one unsigned byte anyway
1497 		}
1498 
1499 		// Swap entries with frequency == 1 to end of array (typically Text nodes).
1500 		// There's no need to sort those with O(N log N).
1501 		int head = entries.length;
1502 		for (int i=entries.length; --i >= 0; ) {
1503 			Entry e = entries[i];
1504 			if (e.getFrequency() == 1) {
1505 				head--;
1506 				entries[i] = entries[head];
1507 				entries[head] = e;
1508 			}
1509 		}
1510 //		if (DEBUG) System.err.println("len=" + entries.length + ", #f>1=" + (100.0f * head / entries.length));
1511 
1512 		// sort remaining entries descending by frequency.
1513 		Arrays.sort(entries, 0, head,
1514 			new Comparator() {
1515 				public final int compare(Object e1, Object e2) {
1516 					int f1 = ((Entry) e1).getFrequency();
1517 					int f2 = ((Entry) e2).getFrequency();
1518 					return f2 - f1;
1519 				}
1520 			}
1521 		);
1522 
1523 		// reorder indexData with sorted indexes
1524 		// since sort has moved entries[indexData[k]] to entries[i]
1525 		int[] indexes = new int[entries.length];
1526 		for (int i=entries.length; --i >= 0; ) {
1527 			indexes[entries[i].getIndex()] = i;
1528 		}
1529 
1530 		int[] ix = indexData.asArray();
1531 		for (int i=indexData.size(); --i >= 0; ) {
1532 			ix[i] = indexes[ix[i]];
1533 		}
1534 		// post-condition: entries[indexData[k]] corresponds to entries[i]
1535 	}
1536 
1537 	/**
1538 	 * Writes nodeTokens in document order; in the process stitches in packed
1539 	 * indexes referring to symbols in the symbolTable.
1540 	 */
1541 	private void encodeTokens(ArrayByteList tokenList, int[] indexes, ArrayByteList dst) {
1542 		byte[] tokens = tokenList.asArray();
1543 		int size = tokenList.size();
1544 		int i = 0;
1545 		int j = 0;
1546 		if (isFirstPage) dst.addInt(indexes[i++]); // document baseURI
1547 		isFirstPage = false;
1548 
1549 		while (j < size) {
1550 			int type = tokens[j++];
1551 			dst.add((byte)type);
1552 
1553 //			if (DEBUG) System.err.println("encoding type = " + toString(type));
1554 			switch (type & 0x07) {
1555 				case TEXT: {
1556 					Util.packOneIndex(dst, indexes[i++], type); // value
1557 					break;
1558 				}
1559 				case ATTRIBUTE: {
1560 					if (Util.hasNoNamespace(type)) { // qname
1561 						Util.packOneIndex(dst, indexes[i++], type);
1562 					} else { // qname, URI
1563 						Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type);
1564 					}
1565 					dst.add((byte)0);
1566 					Util.packOneIndex(dst, indexes[i++], 0); // value
1567 					dst.add(tokens[j++]); // attrType
1568 					break;
1569 				}
1570 				case BEGIN_ELEMENT: {
1571 					if (Util.hasNoNamespace(type)) {
1572 						Util.packOneIndex(dst, indexes[i++], type); // qname
1573 					} else {
1574 						Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type); // qname, URI
1575 					}
1576 					break;
1577 				}
1578 				case END_ELEMENT: {
1579 					break; // nothing to do
1580 				}
1581 				case COMMENT: {
1582 					Util.packOneIndex(dst, indexes[i++], type); // value
1583 					break;
1584 				}
1585 				case NAMESPACE_DECLARATION: { // prefix, URI
1586 					Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type);
1587 					break;
1588 				}
1589 				case PROCESSING_INSTRUCTION: { // target, value
1590 					Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type);
1591 					break;
1592 				}
1593 				case DOC_TYPE: { // infrequent; no need to pack indexes
1594 					dst.addInt(indexes[i++]); // rootElementName
1595 					dst.addInt(indexes[i++]); // publicID
1596 					dst.addInt(indexes[i++]); // systemID
1597 					dst.addInt(indexes[i++]); // internalDTDSubset
1598 					break;
1599 				}
1600 				default: {
1601 					throw new IllegalArgumentException("illegal node type");
1602 				}
1603 			}
1604 		}
1605 	}
1606 
1607 	private static int createMagicNumber() {
1608 		// 0xE0, 0x01, 0xDF, 0xFE = -32, 1, -33, -2 = -536748034
1609 		// Thanks to Paul.Sandoz@Sun.COM
1610 		ArrayByteList magic = new ArrayByteList(4);
1611 		magic.add((byte)0xE0);
1612 		magic.add((byte)0x01);
1613 		magic.add((byte)0xDF);
1614 		magic.add((byte)0xFE);
1615 		return magic.getInt();
1616 	}
1617 
1618 	/** does xom.jar contain our performance extensions? */
1619 	private static boolean hasXOMExtensions() {
1620 		try {
1621 			ParentNode.class.getMethod("fastInsertChild", new Class[] {Node.class, Integer.TYPE});
1622 			Element.class.getMethod("getAdditionalNamespaceDeclarations", null);
1623 			return true;
1624 		} catch (Throwable t) {
1625 			return false;
1626 		}
1627 	}
1628 
1629 	private static String toString(int type) { // DEBUG only
1630 		switch (type & 0x07) {
1631 			case TEXT: return "TEXT";
1632 			case ATTRIBUTE: return "ATTRIBUTE";
1633 			case BEGIN_ELEMENT: return "BEGIN_ELEMENT";
1634 			case END_ELEMENT: return "END_ELEMENT";
1635 			case COMMENT: return "COMMENT";
1636 			case NAMESPACE_DECLARATION: return "NAMESPACE_DECLARATION";
1637 			case PROCESSING_INSTRUCTION: return "PROCESSING_INSTRUCTION";
1638 			case DOC_TYPE: return "DOC_TYPE";
1639 			default: {
1640 				throw new IllegalArgumentException(
1641 						"Illegal node type code=" + (type & 0x07));
1642 			}
1643 		}
1644 	}
1645 
1646 	private static String toString(Entry[] entries) { // DEBUG only
1647 		ArrayList list = new ArrayList();
1648 		for (int i=0; i < entries.length; i++) {
1649 			list.add(entries[i].getQualifiedName());
1650 		}
1651 		return list.toString();
1652 	}
1653 
1654 
1655 	///////////////////////////////////////////////////////////////////////////////
1656 	// Nested classes:
1657 	///////////////////////////////////////////////////////////////////////////////
1658 
1659 	/**
1660 	 * Table of unique symbols for tokenization on XML serialization. This is a
1661 	 * classic text book hash algorithm, adapted to meet our specific
1662 	 * performance needs. It's close to the one used by the JDK HashMap.
1663 	 * Maintains a map of (String, String) ==> (index, frequency) associations.
1664 	 */
1665 	private static final class SymbolTable { // not a public class!
1666 
1667 		private static final float LOAD_FACTOR = 0.75f;
1668 		private static final int INITIAL_CAPACITY = 16;
1669 		private Entry[] entries = new Entry[INITIAL_CAPACITY];
1670 		private int threshold = (int) (INITIAL_CAPACITY * LOAD_FACTOR);
1671 		private int size = 0;
1672 		private int numChars = 0;
1673 
1674 		/** Constructs and returns a table with default parameters. */
1675 		public SymbolTable() {
1676 		}
1677 
1678 		/** Removes all entries from this table, retaining the current capacity. */
1679 		public void clear() {
1680 			size = 0;
1681 			numChars = 0;
1682 			Entry[] src = entries;
1683 			for (int i=src.length; --i >= 0; ) src[i] = null;
1684 		}
1685 
1686 		/** Returns the total number of characters occupied by all symbol strings. */
1687 		public int numCharacters() {
1688 			return numChars;
1689 		}
1690 
1691 		/** Returns the number of symbols. */
1692 		public int size() {
1693 			return size;
1694 		}
1695 
1696 		/**
1697 		 * Adds the given symbol to the table if not already present. Otherwise
1698 		 * increments its frequency counter.
1699 		 *
1700 		 * A symbol is structured like a lexical XML QName.
1701 		 * symbol: key1 + ":" + key2   (if key1 is non-empty string)
1702 		 * symbol: key2                (if key1 is empty string)
1703 		 *
1704 		 * @return a sequence number N >= 0 indicating that the symbol was added
1705 		 *         to this table as the N-th entry, in order.
1706 		 */
1707 		public int addSymbol(String key1, String key2) {
1708 			// assert: key1 and key2 are non-null
1709 			int hash = hash(key1, key2);
1710 			int i = hash & (entries.length - 1);
1711 			Entry entry = findEntry(key1, key2, entries[i], hash);
1712 			if (entry != null) {
1713 				entry.frequency++;
1714 				return entry.index;
1715 			}
1716 
1717 			// not found; add entry for key --> (index=size, freq=1) mapping
1718 			// new entry is inserted at head of chain
1719 //			if (DEBUG) checkNULChar(key1);
1720 //			if (DEBUG) checkNULChar(key2);
1721 			numChars += key1.length() + key2.length();
1722 			if (key1.length() != 0) numChars++;
1723 			entries[i] = new Entry(key1, key2, hash, entries[i], size);
1724 			if (size >= threshold) rehash();
1725 			return size++;
1726 		}
1727 
1728 		private static Entry findEntry(String key1, String key2, Entry cursor, int hash) {
1729 			while (cursor != null) { // scan collision chain
1730 				if (hash == cursor.hash && eq(key2, cursor.key2) && eq(key1, cursor.key1)) {
1731 					cursor.key1 = key1; // speeds up future lookups: equals() vs. ==
1732 					cursor.key2 = key2; // speeds up future lookups: equals() vs. ==
1733 					return cursor;
1734 				}
1735 				cursor = cursor.next;
1736 			}
1737 			return null;
1738 		}
1739 
1740 		/**
1741 		 * Expands the capacity of this table, rehashing all entries into
1742 		 * corresponding new slots.
1743 		 */
1744 		private void rehash() {
1745 			Entry[] src = entries;
1746 			int capacity = 2 * src.length;
1747 			Entry[] dst = new Entry[capacity];
1748 
1749 			for (int i = src.length; --i >= 0; ) {
1750 				Entry e = src[i];
1751 				while (e != null) { // walk collision chain
1752 					int j = e.hash & (capacity - 1);
1753 					Entry next = e.next;
1754 					e.next = dst[j];
1755 					dst[j] = e; // insert e at head of chain
1756 					e = next;
1757 				}
1758 			}
1759 			entries = dst;
1760 			threshold = (int) (capacity * LOAD_FACTOR);
1761 		}
1762 
1763 		/**
1764 		 * Returns all table entries, sorted ascending by entry.index. The
1765 		 * result can subsequently be used to sort by symbol frequency, or
1766 		 * similar. Much faster than an entrySet().iterator().next() loop would
1767 		 * be.
1768 		 */
1769 		public Entry[] getEntries() {
1770 			Entry[] dst = new Entry[size];
1771 			Entry[] src = entries;
1772 			for (int i = src.length; --i >= 0; ) {
1773 				Entry e = src[i];
1774 				while (e != null) { // walk collision chain
1775 					dst[e.index] = e;
1776 					e = e.next;
1777 				}
1778 			}
1779 			return dst;
1780 		}
1781 
1782 		private static int hash(String key1, String key2) {
1783 			int h = key2.hashCode();
1784 			if (key1 != "") h = key1.hashCode() ^ h;
1785 			return auxiliaryHash(h);
1786 //			return auxiliaryHash(key1.hashCode() ^ key2.hashCode());
1787 		}
1788 
1789 		/**
1790 		 * Auxiliary hash function that defends against poor base hash
1791 		 * functions. Ensures more uniform hash distribution, hence reducing the
1792 		 * probability of pathologically long collision chains, in particular
1793 		 * for short key symbols that are quite similar to each other, or XML
1794 		 * boundary whitespace (worst case scenario).
1795 		 */
1796 		private static int auxiliaryHash(int h) {
1797 			h += ~(h << 9);
1798 			h ^= (h >>> 14);
1799 			h += (h << 4);
1800 			h ^= (h >>> 10);
1801 			return h;
1802 		}
1803 
1804 		private static boolean eq(String x, String y) {
1805 			return x == y || x.equals(y);
1806 		}
1807 
1808 		/** Sanity check; Unnecessary since NULs have already been checked by nu.xom.Verifier. */
1809 		private static void checkNULChar(String key) {
1810 			int i = key.indexOf((char)0);
1811 			if (i >= 0) {
1812 				throw new IllegalArgumentException(
1813 					"Symbol must not contain C0 control character NUL (char 0x00) [index:" + i
1814 					+ " within '" + key + "']");
1815 			}
1816 		}
1817 
1818 	}
1819 
1820 	/**
1821 	 * A value in the SymbolTable.
1822 	 */
1823 	private static final class Entry {
1824 
1825 		String key1; // prefix or ""
1826 		String key2; // localName or Text or Comment or similar
1827 		final int hash;   // cache for symbol's hash code
1828 		final int index;  // index to correlate Entry with indexList on XML serialization
1829 		int frequency = 1;// number of occurances of symbol within current XML document
1830 		Entry next;       // successor in collision chain, mapping to the same hash slot
1831 
1832 		public Entry(String key1, String key2, int hash, Entry next, int index) {
1833 			this.key1 = key1;
1834 			this.key2 = key2;
1835 			this.hash = hash;
1836 			this.next = next;
1837 			this.index = index;
1838 		}
1839 
1840 		public String getKey1()    { return key1; }
1841 		public String getKey2()    { return key2; }
1842 		public int getIndex()     { return index; }
1843 		public int getFrequency() { return frequency; }
1844 
1845 		public String getQualifiedName() { // DEBUG only
1846 			if (key1.length() == 0) return key2;
1847 			return key1 + ':' + key2;
1848 		}
1849 		public String toString() { // DEBUG only
1850 			return "[key1=" + key1 + ", key2=" + key2 + ", freq=" + frequency + "]";
1851 		}
1852 
1853 	}
1854 
1855 
1856 	///////////////////////////////////////////////////////////////////////////////
1857 	// Nested classes:
1858 	///////////////////////////////////////////////////////////////////////////////
1859 
1860 	/**
1861 	 * Fast replacement for ArrayList and java.util.Stack. Possibly premature
1862 	 * and unnecessary?
1863 	 */
1864 	private static final class FastStack {
1865 		private Element[] elements = new Element[10];
1866 		private int size = 0;
1867 
1868 		public Element pop() {
1869 			Element elem = elements[size-1];
1870 			elements[--size] = null; // help gc
1871 			return elem;
1872 		}
1873 
1874 		public void push(Element elem) {
1875 			if (size == elements.length) ensureCapacity(size + 1);
1876 			elements[size++] = elem;
1877 		}
1878 
1879 		private void ensureCapacity(int minCapacity) {
1880 			if (minCapacity > elements.length) {
1881 				int newCapacity = Math.max(minCapacity, 2 * elements.length + 1);
1882 				elements = subArray(0, size, newCapacity);
1883 			}
1884 		}
1885 
1886 		/** Small helper method eliminating redundancy. */
1887 		private Element[] subArray(int from, int length, int capacity) {
1888 			Element[] subArray = new Element[capacity];
1889 			System.arraycopy(elements, from, subArray, 0, length);
1890 			return subArray;
1891 		}
1892 
1893 	}
1894 
1895 }