1 /* 2 * Copyright (c) 2005, The Regents of the University of California, through 3 * Lawrence Berkeley National Laboratory (subject to receipt of any required 4 * approvals from the U.S. Dept. of Energy). All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * (1) Redistributions of source code must retain the above copyright notice, 10 * this list of conditions and the following disclaimer. 11 * 12 * (2) Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * (3) Neither the name of the University of California, Lawrence Berkeley 17 * National Laboratory, U.S. Dept. of Energy nor the names of its contributors 18 * may be used to endorse or promote products derived from this software without 19 * specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * You are under no obligation whatsoever to provide any bug fixes, patches, or 34 * upgrades to the features, functionality or performance of the source code 35 * ("Enhancements") to anyone; however, if you choose to make your Enhancements 36 * available either publicly, or directly to Lawrence Berkeley National 37 * Laboratory, without imposing a separate written license agreement for such 38 * Enhancements, then you hereby grant the following license: a non-exclusive, 39 * royalty-free perpetual license to install, use, modify, prepare derivative 40 * works, incorporate into other computer software, distribute, and sublicense 41 * such enhancements or derivative works thereof, in binary and source code 42 * form. 43 */ 44 package nux.xom.binary; 45 46 import java.io.ByteArrayInputStream; 47 import java.io.ByteArrayOutputStream; 48 import java.io.IOException; 49 import java.io.InputStream; 50 import java.io.OutputStream; 51 import java.util.ArrayList; 52 import java.util.Arrays; 53 import java.util.Comparator; 54 import java.util.zip.DataFormatException; 55 import java.util.zip.Deflater; 56 import java.util.zip.Inflater; 57 58 import nu.xom.Attribute; 59 import nu.xom.Comment; 60 import nu.xom.DocType; 61 import nu.xom.Document; 62 import nu.xom.Element; 63 import nu.xom.IllegalAddException; 64 import nu.xom.Node; 65 import nu.xom.NodeFactory; 66 import nu.xom.Nodes; 67 import nu.xom.ParentNode; 68 import nu.xom.ProcessingInstruction; 69 import nu.xom.Text; 70 import nu.xom.WellformednessException; 71 import nu.xom.XMLException; 72 import nux.xom.io.StreamingSerializer; 73 74 /** 75 * Serializes (encodes) and deserializes (decodes) XOM XML documents to and from 76 * an efficient and compact custom binary XML data format (termed <i>bnux </i> 77 * format), without loss or change of any information. Serialization and 78 * deserialization is much faster than with the standard textual XML format, and 79 * the resulting binary data is more compressed than textual XML. 80 * 81 * <h4>Applicability</h4> 82 * 83 * The overall goal of the <i>bnux algorithm</i> is to maximize serialization 84 * and deserialization (parsing) performance without requiring any schema 85 * description. Serialization and deserialization speed are roughly balanced 86 * against each other; neither side is particularly favoured over the other. 87 * Another benefitial effect of the algorithm is that a considerable degree of 88 * XML data redundancy is eliminated, but compression is more a welcome 89 * side-effect than a primary goal in itself. The algorithm is primarily 90 * intended for tightly coupled high-performance systems exchanging large 91 * volumes of XML data over networks, as well as for compact main memory caches 92 * and for <i>short-term </i> storage as BLOBs in backend databases or files 93 * (e.g. "session" data with limited duration). In the case of BLOB storage, 94 * selecting matching BLOBs can be sped up by maintaining a simple metaindex 95 * side table for the most frequent access patterns. See the <a 96 * href="#performance">performance results</a> below. 97 * <p> 98 * While the Java API is considered stable, the bnux data format should be 99 * considered a black box: Its internals are under-documented and may change 100 * without notice from release to release in backwards-incompatible manners. It 101 * is unlikely that support for reading data written with older Nux versions 102 * will ever be available. bnux is an exchange format but not an 103 * interoperability format. Having said that, the data format is machine 104 * architecture/platform independent. For example a bnux file can be moved back 105 * and forth between a 32 bit Intel little-endian machine and a 64 bit PowerPC 106 * big-endian machine; it remains parseable no matter where. 107 * <p> 108 * This approach is expressly <b>not intended </b>as a replacement for standard 109 * textual XML in loosely coupled systems where maximum long-term 110 * interoperability is the overarching concern. It is also expressly <b>not 111 * intended </b>for long-term data storage. If you store data in bnux format 112 * there's every chance you won't be able to read it back a year or two from 113 * now, or even earlier. Finally, it is probably unwise to use this class if 114 * your application's performance requirements are not particularly stringent, 115 * or profiling indicates that the bottleneck is not related to XML 116 * serialization/deserialization anyway. 117 * <p> 118 * The bnux serialization algorithm is a fully streaming block-oriented 119 * algorithm, ideal for large numbers of very small to arbitrarily large 120 * XML documents. 121 * <p> 122 * The bnux deserialization algorithm is a fully streaming algorithm and can 123 * optionally be pushed through a {@link nu.xom.NodeFactory}. This enables 124 * efficient filtering and can avoid the need to build a main memory tree, which 125 * is particularly useful for arbitrarily large documents. For example, streaming 126 * XQueries over binary XML can be expressed via the NodeFactory generated by a 127 * {@link nux.xom.xquery.StreamingPathFilter}. In streaming mode, the binary 128 * codec exactly mimics the NodeFactory based behaviour of the XOM 129 * {@link nu.xom.Builder}. 130 * 131 * <h4>Faithfully Preversing XML</h4> 132 * 133 * Any and all arbitrary XOM XML documents are supported, and no schema is 134 * required. A XOM document that is serialized and subsequently deserialized by 135 * this class is <i>exactly the same </i> as the original document, preserving 136 * "as is" all names and data for elements, namespaces, additional namespace 137 * declarations, attributes, texts, document type, comments, processing 138 * instructions, whitespace, Unicode characters including surrogates, etc. As a 139 * result, the W3C XML Infoset and the W3C Canonical XML representation is 140 * guaranteed to be preserved. In particular there always holds: 141 * 142 * <pre> 143 * java.util.Arrays.equals(XOMUtil.toCanonicalXML(doc), XOMUtil 144 * .toCanonicalXML(deserialize(serialize(doc)))); 145 * </pre> 146 * 147 * <h4>Optional ZLIB Compression</h4> 148 * 149 * The bnux algorithm considerably compresses XML data with little CPU 150 * consumption, by its very design. However, bnux also has an option to further 151 * compress/decompress its output/input with the ZLIB compression algorithm. 152 * ZLIB is based on Huffman coding and also used by the popular 153 * <code>gzip</code> (e.g. {@link java.util.zip.Deflater}). ZLIB compression 154 * is rather CPU intensive, but it typically yields strong compression factors, 155 * in particular for documents containing mostly narrative text (e.g. the 156 * bible). For example, strong compression may be desirable over low-bandwith 157 * networks or when bnux data is known to be accessed rather infrequently. On 158 * the other hand, ZLIB compression probably kills performance in the presence 159 * of high-bandwidth networks such as ESnet, Internet2/Abilene or 10 Gigabit 160 * Ethernet/InfiniBand LANs, even with high-end CPUs. CPU drain is also a 161 * scalability problem in the presence of large amounts of concurrent 162 * connections. An option ranging from 0 (no ZLIB compression; best performance) 163 * to 1 (little ZLIB compression; reduced performance) to 9 (strongest ZLIB 164 * compression; worst performance) allows one to configure the CPU/memory 165 * consumption trade-off. 166 * 167 * <h4>Reliability</h4> 168 * 169 * This class has been successfully tested against some 50000 extremely 170 * weird and unique test documents, including the W3C XML conformance test 171 * suite, and no bugs are known. 172 * <p> 173 * Serialization employs no error checking at all, since malformed XOM input 174 * documents are impossible to produce given XOM's design: XOM strictly enforces 175 * wellformedness anyway. Deserialization employs some limited error checking, 176 * throwing exceptions for any improper API usage, non-bnux input data, data 177 * format version mismatch, or general binary data corruption. Beyond this, 178 * deserialization relies on XOM's hard-wired wellformedness checks, just like 179 * serialization does. Barring one of the above catastrophic situations, the 180 * bnux algorithm will always correctly and faithfully reconstruct the exact 181 * same well-formed XOM document. 182 * 183 * <h4>Example Usage:</h4> 184 * 185 * <pre> 186 * // parse standard textual XML, convert to binary format, round-trip it and compare results 187 * Document doc = new Builder().build(new File("samples/data/periodic.xml")); 188 * BinaryXMLCodec codec = new BinaryXMLCodec(); 189 * byte[] bnuxDoc = codec.serialize(doc, 0); 190 * Document doc2 = codec.deserialize(bnuxDoc); 191 * boolean isEqual = java.util.Arrays.equals( 192 * XOMUtil.toCanonicalXML(doc), XOMUtil.toCanonicalXML(doc2)); 193 * System.out.println("isEqual = " + isEqual); 194 * System.out.println(doc2.toXML()); 195 * 196 * // write binary XML document to file 197 * OutputStream out = new FileOutputStream("/tmp/periodic.xml.bnux"); 198 * out.write(bnuxDoc); 199 * out.close(); 200 * 201 * // read binary XML document from file 202 * bnuxDoc = FileUtil.toByteArray(new FileInputStream("/tmp/periodic.xml.bnux")); 203 * Document doc3 = codec.deserialize(bnuxDoc); 204 * System.out.println(doc3.toXML()); 205 * </pre> 206 * 207 * <a name="performance"/> 208 * <h4>Performance</h4> 209 * 210 * This class has been carefully profiled and optimized. Preliminary performance 211 * results over a wide range of real-world documents are given below. A more 212 * detailed presentation can be found at the Global Grid Forum <a 213 * target="_blank" 214 * href="http://www.ggf.org/GGF15/ggf_events_schedule_WSPerform.htm">Web 215 * Services Performance Workshop</a>. 216 * <p> 217 * Contrasting bnux BinaryXMLCodec with the XOM Builder and Serializer: 218 * <ul> 219 * <li>Tree Deserialization speedup: 40-100 MB/s vs. 3-30 MB/s</li> 220 * <li>Streaming Deserialization speedup: 60-500 MB/s vs. 3-30 MB/s</li> 221 * <li>Tree Serialization speedup: 50-150 MB/s vs. 5-20 MB/s</li> 222 * <li>Data compression factor: 1.0 - 4</li> 223 * </ul> 224 * For meaningful comparison, MB/s and compression factors are always given 225 * normalized in relation to the original standard textual XML file size. 226 * <ul> 227 * <li>Benchmark test data: A wide variety of small to medium large XML 228 * documents is used, including SOAP documents heavily using namespaces ( <a 229 * target="_blank" href="http://xbis.sourceforge.net/">XBIS </a>), simple XML 230 * formatted web server logs using no namespaces, RDF documents with lots of 231 * attributes and namespaces, the periodic table, documents consisting of large 232 * narrative text ( <a target="_blank" 233 * href="http://www.oasis-open.org/cover/bosakShakespeare200.html">Shakespeare 234 * </a>), publication citations ( <a target="_blank" 235 * href="http://dblp.uni-trier.de/xml/">DBLP </a>), music title databases ( <a 236 * target="_blank" href="http://www.freedb.org/">FreeDB </a>), Japanese 237 * documents (XML conformance test suite), SVG image files, etc.</li> 238 * 239 * <li>Benchmark configuration: no ZLIB compression, xom-1.2, non-validating 240 * XOM Builder using xerces-2.8.0, no DTD or schema, Sun JDK 1.5.0 server VM, 241 * commodity PC 2004, Dual Pentium 4, 3.4 GHz, Redhat 9</li> 242 * </ul> 243 * 244 * Example Interpretation: 245 * <ul> 246 * <li>Small documents: Results translate, for example, to ping-pong 247 * round-tripping of typical 500 byte SOAP request/response message documents at 248 * a rate of 25000 msg/s, compared to 2500 msg/s with XOM (excluding network 249 * latency). More pronounced, 500 (150) byte documents with few namespaces 250 * translate to 35000 (120000) msg/s, compared to 3500 (5000) msg/s with XOM. 251 * Consequently, XML serialization and deserialization are probably nomore your 252 * application's bottleneck, leaving, say, 95% CPU headroom free for other 253 * application modules.</li> 254 * 255 * <li>Medium documents: When having a main-memory cache of several thousand 1 256 * MB documents, each containing highly structured complex data, one can 257 * deserialize, XQuery and serve from the cache at a rate of 50 documents/s, 258 * compared to 5 documents/s with XOM.</li> 259 * 260 * </ul> 261 * Note that in contrast to other algorithms, these measurements include XOM 262 * tree building and walking, hence measures delivering data to and from actual 263 * XML applications, rather than merely to and from a low-level SAX event stream 264 * (which is considerably cheaper and deemed less useful). 265 * <p> 266 * The deserialization speedup is further multiplied when DTDs or schema 267 * validation is used while parsing standard textual XML. 268 * <p> 269 * This class relies on advanced Java compiler optimizations, which take 270 * considerable time to warm up. Hence, for comparative benchmarks, use a 271 * server-class VM and make sure to repeat runs for at least 30 seconds. 272 * <p> 273 * Further, you will probably want to eliminate drastic XOM hotspots by 274 * compiling XOM with "ant -Dfat=true jar" to maintain an internal String 275 * instead of an UTF-8 encoded byte array in {@link nu.xom.Text}, which 276 * eliminates the expensive character conversions implied for each access to a 277 * Text object. This increases performance at the expense of memory footprint. 278 * The measurements above report numbers using these patches, both for xom and 279 * bnux. If you're curious about the whereabouts of bottlenecks, run java with 280 * the non-perturbing '-server -agentlib:hprof=cpu=samples,depth=10' flags, then 281 * study the trace log and correlate its hotspot trailer with its call stack 282 * headers (see <a target="_blank" 283 * href="http://java.sun.com/developer/technicalArticles/Programming/HPROF.html"> 284 * hprof tracing </a>). 285 * <p> 286 * Use class {@link nux.xom.sandbox.BinaryXMLTest} to reproduce results, verify 287 * correctness or to evaluate performance for your own datasets. 288 * 289 * @author whoschek.AT.lbl.DOT.gov 290 * @author $Author: hoschek $ 291 * @version $Revision: 1.179 $, $Date: 2006/06/18 21:25:02 $ 292 */ 293 public class BinaryXMLCodec { 294 295 /* 296 * TODO: add a StAX interface on top of bnux? 297 * e.g. createXMLStreamReader(byte[]), similar for XMLStreamWriter 298 * TODO: add coalescing of adjacent Text nodes on deserialization? 299 */ 300 301 /* 302 * TODO: My impression is that there is remaining potential for some speedup, 303 * both for serialization and deserialization performance. Ideas towards this 304 * end include: 305 * 306 * - Add option to always use UTF16 (level=-1) 307 * - Micro caches may become obsolete once XOM has its own internal QName LRU cache 308 * - Use a better low level namespace iteration 309 * - Split unified symbolTable into several smaller symbolTables for Text, Attributes, and other. 310 * - other? 311 */ 312 313 // for deserialization: factory to stream (push) into 314 private NodeFactory factory; 315 316 // for deserialization: unique symbols from deserialized symbolTable 317 private String[] symbols; 318 319 // for deserialization: are pages ZLIB compressed? 320 private boolean isCompressed; 321 322 // for deserialization and serialization: page buffer 323 private ArrayByteList page; // multi-byte integers are ALWAYS in big-endian 324 325 // for serialization: 326 // holds unique strings found in document (qnames, texts, uris, etc.) 327 private SymbolTable symbolTable; 328 329 // for serialization: 330 // byte-level node type tokens in XML document order; also length of next index(es) 331 private ArrayByteList nodeTokens; 332 333 // for serialization: indexes into symbolTable/entries 334 private ArrayIntList indexData; 335 336 // for serialization: has first page of current document already been written? 337 private boolean isFirstPage = true; 338 339 // ZLIB 340 private Inflater decompressor; // ZLIB 341 private Deflater compressor; // ZLIB 342 private int compressionLevel = -1; // initialize to "undefined" 343 344 // for deserialization: avoids reverification of PCDATA 345 private Text[] textCache; 346 347 // for deserialization: avoids reverification of namespace URIs 348 private String[] nameCache; 349 private LRUHashMap1 internedNames; 350 351 // for deserialization: 352 // avoids reverification of qname and URI, as well as indexOf() calls, and saves string memory 353 private NodeBuilder nodeBuilder; 354 355 // for streaming serialization 356 private OutputStream out; 357 358 /** 359 * For serialization: (approximate) maximum number of bytes per page. 360 * <p> 361 * To enable true streaming, a serialized document consists of one or more 362 * independent consecutive pages. Each page contains a portion of the XML 363 * document, in document order. More specifically, each page consists of a 364 * tokenized byte array and corresponding symbols. Once a page has been 365 * read/written related (heavy) state can be discarded, freeing memory. No 366 * more than one page needs to be held in memory at any given time. For very 367 * large documents this page design reduces memory consumption, increases 368 * throughput and reduces latency. For small to medium sized documents it 369 * makes next to no difference. 370 * <p> 371 * A small page capacity (e.g. 128 bytes) leads to lower latency per page 372 * but also lower throughput and lower compression overall. Conversely, a 373 * large page capacity (e.g. 1 MB) leads to higher throughput and higher 374 * compression, at the expense of higher latency. However, a very large page 375 * capacity (e.g. 10 MB) leads to memory subsystem pressure on streaming. 376 * Thus, here we use a happy medium, small enough to generate little memory 377 * subsystem pressure, and large enough to gain high throughput, retain 378 * almost all compression stemming from redundancy eliminating tokenization, 379 * with near zero overhead for small to medium sized documents, and 380 * outstanding performance for very large documents. 381 */ 382 private static final int MAX_PAGE_CAPACITY = 64 * 1024; 383 // private static final int MAX_PAGE_CAPACITY = 1; // DEBUG only 384 385 // low 3 bits for node token types 386 // high 4 bits typically hold number of bytes of the next two indexes 387 private static final int TEXT = 0; 388 private static final int ATTRIBUTE = 1; 389 private static final int BEGIN_ELEMENT = 2; 390 private static final int END_ELEMENT = 3; 391 private static final int COMMENT = 4; 392 private static final int NAMESPACE_DECLARATION = 5; 393 private static final int PROCESSING_INSTRUCTION = 6; 394 private static final int DOC_TYPE = 7; 395 396 private static final int BNUX_MAGIC = createMagicNumber(); // for sanity checks 397 private static final byte VERSION = 7; // version of bnux data format 398 private static final int DOCUMENT_HEADER_SIZE = 4 + 1; // in bytes 399 private static final int PAGE_HEADER_SIZE = 4 + 4 + 4 + 4 + 4; // in bytes 400 401 private static final boolean IS_EXTENDED_XOM = hasXOMExtensions(); 402 403 /** 404 * Marker for non-existant systemID or publicID in DocType. XML and the 405 * nu.xom.Verifier regard " " as an illegal ID, so it can never occur in 406 * practice. Thus we can use it as an unambigous marker identifying a 407 * serialized null value. 408 */ 409 private static final String DOCTYPE_NULL_ID = " "; 410 411 private static final boolean DEBUG = false; // VM does dead code elimination 412 413 /** Reinitializes instance variables to virgin state. */ reset()414 private void reset() { 415 // deserialization state: 416 internedNames = null; 417 nodeBuilder = null; 418 factory = null; 419 420 // serialization state: 421 symbolTable = null; 422 page = null; 423 nodeTokens = null; 424 indexData = null; 425 isFirstPage = true; 426 out = null; 427 428 // better safe than sorry: 429 try { 430 if (decompressor != null) decompressor.end(); // free resources 431 } finally { 432 decompressor = null; 433 try { 434 if (compressor != null) compressor.end(); // free resources 435 } finally { 436 compressor = null; 437 } 438 } 439 } 440 441 /** 442 * Constructs an instance; An instance can be reused serially, but is not 443 * thread-safe, just like a {@link nu.xom.Builder}. 444 */ BinaryXMLCodec()445 public BinaryXMLCodec() { 446 } 447 448 /** 449 * Constructs a new streaming serializer that serializes bnux binary XML to 450 * the given underlying output stream, using the given ZLIB compression 451 * level. 452 * <p> 453 * An optional zlib compression level ranging from 0 (no ZLIB compression; 454 * best performance) to 1 (little ZLIB compression; reduced performance) to 455 * 9 (strongest ZLIB compression; worst performance) allows one to configure 456 * the CPU/memory consumption trade-off. 457 * <p> 458 * Unless there is a good reason to the contrary, you should always use 459 * level 0: the bnux algorithm typically already precompresses considerably. 460 * 461 * @param out 462 * the underlying output stream to write to 463 * @param zlibCompressionLevel 464 * a number in the range 0..9 465 * @return a streaming serializer 466 */ createStreamingSerializer(OutputStream out, int zlibCompressionLevel)467 public StreamingSerializer createStreamingSerializer(OutputStream out, int zlibCompressionLevel) { 468 return new StreamingBinaryXMLSerializer(this, out, zlibCompressionLevel); 469 } 470 471 /** 472 * Returns whether or not the given input stream contains a bnux document. 473 * <p> 474 * A peek into the first 4 bytes is sufficient for unambigous detection, as 475 * standard textual XML cannot start with any arbitrary four byte 476 * combination. 477 * <p> 478 * Finally, the read bytes are put back onto the stream, so they can be 479 * reread as part of subsequent parsing attempts. Therefore, the input 480 * stream must support <code>input.mark()</code> and 481 * <code>input.reset()</code>. For example, a 482 * {@link java.io.BufferedInputStream} is a good choice. 483 * 484 * @param input 485 * the stream to read from 486 * @return true if the stream contains a bnux document 487 * @throws IllegalArgumentException 488 * if the underlying stream does not support 489 * <code>input.mark()</code> and <code>input.reset()</code>. 490 * @throws IOException 491 * if the underlying input stream encounters an I/O error 492 * @see InputStream#mark(int) 493 */ isBnuxDocument(InputStream input)494 public boolean isBnuxDocument(InputStream input) throws IOException { 495 if (input == null) 496 throw new IllegalArgumentException("input stream must not be null"); 497 if (!input.markSupported()) 498 throw new IllegalArgumentException("markSupported() must be true"); 499 500 int magicBytes = 4; 501 input.mark(magicBytes); 502 try { 503 ArrayByteList list = new ArrayByteList(magicBytes); 504 if (!list.ensureRemaining(input, magicBytes)) { 505 return false; // stream contains less than 4 bytes 506 } 507 return list.getInt() == BNUX_MAGIC; 508 } finally { 509 input.reset(); // unread the header 510 } 511 } 512 513 /** 514 * Equivalent to 515 * <code>deserialize(new ByteArrayInputStream(input), new NodeFactory())</code>. 516 * 517 * @param bnuxDocument 518 * the bnux document to deserialize. 519 * @return the new XOM document obtained from deserialization. 520 * @throws BinaryParsingException 521 * if the bnux document is unreadable or corrupt for some reason 522 */ deserialize(byte[] bnuxDocument)523 public Document deserialize(byte[] bnuxDocument) throws BinaryParsingException { 524 if (bnuxDocument == null) 525 throw new IllegalArgumentException("bnuxDocument must not be null"); 526 527 try { 528 return deserialize(new ByteArrayInputStream(bnuxDocument), null); 529 } catch (IOException e) { 530 throw new BinaryParsingException(e); // can never happen 531 } 532 } 533 534 /** 535 * Returns the XOM document obtained by deserializing the next binary XML 536 * document from the given input stream. 537 * <p> 538 * If the document is in ZLIB compressed bnux format, it will be 539 * auto-detected and auto-decompressed as part of deserialization. 540 * <p> 541 * This method exactly mimics the NodeFactory based behaviour of the XOM 542 * {@link nu.xom.Builder}. A NodeFactory enables efficient filtering and 543 * can avoid the need to build a main memory tree, which is particularly 544 * useful for large documents. For example, streaming XQueries over binary 545 * XML can be expressed via the NodeFactory generated by a 546 * {@link nux.xom.xquery.StreamingPathFilter}. Binary XML files can be 547 * converted to and from standard textual XML files via a 548 * {@link nux.xom.pool.XOMUtil#getRedirectingNodeFactory(StreamingSerializer)}. For 549 * other example factories, see {@link nux.xom.pool.XOMUtil}. 550 * <p> 551 * Bnux is a self-framing data format: It knows where the end of a document 552 * occurs. An input stream can contain any number of independent documents, 553 * one after another. Thus, this method reads from the stream as many bytes 554 * as required for the current document, but no more than that. Unlike SAX 555 * XML parsers and unlike a {@link nu.xom.Builder}, it does not read until 556 * end-of-stream (EOS), and it does not auto-close the input stream. If this 557 * method returns successfully, the input stream has been positioned one 558 * byte past the current bnux document, ready to deserialize the following 559 * document, if any. It is the responsibility of the caller to ensure the 560 * input stream gets properly closed when deemed appropriate. 561 * 562 * @param input 563 * the stream to read and deserialize from 564 * @param factory 565 * the node factory to stream into. May be <code>null</code> in 566 * which case the default XOM NodeFactory is used, building the 567 * complete XML document tree. 568 * @return the new XOM document obtained from deserialization. 569 * @throws BinaryParsingException 570 * if the bnux document is unreadable or corrupt for some reason 571 * @throws IOException 572 * if the underlying input stream encounters an I/O error 573 */ deserialize(InputStream input, NodeFactory factory)574 public Document deserialize(InputStream input, NodeFactory factory) 575 throws BinaryParsingException, IOException { 576 577 if (input == null) 578 throw new IllegalArgumentException("input stream must not be null"); 579 if (factory == null) factory = new NodeFactory(); 580 581 // read document header 582 if (page == null) page = new ArrayByteList(256); 583 page.clear(); 584 if (!page.ensureRemaining(input, 4 + 1 + 1 + 4)) 585 throw new BinaryParsingException("Missing bnux document header"); 586 587 int magic = page.getInt(); 588 if (magic != BNUX_MAGIC) throw new BinaryParsingException( 589 "Bnux magic number mismatch: " + magic + ", must be: " + BNUX_MAGIC); 590 591 int version = page.get(); 592 isCompressed = version < 0; 593 if (isCompressed) version = -version; 594 if (version != VERSION) throw new BinaryParsingException( 595 "Bnux data format version mismatch: " + version + ", must be: " + VERSION); 596 if (isCompressed) { 597 if (decompressor == null) decompressor = new Inflater(); 598 } 599 600 if (page.get() != DOC_TYPE) // surrogate hack to identify BEGIN_PAGE 601 throw new BinaryParsingException("Illegal bnux page header marker"); 602 603 // prepare 604 if (internedNames == null) internedNames = new LRUHashMap1(128); 605 if (nodeBuilder == null) nodeBuilder = new NodeBuilder(); 606 607 // parse node token data and packed indexes, building the XOM tree 608 this.factory = factory; 609 try { 610 return readDocument(page, input); 611 } catch (Throwable t) { 612 reset(); // better safe than sorry 613 if (t instanceof Error) { 614 throw (Error) t; 615 } else if (t instanceof BinaryParsingException) { 616 throw (BinaryParsingException) t; 617 } else if (t instanceof IOException) { 618 throw (IOException) t; 619 } else { 620 throw new BinaryParsingException(t); 621 } 622 } finally { 623 this.symbols = null; // help gc 624 this.textCache = null; // help gc 625 this.nameCache = null; // help gc 626 this.factory = null; // help gc 627 // if (decompressor != null) decompressor.end(); 628 // decompressor = null; 629 } 630 } 631 632 /** 633 * Returns the bnux binary XML document obtained by serializing the given 634 * XOM document. 635 * <p> 636 * An optional zlib compression level ranging from 0 (no ZLIB compression; 637 * best performance) to 1 (little ZLIB compression; reduced performance) to 638 * 9 (strongest ZLIB compression; worst performance) allows one to configure 639 * the CPU/memory consumption trade-off. 640 * <p> 641 * Unless there is a good reason to the contrary, you should always use 642 * level 0: the bnux algorithm typically already precompresses considerably. 643 * 644 * @param document 645 * the XOM document to serialize 646 * @param zlibCompressionLevel 647 * a number in the range 0..9 648 * @return the bnux document obtained from serialization. 649 * @throws IllegalArgumentException 650 * if the compression level is out of range. 651 */ 652 public byte[] serialize(Document document, int zlibCompressionLevel) 653 throws IllegalArgumentException { 654 655 ByteArrayOutputStream result = new ByteArrayOutputStream(256); 656 try { 657 serialize(document, zlibCompressionLevel, result); 658 } catch (IOException e) { 659 throw new RuntimeException(e); // can never happen 660 } 661 return result.toByteArray(); 662 } 663 664 /** 665 * Serializes the given XOM document as a bnux binary XML document onto 666 * the given output stream. 667 * <p> 668 * An optional zlib compression level ranging from 0 (no ZLIB compression; 669 * best performance) to 1 (little ZLIB compression; reduced performance) to 670 * 9 (strongest ZLIB compression; worst performance) allows one to configure 671 * the CPU/memory consumption trade-off. 672 * <p> 673 * Unless there is a good reason to the contrary, you should always use 674 * level 0: the bnux algorithm typically already precompresses considerably. 675 * 676 * @param document 677 * the XOM document to serialize 678 * @param zlibCompressionLevel 679 * a number in the range 0..9 680 * @param out 681 * the output stream to write to 682 * @throws IllegalArgumentException 683 * if the compression level is out of range. 684 * @throws IOException 685 * if the underlying output stream encounters an I/O error 686 */ 687 public void serialize(Document document, int zlibCompressionLevel, 688 OutputStream out) throws IllegalArgumentException, IOException { 689 690 if (document == null) 691 throw new IllegalArgumentException("XOM document must not be null"); 692 if (zlibCompressionLevel < 0 || zlibCompressionLevel > 9) 693 throw new IllegalArgumentException("Compression level must be 0..9"); 694 if (out == null) 695 throw new IllegalArgumentException("Output stream must not be null"); 696 697 try { 698 setOutputStream(zlibCompressionLevel, out); 699 writeDocument(document); // generate output 700 } catch (Throwable t) { 701 reset(); // better safe than sorry 702 if (t instanceof Error) { 703 throw (Error) t; 704 } else if (t instanceof RuntimeException) { 705 throw (RuntimeException) t; 706 } else if (t instanceof IOException) { 707 throw (IOException) t; 708 } else { 709 throw new RuntimeException(t); 710 } 711 } finally { 712 this.symbolTable = null; // help gc 713 this.out = null; 714 } 715 } 716 717 final void setOutputStream(int zlibCompressionLevel, OutputStream out) { 718 if (zlibCompressionLevel > 0) { 719 if (compressor == null || zlibCompressionLevel != compressionLevel) { 720 if (compressor != null) compressor.end(); // free resources 721 compressor = new Deflater(zlibCompressionLevel); 722 } 723 } 724 compressionLevel = zlibCompressionLevel; 725 this.out = out; 726 } 727 728 /** Prepares reading from the next page. */ readPage(ArrayByteList src, InputStream input)729 private void readPage(ArrayByteList src, InputStream input) 730 throws BinaryParsingException, IOException { 731 732 if (DEBUG) System.err.println("reading page"); 733 if (!src.ensureRemaining(input, 4)) 734 throw new BinaryParsingException("Missing remaining bnux page size"); 735 int pageSize = src.getInt(); 736 if (src.remaining() != 0) 737 throw new IllegalStateException("Internal codec bug"); 738 739 boolean isLastPage = pageSize < 0; 740 if (isLastPage) pageSize = -pageSize; 741 // if (DEBUG) System.err.println("pageSize = " + pageSize); 742 if (!isLastPage) { 743 pageSize++; // read one byte past page, fetching PAGE_BEGIN marker 744 } 745 if (!src.ensureRemaining(input, pageSize)) 746 throw new BinaryParsingException("Missing remaining bnux page body"); 747 748 if (isCompressed) decompress(src); 749 750 int symbolTableSize = src.getInt(); 751 if (symbolTableSize < 0) 752 throw new BinaryParsingException("Negative symbol table size"); 753 int decodedSize = src.getInt(); 754 if (decodedSize < 0) 755 throw new BinaryParsingException("Negative decodedSize"); 756 int encodedSize = src.getInt(); 757 if (encodedSize < 0) 758 throw new BinaryParsingException("Negative encodedSize"); 759 760 // read symbolTable 761 this.symbols = null; // help gc 762 if (decodedSize == encodedSize) { // safe trick, faster 763 // Note that 7 bit ASCII is a proper subset of UTF-8 764 this.symbols = src.getASCIIStrings(symbolTableSize); 765 } else { 766 this.symbols = src.getUTF8Strings(symbolTableSize); 767 } 768 // this.symbols = src.getUTF16Strings(symbolTableSize); 769 if (DEBUG) System.err.println("read symbols = " + Arrays.asList(symbols)); 770 771 int magic = src.getInt(); 772 if (magic != BNUX_MAGIC) throw new BinaryParsingException( 773 "Bnux magic number mismatch: " + magic + ", must be: " + BNUX_MAGIC); 774 775 // reset caches in preparation for XML token decoding 776 if (this.nameCache == null) { 777 nameCache = new String[Math.min(64, symbolTableSize)]; 778 } else { 779 for (int i=nameCache.length; --i >= 0; ) nameCache[i] = null; 780 } 781 782 if (factory.getClass() == NodeFactory.class) { // fast path 783 if (this.textCache == null) { 784 textCache = new Text[Math.min(256, symbolTableSize)]; 785 } else { 786 for (int i=textCache.length; --i >= 0; ) textCache[i] = null; 787 } 788 } 789 // this.nameCache = null; // help gc 790 // this.nameCache = new String[Math.min(64, symbolTableSize)]; 791 // this.textCache = null; // help gc 792 // if (factory.getClass() == NodeFactory.class) { // fast path 793 // this.textCache = new Text[Math.min(128, symbolTableSize)]; 794 // } 795 } 796 797 private void decompress(ArrayByteList src) throws BinaryParsingException { 798 if (nodeTokens == null) nodeTokens = new ArrayByteList(); 799 nodeTokens.clear(); 800 801 try { 802 nodeTokens.add(decompressor, src); 803 } catch (DataFormatException e) { 804 String s = e.getMessage(); 805 throw new BinaryParsingException( 806 s != null ? s : "Invalid ZLIB data format", e); 807 } 808 809 src.swap(nodeTokens); // replace src with nodeTokens 810 nodeTokens.clear(); 811 } 812 813 /** Parses document from encoded src buffer; tokens appear in document order. */ 814 private Document readDocument(ArrayByteList src, InputStream input) 815 throws BinaryParsingException, IOException { 816 817 if (DEBUG) System.err.println("reading document"); 818 readPage(src, input); 819 Document doc = factory.startMakingDocument(); 820 // doc.setBaseURI(symbols[src.getInt()]); 821 doc.setBaseURI(getInternedName(src.getInt())); 822 boolean hasRootElement = false; 823 int i = 0; 824 825 // add children of document, retaining the exact same order found in input 826 while (src.remaining() > 0) { 827 Nodes nodes; 828 int type = src.get(); // look ahead 829 if (DEBUG) System.err.println("reading type = " + toString(type)); 830 switch (type & 0x07) { // three low bits indicate node type 831 case TEXT: { 832 throw new BinaryParsingException("Unreachable text"); 833 } 834 case ATTRIBUTE: { 835 throw new BinaryParsingException("Unreachable attribute"); 836 } 837 case BEGIN_ELEMENT: { 838 if (factory.getClass() == NodeFactory.class) { // fast path 839 Element root = readStartTag(src, type); 840 readElement(src, root, input); // reads entire subtree 841 nodes = new Nodes(root); 842 } else { // slow path 843 Element root = readStartTagF(src, type, true); 844 if (root == null) { 845 throw new NullPointerException("Factory failed to create root element."); 846 } 847 doc.setRootElement(root); 848 readElementF(src, root, input); 849 nodes = factory.finishMakingElement(root); 850 } 851 break; 852 } 853 case END_ELEMENT: { 854 throw new BinaryParsingException("Unreachable end of element"); 855 } 856 case COMMENT: { 857 nodes = readCommentF(src, type); 858 break; 859 } 860 case NAMESPACE_DECLARATION: { 861 throw new BinaryParsingException("Unreachable namespace declaration"); 862 } 863 case PROCESSING_INSTRUCTION: { 864 nodes = readProcessingInstructionF(src); 865 break; 866 } 867 case DOC_TYPE: { 868 nodes = readDocTypeF(src); 869 break; 870 } 871 default: { 872 throw new BinaryParsingException("Illegal node type code=" + type); 873 } 874 } 875 876 // append nodes: 877 for (int j=0; j < nodes.size(); j++) { 878 Node node = nodes.get(j); 879 if (node instanceof Element) { // replace fake root with real root 880 if (hasRootElement) { 881 throw new IllegalAddException( 882 "Factory returned multiple root elements"); 883 } 884 doc.setRootElement((Element) node); 885 hasRootElement = true; 886 } else { 887 doc.insertChild(node, i); 888 } 889 i++; 890 } 891 } 892 893 if (!hasRootElement) throw new WellformednessException( 894 "Factory attempted to remove the root element"); 895 factory.finishMakingDocument(doc); 896 if (DEBUG) System.err.println("finished reading document"); 897 return doc; 898 } 899 900 /** Reads start tag and returns a corresponding empty element */ 901 private Element readStartTag(ArrayByteList src, int type) { 902 String qname = readString(src, 4, type); 903 String namespaceURI = readName(src, 6, type); 904 return this.nodeBuilder.createElement(qname, namespaceURI); 905 // return new Element(qname, namespaceURI); 906 } 907 908 private Element readStartTagF(ArrayByteList src, int type, boolean isRoot) { 909 String qname = readString(src, 4, type); 910 String namespaceURI = readName(src, 6, type); 911 return isRoot ? 912 factory.makeRootElement(qname, namespaceURI) : 913 factory.startMakingElement(qname, namespaceURI); 914 } 915 916 /** Iterative pull parser reading an entire element subtree. */ 917 private void readElement(ArrayByteList src, Element current, InputStream input) 918 throws BinaryParsingException, IOException { 919 920 while (true) { 921 Node node = null; 922 Element down = null; 923 int type = src.get(); // look ahead 924 // if (DEBUG) System.err.println("reading type = " + toString(type)); 925 switch (type & 0x07) { // three low bits indicate node type 926 case TEXT: { 927 node = readText(src, type); 928 break; 929 } 930 case ATTRIBUTE: { 931 readAttribute(src, current, type); 932 continue; 933 } 934 case BEGIN_ELEMENT: { 935 down = readStartTag(src, type); 936 node = down; 937 break; 938 } 939 case END_ELEMENT: { 940 current = (Element) current.getParent(); 941 if (current == null) return; // we're done with the root element 942 continue; 943 } 944 case COMMENT: { 945 node = readComment(src, type); 946 break; 947 } 948 case NAMESPACE_DECLARATION: { 949 readNamespaceDeclaration(src, current, type); 950 continue; 951 } 952 case PROCESSING_INSTRUCTION: { 953 node = readProcessingInstruction(src); 954 break; 955 } 956 case DOC_TYPE: { // surrogate hack to identify BEGIN_PAGE 957 readPage(src, input); 958 continue; 959 } 960 } 961 962 // if (DEBUG) System.err.println("read node=" + node.toXML()); 963 964 // assert node != null 965 if (IS_EXTENDED_XOM) { // xom-1.1 + patch 966 current.fastInsertChild(node, current.getChildCount()); 967 } else { 968 current.insertChild(node, current.getChildCount()); 969 } 970 971 if (down != null) current = down; // recurse down 972 } 973 } 974 975 /** Iterative pull parser reading an entire element subtree (using NodeFactory). */ 976 private void readElementF(ArrayByteList src, Element current, InputStream input) 977 throws BinaryParsingException, IOException { 978 979 // final ArrayList stack = new ArrayList(); 980 final FastStack stack = new FastStack(); 981 stack.push(current); 982 boolean addAttributesAndNamespaces = true; 983 984 while (true) { 985 Nodes nodes = null; 986 int type = src.get(); // look ahead 987 // if (DEBUG) System.err.println("reading type = " + toString(type)); 988 989 switch (type & 0x07) { // three low bits indicate node type 990 case TEXT: { 991 nodes = readTextF(src, type); 992 break; 993 } 994 case ATTRIBUTE: { 995 Element elem = addAttributesAndNamespaces ? current : null; 996 nodes = readAttributeF(src, elem, type); 997 break; 998 } 999 case BEGIN_ELEMENT: { 1000 Element elem = readStartTagF(src, type, false); 1001 stack.push(elem); // even if it's null 1002 if (elem != null) { 1003 current.insertChild(elem, current.getChildCount()); 1004 current = elem; // recurse down 1005 } 1006 addAttributesAndNamespaces = elem != null; 1007 continue; 1008 } 1009 case END_ELEMENT: { 1010 Element elem = stack.pop(); 1011 if (elem == null) { 1012 continue; // skip element 1013 } 1014 ParentNode parent = elem.getParent(); 1015 if (parent == null) throwTamperedWithParent(); 1016 if (parent instanceof Document) { 1017 return; // we're done with the root element 1018 } 1019 1020 current = (Element) parent; // recurse up 1021 nodes = factory.finishMakingElement(elem); 1022 1023 if (nodes.size()==1 && nodes.get(0)==elem) { // same node? (common case) 1024 continue; // optimization: no need to remove and then readd same element 1025 } 1026 1027 if (current.getChildCount()-1 < 0) throwTamperedWithParent(); 1028 current.removeChild(current.getChildCount()-1); 1029 break; 1030 } 1031 case COMMENT: { 1032 nodes = readCommentF(src, type); 1033 break; 1034 } 1035 case NAMESPACE_DECLARATION: { 1036 Element elem = addAttributesAndNamespaces ? current : null; 1037 readNamespaceDeclaration(src, elem, type); 1038 continue; 1039 } 1040 case PROCESSING_INSTRUCTION: { 1041 nodes = readProcessingInstructionF(src); 1042 break; 1043 } 1044 case DOC_TYPE: { // surrogate hack for BEGIN_PAGE 1045 readPage(src, input); 1046 continue; 1047 } 1048 } 1049 1050 appendNodes(current, nodes); 1051 } 1052 } 1053 1054 private static void appendNodes(Element elem, Nodes nodes) { 1055 if (nodes != null) { 1056 int size = nodes.size(); 1057 for (int i=0; i < size; i++) { 1058 Node node = nodes.get(i); 1059 if (node instanceof Attribute) { 1060 elem.addAttribute((Attribute) node); 1061 } else { 1062 elem.insertChild(node, elem.getChildCount()); 1063 } 1064 } 1065 } 1066 } 1067 1068 private static void throwTamperedWithParent() { 1069 throw new XMLException("Factory has tampered with a parent pointer " + 1070 "of ancestor-or-self in finishMakingElement()"); 1071 } 1072 1073 private void readAttribute(ArrayByteList src, Element dst, int type) throws BinaryParsingException { 1074 String qname = readString(src, 4, type); 1075 String namespaceURI = readName(src, 6, type); 1076 String value = readString(src, 4, src.get()); 1077 Attribute.Type attrType = Util.getAttributeType(src.get()); 1078 Attribute attr = this.nodeBuilder.createAttribute(qname, namespaceURI, value, attrType); 1079 // Attribute attr = new Attribute(qname, namespaceURI, value, attrType); 1080 dst.addAttribute(attr); 1081 } 1082 1083 private Nodes readAttributeF(ArrayByteList src, Element dst, int type) throws BinaryParsingException { 1084 String qname = readString(src, 4, type); 1085 String namespaceURI = readName(src, 6, type); 1086 String value = readString(src, 4, src.get()); 1087 Attribute.Type attrType = Util.getAttributeType(src.get()); 1088 if (dst == null) return null; // NONE; 1089 return factory.makeAttribute(qname, namespaceURI, value, attrType); 1090 } 1091 1092 private Comment readComment(ArrayByteList src, int type) { 1093 return new Comment(readString(src, 4, type)); 1094 } 1095 1096 private Nodes readCommentF(ArrayByteList src, int type) { 1097 return factory.makeComment(readString(src, 4, type)); 1098 } 1099 1100 private void readNamespaceDeclaration(ArrayByteList src, Element dst, int type) { 1101 String prefix = readString(src, 4, type); 1102 String uri = readName(src, 6, type); 1103 if (dst != null) dst.addNamespaceDeclaration(prefix, uri); 1104 } 1105 1106 private ProcessingInstruction readProcessingInstruction(ArrayByteList src) { 1107 int type = src.get(src.position() - 1); 1108 String target = readString(src, 4, type); 1109 String value = readString(src, 6, type); 1110 return new ProcessingInstruction(target, value); 1111 } 1112 1113 private Nodes readProcessingInstructionF(ArrayByteList src) { 1114 int type = src.get(src.position() - 1); 1115 String target = readString(src, 4, type); 1116 String value = readString(src, 6, type); 1117 return factory.makeProcessingInstruction(target, value); 1118 } 1119 1120 /** Does not pack indexes of doctype (infrequent anyway) */ 1121 private Nodes readDocTypeF(ArrayByteList src) { 1122 String rootElementName = symbols[src.getInt()]; 1123 String publicID = symbols[src.getInt()]; 1124 if (DOCTYPE_NULL_ID.equals(publicID)) publicID = null; 1125 String systemID = symbols[src.getInt()]; 1126 if (DOCTYPE_NULL_ID.equals(systemID)) systemID = null; 1127 String internalDTDSubset = symbols[src.getInt()]; 1128 if (internalDTDSubset.length() == 0) internalDTDSubset = null; 1129 1130 Nodes nodes = factory.makeDocType(rootElementName, publicID, systemID); 1131 for (int i=0; i < nodes.size(); i++) { 1132 if (nodes.get(i) instanceof DocType) { 1133 DocType docType = (DocType) nodes.get(i); 1134 if (docType.getInternalDTDSubset().length() == 0) { 1135 try { 1136 docType.setInternalDTDSubset(internalDTDSubset); 1137 } catch (IllegalAccessError e) { 1138 ; // ignore; setInternalDTDSubset() is private in xom < 1.1 1139 } 1140 } 1141 } 1142 } 1143 return nodes; 1144 } 1145 1146 // try to avoid XML reverification by caching repetitive Texts 1147 private Text readText(ArrayByteList src, int type) { 1148 int i = readSymbol(src, 4, type); 1149 Text text; 1150 if (i < textCache.length) { 1151 text = textCache[i]; 1152 if (text != null) return new Text(text); 1153 } 1154 text = new Text(symbols[i]); 1155 if (i < textCache.length) textCache[i] = text; 1156 return text; 1157 } 1158 1159 private Nodes readTextF(ArrayByteList src, int type) { 1160 return factory.makeText(readString(src, 4, type)); 1161 } 1162 1163 /** Reads string via packed index from symbolTable */ 1164 private String readString(ArrayByteList src, int shift, int type) { 1165 int i = readSymbol(src, shift, type); 1166 if (i < 0) return ""; 1167 return symbols[i]; 1168 } 1169 1170 /** Reads string via packed index from symbolTable */ 1171 private static int readSymbol(ArrayByteList src, int shift, int type) { 1172 // assert shift == 4 || shift == 6 1173 if (Util.isInlinedIndex(type)) { 1174 if (shift == 6) return -1; 1175 return Util.getInlinedIndex(type); 1176 } 1177 1178 switch ((type >>> shift) & 0x03) { // look at two bits indicating index length 1179 case 0 : return Util.getUnsignedByte(src.get()); 1180 case 1 : return Util.getUnsignedShort(src.getShort()); 1181 case 2 : return -1; 1182 default: return src.getInt(); 1183 } 1184 } 1185 1186 private String readName(ArrayByteList src, int shift, int type) { 1187 int i = readSymbol(src, shift, type); 1188 if (i < 0) return ""; 1189 if (i < nameCache.length) { 1190 String name = nameCache[i]; 1191 if (name == null) { // cache miss 1192 name = getInternedName(i); 1193 nameCache[i] = name; 1194 } 1195 return name; 1196 } 1197 return symbols[i]; 1198 } 1199 1200 private String getInternedName(int i) { 1201 String name = symbols[i]; 1202 if (name.length() == 0) { 1203 name = ""; 1204 } else { 1205 name = (String) internedNames.get(name); 1206 if (name == null) { 1207 name = symbols[i]; 1208 internedNames.put(name, name); 1209 } 1210 } 1211 return name; 1212 } 1213 1214 /** Writes nodeTokens, indexData, symbolTable to output stream */ 1215 private void writePage(boolean isLastPage) throws IOException { 1216 if (DEBUG) System.err.println("writing page"); 1217 Entry[] entries = symbolTable.getEntries(); 1218 int numChars = symbolTable.numCharacters(); 1219 1220 // reorder entries and update indexData accordingly. 1221 packSort(entries, indexData); 1222 // if (DEBUG) printStatistics(entries); 1223 1224 // add header to page 1225 page.ensureCapacity(page.size() + 1 + 4 + 4 + 4 + 4 + 4 + numChars*4 + entries.length + 1226 nodeTokens.size() + indexData.size() + numChars/100); // an educated guess 1227 page.add((byte) DOC_TYPE); // BEGIN_PAGE marker surrogate 1228 page.addInt(0); // pageSize dummy placeholder 1229 int pageOffset = page.size(); 1230 page.addInt(entries.length); 1231 page.addInt(numChars + entries.length); // decodedSize (+zero terminator) 1232 page.addInt(0); // encodedSize dummy placeholder 1233 int encodedOffset = page.size(); 1234 1235 // add symbolTable to page 1236 encodeSymbols(entries, page); // assert: no need to expand underlying array 1237 int encodedSize = page.size() - encodedOffset; 1238 page.setInt(encodedOffset-4, encodedSize); // replace dummy placeholder 1239 entries = null; // help gc 1240 1241 // add node tokens and packed symbol indexes to page 1242 page.addInt(BNUX_MAGIC); 1243 encodeTokens(nodeTokens, indexData.asArray(), page); 1244 1245 int pageSize; 1246 nodeTokens.clear(); 1247 if (compressionLevel > 0) { // compress page body 1248 page.position(pageOffset); 1249 nodeTokens.add(compressor, page); 1250 page.remove(pageOffset, page.size()); 1251 pageSize = nodeTokens.size(); 1252 } else { 1253 pageSize = page.size() - pageOffset; 1254 } 1255 if (isLastPage) pageSize = -pageSize; 1256 page.setInt(pageOffset-4, pageSize); // replace pageSize dummy placeholder 1257 1258 // having filled the buffers, flush them onto underlying output stream 1259 page.write(out); 1260 nodeTokens.write(out); 1261 1262 // reset 1263 if (!isLastPage) symbolTable.clear(); 1264 page.clear(); 1265 nodeTokens.clear(); 1266 indexData.clear(); 1267 if (DEBUG) System.err.println("finished writing page"); 1268 } 1269 1270 private void writeDocument(Document doc) throws IOException { 1271 if (DEBUG) System.err.println("writing document"); 1272 writeXMLDeclaration(doc.getBaseURI()); 1273 for (int i = 0; i < doc.getChildCount(); i++) { 1274 Node node = doc.getChild(i); 1275 if (node instanceof Element) { 1276 writeElement((Element) node); 1277 } else if (node instanceof Comment) { 1278 writeComment((Comment) node); 1279 } else if (node instanceof ProcessingInstruction) { 1280 writeProcessingInstruction((ProcessingInstruction) node); 1281 } else if (node instanceof DocType) { 1282 writeDocType((DocType) node); 1283 } else { 1284 throw new IllegalAddException("Cannot write node type: " + node); 1285 } 1286 } 1287 writeEndDocument(); 1288 if (DEBUG) System.err.println("finished writing document"); 1289 } 1290 1291 final void writeXMLDeclaration(String baseURI) { 1292 if (baseURI == null) baseURI = ""; 1293 1294 // setup 1295 symbolTable = new SymbolTable(); 1296 if (nodeTokens == null) nodeTokens = new ArrayByteList(); 1297 nodeTokens.clear(); 1298 if (indexData == null) indexData = new ArrayIntList(); 1299 indexData.clear(); 1300 1301 // write bnux document header 1302 if (page == null) page = new ArrayByteList(256); 1303 page.clear(); 1304 page.ensureCapacity(DOCUMENT_HEADER_SIZE + PAGE_HEADER_SIZE + 1); 1305 page.addInt(BNUX_MAGIC); 1306 int version = VERSION; 1307 if (compressionLevel > 0) version = -version; 1308 page.add((byte)version); 1309 1310 isFirstPage = true; 1311 writeIndex(baseURI); 1312 } 1313 1314 final void writeEndDocument() throws IOException { 1315 flush(true); 1316 } 1317 1318 // assertion: must not be called with !isLastPage 1319 // if we're not at a safe point, i.e. at nesting depth == 0 1320 final void flush(boolean isLastPage) throws IOException { 1321 try { 1322 if (nodeTokens.size() > 0) { // anything remaining to be written? 1323 writePage(isLastPage); 1324 } 1325 out.flush(); 1326 } finally { 1327 if (isLastPage) { 1328 this.symbolTable = null; // help gc 1329 this.out = null; 1330 } 1331 nodeTokens.clear(); 1332 } 1333 } 1334 1335 /** Encodes a node into the intermediate unpacked binary form */ 1336 private void writeChild(Node node) throws IOException { 1337 if (node instanceof Element) { 1338 writeElement((Element) node); 1339 } else if (node instanceof Text) { 1340 writeText((Text) node); 1341 } else if (node instanceof Comment) { 1342 writeComment((Comment) node); 1343 } else if (node instanceof ProcessingInstruction) { 1344 writeProcessingInstruction((ProcessingInstruction) node); 1345 } else { 1346 throw new IllegalAddException("Cannot write node type: " + node); 1347 } 1348 } 1349 1350 final void writeElement(Element elem) throws IOException { 1351 writeStartTag(elem); 1352 1353 for (int i = 0; i < elem.getChildCount(); i++) { 1354 writeChild(elem.getChild(i)); 1355 } 1356 1357 writeEndTag(); 1358 } 1359 1360 final void writeStartTag(Element elem) { 1361 writeIndex(elem.getNamespacePrefix(), elem.getLocalName()); 1362 1363 int type = BEGIN_ELEMENT; 1364 if (elem.getNamespaceURI().length() == 0) { 1365 type = Util.noNamespace(type); 1366 } else { 1367 writeIndex(elem.getNamespaceURI()); 1368 } 1369 nodeTokens.add((byte)type); 1370 1371 for (int i = 0; i < elem.getAttributeCount(); i++) { 1372 writeAttribute(elem.getAttribute(i)); 1373 } 1374 1375 if (IS_EXTENDED_XOM) { 1376 writeNamespaceDeclarationsFast(elem); 1377 } else { 1378 writeNamespaceDeclarations(elem); 1379 } 1380 } 1381 1382 final void writeEndTag() throws IOException { 1383 if (nodeTokens.size() + indexData.size() + symbolTable.numCharacters() 1384 + symbolTable.size() >= MAX_PAGE_CAPACITY) { 1385 writePage(false); // write nodeTokens, indexData, symbolTable to output stream 1386 } 1387 1388 nodeTokens.add((byte)END_ELEMENT); 1389 } 1390 1391 private void writeAttribute(Attribute attr) { 1392 writeIndex(attr.getNamespacePrefix(), attr.getLocalName()); 1393 1394 int type = ATTRIBUTE; 1395 if (attr.getNamespaceURI().length() == 0) { 1396 type = Util.noNamespace(type); 1397 } else { 1398 writeIndex(attr.getNamespaceURI()); 1399 } 1400 1401 writeIndex(attr.getValue()); 1402 nodeTokens.add((byte)type); 1403 nodeTokens.add(Util.getAttributeTypeCode(attr)); 1404 } 1405 1406 final void writeComment(Comment comment) { 1407 nodeTokens.add((byte)COMMENT); 1408 writeIndex(comment.getValue()); 1409 } 1410 1411 /** Does not pack indexes of doctype (infrequent anyway) */ 1412 final void writeDocType(DocType docType) { 1413 nodeTokens.add((byte)DOC_TYPE); 1414 writeIndex(docType.getRootElementName()); 1415 writeIndex(docType.getPublicID() == null ? DOCTYPE_NULL_ID : docType.getPublicID()); 1416 writeIndex(docType.getSystemID() == null ? DOCTYPE_NULL_ID : docType.getSystemID()); 1417 writeIndex(docType.getInternalDTDSubset() == null ? "" : docType.getInternalDTDSubset()); 1418 } 1419 1420 // requires xom-1.1 + patches 1421 private void writeNamespaceDeclarationsFast(Element elem) { 1422 String[] decls = elem.getAdditionalNamespaceDeclarations(); 1423 for (int i=0; i < decls.length; ) { 1424 nodeTokens.add((byte)NAMESPACE_DECLARATION); 1425 writeIndex(decls[i++]); // prefix 1426 writeIndex(decls[i++]); // URI 1427 } 1428 } 1429 1430 private void writeNamespaceDeclarations(Element elem) { 1431 int count = elem.getNamespaceDeclarationCount(); 1432 if (count == 1) 1433 return; // elem.getNamespaceURI() has already been written 1434 1435 for (int i = 0; i < count; i++) { 1436 String prefix = elem.getNamespacePrefix(i); 1437 String uri = elem.getNamespaceURI(prefix); 1438 if (prefix.equals(elem.getNamespacePrefix()) && uri.equals(elem.getNamespaceURI())) { 1439 // if (DEBUG) System.err.println("********** NAMESPACE IGNORED ON WRITE ***************\n"); 1440 continue; 1441 } 1442 nodeTokens.add((byte)NAMESPACE_DECLARATION); 1443 writeIndex(prefix); 1444 writeIndex(uri); 1445 } 1446 } 1447 1448 final void writeProcessingInstruction(ProcessingInstruction pi) { 1449 nodeTokens.add((byte)PROCESSING_INSTRUCTION); 1450 writeIndex(pi.getTarget()); 1451 writeIndex(pi.getValue()); 1452 } 1453 1454 final void writeText(Text text) { 1455 nodeTokens.add((byte)TEXT); 1456 writeIndex(text.getValue()); 1457 } 1458 1459 /** Puts symbol into symbolTable and appends the string's index to indexData */ 1460 private final void writeIndex(String symbol) { 1461 writeIndex("", symbol); 1462 } 1463 1464 /** Puts symbol into symbolTable and appends the string's index to indexData */ 1465 private final void writeIndex(String prefix, String localName) { 1466 int index = symbolTable.addSymbol(prefix, localName); 1467 indexData.add(index); 1468 } 1469 1470 /** Converts strings of symbolTable into UTF-8 bytes */ 1471 private void encodeSymbols(Entry[] entries, ArrayByteList dst) { 1472 /* 1473 * As an optimization, one could use dst.ensureCapacity(dst.size + 4 * 1474 * sum(entries.key.length) + entries.length) then use plain array 1475 * accesses via dst.addUTF8Entries(entries) instead of many small 1476 * dst.add(byte) calls. Update: I benchmarked various variants of this 1477 * idea; none of them turned out to be worthwhile, so, for the time 1478 * being, we'll keep the simple version below. 1479 */ 1480 // if (DEBUG) System.err.println("encoding symbols = " + toString(entries)); 1481 int len = entries.length; 1482 for (int i=0; i < len; i++) { 1483 Entry entry = entries[i]; 1484 dst.addUTF8String(entry.getKey1(), entry.getKey2()); 1485 //dst.addUTF16String(entry.getKey1(), entry.getKey2()); 1486 } 1487 } 1488 1489 /** 1490 * Sorts entries descending by symbol frequency (# of occurances) and 1491 * updates indexData accordingly. This allows to compress frequent 4 byte 1492 * indexes into a single byte. FAQ: this sort is *not* the bottleneck. 1493 */ 1494 private void packSort(Entry[] entries, ArrayIntList indexData) { 1495 if (!DEBUG && entries.length <= 256) { // 0, 1, ... , 255 1496 return; // no need to sort indexes - all fit into one unsigned byte anyway 1497 } 1498 1499 // Swap entries with frequency == 1 to end of array (typically Text nodes). 1500 // There's no need to sort those with O(N log N). 1501 int head = entries.length; 1502 for (int i=entries.length; --i >= 0; ) { 1503 Entry e = entries[i]; 1504 if (e.getFrequency() == 1) { 1505 head--; 1506 entries[i] = entries[head]; 1507 entries[head] = e; 1508 } 1509 } 1510 // if (DEBUG) System.err.println("len=" + entries.length + ", #f>1=" + (100.0f * head / entries.length)); 1511 1512 // sort remaining entries descending by frequency. 1513 Arrays.sort(entries, 0, head, 1514 new Comparator() { 1515 public final int compare(Object e1, Object e2) { 1516 int f1 = ((Entry) e1).getFrequency(); 1517 int f2 = ((Entry) e2).getFrequency(); 1518 return f2 - f1; 1519 } 1520 } 1521 ); 1522 1523 // reorder indexData with sorted indexes 1524 // since sort has moved entries[indexData[k]] to entries[i] 1525 int[] indexes = new int[entries.length]; 1526 for (int i=entries.length; --i >= 0; ) { 1527 indexes[entries[i].getIndex()] = i; 1528 } 1529 1530 int[] ix = indexData.asArray(); 1531 for (int i=indexData.size(); --i >= 0; ) { 1532 ix[i] = indexes[ix[i]]; 1533 } 1534 // post-condition: entries[indexData[k]] corresponds to entries[i] 1535 } 1536 1537 /** 1538 * Writes nodeTokens in document order; in the process stitches in packed 1539 * indexes referring to symbols in the symbolTable. 1540 */ 1541 private void encodeTokens(ArrayByteList tokenList, int[] indexes, ArrayByteList dst) { 1542 byte[] tokens = tokenList.asArray(); 1543 int size = tokenList.size(); 1544 int i = 0; 1545 int j = 0; 1546 if (isFirstPage) dst.addInt(indexes[i++]); // document baseURI 1547 isFirstPage = false; 1548 1549 while (j < size) { 1550 int type = tokens[j++]; 1551 dst.add((byte)type); 1552 1553 // if (DEBUG) System.err.println("encoding type = " + toString(type)); 1554 switch (type & 0x07) { 1555 case TEXT: { 1556 Util.packOneIndex(dst, indexes[i++], type); // value 1557 break; 1558 } 1559 case ATTRIBUTE: { 1560 if (Util.hasNoNamespace(type)) { // qname 1561 Util.packOneIndex(dst, indexes[i++], type); 1562 } else { // qname, URI 1563 Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type); 1564 } 1565 dst.add((byte)0); 1566 Util.packOneIndex(dst, indexes[i++], 0); // value 1567 dst.add(tokens[j++]); // attrType 1568 break; 1569 } 1570 case BEGIN_ELEMENT: { 1571 if (Util.hasNoNamespace(type)) { 1572 Util.packOneIndex(dst, indexes[i++], type); // qname 1573 } else { 1574 Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type); // qname, URI 1575 } 1576 break; 1577 } 1578 case END_ELEMENT: { 1579 break; // nothing to do 1580 } 1581 case COMMENT: { 1582 Util.packOneIndex(dst, indexes[i++], type); // value 1583 break; 1584 } 1585 case NAMESPACE_DECLARATION: { // prefix, URI 1586 Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type); 1587 break; 1588 } 1589 case PROCESSING_INSTRUCTION: { // target, value 1590 Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type); 1591 break; 1592 } 1593 case DOC_TYPE: { // infrequent; no need to pack indexes 1594 dst.addInt(indexes[i++]); // rootElementName 1595 dst.addInt(indexes[i++]); // publicID 1596 dst.addInt(indexes[i++]); // systemID 1597 dst.addInt(indexes[i++]); // internalDTDSubset 1598 break; 1599 } 1600 default: { 1601 throw new IllegalArgumentException("illegal node type"); 1602 } 1603 } 1604 } 1605 } 1606 1607 private static int createMagicNumber() { 1608 // 0xE0, 0x01, 0xDF, 0xFE = -32, 1, -33, -2 = -536748034 1609 // Thanks to Paul.Sandoz@Sun.COM 1610 ArrayByteList magic = new ArrayByteList(4); 1611 magic.add((byte)0xE0); 1612 magic.add((byte)0x01); 1613 magic.add((byte)0xDF); 1614 magic.add((byte)0xFE); 1615 return magic.getInt(); 1616 } 1617 1618 /** does xom.jar contain our performance extensions? */ 1619 private static boolean hasXOMExtensions() { 1620 try { 1621 ParentNode.class.getMethod("fastInsertChild", new Class[] {Node.class, Integer.TYPE}); 1622 Element.class.getMethod("getAdditionalNamespaceDeclarations", null); 1623 return true; 1624 } catch (Throwable t) { 1625 return false; 1626 } 1627 } 1628 1629 private static String toString(int type) { // DEBUG only 1630 switch (type & 0x07) { 1631 case TEXT: return "TEXT"; 1632 case ATTRIBUTE: return "ATTRIBUTE"; 1633 case BEGIN_ELEMENT: return "BEGIN_ELEMENT"; 1634 case END_ELEMENT: return "END_ELEMENT"; 1635 case COMMENT: return "COMMENT"; 1636 case NAMESPACE_DECLARATION: return "NAMESPACE_DECLARATION"; 1637 case PROCESSING_INSTRUCTION: return "PROCESSING_INSTRUCTION"; 1638 case DOC_TYPE: return "DOC_TYPE"; 1639 default: { 1640 throw new IllegalArgumentException( 1641 "Illegal node type code=" + (type & 0x07)); 1642 } 1643 } 1644 } 1645 1646 private static String toString(Entry[] entries) { // DEBUG only 1647 ArrayList list = new ArrayList(); 1648 for (int i=0; i < entries.length; i++) { 1649 list.add(entries[i].getQualifiedName()); 1650 } 1651 return list.toString(); 1652 } 1653 1654 1655 /////////////////////////////////////////////////////////////////////////////// 1656 // Nested classes: 1657 /////////////////////////////////////////////////////////////////////////////// 1658 1659 /** 1660 * Table of unique symbols for tokenization on XML serialization. This is a 1661 * classic text book hash algorithm, adapted to meet our specific 1662 * performance needs. It's close to the one used by the JDK HashMap. 1663 * Maintains a map of (String, String) ==> (index, frequency) associations. 1664 */ 1665 private static final class SymbolTable { // not a public class! 1666 1667 private static final float LOAD_FACTOR = 0.75f; 1668 private static final int INITIAL_CAPACITY = 16; 1669 private Entry[] entries = new Entry[INITIAL_CAPACITY]; 1670 private int threshold = (int) (INITIAL_CAPACITY * LOAD_FACTOR); 1671 private int size = 0; 1672 private int numChars = 0; 1673 1674 /** Constructs and returns a table with default parameters. */ 1675 public SymbolTable() { 1676 } 1677 1678 /** Removes all entries from this table, retaining the current capacity. */ 1679 public void clear() { 1680 size = 0; 1681 numChars = 0; 1682 Entry[] src = entries; 1683 for (int i=src.length; --i >= 0; ) src[i] = null; 1684 } 1685 1686 /** Returns the total number of characters occupied by all symbol strings. */ 1687 public int numCharacters() { 1688 return numChars; 1689 } 1690 1691 /** Returns the number of symbols. */ 1692 public int size() { 1693 return size; 1694 } 1695 1696 /** 1697 * Adds the given symbol to the table if not already present. Otherwise 1698 * increments its frequency counter. 1699 * 1700 * A symbol is structured like a lexical XML QName. 1701 * symbol: key1 + ":" + key2 (if key1 is non-empty string) 1702 * symbol: key2 (if key1 is empty string) 1703 * 1704 * @return a sequence number N >= 0 indicating that the symbol was added 1705 * to this table as the N-th entry, in order. 1706 */ 1707 public int addSymbol(String key1, String key2) { 1708 // assert: key1 and key2 are non-null 1709 int hash = hash(key1, key2); 1710 int i = hash & (entries.length - 1); 1711 Entry entry = findEntry(key1, key2, entries[i], hash); 1712 if (entry != null) { 1713 entry.frequency++; 1714 return entry.index; 1715 } 1716 1717 // not found; add entry for key --> (index=size, freq=1) mapping 1718 // new entry is inserted at head of chain 1719 // if (DEBUG) checkNULChar(key1); 1720 // if (DEBUG) checkNULChar(key2); 1721 numChars += key1.length() + key2.length(); 1722 if (key1.length() != 0) numChars++; 1723 entries[i] = new Entry(key1, key2, hash, entries[i], size); 1724 if (size >= threshold) rehash(); 1725 return size++; 1726 } 1727 1728 private static Entry findEntry(String key1, String key2, Entry cursor, int hash) { 1729 while (cursor != null) { // scan collision chain 1730 if (hash == cursor.hash && eq(key2, cursor.key2) && eq(key1, cursor.key1)) { 1731 cursor.key1 = key1; // speeds up future lookups: equals() vs. == 1732 cursor.key2 = key2; // speeds up future lookups: equals() vs. == 1733 return cursor; 1734 } 1735 cursor = cursor.next; 1736 } 1737 return null; 1738 } 1739 1740 /** 1741 * Expands the capacity of this table, rehashing all entries into 1742 * corresponding new slots. 1743 */ 1744 private void rehash() { 1745 Entry[] src = entries; 1746 int capacity = 2 * src.length; 1747 Entry[] dst = new Entry[capacity]; 1748 1749 for (int i = src.length; --i >= 0; ) { 1750 Entry e = src[i]; 1751 while (e != null) { // walk collision chain 1752 int j = e.hash & (capacity - 1); 1753 Entry next = e.next; 1754 e.next = dst[j]; 1755 dst[j] = e; // insert e at head of chain 1756 e = next; 1757 } 1758 } 1759 entries = dst; 1760 threshold = (int) (capacity * LOAD_FACTOR); 1761 } 1762 1763 /** 1764 * Returns all table entries, sorted ascending by entry.index. The 1765 * result can subsequently be used to sort by symbol frequency, or 1766 * similar. Much faster than an entrySet().iterator().next() loop would 1767 * be. 1768 */ 1769 public Entry[] getEntries() { 1770 Entry[] dst = new Entry[size]; 1771 Entry[] src = entries; 1772 for (int i = src.length; --i >= 0; ) { 1773 Entry e = src[i]; 1774 while (e != null) { // walk collision chain 1775 dst[e.index] = e; 1776 e = e.next; 1777 } 1778 } 1779 return dst; 1780 } 1781 1782 private static int hash(String key1, String key2) { 1783 int h = key2.hashCode(); 1784 if (key1 != "") h = key1.hashCode() ^ h; 1785 return auxiliaryHash(h); 1786 // return auxiliaryHash(key1.hashCode() ^ key2.hashCode()); 1787 } 1788 1789 /** 1790 * Auxiliary hash function that defends against poor base hash 1791 * functions. Ensures more uniform hash distribution, hence reducing the 1792 * probability of pathologically long collision chains, in particular 1793 * for short key symbols that are quite similar to each other, or XML 1794 * boundary whitespace (worst case scenario). 1795 */ 1796 private static int auxiliaryHash(int h) { 1797 h += ~(h << 9); 1798 h ^= (h >>> 14); 1799 h += (h << 4); 1800 h ^= (h >>> 10); 1801 return h; 1802 } 1803 1804 private static boolean eq(String x, String y) { 1805 return x == y || x.equals(y); 1806 } 1807 1808 /** Sanity check; Unnecessary since NULs have already been checked by nu.xom.Verifier. */ 1809 private static void checkNULChar(String key) { 1810 int i = key.indexOf((char)0); 1811 if (i >= 0) { 1812 throw new IllegalArgumentException( 1813 "Symbol must not contain C0 control character NUL (char 0x00) [index:" + i 1814 + " within '" + key + "']"); 1815 } 1816 } 1817 1818 } 1819 1820 /** 1821 * A value in the SymbolTable. 1822 */ 1823 private static final class Entry { 1824 1825 String key1; // prefix or "" 1826 String key2; // localName or Text or Comment or similar 1827 final int hash; // cache for symbol's hash code 1828 final int index; // index to correlate Entry with indexList on XML serialization 1829 int frequency = 1;// number of occurances of symbol within current XML document 1830 Entry next; // successor in collision chain, mapping to the same hash slot 1831 1832 public Entry(String key1, String key2, int hash, Entry next, int index) { 1833 this.key1 = key1; 1834 this.key2 = key2; 1835 this.hash = hash; 1836 this.next = next; 1837 this.index = index; 1838 } 1839 1840 public String getKey1() { return key1; } 1841 public String getKey2() { return key2; } 1842 public int getIndex() { return index; } 1843 public int getFrequency() { return frequency; } 1844 1845 public String getQualifiedName() { // DEBUG only 1846 if (key1.length() == 0) return key2; 1847 return key1 + ':' + key2; 1848 } 1849 public String toString() { // DEBUG only 1850 return "[key1=" + key1 + ", key2=" + key2 + ", freq=" + frequency + "]"; 1851 } 1852 1853 } 1854 1855 1856 /////////////////////////////////////////////////////////////////////////////// 1857 // Nested classes: 1858 /////////////////////////////////////////////////////////////////////////////// 1859 1860 /** 1861 * Fast replacement for ArrayList and java.util.Stack. Possibly premature 1862 * and unnecessary? 1863 */ 1864 private static final class FastStack { 1865 private Element[] elements = new Element[10]; 1866 private int size = 0; 1867 1868 public Element pop() { 1869 Element elem = elements[size-1]; 1870 elements[--size] = null; // help gc 1871 return elem; 1872 } 1873 1874 public void push(Element elem) { 1875 if (size == elements.length) ensureCapacity(size + 1); 1876 elements[size++] = elem; 1877 } 1878 1879 private void ensureCapacity(int minCapacity) { 1880 if (minCapacity > elements.length) { 1881 int newCapacity = Math.max(minCapacity, 2 * elements.length + 1); 1882 elements = subArray(0, size, newCapacity); 1883 } 1884 } 1885 1886 /** Small helper method eliminating redundancy. */ 1887 private Element[] subArray(int from, int length, int capacity) { 1888 Element[] subArray = new Element[capacity]; 1889 System.arraycopy(elements, from, subArray, 0, length); 1890 return subArray; 1891 } 1892 1893 } 1894 1895 }