1 /* ValidationConsumer.java --
2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 package gnu.xml.pipeline;
39 
40 import java.io.IOException;
41 import java.io.StringReader;
42 import java.io.StringWriter;
43 import java.util.EmptyStackException;
44 import java.util.Enumeration;
45 import java.util.Hashtable;
46 import java.util.Stack;
47 import java.util.StringTokenizer;
48 import java.util.Vector;
49 
50 import org.xml.sax.Attributes;
51 import org.xml.sax.EntityResolver;
52 import org.xml.sax.ErrorHandler;
53 import org.xml.sax.InputSource;
54 import org.xml.sax.Locator;
55 import org.xml.sax.SAXException;
56 import org.xml.sax.SAXParseException;
57 import org.xml.sax.XMLReader;
58 import org.xml.sax.helpers.XMLReaderFactory;
59 
60 /**
61  * This class checks SAX2 events to report validity errors; it works as
62  * both a filter and a terminus on an event pipeline.  It relies on the
63  * producer of SAX events to:  </p> <ol>
64  *
65  *	<li> Conform to the specification of a non-validating XML parser that
66  *	reads all external entities, reported using SAX2 events. </li>
67  *
68  *	<li> Report ignorable whitespace as such (through the ContentHandler
69  *	interface).  This is, strictly speaking, optional for nonvalidating
70  *	XML processors.  </li>
71  *
72  *	<li> Make SAX2 DeclHandler callbacks, with default
73  *	attribute values already normalized (and without "&lt;").</li>
74  *
75  *	<li> Make SAX2 LexicalHandler startDTD() and endDTD ()
76  *	callbacks. </li>
77  *
78  *	<li> Act as if the <em>(URI)/namespace-prefixes</em> property were
79  *	set to true, by providing XML 1.0 names and all <code>xmlns*</code>
80  *	attributes (rather than omitting either or both). </li>
81  *
82  *	</ol>
83  *
84  * <p> At this writing, the major SAX2 parsers (such as &AElig;lfred2,
85  * Crimson, and Xerces) meet these requirements, and this validation
86  * module is used by the optional &AElig;lfred2 validation support.
87  * </p>
88  *
89  * <p> Note that because this is a layered validator, it has to duplicate some
90  * work that the parser is doing; there are also other cost to layering.
91  * However, <em>because of layering it doesn't need a parser</em> in order
92  * to work! You can use it with anything that generates SAX events, such
93  * as an application component that wants to detect invalid content in
94  * a changed area without validating an entire document, or which wants to
95  * ensure that it doesn't write invalid data to a communications partner.</p>
96  *
97  * <p> Also, note that because this is a layered validator, the line numbers
98  * reported for some errors may seem strange.  For example, if an element does
99  * not permit character content, the validator
100  * will use the locator provided to it.
101  * That might reflect the last character of a <em>characters</em> event
102  * callback, rather than the first non-whitespace character. </p>
103  *
104  * <hr />
105  *
106  * <!--
107  * <p> Of interest is the fact that unlike most currently known XML validators,
108  * this one can report some cases of non-determinism in element content models.
109  * It is a compile-time option, enabled by default.  This will only report
110  * such XML errors if they relate to content actually appearing in a document;
111  * content models aren't aggressively scanned for non-deterministic structure.
112  * Documents which trigger such non-deterministic transitions may be handled
113  * differently by different validating parsers, without losing conformance
114  * to the XML specification. </p>
115  * -->
116  *
117  * <p> Current limitations of the validation performed are in roughly three
118  * categories.  </p>
119  *
120  * <p> The first category represents constraints which demand violations
121  * of software layering:  exposing lexical details, one of the first things
122  * that <em>application</em> programming interfaces (APIs) hide.  These
123  * invariably relate to XML entity handling, and to historical oddities
124  * of the XML validation semantics.  Curiously,
125  * recent (Autumn 1999) conformance testing showed that these constraints are
126  * among those handled worst by existing XML validating parsers.  Arguments
127  * have been made that each of these VCs should be turned into WFCs (most
128  * of them) or discarded (popular for the standalone declaration); in short,
129  * that these are bugs in the XML specification (not all via SGML): </p><ul>
130  *
131  *	<li> The <em>Proper Declaration/PE Nesting</em> and
132  *	<em>Proper Group/PE Nesting</em> VCs can't be tested because they
133  *	require access to particularly low level lexical level information.
134  *	In essence, the reason XML isn't a simple thing to parse is that
135  *	it's not a context free grammar, and these constraints elevate that
136  *	SGML-derived context sensitivity to the level of a semantic rule.
137  *
138  *	<li> The <em>Standalone Document Declaration</em> VC can't be
139  *	tested.  This is for two reasons.  First, this flag isn't made
140  *	available through SAX2.  Second, it also requires breaking that
141  *	lexical layering boundary.  (If you ever wondered why classes
142  *	in compiler construction or language design barely mention the
143  *	existence of context-sensitive grammars, it's because of messy
144  *	issues like these.)
145  *
146  *	<li> The <em>Entity Declared</em> VC can't be tested, because it
147  *	also requires breaking that lexical layering boundary!  There's also
148  *	another issue: the VC wording (and seemingly intent) is ambiguous.
149  *	(This is still true in the "Second edition" XML spec.)
150  *	Since there is a WFC of the same name, everyone's life would be
151  *	easier if references to undeclared parsed entities were always well
152  *	formedness errors, regardless of whether they're parameter entities
153  *	or not.  (Note that nonvalidating parsers are not required
154  *	to report all such well formedness errors if they don't read external
155  *	parameter entities, although currently most XML parsers read them
156  *	in an attempt to avoid problems from inconsistent parser behavior.)
157  *
158  *	</ul>
159  *
160  * <p> The second category of limitations on this validation represent
161  * constraints associated with information that is not guaranteed to be
162  * available (or in one case, <em>is guaranteed not to be available</em>,
163  * through the SAX2 API: </p><ul>
164  *
165  *	<li> The <em>Unique Element Type Declaration</em> VC may not be
166  *	reportable, if the underlying parser happens not to expose
167  *	multiple declarations.   (&AElig;lfred2 reports these validity
168  *	errors directly.)</li>
169  *
170  *	<li> Similarly, the <em>Unique Notation Name</em> VC, added in the
171  *	14-January-2000 XML spec errata to restrict typing models used by
172  *	elements, may not be reportable.  (&AElig;lfred reports these
173  *	validity errors directly.) </li>
174  *
175  *	</ul>
176  *
177  * <p> A third category relates to ease of implementation.  (Think of this
178  * as "bugs".)  The most notable issue here is character handling.  Rather
179  * than attempting to implement the voluminous character tables in the XML
180  * specification (Appendix B), Unicode rules are used directly from
181  * the java.lang.Character class.  Recent JVMs have begun to diverge from
182  * the original specification for that class (Unicode 2.0), meaning that
183  * different JVMs may handle that aspect of conformance differently.
184  * </p>
185  *
186  * <p> Note that for some of the validity errors that SAX2 does not
187  * expose, a nonvalidating parser is permitted (by the XML specification)
188  * to report validity errors.  When used with a parser that does so for
189  * the validity constraints mentioned above (or any other SAX2 event
190  * stream producer that does the same thing), overall conformance is
191  * substantially improved.
192  *
193  * @see gnu.xml.aelfred2.SAXDriver
194  * @see gnu.xml.aelfred2.XmlReader
195  *
196  * @author David Brownell
197  */
198 public final class ValidationConsumer extends EventFilter
199 {
200     // report error if we happen to notice a non-deterministic choice?
201     // we won't report buggy content models; just buggy instances
202     private static final boolean	warnNonDeterministic = false;
203 
204     // for tracking active content models
205     private String		rootName;
206     private Stack		contentStack = new Stack ();
207 
208     // flags for "saved DTD" processing
209     private boolean		disableDeclarations;
210     private boolean		disableReset;
211 
212     //
213     // most VCs get tested when we see element start tags.  the per-element
214     // info (including attributes) recorded here duplicates that found inside
215     // many nonvalidating parsers, hence dual lookups etc ... that's why a
216     // layered validator isn't going to be as fast as a non-layered one.
217     //
218 
219     // key = element name; value = ElementInfo
220     private Hashtable		elements = new Hashtable ();
221 
222     // some VCs relate to ID/IDREF/IDREFS attributes
223     // key = id; value = boolean true (defd) or false (refd)
224     private Hashtable		ids = new Hashtable ();
225 
226     // we just record declared notation and unparsed entity names.
227     // the implementation here is simple/slow; these features
228     // are seldom used, one hopes they'll wither away soon
229     private Vector		notations = new Vector (5, 5);
230     private Vector		nDeferred = new Vector (5, 5);
231     private Vector		unparsed = new Vector (5, 5);
232     private Vector		uDeferred = new Vector (5, 5);
233 
234 	// note: DocBk 3.1.7 XML defines over 2 dozen notations,
235 	// used when defining unparsed entities for graphics
236 	// (and maybe in other places)
237 
238 
239 
240     /**
241      * Creates a pipeline terminus which consumes all events passed to
242      * it; this will report validity errors as if they were fatal errors,
243      * unless an error handler is assigned.
244      *
245      * @see #setErrorHandler
246      */
247 	// constructor used by PipelineFactory
248 	    // ... and want one taking system ID of an external subset
ValidationConsumer()249     public ValidationConsumer ()
250     {
251 	this (null);
252     }
253 
254     /**
255      * Creates a pipeline filter which reports validity errors and then
256      * passes events on to the next consumer if they were not fatal.
257      *
258      * @see #setErrorHandler
259      */
260 	// constructor used by PipelineFactory
261 	    // ... and want one taking system ID of an external subset
262 	    // (which won't send declaration events)
ValidationConsumer(EventConsumer next)263     public ValidationConsumer (EventConsumer next)
264     {
265 	super (next);
266 
267 	setContentHandler (this);
268 	setDTDHandler (this);
269 	try { setProperty (DECL_HANDLER, this); }
270 	catch (Exception e) { /* "can't happen" */ }
271 	try { setProperty (LEXICAL_HANDLER, this); }
272 	catch (Exception e) { /* "can't happen" */ }
273     }
274 
275 
276     private static final String	fakeRootName
277 	= ":Nobody:in:their_Right.Mind_would:use:this-name:1x:";
278 
279     /**
280      * Creates a validation consumer which is preloaded with the DTD provided.
281      * It does this by constructing a document with that DTD, then parsing
282      * that document and recording its DTD declarations.  Then it arranges
283      * not to modify that information.
284      *
285      * <p> The resulting validation consumer will only validate against
286      * the specified DTD, regardless of whether some other DTD is found
287      * in a document being parsed.
288      *
289      * @param rootName The name of the required root element; if this is
290      *	null, any root element name will be accepted.
291      * @param publicId If non-null and there is a non-null systemId, this
292      *	identifier provides an alternate access identifier for the DTD's
293      *	external subset.
294      * @param systemId If non-null, this is a URI (normally URL) that
295      *	may be used to access the DTD's external subset.
296      * @param internalSubset If non-null, holds literal markup declarations
297      *	comprising the DTD's internal subset.
298      * @param resolver If non-null, this will be provided to the parser for
299      *	use when resolving parameter entities (including any external subset).
300      * @param resolver If non-null, this will be provided to the parser for
301      *	use when resolving parameter entities (including any external subset).
302      * @param minimalElement If non-null, a minimal valid document.
303      *
304      * @exception SAXNotSupportedException If the default SAX parser does
305      *	not support the standard lexical or declaration handlers.
306      * @exception SAXParseException If the specified DTD has either
307      *	well-formedness or validity errors
308      * @exception IOException If the specified DTD can't be read for
309      *	some reason
310      */
ValidationConsumer( String rootName, String publicId, String systemId, String internalSubset, EntityResolver resolver, String minimalDocument )311     public ValidationConsumer (
312 	String		rootName,
313 	String		publicId,
314 	String		systemId,
315 	String		internalSubset,
316 	EntityResolver	resolver,
317 	String		minimalDocument
318     ) throws SAXException, IOException
319     {
320 	this (null);
321 
322 	disableReset = true;
323 	if (rootName == null)
324 	    rootName = fakeRootName;
325 
326 	//
327 	// Synthesize document with that DTD; is it possible to do
328 	// better for the declaration of the root element?
329 	//
330 	// NOTE:  can't use SAX2 to write internal subsets.
331 	//
332 	StringWriter	writer = new StringWriter ();
333 
334 	writer.write ("<!DOCTYPE ");
335 	writer.write (rootName);
336 	if (systemId != null) {
337 	    writer.write ("\n  ");
338 	    if (publicId != null) {
339 		writer.write ("PUBLIC '");
340 		writer.write (publicId);
341 		writer.write ("'\n\t'");
342 	    } else
343 		writer.write ("SYSTEM '");
344 	    writer.write (systemId);
345 	    writer.write ("'");
346 	}
347 	writer.write (" [ ");
348 	if (rootName == fakeRootName) {
349 	    writer.write ("\n<!ELEMENT ");
350 	    writer.write (rootName);
351 	    writer.write (" EMPTY>");
352 	}
353 	if (internalSubset != null)
354 	    writer.write (internalSubset);
355 	writer.write ("\n ]>");
356 
357 	if (minimalDocument != null) {
358 	    writer.write ("\n");
359 	    writer.write (minimalDocument);
360 	    writer.write ("\n");
361 	} else {
362 	    writer.write (" <");
363 	    writer.write (rootName);
364 	    writer.write ("/>\n");
365 	}
366 	minimalDocument = writer.toString ();
367 
368 	//
369 	// OK, load it
370 	//
371 	XMLReader	producer;
372 
373 	producer = XMLReaderFactory.createXMLReader ();
374 	bind (producer, this);
375 
376 	if (resolver != null)
377 	    producer.setEntityResolver (resolver);
378 
379 	InputSource	in;
380 
381 	in = new InputSource (new StringReader (minimalDocument));
382 	producer.parse (in);
383 
384 	disableDeclarations = true;
385 	if (rootName == fakeRootName)
386 	    this.rootName = null;
387     }
388 
resetState()389     private void resetState ()
390     {
391 	if (!disableReset) {
392 	    rootName = null;
393 	    contentStack.removeAllElements ();
394 	    elements.clear ();
395 	    ids.clear ();
396 
397 	    notations.removeAllElements ();
398 	    nDeferred.removeAllElements ();
399 	    unparsed.removeAllElements ();
400 	    uDeferred.removeAllElements ();
401 	}
402     }
403 
404 
warning(String description)405     private void warning (String description)
406     throws SAXException
407     {
408 	ErrorHandler		errHandler = getErrorHandler ();
409 	Locator			locator = getDocumentLocator ();
410 	SAXParseException	err;
411 
412 	if (errHandler == null)
413 	    return;
414 
415 	if (locator == null)
416 	    err = new SAXParseException (description, null, null, -1, -1);
417 	else
418 	    err = new SAXParseException (description, locator);
419 	errHandler.warning (err);
420     }
421 
422     // package private (for ChildrenRecognizer)
error(String description)423     private void error (String description)
424     throws SAXException
425     {
426 	ErrorHandler		errHandler = getErrorHandler ();
427 	Locator			locator = getDocumentLocator ();
428 	SAXParseException	err;
429 
430 	if (locator == null)
431 	    err = new SAXParseException (description, null, null, -1, -1);
432 	else
433 	    err = new SAXParseException (description, locator);
434 	if (errHandler != null)
435 	    errHandler.error (err);
436 	else	// else we always treat it as fatal!
437 	    throw err;
438     }
439 
fatalError(String description)440     private void fatalError (String description)
441     throws SAXException
442     {
443 	ErrorHandler		errHandler = getErrorHandler ();
444 	Locator			locator = getDocumentLocator ();
445 	SAXParseException	err;
446 
447 	if (locator != null)
448 	    err = new SAXParseException (description, locator);
449 	else
450 	    err = new SAXParseException (description, null, null, -1, -1);
451 	if (errHandler != null)
452 	    errHandler.fatalError (err);
453 	// we always treat this as fatal, regardless of the handler
454 	throw err;
455     }
456 
457 
isExtender(char c)458     private static boolean isExtender (char c)
459     {
460 	// [88] Extender ::= ...
461 	return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
462 	       || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
463 	       || (c >= 0x3031 && c <= 0x3035)
464 	       || (c >= 0x309d && c <= 0x309e)
465 	       || (c >= 0x30fc && c <= 0x30fe);
466     }
467 
468 
469     // use augmented Unicode rules, not full XML rules
isName(String name, String context, String id)470     private boolean isName (String name, String context, String id)
471     throws SAXException
472     {
473 	char	buf [] = name.toCharArray ();
474 	boolean	pass = true;
475 
476 	if (!Character.isUnicodeIdentifierStart (buf [0])
477 		&& ":_".indexOf (buf [0]) == -1)
478 	    pass = false;
479 	else {
480 	    int max = buf.length;
481 	    for (int i = 1; pass && i < max; i++) {
482 		char c = buf [i];
483 		if (!Character.isUnicodeIdentifierPart (c)
484 			&& ":-_.".indexOf (c) == -1
485 			&& !isExtender (c))
486 		    pass = false;
487 	    }
488 	}
489 
490 	if (!pass)
491 	    error ("In " + context + " for " + id
492 		+ ", '" + name + "' is not a name");
493 	return pass;	// true == OK
494     }
495 
496     // use augmented Unicode rules, not full XML rules
isNmtoken(String nmtoken, String context, String id)497     private boolean isNmtoken (String nmtoken, String context, String id)
498     throws SAXException
499     {
500 	char	buf [] = nmtoken.toCharArray ();
501 	boolean	pass = true;
502 	int	max = buf.length;
503 
504 	// XXX make this share code with isName
505 
506 	for (int i = 0; pass && i < max; i++) {
507 		char c = buf [i];
508 	    if (!Character.isUnicodeIdentifierPart (c)
509 		    && ":-_.".indexOf (c) == -1
510 		    && !isExtender (c))
511 		pass = false;
512 	}
513 
514 	if (!pass)
515 	    error ("In " + context + " for " + id
516 		+ ", '" + nmtoken + "' is not a name token");
517 	return pass;	// true == OK
518     }
519 
checkEnumeration(String value, String type, String name)520     private void checkEnumeration (String value, String type, String name)
521     throws SAXException
522     {
523 	if (!hasMatch (value, type))
524 	    // VC: Enumeration
525 	    error ("Value '" + value
526 		+ "' for attribute '" + name
527 		+ "' is not permitted: " + type);
528     }
529 
530     // used to test enumerated attributes and mixed content models
531     // package private
hasMatch(String value, String orList)532     static boolean hasMatch (String value, String orList)
533     {
534 	int len = value.length ();
535 	int max = orList.length () - len;
536 
537 	for (int start = 0;
538 		(start = orList.indexOf (value, start)) != -1;
539 		start++) {
540 	    char c;
541 
542 	    if (start > max)
543 		break;
544 	    c = orList.charAt (start - 1);
545 	    if (c != '|' && c != '('/*)*/)
546 		continue;
547 	    c = orList.charAt (start + len);
548 	    if (c != '|' && /*(*/ c != ')')
549 		continue;
550 	    return true;
551 	}
552 	return false;
553     }
554 
555     /**
556      * <b>LexicalHandler</b> Records the declaration of the root
557      * element, so it can be verified later.
558      * Passed to the next consumer, unless this one was
559      * preloaded with a particular DTD.
560      */
startDTD(String name, String publicId, String systemId)561     public void startDTD (String name, String publicId, String systemId)
562     throws SAXException
563     {
564 	if (disableDeclarations)
565 	    return;
566 
567 	rootName = name;
568 	super.startDTD (name, publicId, systemId);
569     }
570 
571     /**
572      * <b>LexicalHandler</b> Verifies that all referenced notations
573      * and unparsed entities have been declared.
574      * Passed to the next consumer, unless this one was
575      * preloaded with a particular DTD.
576      */
endDTD()577     public void endDTD ()
578     throws SAXException
579     {
580 	if (disableDeclarations)
581 	    return;
582 
583 	// this is a convenient hook for end-of-dtd checks, but we
584 	// could also trigger it in the first startElement call.
585 	// locator info is more appropriate here though.
586 
587 	// VC: Notation Declared (NDATA can refer to them before decls,
588 	//	as can NOTATION attribute enumerations and defaults)
589 	int length = nDeferred.size ();
590 	for (int i = 0; i < length; i++) {
591 	    String notation = (String) nDeferred.elementAt (i);
592 	    if (!notations.contains (notation)) {
593 		error ("A declaration referred to notation '" + notation
594 			+ "' which was never declared");
595 	    }
596 	}
597 	nDeferred.removeAllElements ();
598 
599 	// VC: Entity Name (attribute values can refer to them
600 	//	before they're declared); VC Attribute Default Legal
601 	length = uDeferred.size ();
602 	for (int i = 0; i < length; i++) {
603 	    String entity = (String) uDeferred.elementAt (i);
604 	    if (!unparsed.contains (entity)) {
605 		error ("An attribute default referred to entity '" + entity
606 			+ "' which was never declared");
607 	    }
608 	}
609 	uDeferred.removeAllElements ();
610 	super.endDTD ();
611     }
612 
613 
614     // These are interned, so we can rely on "==" to find the type of
615     // all attributes except enumerations ...
616     // "(this|or|that|...)" and "NOTATION (this|or|that|...)"
617     static final String types [] = {
618 	"CDATA",
619 	"ID", "IDREF", "IDREFS",
620 	"NMTOKEN", "NMTOKENS",
621 	"ENTITY", "ENTITIES"
622     };
623 
624 
625     /**
626      * <b>DecllHandler</b> Records attribute declaration for later use
627      * in validating document content, and checks validity constraints
628      * that are applicable to attribute declarations.
629      * Passed to the next consumer, unless this one was
630      * preloaded with a particular DTD.
631      */
attributeDecl( String eName, String aName, String type, String mode, String value )632     public void attributeDecl (
633 	String eName,
634 	String aName,
635 	String type,
636 	String mode,
637 	String value
638     ) throws SAXException
639     {
640 	if (disableDeclarations)
641 	    return;
642 
643 	ElementInfo	info = (ElementInfo) elements.get (eName);
644 	AttributeInfo	ainfo = new AttributeInfo ();
645 	boolean		checkOne = false;
646 	boolean		interned = false;
647 
648 	// cheap interning of type names and #FIXED, #REQUIRED
649 	// for faster startElement (we can use "==")
650 	for (int i = 0; i < types.length; i++) {
651 	    if (types [i].equals (type)) {
652 		type = types [i];
653 		interned = true;
654 		break;
655 	    }
656 	}
657 	if ("#FIXED".equals (mode))
658 	    mode = "#FIXED";
659 	else if ("#REQUIRED".equals (mode))
660 	    mode = "#REQUIRED";
661 
662 	ainfo.type = type;
663 	ainfo.mode = mode;
664 	ainfo.value = value;
665 
666 	// we might not have seen the content model yet
667 	if (info == null) {
668 	    info = new ElementInfo (eName);
669 	    elements.put (eName, info);
670 	}
671 	if ("ID" == type) {
672 	    checkOne = true;
673 	    if (!("#REQUIRED" == mode || "#IMPLIED".equals (mode))) {
674 		// VC: ID Attribute Default
675 		error ("ID attribute '" + aName
676 		    + "' must be #IMPLIED or #REQUIRED");
677 	    }
678 
679 	} else if (!interned && type.startsWith ("NOTATION ")) {
680 	    checkOne = true;
681 
682 	    // VC: Notation Attributes (notations must be declared)
683 	    StringTokenizer	tokens = new StringTokenizer (
684 		type.substring (10, type.lastIndexOf (')')),
685 		"|");
686 	    while (tokens.hasMoreTokens ()) {
687 		String	token = tokens.nextToken ();
688 		if (!notations.contains (token))
689 		    nDeferred.addElement (token);
690 	    }
691 	}
692 	if (checkOne) {
693 	    for (Enumeration e = info.attributes.keys ();
694 		    e.hasMoreElements ();
695 		    /* NOP */) {
696 		String		name;
697 		AttributeInfo	ainfo2;
698 
699 		name = (String) e.nextElement ();
700 		ainfo2 = (AttributeInfo) info.attributes.get (name);
701 		if (type == ainfo2.type || !interned /* NOTATION */) {
702 		    // VC: One ID per Element Type
703 		    // VC: One Notation per Element TYpe
704 		    error ("Element '" + eName
705 			+ "' already has an attribute of type "
706 			+ (interned ? "NOTATION" : type)
707 			+ " ('" + name
708 			+ "') so '" + aName
709 			+ "' is a validity error");
710 		}
711 	    }
712 	}
713 
714 	// VC: Attribute Default Legal
715 	if (value != null) {
716 
717 	    if ("CDATA" == type) {
718 		// event source rejected '<'
719 
720 	    } else if ("NMTOKEN" == type) {
721 		// VC: Name Token (is a nmtoken)
722 		isNmtoken (value, "attribute default", aName);
723 
724 	    } else if ("NMTOKENS" == type) {
725 		// VC: Name Token (is a nmtoken; at least one value)
726 		StringTokenizer	tokens = new StringTokenizer (value);
727 		if (!tokens.hasMoreTokens ())
728 		    error ("Default for attribute '" + aName
729 			+ "' must have at least one name token.");
730 		else do {
731 		    String token = tokens.nextToken ();
732 		    isNmtoken (token, "attribute default", aName);
733 		} while (tokens.hasMoreTokens ());
734 
735 	    } else if ("IDREF" == type || "ENTITY" == type) {
736 		// VC: Entity Name (is a name)
737 		// VC: IDREF (is a name) (is declared)
738 		isName (value, "attribute default", aName);
739 		if ("ENTITY" == type && !unparsed.contains (value))
740 		    uDeferred.addElement (value);
741 
742 	    } else if ("IDREFS" == type || "ENTITIES" == type) {
743 		// VC: Entity Name (is a name; at least one value)
744 		// VC: IDREF (is a name; at least one value)
745 		StringTokenizer	names = new StringTokenizer (value);
746 		if (!names.hasMoreTokens ())
747 		    error ("Default for attribute '" + aName
748 			+ "' must have at least one name.");
749 		else do {
750 		    String name = names.nextToken ();
751 		    isName (name, "attribute default", aName);
752 		    if ("ENTITIES" == type && !unparsed.contains (name))
753 			uDeferred.addElement (value);
754 		} while (names.hasMoreTokens ());
755 
756 	    } else if (type.charAt (0) == '(' /*)*/ ) {
757 		// VC: Enumeration (must match)
758 		checkEnumeration (value, type, aName);
759 
760 	    } else if (!interned && checkOne) {	/* NOTATION */
761 		// VC: Notation attributes (must be names)
762 		isName (value, "attribute default", aName);
763 
764 		// VC: Notation attributes (must be declared)
765 		if (!notations.contains (value))
766 		    nDeferred.addElement (value);
767 
768 		// VC: Enumeration (must match)
769 		checkEnumeration (value, type, aName);
770 
771 	    } else if ("ID" != type)
772 		throw new RuntimeException ("illegal attribute type: " + type);
773 	}
774 
775 	if (info.attributes.get (aName) == null)
776 	    info.attributes.put (aName, ainfo);
777 	/*
778 	else
779 	    warning ("Element '" + eName
780 		+ "' already has an attribute named '" + aName + "'");
781 	*/
782 
783 	if ("xml:space".equals (aName)) {
784 	    if (!("(default|preserve)".equals (type)
785 		    || "(preserve|default)".equals (type)
786 			// these next two are arguable; XHTML's DTD doesn't
787 			// deserve errors.  After all, it's not like any
788 			// illegal _value_ could pass ...
789 		    || "(preserve)".equals (type)
790 		    || "(default)".equals (type)
791 		    ))
792 		error (
793 		    "xml:space attribute type must be like '(default|preserve)'"
794 		    + " not '" + type + "'"
795 		    );
796 
797 	}
798 	super.attributeDecl (eName, aName, type, mode, value);
799     }
800 
801     /**
802      * <b>DecllHandler</b> Records the element declaration for later use
803      * when checking document content, and checks validity constraints that
804      * apply to element declarations.  Passed to the next consumer, unless
805      * this one was preloaded with a particular DTD.
806      */
elementDecl(String name, String model)807     public void elementDecl (String name, String model)
808     throws SAXException
809     {
810 	if (disableDeclarations)
811 	    return;
812 
813 	ElementInfo	info = (ElementInfo) elements.get (name);
814 
815 	// we might have seen an attribute decl already
816 	if (info == null) {
817 	    info = new ElementInfo (name);
818 	    elements.put (name, info);
819 	}
820 	if (info.model != null) {
821 	    // NOTE:  not all parsers can report such duplicates.
822 	    // VC: Unique Element Type Declaration
823 	    error ("Element type '" + name
824 		+ "' was already declared.");
825 	} else {
826 	    info.model = model;
827 
828 	    // VC: No Duplicate Types (in mixed content models)
829 	    if (model.charAt (1) == '#') 	// (#PCDATA...
830 		info.getRecognizer (this);
831 	}
832 	super.elementDecl (name, model);
833     }
834 
835     /**
836      * <b>DecllHandler</b> passed to the next consumer, unless this
837      * one was preloaded with a particular DTD
838      */
internalEntityDecl(String name, String value)839     public void internalEntityDecl (String name, String value)
840     throws SAXException
841     {
842 	if (!disableDeclarations)
843 	    super.internalEntityDecl (name, value);
844     }
845 
846     /**
847      * <b>DecllHandler</b> passed to the next consumer, unless this
848      * one was preloaded with a particular DTD
849      */
externalEntityDecl(String name, String publicId, String systemId)850     public void externalEntityDecl (String name,
851     	String publicId, String systemId)
852     throws SAXException
853     {
854 	if (!disableDeclarations)
855 	    super.externalEntityDecl (name, publicId, systemId);
856     }
857 
858 
859     /**
860      * <b>DTDHandler</b> Records the notation name, for checking
861      * NOTATIONS attribute values and declararations of unparsed
862      * entities.  Passed to the next consumer, unless this one was
863      * preloaded with a particular DTD.
864      */
notationDecl(String name, String publicId, String systemId)865     public void notationDecl (String name, String publicId, String systemId)
866     throws SAXException
867     {
868 	if (disableDeclarations)
869 	    return;
870 
871 	notations.addElement (name);
872 	super.notationDecl (name, publicId, systemId);
873     }
874 
875     /**
876      * <b>DTDHandler</b> Records the entity name, for checking
877      * ENTITY and ENTITIES attribute values; records the notation
878      * name if it hasn't yet been declared.  Passed to the next consumer,
879      * unless this one was preloaded with a particular DTD.
880      */
unparsedEntityDecl( String name, String publicId, String systemId, String notationName )881     public void unparsedEntityDecl (
882 	String name,
883 	String publicId,
884 	String systemId,
885 	String notationName
886     ) throws SAXException
887     {
888 	if (disableDeclarations)
889 	    return;
890 
891 	unparsed.addElement (name);
892 	if (!notations.contains (notationName))
893 	    nDeferred.addElement (notationName);
894 	super.unparsedEntityDecl (name, publicId, systemId, notationName);
895     }
896 
897 
898     /**
899      * <b>ContentHandler</b> Ensures that state from any previous parse
900      * has been deleted.
901      * Passed to the next consumer.
902      */
startDocument()903     public void startDocument ()
904     throws SAXException
905     {
906 	resetState ();
907 	super.startDocument ();
908     }
909 
910 
isAsciiLetter(char c)911     private static boolean isAsciiLetter (char c)
912     {
913 	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
914     }
915 
916 
917     /**
918      * <b>ContentHandler</b> Reports a fatal exception.  Validating
919      * XML processors may not skip any entities.
920      */
skippedEntity(String name)921     public void skippedEntity (String name)
922     throws SAXException
923     {
924 	fatalError ("may not skip entities");
925     }
926 
927     /*
928      * SAX2 doesn't expand non-PE refs in attribute defaults...
929      */
expandDefaultRefs(String s)930     private String expandDefaultRefs (String s)
931     throws SAXException
932     {
933 	if (s.indexOf ('&') < 0)
934 	    return s;
935 
936 // FIXME: handle &#nn; &#xnn; &name;
937 	String message = "Can't expand refs in attribute default: " + s;
938 	warning (message);
939 
940 	return s;
941     }
942 
943     /**
944      * <b>ContentHandler</b> Performs validity checks against element
945      * (and document) content models, and attribute values.
946      * Passed to the next consumer.
947      */
startElement( String uri, String localName, String qName, Attributes atts )948     public void startElement (
949 	String		uri,
950 	String		localName,
951 	String		qName,
952 	Attributes	atts
953     ) throws SAXException
954     {
955 	//
956 	// First check content model for the enclosing scope.
957 	//
958 	if (contentStack.isEmpty ()) {
959 	    // VC:  Root Element Type
960 	    if (!qName.equals (rootName)) {
961 		if (rootName == null)
962 		    warning ("This document has no DTD, can't be valid");
963 		else
964 		    error ("Root element type '" + qName
965 			+ "' was declared to be '" + rootName + "'");
966 	    }
967 	} else {
968 	    Recognizer state = (Recognizer) contentStack.peek ();
969 
970 	    if (state != null) {
971 		Recognizer newstate = state.acceptElement (qName);
972 
973 		if (newstate == null)
974 		    error ("Element type '" + qName
975 			+ "' in element '" + state.type.name
976 			+ "' violates content model " + state.type.model
977 			);
978 		if (newstate != state) {
979 		    contentStack.pop ();
980 		    contentStack.push (newstate);
981 		}
982 	    }
983 	}
984 
985 	//
986 	// Then check that this element was declared, and push the
987 	// object used to validate its content model onto our stack.
988 	//
989 	// This is where the recognizer gets created, if needed; if
990 	// it's a "children" (elements) content model, an NDFA is
991 	// created.  (One recognizer is used per content type, no
992 	// matter how complex that recognizer is.)
993 	//
994 	ElementInfo		info;
995 
996 	info = (ElementInfo) elements.get (qName);
997 	if (info == null || info.model == null) {
998 	    // VC: Element Valid (base clause)
999 	    error ("Element type '" + qName + "' was not declared");
1000 	    contentStack.push (null);
1001 
1002 	    // for less diagnostic noise, fake a declaration.
1003 	    elementDecl (qName, "ANY");
1004 	} else
1005 	    contentStack.push (info.getRecognizer (this));
1006 
1007 	//
1008 	// Then check each attribute present
1009 	//
1010 	int			len;
1011 	String			aname;
1012 	AttributeInfo		ainfo;
1013 
1014 	if (atts != null)
1015 	    len = atts.getLength ();
1016 	else
1017 	    len = 0;
1018 
1019 	for (int i = 0; i < len; i++) {
1020 	    aname = atts.getQName (i);
1021 
1022 	    if (info == null
1023 		    || (ainfo = (AttributeInfo) info.attributes.get (aname))
1024 			    == null) {
1025 		// VC: Attribute Value Type
1026 		error ("Attribute '" + aname
1027 		    + "' was not declared for element type " + qName);
1028 		continue;
1029 	    }
1030 
1031 	    String value = atts.getValue (i);
1032 
1033 	    // note that "==" for type names and "#FIXED" is correct
1034 	    // (and fast) since we've interned those literals.
1035 
1036 	    if ("#FIXED" == ainfo.mode) {
1037 		String expanded = expandDefaultRefs (ainfo.value);
1038 
1039 		// VC: Fixed Attribute Default
1040 		if (!value.equals (expanded)) {
1041 		    error ("Attribute '" + aname
1042 			+ "' must match " + expanded
1043 			);
1044 		    continue;
1045 		}
1046 	    }
1047 
1048 	    if ("CDATA" == ainfo.type)
1049 		continue;
1050 
1051 	    //
1052 	    // For all other attribute types, there are various
1053 	    // rules to follow.
1054 	    //
1055 
1056 	    if ("ID" == ainfo.type) {
1057 		// VC: ID (must be a name)
1058 		if (isName (value, "ID attribute", aname)) {
1059 		    if (Boolean.TRUE == ids.get (value))
1060 			// VC: ID (appears once)
1061 			error ("ID attribute " + aname
1062 			    + " uses an ID value '" + value
1063 			    + "' which was already declared.");
1064 		    else
1065 			// any forward refs are no longer problems
1066 			ids.put (value, Boolean.TRUE);
1067 		}
1068 		continue;
1069 	    }
1070 
1071 	    if ("IDREF" == ainfo.type) {
1072 		// VC: IDREF (value must be a name)
1073 		if (isName (value, "IDREF attribute", aname)) {
1074 		    // VC: IDREF (must match some ID attribute)
1075 		    if (ids.get (value) == null)
1076 			// new -- assume it's a forward ref
1077 			ids.put (value, Boolean.FALSE);
1078 		}
1079 		continue;
1080 	    }
1081 
1082 	    if ("IDREFS" == ainfo.type) {
1083 		StringTokenizer	tokens = new StringTokenizer (value, " ");
1084 
1085 		if (!tokens.hasMoreTokens ()) {
1086 		    // VC: IDREF (one or more values)
1087 		    error ("IDREFS attribute " + aname
1088 			+ " must have at least one ID ref");
1089 		} else do {
1090 		    String id = tokens.nextToken ();
1091 
1092 		    // VC: IDREF (value must be a name)
1093 		    if (isName (id, "IDREFS attribute", aname)) {
1094 			// VC: IDREF (must match some ID attribute)
1095 			if (ids.get (id) == null)
1096 			    // new -- assume it's a forward ref
1097 			    ids.put (id, Boolean.FALSE);
1098 		    }
1099 		} while (tokens.hasMoreTokens ());
1100 		continue;
1101 	    }
1102 
1103 	    if ("NMTOKEN" == ainfo.type) {
1104 		// VC: Name Token (is a name token)
1105 		isNmtoken (value, "NMTOKEN attribute", aname);
1106 		continue;
1107 	    }
1108 
1109 	    if ("NMTOKENS" == ainfo.type) {
1110 		StringTokenizer	tokens = new StringTokenizer (value, " ");
1111 
1112 		if (!tokens.hasMoreTokens ()) {
1113 		    // VC: Name Token (one or more values)
1114 		    error ("NMTOKENS attribute " + aname
1115 			+ " must have at least one name token");
1116 		} else do {
1117 		    String token = tokens.nextToken ();
1118 
1119 		    // VC: Name Token (is a name token)
1120 		    isNmtoken (token, "NMTOKENS attribute", aname);
1121 		} while (tokens.hasMoreTokens ());
1122 		continue;
1123 	    }
1124 
1125 	    if ("ENTITY" == ainfo.type) {
1126 		if (!unparsed.contains (value))
1127 		    // VC: Entity Name
1128 		    error ("Value of attribute '" + aname
1129 			+ "' refers to unparsed entity '" + value
1130 			+ "' which was not declared.");
1131 		continue;
1132 	    }
1133 
1134 	    if ("ENTITIES" == ainfo.type) {
1135 		StringTokenizer	tokens = new StringTokenizer (value, " ");
1136 
1137 		if (!tokens.hasMoreTokens ()) {
1138 		    // VC: Entity Name (one or more values)
1139 		    error ("ENTITIES attribute " + aname
1140 			+ " must have at least one name token");
1141 		} else do {
1142 		    String entity = tokens.nextToken ();
1143 
1144 		    if (!unparsed.contains (entity))
1145 			// VC: Entity Name
1146 			error ("Value of attribute '" + aname
1147 			    + "' refers to unparsed entity '" + entity
1148 			    + "' which was not declared.");
1149 		} while (tokens.hasMoreTokens ());
1150 		continue;
1151 	    }
1152 
1153 	    //
1154 	    // check for enumerations last; more expensive
1155 	    //
1156 	    if (ainfo.type.charAt (0) == '(' /*)*/
1157 		    || ainfo.type.startsWith ("NOTATION ")
1158 		    ) {
1159 		// VC: Enumeration (value must be defined)
1160 		checkEnumeration (value, ainfo.type, aname);
1161 		continue;
1162 	    }
1163 	}
1164 
1165 	//
1166 	// Last, check that all #REQUIRED attributes were provided
1167 	//
1168 	if (info != null) {
1169 	    Hashtable	table = info.attributes;
1170 
1171 	    if (table.size () != 0) {
1172 		Enumeration	e = table.keys ();
1173 
1174 		// XXX table.keys uses the heap, bleech -- slows things
1175 
1176 		while (e.hasMoreElements ()) {
1177 		    aname = (String) e.nextElement ();
1178 		    ainfo = (AttributeInfo) table.get (aname);
1179 
1180 		    // "#REQUIRED" mode was interned in attributeDecl
1181 		    if ("#REQUIRED" == ainfo.mode
1182 			    && atts.getValue (aname) == null) {
1183 			// VC: Required Attribute
1184 			error ("Attribute '" + aname + "' must be specified "
1185 			    + "for element type " + qName);
1186 		    }
1187 		}
1188 	    }
1189 	}
1190 	super.startElement (uri, localName, qName, atts);
1191     }
1192 
1193     /**
1194      * <b>ContentHandler</b> Reports a validity error if the element's content
1195      * model does not permit character data.
1196      * Passed to the next consumer.
1197      */
characters(char ch [], int start, int length)1198     public void characters (char ch [], int start, int length)
1199     throws SAXException
1200     {
1201 	Recognizer state;
1202 
1203 	if (contentStack.empty ())
1204 	    state = null;
1205 	else
1206 	    state = (Recognizer) contentStack.peek ();
1207 
1208 	// NOTE:  if this ever supports with SAX parsers that don't
1209 	// report ignorable whitespace as such (only XP?), this class
1210 	// needs to morph it into ignorableWhitespace() as needed ...
1211 
1212 	if (state != null && !state.acceptCharacters ())
1213 	    // VC: Element Valid (clauses three, four -- see recognizer)
1214 	    error ("Character content not allowed in element "
1215 		+ state.type.name);
1216 
1217 	super.characters (ch, start, length);
1218     }
1219 
1220 
1221     /**
1222      * <b>ContentHandler</b> Reports a validity error if the element's content
1223      * model does not permit end-of-element yet, or a well formedness error
1224      * if there was no matching startElement call.
1225      * Passed to the next consumer.
1226      */
endElement(String uri, String localName, String qName)1227     public void endElement (String uri, String localName, String qName)
1228     throws SAXException
1229     {
1230 	try {
1231 	    Recognizer state = (Recognizer) contentStack.pop ();
1232 
1233 	    if (state != null && !state.completed ())
1234 		// VC: Element valid (clauses two, three, four; see Recognizer)
1235 		error ("Premature end for element '"
1236 		    + state.type.name
1237 		    + "', content model "
1238 		    + state.type.model);
1239 
1240 	    // could insist on match of start element, but that's
1241 	    // something the input stream must to guarantee.
1242 
1243 	} catch (EmptyStackException e) {
1244 	    fatalError ("endElement without startElement: " + qName
1245 		+ ((uri == null)
1246 		    ? ""
1247 		    : ( " { '" + uri + "', " + localName + " }")));
1248 	}
1249 	super.endElement (uri, localName, qName);
1250     }
1251 
1252     /**
1253      * <b>ContentHandler</b> Checks whether all ID values that were
1254      * referenced have been declared, and releases all resources.
1255      * Passed to the next consumer.
1256      *
1257      * @see #setDocumentLocator
1258      */
endDocument()1259     public void endDocument ()
1260     throws SAXException
1261     {
1262 	for (Enumeration idNames = ids.keys ();
1263 		idNames.hasMoreElements ();
1264 		/* NOP */) {
1265 	    String id = (String) idNames.nextElement ();
1266 
1267 	    if (Boolean.FALSE == ids.get (id)) {
1268 		// VC: IDREF (must match ID)
1269 		error ("Undeclared ID value '" + id
1270 		    + "' was referred to by an IDREF/IDREFS attribute");
1271 	    }
1272 	}
1273 
1274 	resetState ();
1275 	super.endDocument ();
1276     }
1277 
1278 
1279     /** Holds per-element declarations */
1280     static private final class ElementInfo
1281     {
1282 	String			name;
1283 	String			model;
1284 
1285 	// key = attribute name; value = AttributeInfo
1286 	Hashtable		attributes = new Hashtable (11);
1287 
ElementInfo(String n)1288 	ElementInfo (String n) { name = n; }
1289 
1290 	private Recognizer	recognizer;
1291 
1292 	// for validating content models:  one per type, shared,
1293 	// and constructed only on demand ... so unused elements do
1294 	// not need to consume resources.
getRecognizer(ValidationConsumer consumer)1295 	Recognizer	getRecognizer (ValidationConsumer consumer)
1296 	throws SAXException
1297 	{
1298 	    if (recognizer == null) {
1299 		if ("ANY".equals (model))
1300 		    recognizer = ANY;
1301 		else if ("EMPTY".equals (model))
1302 		    recognizer = new EmptyRecognizer (this);
1303 		else if ('#' == model.charAt (1))
1304 		    // n.b. this constructor does a validity check
1305 		    recognizer = new MixedRecognizer (this, consumer);
1306 		else
1307 		    recognizer = new ChildrenRecognizer (this, consumer);
1308 	    }
1309 	    return recognizer;
1310 	}
1311     }
1312 
1313     /** Holds per-attribute declarations */
1314     static private final class AttributeInfo
1315     {
1316 	String	type;
1317 	String	mode;		// #REQUIRED, etc (or null)
1318 	String	value;		// or null
1319     }
1320 
1321 
1322     //
1323     // Content model validation
1324     //
1325 
1326     static private final Recognizer	ANY = new Recognizer (null);
1327 
1328 
1329     // Base class defines the calls used to validate content,
1330     // and supports the "ANY" content model
1331     static private class Recognizer
1332     {
1333 	final ElementInfo	type;
1334 
Recognizer(ElementInfo t)1335 	Recognizer (ElementInfo t) { type = t; }
1336 
1337 	// return true iff character data is legal here
acceptCharacters()1338 	boolean acceptCharacters ()
1339 	throws SAXException
1340 	    // VC: Element Valid (third and fourth clauses)
1341 	    { return true; }
1342 
1343 	// null return = failure
1344 	// otherwise, next state (like an FSM)
1345 	// prerequisite: tested that name was declared
acceptElement(String name)1346 	Recognizer acceptElement (String name)
1347 	throws SAXException
1348 	    // VC: Element Valid (fourth clause)
1349 	    { return this; }
1350 
1351 	// return true iff model is completed, can finish
completed()1352 	boolean completed ()
1353 	throws SAXException
1354 	    // VC: Element Valid (fourth clause)
1355 	    { return true; }
1356 
toString()1357 	public String toString ()
1358 	    // n.b. "children" is the interesting case!
1359 	    { return (type == null) ? "ANY" : type.model; }
1360     }
1361 
1362     // "EMPTY" content model -- no characters or elements
1363     private static final class EmptyRecognizer extends Recognizer
1364     {
EmptyRecognizer(ElementInfo type)1365 	public EmptyRecognizer (ElementInfo type)
1366 	    { super (type); }
1367 
1368 	// VC: Element Valid (first clause)
acceptCharacters()1369 	boolean acceptCharacters ()
1370 	    { return false; }
1371 
1372 	// VC: Element Valid (first clause)
acceptElement(String name)1373 	Recognizer acceptElement (String name)
1374 	    { return null; }
1375     }
1376 
1377     // "Mixed" content model -- ANY, but restricts elements
1378     private static final class MixedRecognizer extends Recognizer
1379     {
1380 	private String	permitted [];
1381 
1382 	// N.B. constructor tests for duplicated element names (VC)
MixedRecognizer(ElementInfo t, ValidationConsumer v)1383 	public MixedRecognizer (ElementInfo t, ValidationConsumer v)
1384 	throws SAXException
1385 	{
1386 	    super (t);
1387 
1388 	    // (#PCDATA...)* or (#PCDATA) ==> ... or empty
1389 	    // with the "..." being "|elname|..."
1390 	    StringTokenizer	tokens = new StringTokenizer (
1391 		t.model.substring (8, t.model.lastIndexOf (')')),
1392 		"|");
1393 	    Vector		vec = new Vector ();
1394 
1395 	    while (tokens.hasMoreTokens ()) {
1396 		String token = tokens.nextToken ();
1397 
1398 		if (vec.contains (token))
1399 		    v.error ("element " + token
1400 			+ " is repeated in mixed content model: "
1401 			+ t.model);
1402 		else
1403 		    vec.addElement (token.intern ());
1404 	    }
1405 	    permitted = new String [vec.size ()];
1406 	    for (int i = 0; i < permitted.length; i++)
1407 		permitted [i] = (String) vec.elementAt (i);
1408 
1409 	    // in one large machine-derived DTD sample, most of about
1410 	    // 250 mixed content models were empty, and 25 had ten or
1411 	    // more entries.  2 had over a hundred elements.  Linear
1412 	    // search isn't obviously wrong.
1413 	}
1414 
1415 	// VC: Element Valid (third clause)
acceptElement(String name)1416 	Recognizer acceptElement (String name)
1417 	{
1418 	    int		length = permitted.length;
1419 
1420 	    // first pass -- optimistic w.r.t. event source interning
1421 	    // (and document validity)
1422 	    for (int i = 0; i < length; i++)
1423 		if (permitted [i] == name)
1424 		    return this;
1425 	    // second pass -- pessimistic w.r.t. event source interning
1426 	    for (int i = 0; i < length; i++)
1427 		if (permitted [i].equals (name))
1428 		    return this;
1429 	    return null;
1430 	}
1431     }
1432 
1433 
1434     // recognizer loop flags, see later
1435     private static final int		F_LOOPHEAD = 0x01;
1436     private static final int		F_LOOPNEXT = 0x02;
1437 
1438     // for debugging -- used to label/count nodes in toString()
1439     private static int			nodeCount;
1440 
1441     /**
1442      * "Children" content model -- these are nodes in NDFA state graphs.
1443      * They work in fixed space.  Note that these graphs commonly have
1444      * cycles, handling features such as zero-or-more and one-or-more.
1445      *
1446      * <p>It's readonly, so only one copy is ever needed.  The content model
1447      * stack may have any number of pointers into each graph, when a model
1448      * happens to be needed more than once due to element nesting.  Since
1449      * traversing the graph just moves to another node, and never changes
1450      * it, traversals never interfere with each other.
1451      *
1452      * <p>There is an option to report non-deterministic models.  These are
1453      * always XML errors, but ones which are not often reported despite the
1454      * fact that they can lead to different validating parsers giving
1455      * different results for the same input.  (The XML spec doesn't require
1456      * them to be reported.)
1457      *
1458      * <p><b>FIXME</b> There's currently at least one known bug here, in that
1459      * it's not actually detecting the non-determinism it tries to detect.
1460      * (Of the "optional.xml" test, the once-or-twice-2* tests are all non-D;
1461      * maybe some others.)  This may relate to the issue flagged below as
1462      * "should not" happen (but it was), which showed up when patching the
1463      * graph to have one exit node (or more EMPTY nodes).
1464      */
1465     private static final class ChildrenRecognizer extends Recognizer
1466 	implements Cloneable
1467     {
1468 	// for reporting non-deterministic content models
1469 	// ... a waste of space if we're not reporting those!
1470 	// ... along with the 'model' member (in base class)
1471 	private ValidationConsumer	consumer;
1472 
1473 	// for CHOICE nodes -- each component is an arc that
1474 	// accepts a different NAME (or is EMPTY indicating
1475 	// NDFA termination).
1476 	private Recognizer		components [];
1477 
1478 	// for NAME/SEQUENCE nodes -- accepts that NAME and
1479 	// then goes to the next node (CHOICE, NAME, EMPTY).
1480 	private String			name;
1481 	private Recognizer		next;
1482 
1483 	// loops always point back to a CHOICE node. we mark such choice
1484 	// nodes (F_LOOPHEAD) for diagnostics and faster deep cloning.
1485 	// We also mark nodes before back pointers (F_LOOPNEXT), to ensure
1486 	// termination when we patch sequences and loops.
1487 	private int			flags;
1488 
1489 
1490 	// prevent a needless indirection between 'this' and 'node'
copyIn(ChildrenRecognizer node)1491 	private void copyIn (ChildrenRecognizer node)
1492 	{
1493 	    // model & consumer are already set
1494 	    components = node.components;
1495 	    name = node.name;
1496 	    next = node.next;
1497 	    flags = node.flags;
1498 	}
1499 
1500 	// used to construct top level "children" content models,
ChildrenRecognizer(ElementInfo type, ValidationConsumer vc)1501 	public ChildrenRecognizer (ElementInfo type, ValidationConsumer vc)
1502 	{
1503 	    this (vc, type);
1504 	    populate (type.model.toCharArray (), 0);
1505 	    patchNext (new EmptyRecognizer (type), null);
1506 	}
1507 
1508 	// used internally; populating is separate
ChildrenRecognizer(ValidationConsumer vc, ElementInfo type)1509 	private ChildrenRecognizer (ValidationConsumer vc, ElementInfo type)
1510 	{
1511 	    super (type);
1512 	    consumer = vc;
1513 	}
1514 
1515 
1516 	//
1517 	// When rewriting some graph nodes we need deep clones in one case;
1518 	// mostly shallow clones (what the JVM handles for us) are fine.
1519 	//
shallowClone()1520 	private ChildrenRecognizer shallowClone ()
1521 	{
1522 	    try {
1523 		return (ChildrenRecognizer) clone ();
1524 	    } catch (CloneNotSupportedException e) {
1525 		throw new Error ("clone");
1526 	    }
1527 	}
1528 
deepClone()1529 	private ChildrenRecognizer deepClone ()
1530 	{
1531 	    return deepClone (new Hashtable (37));
1532 	}
1533 
deepClone(Hashtable table)1534 	private ChildrenRecognizer deepClone (Hashtable table)
1535 	{
1536 	    ChildrenRecognizer retval;
1537 
1538 	    if ((flags & F_LOOPHEAD) != 0) {
1539 		retval = (ChildrenRecognizer) table.get (this);
1540 		if (retval != null)
1541 		    return this;
1542 
1543 		retval = shallowClone ();
1544 		table.put (this, retval);
1545 	    } else
1546 		retval = shallowClone ();
1547 
1548 	    if (next != null) {
1549 		if (next instanceof ChildrenRecognizer)
1550 		    retval.next = ((ChildrenRecognizer)next)
1551 			    .deepClone (table);
1552 		else if (!(next instanceof EmptyRecognizer))
1553 		    throw new RuntimeException ("deepClone");
1554 	    }
1555 
1556 	    if (components != null) {
1557 		retval.components = new Recognizer [components.length];
1558 		for (int i = 0; i < components.length; i++) {
1559 		    Recognizer temp = components [i];
1560 
1561 		    if (temp == null)
1562 			retval.components [i] = null;
1563 		    else if (temp instanceof ChildrenRecognizer)
1564 			retval.components [i] = ((ChildrenRecognizer)temp)
1565 				.deepClone (table);
1566 		    else if (!(temp instanceof EmptyRecognizer))
1567 			throw new RuntimeException ("deepClone");
1568 		}
1569 	    }
1570 
1571 	    return retval;
1572 	}
1573 
1574 	// connect subgraphs, first to next (sequencing)
patchNext(Recognizer theNext, Hashtable table)1575 	private void patchNext (Recognizer theNext, Hashtable table)
1576 	{
1577 	    // backpointers must not be repatched or followed
1578 	    if ((flags & F_LOOPNEXT) != 0)
1579 		return;
1580 
1581 	    // XXX this table "shouldn't" be needed, right?
1582 	    // but some choice nodes looped if it isn't there.
1583 	    if (table != null && table.get (this) != null)
1584 		return;
1585 	    if (table == null)
1586 		table = new Hashtable ();
1587 
1588 	    // NAME/SEQUENCE
1589 	    if (name != null) {
1590 		if (next == null)
1591 		    next = theNext;
1592 		else if (next instanceof ChildrenRecognizer) {
1593 		    ((ChildrenRecognizer)next).patchNext (theNext, table);
1594 		} else if (!(next instanceof EmptyRecognizer))
1595 		    throw new RuntimeException ("patchNext");
1596 		return;
1597 	    }
1598 
1599 	    // CHOICE
1600 	    for (int i = 0; i < components.length; i++) {
1601 		if (components [i] == null)
1602 		    components [i] = theNext;
1603 		else if (components [i] instanceof ChildrenRecognizer) {
1604 		    ((ChildrenRecognizer)components [i])
1605 			    .patchNext (theNext, table);
1606 		} else if (!(components [i] instanceof EmptyRecognizer))
1607 		    throw new RuntimeException ("patchNext");
1608 	    }
1609 
1610 	    if (table != null && (flags & F_LOOPHEAD) != 0)
1611 		table.put (this, this);
1612 	}
1613 
1614 	/**
1615 	 * Parses a 'children' spec (or recursively 'cp') and makes this
1616 	 * become a regular graph node.
1617 	 *
1618 	 * @return index after this particle
1619 	 */
populate(char parseBuf [], int startPos)1620 	private int populate (char parseBuf [], int startPos)
1621 	{
1622 	    int		nextPos = startPos + 1;
1623 	    char	c;
1624 
1625 	    if (nextPos < 0 || nextPos >= parseBuf.length)
1626 		throw new IndexOutOfBoundsException ();
1627 
1628 	    // Grammar of the string is from the XML spec, but
1629 	    // with whitespace removed by the SAX parser.
1630 
1631 	    // children ::= (choice | seq) ('?' | '*' | '+')?
1632 	    // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1633 	    // choice ::= '(' cp ('|' choice)* ')'
1634 	    // seq ::= '(' cp (',' choice)* ')'
1635 
1636 	    // interior nodes only
1637 	    //   cp ::= name ...
1638 	    if (parseBuf [startPos] != '('/*)*/) {
1639 		boolean		done = false;
1640 		do {
1641 		    switch (c = parseBuf [nextPos]) {
1642 			case '?': case '*': case '+':
1643 			case '|': case ',':
1644 			case /*(*/ ')':
1645 			    done = true;
1646 			    continue;
1647 			default:
1648 			    nextPos++;
1649 			    continue;
1650 		    }
1651 		} while (!done);
1652 		name = new String (parseBuf, startPos, nextPos - startPos);
1653 
1654 	    // interior OR toplevel nodes
1655 	    //   cp ::= choice ..
1656 	    //   cp ::= seq ..
1657 	    } else {
1658 		// collect everything as a separate list, and merge it
1659 		// into "this" later if we can (SEQUENCE or singleton)
1660 		ChildrenRecognizer	first;
1661 
1662 		first = new ChildrenRecognizer (consumer, type);
1663 		nextPos = first.populate (parseBuf, nextPos);
1664 		c = parseBuf [nextPos++];
1665 
1666 		if (c == ',' || c == '|') {
1667 		    ChildrenRecognizer	current = first;
1668 		    char		separator = c;
1669 		    Vector		v = null;
1670 
1671 		    if (separator == '|') {
1672 			v = new Vector ();
1673 			v.addElement (first);
1674 		    }
1675 
1676 		    do {
1677 			ChildrenRecognizer link;
1678 
1679 			link = new ChildrenRecognizer (consumer, type);
1680 			nextPos = link.populate (parseBuf, nextPos);
1681 
1682 			if (separator == ',') {
1683 			    current.patchNext (link, null);
1684 			    current = link;
1685 			} else
1686 			    v.addElement (link);
1687 
1688 			c = parseBuf [nextPos++];
1689 		    } while (c == separator);
1690 
1691 		    // choice ... collect everything into one array.
1692 		    if (separator == '|') {
1693 			// assert v.size() > 1
1694 			components = new Recognizer [v.size ()];
1695 			for (int i = 0; i < components.length; i++) {
1696 			    components [i] = (Recognizer)
1697 				    v.elementAt (i);
1698 			}
1699 			// assert flags == 0
1700 
1701 		    // sequence ... merge into "this" to be smaller.
1702 		    } else
1703 			copyIn (first);
1704 
1705 		// treat singletons like one-node sequences.
1706 		} else
1707 		    copyIn (first);
1708 
1709 		if (c != /*(*/ ')')
1710 		    throw new RuntimeException ("corrupt content model");
1711 	    }
1712 
1713 	    //
1714 	    // Arity is optional, and the root of all fun.  We keep the
1715 	    // FSM state graph simple by only having NAME/SEQUENCE and
1716 	    // CHOICE nodes (or EMPTY to terminate a model), easily
1717 	    // evaluated.  So we rewrite each node that has arity, using
1718 	    // those primitives.  We create loops here, if needed.
1719 	    //
1720 	    if (nextPos < parseBuf.length) {
1721 		c = parseBuf [nextPos];
1722 		if (c == '?' || c == '*' || c == '+') {
1723 		    nextPos++;
1724 
1725 		    // Rewrite 'zero-or-one' "?" arity to a CHOICE:
1726 		    //   - SEQUENCE (clone, what's next)
1727 		    //   - or, what's next
1728 		    // Size cost: N --> N + 1
1729 		    if (c == '?') {
1730 			Recognizer		once = shallowClone ();
1731 
1732 			components = new Recognizer [2];
1733 			components [0] = once;
1734 			// components [1] initted to null
1735 			name = null;
1736 			next = null;
1737 			flags = 0;
1738 
1739 
1740 		    // Rewrite 'zero-or-more' "*" arity to a CHOICE.
1741 		    //   - LOOP (clone, back to this CHOICE)
1742 		    //   - or, what's next
1743 		    // Size cost: N --> N + 1
1744 		    } else if (c == '*') {
1745 			ChildrenRecognizer	loop = shallowClone ();
1746 
1747 			loop.patchNext (this, null);
1748 			loop.flags |= F_LOOPNEXT;
1749 			flags = F_LOOPHEAD;
1750 
1751 			components = new Recognizer [2];
1752 			components [0] = loop;
1753 			// components [1] initted to null
1754 			name = null;
1755 			next = null;
1756 
1757 
1758 		    // Rewrite 'one-or-more' "+" arity to a SEQUENCE.
1759 		    // Basically (a)+ --> ((a),(a)*).
1760 		    //   - this
1761 		    //   - CHOICE
1762 		    //	    * LOOP (clone, back to the CHOICE)
1763 		    //	    * or, whatever's next
1764 		    // Size cost: N --> 2N + 1
1765 		    } else if (c == '+') {
1766 			ChildrenRecognizer loop = deepClone ();
1767 			ChildrenRecognizer choice;
1768 
1769 			choice = new ChildrenRecognizer (consumer, type);
1770 			loop.patchNext (choice, null);
1771 			loop.flags |= F_LOOPNEXT;
1772 			choice.flags = F_LOOPHEAD;
1773 
1774 			choice.components = new Recognizer [2];
1775 			choice.components [0] = loop;
1776 			// choice.components [1] initted to null
1777 			// choice.name, choice.next initted to null
1778 
1779 			patchNext (choice, null);
1780 		    }
1781 		}
1782 	    }
1783 
1784 	    return nextPos;
1785 	}
1786 
1787 	// VC: Element Valid (second clause)
acceptCharacters()1788 	boolean acceptCharacters ()
1789 	    { return false; }
1790 
1791 	// VC: Element Valid (second clause)
acceptElement(String type)1792 	Recognizer acceptElement (String type)
1793 	throws SAXException
1794 	{
1795 	    // NAME/SEQUENCE
1796 	    if (name != null) {
1797 		if (name.equals (type))
1798 		    return next;
1799 		return null;
1800 	    }
1801 
1802 	    // CHOICE ... optionally reporting nondeterminism we
1803 	    // run across.  we won't check out every transition
1804 	    // for nondeterminism; only the ones we follow.
1805 	    Recognizer	retval = null;
1806 
1807 	    for (int i = 0; i < components.length; i++) {
1808 		Recognizer temp = components [i].acceptElement (type);
1809 
1810 		if (temp == null)
1811 		    continue;
1812 		else if (!warnNonDeterministic)
1813 		    return temp;
1814 		else if (retval == null)
1815 		    retval = temp;
1816 		else if (retval != temp)
1817 		    consumer.error ("Content model " + this.type.model
1818 			+ " is non-deterministic for " + type);
1819 	    }
1820 	    return retval;
1821 	}
1822 
1823 	// VC: Element Valid (second clause)
completed()1824 	boolean completed ()
1825 	throws SAXException
1826 	{
1827 	    // expecting a specific element
1828 	    if (name != null)
1829 		return false;
1830 
1831 	    // choice, some sequences
1832 	    for (int i = 0; i < components.length; i++) {
1833 		if (components [i].completed ())
1834 		    return true;
1835 	    }
1836 
1837 	    return false;
1838 	}
1839 
1840 /** /
1841 	// FOR DEBUGGING ... flattens the graph for printing.
1842 
1843 	public String toString ()
1844 	{
1845 	    StringBuffer buf = new StringBuffer ();
1846 
1847 	    // only one set of loop labels can be generated
1848 	    // at a time...
1849 	    synchronized (ANY) {
1850 		nodeCount = 0;
1851 
1852 		toString (buf, new Hashtable ());
1853 		return buf.toString ();
1854 	    }
1855 	}
1856 
1857 	private void toString (StringBuffer buf, Hashtable table)
1858 	{
1859 	    // When we visit a node, label and count it.
1860 	    // Nodes are never visited/counted more than once.
1861 	    // For small models labels waste space, but if arity
1862 	    // mappings were used the savings are substantial.
1863 	    // (Plus, the output can be more readily understood.)
1864 	    String temp = (String) table.get (this);
1865 
1866 	    if (temp != null) {
1867 		buf.append ('{');
1868 		buf.append (temp);
1869 		buf.append ('}');
1870 		return;
1871 	    } else {
1872 		StringBuffer scratch = new StringBuffer (15);
1873 
1874 		if ((flags & F_LOOPHEAD) != 0)
1875 		    scratch.append ("loop");
1876 		else
1877 		    scratch.append ("node");
1878 		scratch.append ('-');
1879 		scratch.append (++nodeCount);
1880 		temp = scratch.toString ();
1881 
1882 		table.put (this, temp);
1883 		buf.append ('[');
1884 		buf.append (temp);
1885 		buf.append (']');
1886 		buf.append (':');
1887 	    }
1888 
1889 	    // NAME/SEQUENCE
1890 	    if (name != null) {
1891 		// n.b. some output encodings turn some name chars into '?'
1892 		// e.g. with Japanese names and ASCII output
1893 		buf.append (name);
1894 		if (components != null)		// bug!
1895 		    buf.append ('$');
1896 		if (next == null)
1897 		    buf.append (",*");
1898 		else if (next instanceof EmptyRecognizer) // patch-to-next
1899 		    buf.append (",{}");
1900 		else if (next instanceof ChildrenRecognizer) {
1901 		    buf.append (',');
1902 		    ((ChildrenRecognizer)next).toString (buf, table);
1903 		} else				// bug!
1904 		    buf.append (",+");
1905 		return;
1906 	    }
1907 
1908 	    // CHOICE
1909 	    buf.append ("<");
1910 	    for (int i = 0; i < components.length; i++) {
1911 		if (i != 0)
1912 		    buf.append ("|");
1913 		if (components [i] instanceof EmptyRecognizer) {
1914 		    buf.append ("{}");
1915 		} else if (components [i] == null) {	// patch-to-next
1916 		    buf.append ('*');
1917 		} else {
1918 		    ChildrenRecognizer r;
1919 
1920 		    r = (ChildrenRecognizer) components [i];
1921 		    r.toString (buf, table);
1922 		}
1923 	    }
1924 	    buf.append (">");
1925 	}
1926 /**/
1927     }
1928 }
1929