1 /* ValidationConsumer.java --
2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 package gnu.xml.pipeline;
39 
40 import java.io.IOException;
41 import java.io.StringReader;
42 import java.io.StringWriter;
43 import java.util.EmptyStackException;
44 import java.util.Enumeration;
45 import java.util.Hashtable;
46 import java.util.Stack;
47 import java.util.StringTokenizer;
48 import java.util.Vector;
49 
50 import org.xml.sax.Attributes;
51 import org.xml.sax.EntityResolver;
52 import org.xml.sax.ErrorHandler;
53 import org.xml.sax.InputSource;
54 import org.xml.sax.Locator;
55 import org.xml.sax.SAXException;
56 import org.xml.sax.SAXParseException;
57 import org.xml.sax.XMLReader;
58 import org.xml.sax.helpers.XMLReaderFactory;
59 
60 /**
61  * This class checks SAX2 events to report validity errors; it works as
62  * both a filter and a terminus on an event pipeline.  It relies on the
63  * producer of SAX events to:  </p> <ol>
64  *
65  *      <li> Conform to the specification of a non-validating XML parser that
66  *      reads all external entities, reported using SAX2 events. </li>
67  *
68  *      <li> Report ignorable whitespace as such (through the ContentHandler
69  *      interface).  This is, strictly speaking, optional for nonvalidating
70  *      XML processors.  </li>
71  *
72  *      <li> Make SAX2 DeclHandler callbacks, with default
73  *      attribute values already normalized (and without "&lt;").</li>
74  *
75  *      <li> Make SAX2 LexicalHandler startDTD() and endDTD ()
76  *      callbacks. </li>
77  *
78  *      <li> Act as if the <em>(URI)/namespace-prefixes</em> property were
79  *      set to true, by providing XML 1.0 names and all <code>xmlns*</code>
80  *      attributes (rather than omitting either or both). </li>
81  *
82  *      </ol>
83  *
84  * <p> At this writing, the major SAX2 parsers (such as &AElig;lfred2,
85  * Crimson, and Xerces) meet these requirements, and this validation
86  * module is used by the optional &AElig;lfred2 validation support.
87  * </p>
88  *
89  * <p> Note that because this is a layered validator, it has to duplicate some
90  * work that the parser is doing; there are also other cost to layering.
91  * However, <em>because of layering it doesn't need a parser</em> in order
92  * to work! You can use it with anything that generates SAX events, such
93  * as an application component that wants to detect invalid content in
94  * a changed area without validating an entire document, or which wants to
95  * ensure that it doesn't write invalid data to a communications partner.</p>
96  *
97  * <p> Also, note that because this is a layered validator, the line numbers
98  * reported for some errors may seem strange.  For example, if an element does
99  * not permit character content, the validator
100  * will use the locator provided to it.
101  * That might reflect the last character of a <em>characters</em> event
102  * callback, rather than the first non-whitespace character. </p>
103  *
104  * <hr />
105  *
106  * <!--
107  * <p> Of interest is the fact that unlike most currently known XML validators,
108  * this one can report some cases of non-determinism in element content models.
109  * It is a compile-time option, enabled by default.  This will only report
110  * such XML errors if they relate to content actually appearing in a document;
111  * content models aren't aggressively scanned for non-deterministic structure.
112  * Documents which trigger such non-deterministic transitions may be handled
113  * differently by different validating parsers, without losing conformance
114  * to the XML specification. </p>
115  * -->
116  *
117  * <p> Current limitations of the validation performed are in roughly three
118  * categories.  </p>
119  *
120  * <p> The first category represents constraints which demand violations
121  * of software layering:  exposing lexical details, one of the first things
122  * that <em>application</em> programming interfaces (APIs) hide.  These
123  * invariably relate to XML entity handling, and to historical oddities
124  * of the XML validation semantics.  Curiously,
125  * recent (Autumn 1999) conformance testing showed that these constraints are
126  * among those handled worst by existing XML validating parsers.  Arguments
127  * have been made that each of these VCs should be turned into WFCs (most
128  * of them) or discarded (popular for the standalone declaration); in short,
129  * that these are bugs in the XML specification (not all via SGML): </p><ul>
130  *
131  *      <li> The <em>Proper Declaration/PE Nesting</em> and
132  *      <em>Proper Group/PE Nesting</em> VCs can't be tested because they
133  *      require access to particularly low level lexical level information.
134  *      In essence, the reason XML isn't a simple thing to parse is that
135  *      it's not a context free grammar, and these constraints elevate that
136  *      SGML-derived context sensitivity to the level of a semantic rule.
137  *
138  *      <li> The <em>Standalone Document Declaration</em> VC can't be
139  *      tested.  This is for two reasons.  First, this flag isn't made
140  *      available through SAX2.  Second, it also requires breaking that
141  *      lexical layering boundary.  (If you ever wondered why classes
142  *      in compiler construction or language design barely mention the
143  *      existence of context-sensitive grammars, it's because of messy
144  *      issues like these.)
145  *
146  *      <li> The <em>Entity Declared</em> VC can't be tested, because it
147  *      also requires breaking that lexical layering boundary!  There's also
148  *      another issue: the VC wording (and seemingly intent) is ambiguous.
149  *      (This is still true in the "Second edition" XML spec.)
150  *      Since there is a WFC of the same name, everyone's life would be
151  *      easier if references to undeclared parsed entities were always well
152  *      formedness errors, regardless of whether they're parameter entities
153  *      or not.  (Note that nonvalidating parsers are not required
154  *      to report all such well formedness errors if they don't read external
155  *      parameter entities, although currently most XML parsers read them
156  *      in an attempt to avoid problems from inconsistent parser behavior.)
157  *
158  *      </ul>
159  *
160  * <p> The second category of limitations on this validation represent
161  * constraints associated with information that is not guaranteed to be
162  * available (or in one case, <em>is guaranteed not to be available</em>,
163  * through the SAX2 API: </p><ul>
164  *
165  *      <li> The <em>Unique Element Type Declaration</em> VC may not be
166  *      reportable, if the underlying parser happens not to expose
167  *      multiple declarations.   (&AElig;lfred2 reports these validity
168  *      errors directly.)</li>
169  *
170  *      <li> Similarly, the <em>Unique Notation Name</em> VC, added in the
171  *      14-January-2000 XML spec errata to restrict typing models used by
172  *      elements, may not be reportable.  (&AElig;lfred reports these
173  *      validity errors directly.) </li>
174  *
175  *      </ul>
176  *
177  * <p> A third category relates to ease of implementation.  (Think of this
178  * as "bugs".)  The most notable issue here is character handling.  Rather
179  * than attempting to implement the voluminous character tables in the XML
180  * specification (Appendix B), Unicode rules are used directly from
181  * the java.lang.Character class.  Recent JVMs have begun to diverge from
182  * the original specification for that class (Unicode 2.0), meaning that
183  * different JVMs may handle that aspect of conformance differently.
184  * </p>
185  *
186  * <p> Note that for some of the validity errors that SAX2 does not
187  * expose, a nonvalidating parser is permitted (by the XML specification)
188  * to report validity errors.  When used with a parser that does so for
189  * the validity constraints mentioned above (or any other SAX2 event
190  * stream producer that does the same thing), overall conformance is
191  * substantially improved.
192  *
193  * @see gnu.xml.aelfred2.SAXDriver
194  * @see gnu.xml.aelfred2.XmlReader
195  *
196  * @author David Brownell
197  */
198 public final class ValidationConsumer extends EventFilter
199 {
200     // report error if we happen to notice a non-deterministic choice?
201     // we won't report buggy content models; just buggy instances
202     private static final boolean        warnNonDeterministic = false;
203 
204     // for tracking active content models
205     private String              rootName;
206     private Stack               contentStack = new Stack ();
207 
208     // flags for "saved DTD" processing
209     private boolean             disableDeclarations;
210     private boolean             disableReset;
211 
212     //
213     // most VCs get tested when we see element start tags.  the per-element
214     // info (including attributes) recorded here duplicates that found inside
215     // many nonvalidating parsers, hence dual lookups etc ... that's why a
216     // layered validator isn't going to be as fast as a non-layered one.
217     //
218 
219     // key = element name; value = ElementInfo
220     private Hashtable           elements = new Hashtable ();
221 
222     // some VCs relate to ID/IDREF/IDREFS attributes
223     // key = id; value = boolean true (defd) or false (refd)
224     private Hashtable           ids = new Hashtable ();
225 
226     // we just record declared notation and unparsed entity names.
227     // the implementation here is simple/slow; these features
228     // are seldom used, one hopes they'll wither away soon
229     private Vector              notations = new Vector (5, 5);
230     private Vector              nDeferred = new Vector (5, 5);
231     private Vector              unparsed = new Vector (5, 5);
232     private Vector              uDeferred = new Vector (5, 5);
233 
234         // note: DocBk 3.1.7 XML defines over 2 dozen notations,
235         // used when defining unparsed entities for graphics
236         // (and maybe in other places)
237 
238 
239 
240     /**
241      * Creates a pipeline terminus which consumes all events passed to
242      * it; this will report validity errors as if they were fatal errors,
243      * unless an error handler is assigned.
244      *
245      * @see #setErrorHandler
246      */
247         // constructor used by PipelineFactory
248             // ... and want one taking system ID of an external subset
ValidationConsumer()249     public ValidationConsumer ()
250     {
251         this (null);
252     }
253 
254     /**
255      * Creates a pipeline filter which reports validity errors and then
256      * passes events on to the next consumer if they were not fatal.
257      *
258      * @see #setErrorHandler
259      */
260         // constructor used by PipelineFactory
261             // ... and want one taking system ID of an external subset
262             // (which won't send declaration events)
ValidationConsumer(EventConsumer next)263     public ValidationConsumer (EventConsumer next)
264     {
265         super (next);
266 
267         setContentHandler (this);
268         setDTDHandler (this);
269         try { setProperty (DECL_HANDLER, this); }
270         catch (Exception e) { /* "can't happen" */ }
271         try { setProperty (LEXICAL_HANDLER, this); }
272         catch (Exception e) { /* "can't happen" */ }
273     }
274 
275 
276     private static final String fakeRootName
277         = ":Nobody:in:their_Right.Mind_would:use:this-name:1x:";
278 
279     /**
280      * Creates a validation consumer which is preloaded with the DTD provided.
281      * It does this by constructing a document with that DTD, then parsing
282      * that document and recording its DTD declarations.  Then it arranges
283      * not to modify that information.
284      *
285      * <p> The resulting validation consumer will only validate against
286      * the specified DTD, regardless of whether some other DTD is found
287      * in a document being parsed.
288      *
289      * @param rootName The name of the required root element; if this is
290      *  null, any root element name will be accepted.
291      * @param publicId If non-null and there is a non-null systemId, this
292      *  identifier provides an alternate access identifier for the DTD's
293      *  external subset.
294      * @param systemId If non-null, this is a URI (normally URL) that
295      *  may be used to access the DTD's external subset.
296      * @param internalSubset If non-null, holds literal markup declarations
297      *  comprising the DTD's internal subset.
298      * @param resolver If non-null, this will be provided to the parser for
299      *  use when resolving parameter entities (including any external subset).
300      * @param resolver If non-null, this will be provided to the parser for
301      *  use when resolving parameter entities (including any external subset).
302      * @param minimalElement If non-null, a minimal valid document.
303      *
304      * @exception SAXNotSupportedException If the default SAX parser does
305      *  not support the standard lexical or declaration handlers.
306      * @exception SAXParseException If the specified DTD has either
307      *  well-formedness or validity errors
308      * @exception IOException If the specified DTD can't be read for
309      *  some reason
310      */
ValidationConsumer( String rootName, String publicId, String systemId, String internalSubset, EntityResolver resolver, String minimalDocument )311     public ValidationConsumer (
312         String          rootName,
313         String          publicId,
314         String          systemId,
315         String          internalSubset,
316         EntityResolver  resolver,
317         String          minimalDocument
318     ) throws SAXException, IOException
319     {
320         this (null);
321 
322         disableReset = true;
323         if (rootName == null)
324             rootName = fakeRootName;
325 
326         //
327         // Synthesize document with that DTD; is it possible to do
328         // better for the declaration of the root element?
329         //
330         // NOTE:  can't use SAX2 to write internal subsets.
331         //
332         StringWriter    writer = new StringWriter ();
333 
334         writer.write ("<!DOCTYPE ");
335         writer.write (rootName);
336         if (systemId != null) {
337             writer.write ("\n  ");
338             if (publicId != null) {
339                 writer.write ("PUBLIC '");
340                 writer.write (publicId);
341                 writer.write ("'\n\t'");
342             } else
343                 writer.write ("SYSTEM '");
344             writer.write (systemId);
345             writer.write ("'");
346         }
347         writer.write (" [ ");
348         if (rootName == fakeRootName) {
349             writer.write ("\n<!ELEMENT ");
350             writer.write (rootName);
351             writer.write (" EMPTY>");
352         }
353         if (internalSubset != null)
354             writer.write (internalSubset);
355         writer.write ("\n ]>");
356 
357         if (minimalDocument != null) {
358             writer.write ("\n");
359             writer.write (minimalDocument);
360             writer.write ("\n");
361         } else {
362             writer.write (" <");
363             writer.write (rootName);
364             writer.write ("/>\n");
365         }
366         minimalDocument = writer.toString ();
367 
368         //
369         // OK, load it
370         //
371         XMLReader       producer;
372 
373         producer = XMLReaderFactory.createXMLReader ();
374         bind (producer, this);
375 
376         if (resolver != null)
377             producer.setEntityResolver (resolver);
378 
379         InputSource     in;
380 
381         in = new InputSource (new StringReader (minimalDocument));
382         producer.parse (in);
383 
384         disableDeclarations = true;
385         if (rootName == fakeRootName)
386             this.rootName = null;
387     }
388 
resetState()389     private void resetState ()
390     {
391         if (!disableReset) {
392             rootName = null;
393             contentStack.removeAllElements ();
394             elements.clear ();
395             ids.clear ();
396 
397             notations.removeAllElements ();
398             nDeferred.removeAllElements ();
399             unparsed.removeAllElements ();
400             uDeferred.removeAllElements ();
401         }
402     }
403 
404 
warning(String description)405     private void warning (String description)
406     throws SAXException
407     {
408         ErrorHandler            errHandler = getErrorHandler ();
409         Locator                 locator = getDocumentLocator ();
410         SAXParseException       err;
411 
412         if (errHandler == null)
413             return;
414 
415         if (locator == null)
416             err = new SAXParseException (description, null, null, -1, -1);
417         else
418             err = new SAXParseException (description, locator);
419         errHandler.warning (err);
420     }
421 
422     // package private (for ChildrenRecognizer)
error(String description)423     private void error (String description)
424     throws SAXException
425     {
426         ErrorHandler            errHandler = getErrorHandler ();
427         Locator                 locator = getDocumentLocator ();
428         SAXParseException       err;
429 
430         if (locator == null)
431             err = new SAXParseException (description, null, null, -1, -1);
432         else
433             err = new SAXParseException (description, locator);
434         if (errHandler != null)
435             errHandler.error (err);
436         else    // else we always treat it as fatal!
437             throw err;
438     }
439 
fatalError(String description)440     private void fatalError (String description)
441     throws SAXException
442     {
443         ErrorHandler            errHandler = getErrorHandler ();
444         Locator                 locator = getDocumentLocator ();
445         SAXParseException       err;
446 
447         if (locator != null)
448             err = new SAXParseException (description, locator);
449         else
450             err = new SAXParseException (description, null, null, -1, -1);
451         if (errHandler != null)
452             errHandler.fatalError (err);
453         // we always treat this as fatal, regardless of the handler
454         throw err;
455     }
456 
457 
isExtender(char c)458     private static boolean isExtender (char c)
459     {
460         // [88] Extender ::= ...
461         return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
462                || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
463                || (c >= 0x3031 && c <= 0x3035)
464                || (c >= 0x309d && c <= 0x309e)
465                || (c >= 0x30fc && c <= 0x30fe);
466     }
467 
468 
469     // use augmented Unicode rules, not full XML rules
isName(String name, String context, String id)470     private boolean isName (String name, String context, String id)
471     throws SAXException
472     {
473         char    buf [] = name.toCharArray ();
474         boolean pass = true;
475 
476         if (!Character.isUnicodeIdentifierStart (buf [0])
477                 && ":_".indexOf (buf [0]) == -1)
478             pass = false;
479         else {
480             int max = buf.length;
481             for (int i = 1; pass && i < max; i++) {
482                 char c = buf [i];
483                 if (!Character.isUnicodeIdentifierPart (c)
484                         && ":-_.".indexOf (c) == -1
485                         && !isExtender (c))
486                     pass = false;
487             }
488         }
489 
490         if (!pass)
491             error ("In " + context + " for " + id
492                 + ", '" + name + "' is not a name");
493         return pass;    // true == OK
494     }
495 
496     // use augmented Unicode rules, not full XML rules
isNmtoken(String nmtoken, String context, String id)497     private boolean isNmtoken (String nmtoken, String context, String id)
498     throws SAXException
499     {
500         char    buf [] = nmtoken.toCharArray ();
501         boolean pass = true;
502         int     max = buf.length;
503 
504         // XXX make this share code with isName
505 
506         for (int i = 0; pass && i < max; i++) {
507                 char c = buf [i];
508             if (!Character.isUnicodeIdentifierPart (c)
509                     && ":-_.".indexOf (c) == -1
510                     && !isExtender (c))
511                 pass = false;
512         }
513 
514         if (!pass)
515             error ("In " + context + " for " + id
516                 + ", '" + nmtoken + "' is not a name token");
517         return pass;    // true == OK
518     }
519 
checkEnumeration(String value, String type, String name)520     private void checkEnumeration (String value, String type, String name)
521     throws SAXException
522     {
523         if (!hasMatch (value, type))
524             // VC: Enumeration
525             error ("Value '" + value
526                 + "' for attribute '" + name
527                 + "' is not permitted: " + type);
528     }
529 
530     // used to test enumerated attributes and mixed content models
531     // package private
hasMatch(String value, String orList)532     static boolean hasMatch (String value, String orList)
533     {
534         int len = value.length ();
535         int max = orList.length () - len;
536 
537         for (int start = 0;
538                 (start = orList.indexOf (value, start)) != -1;
539                 start++) {
540             char c;
541 
542             if (start > max)
543                 break;
544             c = orList.charAt (start - 1);
545             if (c != '|' && c != '('/*)*/)
546                 continue;
547             c = orList.charAt (start + len);
548             if (c != '|' && /*(*/ c != ')')
549                 continue;
550             return true;
551         }
552         return false;
553     }
554 
555     /**
556      * <b>LexicalHandler</b> Records the declaration of the root
557      * element, so it can be verified later.
558      * Passed to the next consumer, unless this one was
559      * preloaded with a particular DTD.
560      */
startDTD(String name, String publicId, String systemId)561     public void startDTD (String name, String publicId, String systemId)
562     throws SAXException
563     {
564         if (disableDeclarations)
565             return;
566 
567         rootName = name;
568         super.startDTD (name, publicId, systemId);
569     }
570 
571     /**
572      * <b>LexicalHandler</b> Verifies that all referenced notations
573      * and unparsed entities have been declared.
574      * Passed to the next consumer, unless this one was
575      * preloaded with a particular DTD.
576      */
endDTD()577     public void endDTD ()
578     throws SAXException
579     {
580         if (disableDeclarations)
581             return;
582 
583         // this is a convenient hook for end-of-dtd checks, but we
584         // could also trigger it in the first startElement call.
585         // locator info is more appropriate here though.
586 
587         // VC: Notation Declared (NDATA can refer to them before decls,
588         //      as can NOTATION attribute enumerations and defaults)
589         int length = nDeferred.size ();
590         for (int i = 0; i < length; i++) {
591             String notation = (String) nDeferred.elementAt (i);
592             if (!notations.contains (notation)) {
593                 error ("A declaration referred to notation '" + notation
594                         + "' which was never declared");
595             }
596         }
597         nDeferred.removeAllElements ();
598 
599         // VC: Entity Name (attribute values can refer to them
600         //      before they're declared); VC Attribute Default Legal
601         length = uDeferred.size ();
602         for (int i = 0; i < length; i++) {
603             String entity = (String) uDeferred.elementAt (i);
604             if (!unparsed.contains (entity)) {
605                 error ("An attribute default referred to entity '" + entity
606                         + "' which was never declared");
607             }
608         }
609         uDeferred.removeAllElements ();
610         super.endDTD ();
611     }
612 
613 
614     // These are interned, so we can rely on "==" to find the type of
615     // all attributes except enumerations ...
616     // "(this|or|that|...)" and "NOTATION (this|or|that|...)"
617     static final String types [] = {
618         "CDATA",
619         "ID", "IDREF", "IDREFS",
620         "NMTOKEN", "NMTOKENS",
621         "ENTITY", "ENTITIES"
622     };
623 
624 
625     /**
626      * <b>DecllHandler</b> Records attribute declaration for later use
627      * in validating document content, and checks validity constraints
628      * that are applicable to attribute declarations.
629      * Passed to the next consumer, unless this one was
630      * preloaded with a particular DTD.
631      */
attributeDecl( String eName, String aName, String type, String mode, String value )632     public void attributeDecl (
633         String eName,
634         String aName,
635         String type,
636         String mode,
637         String value
638     ) throws SAXException
639     {
640         if (disableDeclarations)
641             return;
642 
643         ElementInfo     info = (ElementInfo) elements.get (eName);
644         AttributeInfo   ainfo = new AttributeInfo ();
645         boolean         checkOne = false;
646         boolean         interned = false;
647 
648         // cheap interning of type names and #FIXED, #REQUIRED
649         // for faster startElement (we can use "==")
650         for (int i = 0; i < types.length; i++) {
651             if (types [i].equals (type)) {
652                 type = types [i];
653                 interned = true;
654                 break;
655             }
656         }
657         if ("#FIXED".equals (mode))
658             mode = "#FIXED";
659         else if ("#REQUIRED".equals (mode))
660             mode = "#REQUIRED";
661 
662         ainfo.type = type;
663         ainfo.mode = mode;
664         ainfo.value = value;
665 
666         // we might not have seen the content model yet
667         if (info == null) {
668             info = new ElementInfo (eName);
669             elements.put (eName, info);
670         }
671         if ("ID" == type) {
672             checkOne = true;
673             if (!("#REQUIRED" == mode || "#IMPLIED".equals (mode))) {
674                 // VC: ID Attribute Default
675                 error ("ID attribute '" + aName
676                     + "' must be #IMPLIED or #REQUIRED");
677             }
678 
679         } else if (!interned && type.startsWith ("NOTATION ")) {
680             checkOne = true;
681 
682             // VC: Notation Attributes (notations must be declared)
683             StringTokenizer     tokens = new StringTokenizer (
684                 type.substring (10, type.lastIndexOf (')')),
685                 "|");
686             while (tokens.hasMoreTokens ()) {
687                 String  token = tokens.nextToken ();
688                 if (!notations.contains (token))
689                     nDeferred.addElement (token);
690             }
691         }
692         if (checkOne) {
693             for (Enumeration e = info.attributes.keys ();
694                     e.hasMoreElements ();
695                     /* NOP */) {
696                 String          name;
697                 AttributeInfo   ainfo2;
698 
699                 name = (String) e.nextElement ();
700                 ainfo2 = (AttributeInfo) info.attributes.get (name);
701                 if (type == ainfo2.type || !interned /* NOTATION */) {
702                     // VC: One ID per Element Type
703                     // VC: One Notation per Element TYpe
704                     error ("Element '" + eName
705                         + "' already has an attribute of type "
706                         + (interned ? "NOTATION" : type)
707                         + " ('" + name
708                         + "') so '" + aName
709                         + "' is a validity error");
710                 }
711             }
712         }
713 
714         // VC: Attribute Default Legal
715         if (value != null) {
716 
717             if ("CDATA" == type) {
718                 // event source rejected '<'
719 
720             } else if ("NMTOKEN" == type) {
721                 // VC: Name Token (is a nmtoken)
722                 isNmtoken (value, "attribute default", aName);
723 
724             } else if ("NMTOKENS" == type) {
725                 // VC: Name Token (is a nmtoken; at least one value)
726                 StringTokenizer tokens = new StringTokenizer (value);
727                 if (!tokens.hasMoreTokens ())
728                     error ("Default for attribute '" + aName
729                         + "' must have at least one name token.");
730                 else do {
731                     String token = tokens.nextToken ();
732                     isNmtoken (token, "attribute default", aName);
733                 } while (tokens.hasMoreTokens ());
734 
735             } else if ("IDREF" == type || "ENTITY" == type) {
736                 // VC: Entity Name (is a name)
737                 // VC: IDREF (is a name) (is declared)
738                 isName (value, "attribute default", aName);
739                 if ("ENTITY" == type && !unparsed.contains (value))
740                     uDeferred.addElement (value);
741 
742             } else if ("IDREFS" == type || "ENTITIES" == type) {
743                 // VC: Entity Name (is a name; at least one value)
744                 // VC: IDREF (is a name; at least one value)
745                 StringTokenizer names = new StringTokenizer (value);
746                 if (!names.hasMoreTokens ())
747                     error ("Default for attribute '" + aName
748                         + "' must have at least one name.");
749                 else do {
750                     String name = names.nextToken ();
751                     isName (name, "attribute default", aName);
752                     if ("ENTITIES" == type && !unparsed.contains (name))
753                         uDeferred.addElement (value);
754                 } while (names.hasMoreTokens ());
755 
756             } else if (type.charAt (0) == '(' /*)*/ ) {
757                 // VC: Enumeration (must match)
758                 checkEnumeration (value, type, aName);
759 
760             } else if (!interned && checkOne) { /* NOTATION */
761                 // VC: Notation attributes (must be names)
762                 isName (value, "attribute default", aName);
763 
764                 // VC: Notation attributes (must be declared)
765                 if (!notations.contains (value))
766                     nDeferred.addElement (value);
767 
768                 // VC: Enumeration (must match)
769                 checkEnumeration (value, type, aName);
770 
771             } else if ("ID" != type)
772                 throw new RuntimeException ("illegal attribute type: " + type);
773         }
774 
775         if (info.attributes.get (aName) == null)
776             info.attributes.put (aName, ainfo);
777         /*
778         else
779             warning ("Element '" + eName
780                 + "' already has an attribute named '" + aName + "'");
781         */
782 
783         if ("xml:space".equals (aName)) {
784             if (!("(default|preserve)".equals (type)
785                     || "(preserve|default)".equals (type)
786                         // these next two are arguable; XHTML's DTD doesn't
787                         // deserve errors.  After all, it's not like any
788                         // illegal _value_ could pass ...
789                     || "(preserve)".equals (type)
790                     || "(default)".equals (type)
791                     ))
792                 error (
793                     "xml:space attribute type must be like '(default|preserve)'"
794                     + " not '" + type + "'"
795                     );
796 
797         }
798         super.attributeDecl (eName, aName, type, mode, value);
799     }
800 
801     /**
802      * <b>DecllHandler</b> Records the element declaration for later use
803      * when checking document content, and checks validity constraints that
804      * apply to element declarations.  Passed to the next consumer, unless
805      * this one was preloaded with a particular DTD.
806      */
elementDecl(String name, String model)807     public void elementDecl (String name, String model)
808     throws SAXException
809     {
810         if (disableDeclarations)
811             return;
812 
813         ElementInfo     info = (ElementInfo) elements.get (name);
814 
815         // we might have seen an attribute decl already
816         if (info == null) {
817             info = new ElementInfo (name);
818             elements.put (name, info);
819         }
820         if (info.model != null) {
821             // NOTE:  not all parsers can report such duplicates.
822             // VC: Unique Element Type Declaration
823             error ("Element type '" + name
824                 + "' was already declared.");
825         } else {
826             info.model = model;
827 
828             // VC: No Duplicate Types (in mixed content models)
829             if (model.charAt (1) == '#')        // (#PCDATA...
830                 info.getRecognizer (this);
831         }
832         super.elementDecl (name, model);
833     }
834 
835     /**
836      * <b>DecllHandler</b> passed to the next consumer, unless this
837      * one was preloaded with a particular DTD
838      */
internalEntityDecl(String name, String value)839     public void internalEntityDecl (String name, String value)
840     throws SAXException
841     {
842         if (!disableDeclarations)
843             super.internalEntityDecl (name, value);
844     }
845 
846     /**
847      * <b>DecllHandler</b> passed to the next consumer, unless this
848      * one was preloaded with a particular DTD
849      */
externalEntityDecl(String name, String publicId, String systemId)850     public void externalEntityDecl (String name,
851         String publicId, String systemId)
852     throws SAXException
853     {
854         if (!disableDeclarations)
855             super.externalEntityDecl (name, publicId, systemId);
856     }
857 
858 
859     /**
860      * <b>DTDHandler</b> Records the notation name, for checking
861      * NOTATIONS attribute values and declararations of unparsed
862      * entities.  Passed to the next consumer, unless this one was
863      * preloaded with a particular DTD.
864      */
notationDecl(String name, String publicId, String systemId)865     public void notationDecl (String name, String publicId, String systemId)
866     throws SAXException
867     {
868         if (disableDeclarations)
869             return;
870 
871         notations.addElement (name);
872         super.notationDecl (name, publicId, systemId);
873     }
874 
875     /**
876      * <b>DTDHandler</b> Records the entity name, for checking
877      * ENTITY and ENTITIES attribute values; records the notation
878      * name if it hasn't yet been declared.  Passed to the next consumer,
879      * unless this one was preloaded with a particular DTD.
880      */
unparsedEntityDecl( String name, String publicId, String systemId, String notationName )881     public void unparsedEntityDecl (
882         String name,
883         String publicId,
884         String systemId,
885         String notationName
886     ) throws SAXException
887     {
888         if (disableDeclarations)
889             return;
890 
891         unparsed.addElement (name);
892         if (!notations.contains (notationName))
893             nDeferred.addElement (notationName);
894         super.unparsedEntityDecl (name, publicId, systemId, notationName);
895     }
896 
897 
898     /**
899      * <b>ContentHandler</b> Ensures that state from any previous parse
900      * has been deleted.
901      * Passed to the next consumer.
902      */
startDocument()903     public void startDocument ()
904     throws SAXException
905     {
906         resetState ();
907         super.startDocument ();
908     }
909 
910 
isAsciiLetter(char c)911     private static boolean isAsciiLetter (char c)
912     {
913         return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
914     }
915 
916 
917     /**
918      * <b>ContentHandler</b> Reports a fatal exception.  Validating
919      * XML processors may not skip any entities.
920      */
skippedEntity(String name)921     public void skippedEntity (String name)
922     throws SAXException
923     {
924         fatalError ("may not skip entities");
925     }
926 
927     /*
928      * SAX2 doesn't expand non-PE refs in attribute defaults...
929      */
expandDefaultRefs(String s)930     private String expandDefaultRefs (String s)
931     throws SAXException
932     {
933         if (s.indexOf ('&') < 0)
934             return s;
935 
936 // FIXME: handle &#nn; &#xnn; &name;
937         String message = "Can't expand refs in attribute default: " + s;
938         warning (message);
939 
940         return s;
941     }
942 
943     /**
944      * <b>ContentHandler</b> Performs validity checks against element
945      * (and document) content models, and attribute values.
946      * Passed to the next consumer.
947      */
startElement( String uri, String localName, String qName, Attributes atts )948     public void startElement (
949         String          uri,
950         String          localName,
951         String          qName,
952         Attributes      atts
953     ) throws SAXException
954     {
955         //
956         // First check content model for the enclosing scope.
957         //
958         if (contentStack.isEmpty ()) {
959             // VC:  Root Element Type
960             if (!qName.equals (rootName)) {
961                 if (rootName == null)
962                     warning ("This document has no DTD, can't be valid");
963                 else
964                     error ("Root element type '" + qName
965                         + "' was declared to be '" + rootName + "'");
966             }
967         } else {
968             Recognizer state = (Recognizer) contentStack.peek ();
969 
970             if (state != null) {
971                 Recognizer newstate = state.acceptElement (qName);
972 
973                 if (newstate == null)
974                     error ("Element type '" + qName
975                         + "' in element '" + state.type.name
976                         + "' violates content model " + state.type.model
977                         );
978                 if (newstate != state) {
979                     contentStack.pop ();
980                     contentStack.push (newstate);
981                 }
982             }
983         }
984 
985         //
986         // Then check that this element was declared, and push the
987         // object used to validate its content model onto our stack.
988         //
989         // This is where the recognizer gets created, if needed; if
990         // it's a "children" (elements) content model, an NDFA is
991         // created.  (One recognizer is used per content type, no
992         // matter how complex that recognizer is.)
993         //
994         ElementInfo             info;
995 
996         info = (ElementInfo) elements.get (qName);
997         if (info == null || info.model == null) {
998             // VC: Element Valid (base clause)
999             error ("Element type '" + qName + "' was not declared");
1000             contentStack.push (null);
1001 
1002             // for less diagnostic noise, fake a declaration.
1003             elementDecl (qName, "ANY");
1004         } else
1005             contentStack.push (info.getRecognizer (this));
1006 
1007         //
1008         // Then check each attribute present
1009         //
1010         int                     len;
1011         String                  aname;
1012         AttributeInfo           ainfo;
1013 
1014         if (atts != null)
1015             len = atts.getLength ();
1016         else
1017             len = 0;
1018 
1019         for (int i = 0; i < len; i++) {
1020             aname = atts.getQName (i);
1021 
1022             if (info == null
1023                     || (ainfo = (AttributeInfo) info.attributes.get (aname))
1024                             == null) {
1025                 // VC: Attribute Value Type
1026                 error ("Attribute '" + aname
1027                     + "' was not declared for element type " + qName);
1028                 continue;
1029             }
1030 
1031             String value = atts.getValue (i);
1032 
1033             // note that "==" for type names and "#FIXED" is correct
1034             // (and fast) since we've interned those literals.
1035 
1036             if ("#FIXED" == ainfo.mode) {
1037                 String expanded = expandDefaultRefs (ainfo.value);
1038 
1039                 // VC: Fixed Attribute Default
1040                 if (!value.equals (expanded)) {
1041                     error ("Attribute '" + aname
1042                         + "' must match " + expanded
1043                         );
1044                     continue;
1045                 }
1046             }
1047 
1048             if ("CDATA" == ainfo.type)
1049                 continue;
1050 
1051             //
1052             // For all other attribute types, there are various
1053             // rules to follow.
1054             //
1055 
1056             if ("ID" == ainfo.type) {
1057                 // VC: ID (must be a name)
1058                 if (isName (value, "ID attribute", aname)) {
1059                     if (Boolean.TRUE == ids.get (value))
1060                         // VC: ID (appears once)
1061                         error ("ID attribute " + aname
1062                             + " uses an ID value '" + value
1063                             + "' which was already declared.");
1064                     else
1065                         // any forward refs are no longer problems
1066                         ids.put (value, Boolean.TRUE);
1067                 }
1068                 continue;
1069             }
1070 
1071             if ("IDREF" == ainfo.type) {
1072                 // VC: IDREF (value must be a name)
1073                 if (isName (value, "IDREF attribute", aname)) {
1074                     // VC: IDREF (must match some ID attribute)
1075                     if (ids.get (value) == null)
1076                         // new -- assume it's a forward ref
1077                         ids.put (value, Boolean.FALSE);
1078                 }
1079                 continue;
1080             }
1081 
1082             if ("IDREFS" == ainfo.type) {
1083                 StringTokenizer tokens = new StringTokenizer (value, " ");
1084 
1085                 if (!tokens.hasMoreTokens ()) {
1086                     // VC: IDREF (one or more values)
1087                     error ("IDREFS attribute " + aname
1088                         + " must have at least one ID ref");
1089                 } else do {
1090                     String id = tokens.nextToken ();
1091 
1092                     // VC: IDREF (value must be a name)
1093                     if (isName (id, "IDREFS attribute", aname)) {
1094                         // VC: IDREF (must match some ID attribute)
1095                         if (ids.get (id) == null)
1096                             // new -- assume it's a forward ref
1097                             ids.put (id, Boolean.FALSE);
1098                     }
1099                 } while (tokens.hasMoreTokens ());
1100                 continue;
1101             }
1102 
1103             if ("NMTOKEN" == ainfo.type) {
1104                 // VC: Name Token (is a name token)
1105                 isNmtoken (value, "NMTOKEN attribute", aname);
1106                 continue;
1107             }
1108 
1109             if ("NMTOKENS" == ainfo.type) {
1110                 StringTokenizer tokens = new StringTokenizer (value, " ");
1111 
1112                 if (!tokens.hasMoreTokens ()) {
1113                     // VC: Name Token (one or more values)
1114                     error ("NMTOKENS attribute " + aname
1115                         + " must have at least one name token");
1116                 } else do {
1117                     String token = tokens.nextToken ();
1118 
1119                     // VC: Name Token (is a name token)
1120                     isNmtoken (token, "NMTOKENS attribute", aname);
1121                 } while (tokens.hasMoreTokens ());
1122                 continue;
1123             }
1124 
1125             if ("ENTITY" == ainfo.type) {
1126                 if (!unparsed.contains (value))
1127                     // VC: Entity Name
1128                     error ("Value of attribute '" + aname
1129                         + "' refers to unparsed entity '" + value
1130                         + "' which was not declared.");
1131                 continue;
1132             }
1133 
1134             if ("ENTITIES" == ainfo.type) {
1135                 StringTokenizer tokens = new StringTokenizer (value, " ");
1136 
1137                 if (!tokens.hasMoreTokens ()) {
1138                     // VC: Entity Name (one or more values)
1139                     error ("ENTITIES attribute " + aname
1140                         + " must have at least one name token");
1141                 } else do {
1142                     String entity = tokens.nextToken ();
1143 
1144                     if (!unparsed.contains (entity))
1145                         // VC: Entity Name
1146                         error ("Value of attribute '" + aname
1147                             + "' refers to unparsed entity '" + entity
1148                             + "' which was not declared.");
1149                 } while (tokens.hasMoreTokens ());
1150                 continue;
1151             }
1152 
1153             //
1154             // check for enumerations last; more expensive
1155             //
1156             if (ainfo.type.charAt (0) == '(' /*)*/
1157                     || ainfo.type.startsWith ("NOTATION ")
1158                     ) {
1159                 // VC: Enumeration (value must be defined)
1160                 checkEnumeration (value, ainfo.type, aname);
1161                 continue;
1162             }
1163         }
1164 
1165         //
1166         // Last, check that all #REQUIRED attributes were provided
1167         //
1168         if (info != null) {
1169             Hashtable   table = info.attributes;
1170 
1171             if (table.size () != 0) {
1172                 Enumeration     e = table.keys ();
1173 
1174                 // XXX table.keys uses the heap, bleech -- slows things
1175 
1176                 while (e.hasMoreElements ()) {
1177                     aname = (String) e.nextElement ();
1178                     ainfo = (AttributeInfo) table.get (aname);
1179 
1180                     // "#REQUIRED" mode was interned in attributeDecl
1181                     if ("#REQUIRED" == ainfo.mode
1182                             && atts.getValue (aname) == null) {
1183                         // VC: Required Attribute
1184                         error ("Attribute '" + aname + "' must be specified "
1185                             + "for element type " + qName);
1186                     }
1187                 }
1188             }
1189         }
1190         super.startElement (uri, localName, qName, atts);
1191     }
1192 
1193     /**
1194      * <b>ContentHandler</b> Reports a validity error if the element's content
1195      * model does not permit character data.
1196      * Passed to the next consumer.
1197      */
characters(char ch [], int start, int length)1198     public void characters (char ch [], int start, int length)
1199     throws SAXException
1200     {
1201         Recognizer state;
1202 
1203         if (contentStack.empty ())
1204             state = null;
1205         else
1206             state = (Recognizer) contentStack.peek ();
1207 
1208         // NOTE:  if this ever supports with SAX parsers that don't
1209         // report ignorable whitespace as such (only XP?), this class
1210         // needs to morph it into ignorableWhitespace() as needed ...
1211 
1212         if (state != null && !state.acceptCharacters ())
1213             // VC: Element Valid (clauses three, four -- see recognizer)
1214             error ("Character content not allowed in element "
1215                 + state.type.name);
1216 
1217         super.characters (ch, start, length);
1218     }
1219 
1220 
1221     /**
1222      * <b>ContentHandler</b> Reports a validity error if the element's content
1223      * model does not permit end-of-element yet, or a well formedness error
1224      * if there was no matching startElement call.
1225      * Passed to the next consumer.
1226      */
endElement(String uri, String localName, String qName)1227     public void endElement (String uri, String localName, String qName)
1228     throws SAXException
1229     {
1230         try {
1231             Recognizer state = (Recognizer) contentStack.pop ();
1232 
1233             if (state != null && !state.completed ())
1234                 // VC: Element valid (clauses two, three, four; see Recognizer)
1235                 error ("Premature end for element '"
1236                     + state.type.name
1237                     + "', content model "
1238                     + state.type.model);
1239 
1240             // could insist on match of start element, but that's
1241             // something the input stream must to guarantee.
1242 
1243         } catch (EmptyStackException e) {
1244             fatalError ("endElement without startElement: " + qName
1245                 + ((uri == null)
1246                     ? ""
1247                     : ( " { '" + uri + "', " + localName + " }")));
1248         }
1249         super.endElement (uri, localName, qName);
1250     }
1251 
1252     /**
1253      * <b>ContentHandler</b> Checks whether all ID values that were
1254      * referenced have been declared, and releases all resources.
1255      * Passed to the next consumer.
1256      *
1257      * @see #setDocumentLocator
1258      */
endDocument()1259     public void endDocument ()
1260     throws SAXException
1261     {
1262         for (Enumeration idNames = ids.keys ();
1263                 idNames.hasMoreElements ();
1264                 /* NOP */) {
1265             String id = (String) idNames.nextElement ();
1266 
1267             if (Boolean.FALSE == ids.get (id)) {
1268                 // VC: IDREF (must match ID)
1269                 error ("Undeclared ID value '" + id
1270                     + "' was referred to by an IDREF/IDREFS attribute");
1271             }
1272         }
1273 
1274         resetState ();
1275         super.endDocument ();
1276     }
1277 
1278 
1279     /** Holds per-element declarations */
1280     static private final class ElementInfo
1281     {
1282         String                  name;
1283         String                  model;
1284 
1285         // key = attribute name; value = AttributeInfo
1286         Hashtable               attributes = new Hashtable (11);
1287 
ElementInfo(String n)1288         ElementInfo (String n) { name = n; }
1289 
1290         private Recognizer      recognizer;
1291 
1292         // for validating content models:  one per type, shared,
1293         // and constructed only on demand ... so unused elements do
1294         // not need to consume resources.
getRecognizer(ValidationConsumer consumer)1295         Recognizer      getRecognizer (ValidationConsumer consumer)
1296         throws SAXException
1297         {
1298             if (recognizer == null) {
1299                 if ("ANY".equals (model))
1300                     recognizer = ANY;
1301                 else if ("EMPTY".equals (model))
1302                     recognizer = new EmptyRecognizer (this);
1303                 else if ('#' == model.charAt (1))
1304                     // n.b. this constructor does a validity check
1305                     recognizer = new MixedRecognizer (this, consumer);
1306                 else
1307                     recognizer = new ChildrenRecognizer (this, consumer);
1308             }
1309             return recognizer;
1310         }
1311     }
1312 
1313     /** Holds per-attribute declarations */
1314     static private final class AttributeInfo
1315     {
1316         String  type;
1317         String  mode;           // #REQUIRED, etc (or null)
1318         String  value;          // or null
1319     }
1320 
1321 
1322     //
1323     // Content model validation
1324     //
1325 
1326     static private final Recognizer     ANY = new Recognizer (null);
1327 
1328 
1329     // Base class defines the calls used to validate content,
1330     // and supports the "ANY" content model
1331     static private class Recognizer
1332     {
1333         final ElementInfo       type;
1334 
Recognizer(ElementInfo t)1335         Recognizer (ElementInfo t) { type = t; }
1336 
1337         // return true iff character data is legal here
acceptCharacters()1338         boolean acceptCharacters ()
1339         throws SAXException
1340             // VC: Element Valid (third and fourth clauses)
1341             { return true; }
1342 
1343         // null return = failure
1344         // otherwise, next state (like an FSM)
1345         // prerequisite: tested that name was declared
acceptElement(String name)1346         Recognizer acceptElement (String name)
1347         throws SAXException
1348             // VC: Element Valid (fourth clause)
1349             { return this; }
1350 
1351         // return true iff model is completed, can finish
completed()1352         boolean completed ()
1353         throws SAXException
1354             // VC: Element Valid (fourth clause)
1355             { return true; }
1356 
toString()1357         public String toString ()
1358             // n.b. "children" is the interesting case!
1359             { return (type == null) ? "ANY" : type.model; }
1360     }
1361 
1362     // "EMPTY" content model -- no characters or elements
1363     private static final class EmptyRecognizer extends Recognizer
1364     {
EmptyRecognizer(ElementInfo type)1365         public EmptyRecognizer (ElementInfo type)
1366             { super (type); }
1367 
1368         // VC: Element Valid (first clause)
acceptCharacters()1369         boolean acceptCharacters ()
1370             { return false; }
1371 
1372         // VC: Element Valid (first clause)
acceptElement(String name)1373         Recognizer acceptElement (String name)
1374             { return null; }
1375     }
1376 
1377     // "Mixed" content model -- ANY, but restricts elements
1378     private static final class MixedRecognizer extends Recognizer
1379     {
1380         private String  permitted [];
1381 
1382         // N.B. constructor tests for duplicated element names (VC)
MixedRecognizer(ElementInfo t, ValidationConsumer v)1383         public MixedRecognizer (ElementInfo t, ValidationConsumer v)
1384         throws SAXException
1385         {
1386             super (t);
1387 
1388             // (#PCDATA...)* or (#PCDATA) ==> ... or empty
1389             // with the "..." being "|elname|..."
1390             StringTokenizer     tokens = new StringTokenizer (
1391                 t.model.substring (8, t.model.lastIndexOf (')')),
1392                 "|");
1393             Vector              vec = new Vector ();
1394 
1395             while (tokens.hasMoreTokens ()) {
1396                 String token = tokens.nextToken ();
1397 
1398                 if (vec.contains (token))
1399                     v.error ("element " + token
1400                         + " is repeated in mixed content model: "
1401                         + t.model);
1402                 else
1403                     vec.addElement (token.intern ());
1404             }
1405             permitted = new String [vec.size ()];
1406             for (int i = 0; i < permitted.length; i++)
1407                 permitted [i] = (String) vec.elementAt (i);
1408 
1409             // in one large machine-derived DTD sample, most of about
1410             // 250 mixed content models were empty, and 25 had ten or
1411             // more entries.  2 had over a hundred elements.  Linear
1412             // search isn't obviously wrong.
1413         }
1414 
1415         // VC: Element Valid (third clause)
acceptElement(String name)1416         Recognizer acceptElement (String name)
1417         {
1418             int         length = permitted.length;
1419 
1420             // first pass -- optimistic w.r.t. event source interning
1421             // (and document validity)
1422             for (int i = 0; i < length; i++)
1423                 if (permitted [i] == name)
1424                     return this;
1425             // second pass -- pessimistic w.r.t. event source interning
1426             for (int i = 0; i < length; i++)
1427                 if (permitted [i].equals (name))
1428                     return this;
1429             return null;
1430         }
1431     }
1432 
1433 
1434     // recognizer loop flags, see later
1435     private static final int            F_LOOPHEAD = 0x01;
1436     private static final int            F_LOOPNEXT = 0x02;
1437 
1438     // for debugging -- used to label/count nodes in toString()
1439     private static int                  nodeCount;
1440 
1441     /**
1442      * "Children" content model -- these are nodes in NDFA state graphs.
1443      * They work in fixed space.  Note that these graphs commonly have
1444      * cycles, handling features such as zero-or-more and one-or-more.
1445      *
1446      * <p>It's readonly, so only one copy is ever needed.  The content model
1447      * stack may have any number of pointers into each graph, when a model
1448      * happens to be needed more than once due to element nesting.  Since
1449      * traversing the graph just moves to another node, and never changes
1450      * it, traversals never interfere with each other.
1451      *
1452      * <p>There is an option to report non-deterministic models.  These are
1453      * always XML errors, but ones which are not often reported despite the
1454      * fact that they can lead to different validating parsers giving
1455      * different results for the same input.  (The XML spec doesn't require
1456      * them to be reported.)
1457      *
1458      * <p><b>FIXME</b> There's currently at least one known bug here, in that
1459      * it's not actually detecting the non-determinism it tries to detect.
1460      * (Of the "optional.xml" test, the once-or-twice-2* tests are all non-D;
1461      * maybe some others.)  This may relate to the issue flagged below as
1462      * "should not" happen (but it was), which showed up when patching the
1463      * graph to have one exit node (or more EMPTY nodes).
1464      */
1465     private static final class ChildrenRecognizer extends Recognizer
1466         implements Cloneable
1467     {
1468         // for reporting non-deterministic content models
1469         // ... a waste of space if we're not reporting those!
1470         // ... along with the 'model' member (in base class)
1471         private ValidationConsumer      consumer;
1472 
1473         // for CHOICE nodes -- each component is an arc that
1474         // accepts a different NAME (or is EMPTY indicating
1475         // NDFA termination).
1476         private Recognizer              components [];
1477 
1478         // for NAME/SEQUENCE nodes -- accepts that NAME and
1479         // then goes to the next node (CHOICE, NAME, EMPTY).
1480         private String                  name;
1481         private Recognizer              next;
1482 
1483         // loops always point back to a CHOICE node. we mark such choice
1484         // nodes (F_LOOPHEAD) for diagnostics and faster deep cloning.
1485         // We also mark nodes before back pointers (F_LOOPNEXT), to ensure
1486         // termination when we patch sequences and loops.
1487         private int                     flags;
1488 
1489 
1490         // prevent a needless indirection between 'this' and 'node'
copyIn(ChildrenRecognizer node)1491         private void copyIn (ChildrenRecognizer node)
1492         {
1493             // model & consumer are already set
1494             components = node.components;
1495             name = node.name;
1496             next = node.next;
1497             flags = node.flags;
1498         }
1499 
1500         // used to construct top level "children" content models,
ChildrenRecognizer(ElementInfo type, ValidationConsumer vc)1501         public ChildrenRecognizer (ElementInfo type, ValidationConsumer vc)
1502         {
1503             this (vc, type);
1504             populate (type.model.toCharArray (), 0);
1505             patchNext (new EmptyRecognizer (type), null);
1506         }
1507 
1508         // used internally; populating is separate
ChildrenRecognizer(ValidationConsumer vc, ElementInfo type)1509         private ChildrenRecognizer (ValidationConsumer vc, ElementInfo type)
1510         {
1511             super (type);
1512             consumer = vc;
1513         }
1514 
1515 
1516         //
1517         // When rewriting some graph nodes we need deep clones in one case;
1518         // mostly shallow clones (what the JVM handles for us) are fine.
1519         //
shallowClone()1520         private ChildrenRecognizer shallowClone ()
1521         {
1522             try {
1523                 return (ChildrenRecognizer) clone ();
1524             } catch (CloneNotSupportedException e) {
1525                 throw new Error ("clone");
1526             }
1527         }
1528 
deepClone()1529         private ChildrenRecognizer deepClone ()
1530         {
1531             return deepClone (new Hashtable (37));
1532         }
1533 
deepClone(Hashtable table)1534         private ChildrenRecognizer deepClone (Hashtable table)
1535         {
1536             ChildrenRecognizer retval;
1537 
1538             if ((flags & F_LOOPHEAD) != 0) {
1539                 retval = (ChildrenRecognizer) table.get (this);
1540                 if (retval != null)
1541                     return this;
1542 
1543                 retval = shallowClone ();
1544                 table.put (this, retval);
1545             } else
1546                 retval = shallowClone ();
1547 
1548             if (next != null) {
1549                 if (next instanceof ChildrenRecognizer)
1550                     retval.next = ((ChildrenRecognizer)next)
1551                             .deepClone (table);
1552                 else if (!(next instanceof EmptyRecognizer))
1553                     throw new RuntimeException ("deepClone");
1554             }
1555 
1556             if (components != null) {
1557                 retval.components = new Recognizer [components.length];
1558                 for (int i = 0; i < components.length; i++) {
1559                     Recognizer temp = components [i];
1560 
1561                     if (temp == null)
1562                         retval.components [i] = null;
1563                     else if (temp instanceof ChildrenRecognizer)
1564                         retval.components [i] = ((ChildrenRecognizer)temp)
1565                                 .deepClone (table);
1566                     else if (!(temp instanceof EmptyRecognizer))
1567                         throw new RuntimeException ("deepClone");
1568                 }
1569             }
1570 
1571             return retval;
1572         }
1573 
1574         // connect subgraphs, first to next (sequencing)
patchNext(Recognizer theNext, Hashtable table)1575         private void patchNext (Recognizer theNext, Hashtable table)
1576         {
1577             // backpointers must not be repatched or followed
1578             if ((flags & F_LOOPNEXT) != 0)
1579                 return;
1580 
1581             // XXX this table "shouldn't" be needed, right?
1582             // but some choice nodes looped if it isn't there.
1583             if (table != null && table.get (this) != null)
1584                 return;
1585             if (table == null)
1586                 table = new Hashtable ();
1587 
1588             // NAME/SEQUENCE
1589             if (name != null) {
1590                 if (next == null)
1591                     next = theNext;
1592                 else if (next instanceof ChildrenRecognizer) {
1593                     ((ChildrenRecognizer)next).patchNext (theNext, table);
1594                 } else if (!(next instanceof EmptyRecognizer))
1595                     throw new RuntimeException ("patchNext");
1596                 return;
1597             }
1598 
1599             // CHOICE
1600             for (int i = 0; i < components.length; i++) {
1601                 if (components [i] == null)
1602                     components [i] = theNext;
1603                 else if (components [i] instanceof ChildrenRecognizer) {
1604                     ((ChildrenRecognizer)components [i])
1605                             .patchNext (theNext, table);
1606                 } else if (!(components [i] instanceof EmptyRecognizer))
1607                     throw new RuntimeException ("patchNext");
1608             }
1609 
1610             if (table != null && (flags & F_LOOPHEAD) != 0)
1611                 table.put (this, this);
1612         }
1613 
1614         /**
1615          * Parses a 'children' spec (or recursively 'cp') and makes this
1616          * become a regular graph node.
1617          *
1618          * @return index after this particle
1619          */
populate(char parseBuf [], int startPos)1620         private int populate (char parseBuf [], int startPos)
1621         {
1622             int         nextPos = startPos + 1;
1623             char        c;
1624 
1625             if (nextPos < 0 || nextPos >= parseBuf.length)
1626                 throw new IndexOutOfBoundsException ();
1627 
1628             // Grammar of the string is from the XML spec, but
1629             // with whitespace removed by the SAX parser.
1630 
1631             // children ::= (choice | seq) ('?' | '*' | '+')?
1632             // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1633             // choice ::= '(' cp ('|' choice)* ')'
1634             // seq ::= '(' cp (',' choice)* ')'
1635 
1636             // interior nodes only
1637             //   cp ::= name ...
1638             if (parseBuf [startPos] != '('/*)*/) {
1639                 boolean         done = false;
1640                 do {
1641                     switch (c = parseBuf [nextPos]) {
1642                         case '?': case '*': case '+':
1643                         case '|': case ',':
1644                         case /*(*/ ')':
1645                             done = true;
1646                             continue;
1647                         default:
1648                             nextPos++;
1649                             continue;
1650                     }
1651                 } while (!done);
1652                 name = new String (parseBuf, startPos, nextPos - startPos);
1653 
1654             // interior OR toplevel nodes
1655             //   cp ::= choice ..
1656             //   cp ::= seq ..
1657             } else {
1658                 // collect everything as a separate list, and merge it
1659                 // into "this" later if we can (SEQUENCE or singleton)
1660                 ChildrenRecognizer      first;
1661 
1662                 first = new ChildrenRecognizer (consumer, type);
1663                 nextPos = first.populate (parseBuf, nextPos);
1664                 c = parseBuf [nextPos++];
1665 
1666                 if (c == ',' || c == '|') {
1667                     ChildrenRecognizer  current = first;
1668                     char                separator = c;
1669                     Vector              v = null;
1670 
1671                     if (separator == '|') {
1672                         v = new Vector ();
1673                         v.addElement (first);
1674                     }
1675 
1676                     do {
1677                         ChildrenRecognizer link;
1678 
1679                         link = new ChildrenRecognizer (consumer, type);
1680                         nextPos = link.populate (parseBuf, nextPos);
1681 
1682                         if (separator == ',') {
1683                             current.patchNext (link, null);
1684                             current = link;
1685                         } else
1686                             v.addElement (link);
1687 
1688                         c = parseBuf [nextPos++];
1689                     } while (c == separator);
1690 
1691                     // choice ... collect everything into one array.
1692                     if (separator == '|') {
1693                         // assert v.size() > 1
1694                         components = new Recognizer [v.size ()];
1695                         for (int i = 0; i < components.length; i++) {
1696                             components [i] = (Recognizer)
1697                                     v.elementAt (i);
1698                         }
1699                         // assert flags == 0
1700 
1701                     // sequence ... merge into "this" to be smaller.
1702                     } else
1703                         copyIn (first);
1704 
1705                 // treat singletons like one-node sequences.
1706                 } else
1707                     copyIn (first);
1708 
1709                 if (c != /*(*/ ')')
1710                     throw new RuntimeException ("corrupt content model");
1711             }
1712 
1713             //
1714             // Arity is optional, and the root of all fun.  We keep the
1715             // FSM state graph simple by only having NAME/SEQUENCE and
1716             // CHOICE nodes (or EMPTY to terminate a model), easily
1717             // evaluated.  So we rewrite each node that has arity, using
1718             // those primitives.  We create loops here, if needed.
1719             //
1720             if (nextPos < parseBuf.length) {
1721                 c = parseBuf [nextPos];
1722                 if (c == '?' || c == '*' || c == '+') {
1723                     nextPos++;
1724 
1725                     // Rewrite 'zero-or-one' "?" arity to a CHOICE:
1726                     //   - SEQUENCE (clone, what's next)
1727                     //   - or, what's next
1728                     // Size cost: N --> N + 1
1729                     if (c == '?') {
1730                         Recognizer              once = shallowClone ();
1731 
1732                         components = new Recognizer [2];
1733                         components [0] = once;
1734                         // components [1] initted to null
1735                         name = null;
1736                         next = null;
1737                         flags = 0;
1738 
1739 
1740                     // Rewrite 'zero-or-more' "*" arity to a CHOICE.
1741                     //   - LOOP (clone, back to this CHOICE)
1742                     //   - or, what's next
1743                     // Size cost: N --> N + 1
1744                     } else if (c == '*') {
1745                         ChildrenRecognizer      loop = shallowClone ();
1746 
1747                         loop.patchNext (this, null);
1748                         loop.flags |= F_LOOPNEXT;
1749                         flags = F_LOOPHEAD;
1750 
1751                         components = new Recognizer [2];
1752                         components [0] = loop;
1753                         // components [1] initted to null
1754                         name = null;
1755                         next = null;
1756 
1757 
1758                     // Rewrite 'one-or-more' "+" arity to a SEQUENCE.
1759                     // Basically (a)+ --> ((a),(a)*).
1760                     //   - this
1761                     //   - CHOICE
1762                     //      * LOOP (clone, back to the CHOICE)
1763                     //      * or, whatever's next
1764                     // Size cost: N --> 2N + 1
1765                     } else if (c == '+') {
1766                         ChildrenRecognizer loop = deepClone ();
1767                         ChildrenRecognizer choice;
1768 
1769                         choice = new ChildrenRecognizer (consumer, type);
1770                         loop.patchNext (choice, null);
1771                         loop.flags |= F_LOOPNEXT;
1772                         choice.flags = F_LOOPHEAD;
1773 
1774                         choice.components = new Recognizer [2];
1775                         choice.components [0] = loop;
1776                         // choice.components [1] initted to null
1777                         // choice.name, choice.next initted to null
1778 
1779                         patchNext (choice, null);
1780                     }
1781                 }
1782             }
1783 
1784             return nextPos;
1785         }
1786 
1787         // VC: Element Valid (second clause)
acceptCharacters()1788         boolean acceptCharacters ()
1789             { return false; }
1790 
1791         // VC: Element Valid (second clause)
acceptElement(String type)1792         Recognizer acceptElement (String type)
1793         throws SAXException
1794         {
1795             // NAME/SEQUENCE
1796             if (name != null) {
1797                 if (name.equals (type))
1798                     return next;
1799                 return null;
1800             }
1801 
1802             // CHOICE ... optionally reporting nondeterminism we
1803             // run across.  we won't check out every transition
1804             // for nondeterminism; only the ones we follow.
1805             Recognizer  retval = null;
1806 
1807             for (int i = 0; i < components.length; i++) {
1808                 Recognizer temp = components [i].acceptElement (type);
1809 
1810                 if (temp == null)
1811                     continue;
1812                 else if (!warnNonDeterministic)
1813                     return temp;
1814                 else if (retval == null)
1815                     retval = temp;
1816                 else if (retval != temp)
1817                     consumer.error ("Content model " + this.type.model
1818                         + " is non-deterministic for " + type);
1819             }
1820             return retval;
1821         }
1822 
1823         // VC: Element Valid (second clause)
completed()1824         boolean completed ()
1825         throws SAXException
1826         {
1827             // expecting a specific element
1828             if (name != null)
1829                 return false;
1830 
1831             // choice, some sequences
1832             for (int i = 0; i < components.length; i++) {
1833                 if (components [i].completed ())
1834                     return true;
1835             }
1836 
1837             return false;
1838         }
1839 
1840 /** /
1841         // FOR DEBUGGING ... flattens the graph for printing.
1842 
1843         public String toString ()
1844         {
1845             StringBuffer buf = new StringBuffer ();
1846 
1847             // only one set of loop labels can be generated
1848             // at a time...
1849             synchronized (ANY) {
1850                 nodeCount = 0;
1851 
1852                 toString (buf, new Hashtable ());
1853                 return buf.toString ();
1854             }
1855         }
1856 
1857         private void toString (StringBuffer buf, Hashtable table)
1858         {
1859             // When we visit a node, label and count it.
1860             // Nodes are never visited/counted more than once.
1861             // For small models labels waste space, but if arity
1862             // mappings were used the savings are substantial.
1863             // (Plus, the output can be more readily understood.)
1864             String temp = (String) table.get (this);
1865 
1866             if (temp != null) {
1867                 buf.append ('{');
1868                 buf.append (temp);
1869                 buf.append ('}');
1870                 return;
1871             } else {
1872                 StringBuffer scratch = new StringBuffer (15);
1873 
1874                 if ((flags & F_LOOPHEAD) != 0)
1875                     scratch.append ("loop");
1876                 else
1877                     scratch.append ("node");
1878                 scratch.append ('-');
1879                 scratch.append (++nodeCount);
1880                 temp = scratch.toString ();
1881 
1882                 table.put (this, temp);
1883                 buf.append ('[');
1884                 buf.append (temp);
1885                 buf.append (']');
1886                 buf.append (':');
1887             }
1888 
1889             // NAME/SEQUENCE
1890             if (name != null) {
1891                 // n.b. some output encodings turn some name chars into '?'
1892                 // e.g. with Japanese names and ASCII output
1893                 buf.append (name);
1894                 if (components != null)         // bug!
1895                     buf.append ('$');
1896                 if (next == null)
1897                     buf.append (",*");
1898                 else if (next instanceof EmptyRecognizer) // patch-to-next
1899                     buf.append (",{}");
1900                 else if (next instanceof ChildrenRecognizer) {
1901                     buf.append (',');
1902                     ((ChildrenRecognizer)next).toString (buf, table);
1903                 } else                          // bug!
1904                     buf.append (",+");
1905                 return;
1906             }
1907 
1908             // CHOICE
1909             buf.append ("<");
1910             for (int i = 0; i < components.length; i++) {
1911                 if (i != 0)
1912                     buf.append ("|");
1913                 if (components [i] instanceof EmptyRecognizer) {
1914                     buf.append ("{}");
1915                 } else if (components [i] == null) {    // patch-to-next
1916                     buf.append ('*');
1917                 } else {
1918                     ChildrenRecognizer r;
1919 
1920                     r = (ChildrenRecognizer) components [i];
1921                     r.toString (buf, table);
1922                 }
1923             }
1924             buf.append (">");
1925         }
1926 /**/
1927     }
1928 }
1929