1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 /*
19  * $Id: DGXMLScanner.cpp 833045 2009-11-05 13:21:27Z borisk $
20  */
21 
22 
23 // ---------------------------------------------------------------------------
24 //  Includes
25 // ---------------------------------------------------------------------------
26 #include <xercesc/internal/DGXMLScanner.hpp>
27 #include <xercesc/util/Janitor.hpp>
28 #include <xercesc/util/RuntimeException.hpp>
29 #include <xercesc/util/UnexpectedEOFException.hpp>
30 #include <xercesc/util/XMLUri.hpp>
31 #include <xercesc/framework/URLInputSource.hpp>
32 #include <xercesc/framework/LocalFileInputSource.hpp>
33 #include <xercesc/framework/XMLDocumentHandler.hpp>
34 #include <xercesc/framework/XMLEntityHandler.hpp>
35 #include <xercesc/framework/XMLPScanToken.hpp>
36 #include <xercesc/framework/XMLGrammarPool.hpp>
37 #include <xercesc/framework/XMLDTDDescription.hpp>
38 #include <xercesc/internal/EndOfEntityException.hpp>
39 #include <xercesc/validators/common/GrammarResolver.hpp>
40 #include <xercesc/validators/DTD/DocTypeHandler.hpp>
41 #include <xercesc/validators/DTD/DTDScanner.hpp>
42 #include <xercesc/validators/DTD/DTDValidator.hpp>
43 #include <xercesc/util/OutOfMemoryException.hpp>
44 #include <xercesc/util/XMLResourceIdentifier.hpp>
45 
46 XERCES_CPP_NAMESPACE_BEGIN
47 
48 
49 typedef JanitorMemFunCall<DGXMLScanner> CleanupType;
50 typedef JanitorMemFunCall<ReaderMgr>    ReaderMgrResetType;
51 
52 
53 // ---------------------------------------------------------------------------
54 //  DGXMLScanner: Constructors and Destructor
55 // ---------------------------------------------------------------------------
DGXMLScanner(XMLValidator * const valToAdopt,GrammarResolver * const grammarResolver,MemoryManager * const manager)56 DGXMLScanner::DGXMLScanner(XMLValidator* const valToAdopt
57                          , GrammarResolver* const grammarResolver
58                          , MemoryManager* const manager) :
59 
60     XMLScanner(valToAdopt, grammarResolver, manager)
61     , fAttrNSList(0)
62     , fDTDValidator(0)
63     , fDTDGrammar(0)
64     , fDTDElemNonDeclPool(0)
65     , fElemCount(0)
66     , fAttDefRegistry(0)
67     , fUndeclaredAttrRegistry(0)
68 {
69     CleanupType cleanup(this, &DGXMLScanner::cleanUp);
70 
71     try
72     {
73         commonInit();
74     }
75     catch(const OutOfMemoryException&)
76     {
77         // Don't cleanup when out of memory, since executing the
78         // code can cause problems.
79         cleanup.release();
80 
81         throw;
82     }
83 
84     cleanup.release();
85 }
86 
DGXMLScanner(XMLDocumentHandler * const docHandler,DocTypeHandler * const docTypeHandler,XMLEntityHandler * const entityHandler,XMLErrorReporter * const errHandler,XMLValidator * const valToAdopt,GrammarResolver * const grammarResolver,MemoryManager * const manager)87 DGXMLScanner::DGXMLScanner( XMLDocumentHandler* const docHandler
88                           , DocTypeHandler* const     docTypeHandler
89                           , XMLEntityHandler* const   entityHandler
90                           , XMLErrorReporter* const   errHandler
91                           , XMLValidator* const       valToAdopt
92                           , GrammarResolver* const    grammarResolver
93                           , MemoryManager* const      manager) :
94 
95     XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager)
96     , fAttrNSList(0)
97     , fDTDValidator(0)
98     , fDTDGrammar(0)
99     , fDTDElemNonDeclPool(0)
100     , fElemCount(0)
101     , fAttDefRegistry(0)
102     , fUndeclaredAttrRegistry(0)
103 {
104     CleanupType cleanup(this, &DGXMLScanner::cleanUp);
105 
106     try
107     {
108         commonInit();
109     }
110     catch(const OutOfMemoryException&)
111     {
112         // Don't cleanup when out of memory, since executing the
113         // code can cause problems.
114         cleanup.release();
115 
116         throw;
117     }
118 
119     cleanup.release();
120 }
121 
~DGXMLScanner()122 DGXMLScanner::~DGXMLScanner()
123 {
124     cleanUp();
125 }
126 
127 // ---------------------------------------------------------------------------
128 //  XMLScanner: Getter methods
129 // ---------------------------------------------------------------------------
getEntityDeclPool()130 NameIdPool<DTDEntityDecl>* DGXMLScanner::getEntityDeclPool()
131 {
132     if(!fGrammar)
133         return 0;
134     return ((DTDGrammar*)fGrammar)->getEntityDeclPool();
135 }
136 
getEntityDeclPool() const137 const NameIdPool<DTDEntityDecl>* DGXMLScanner::getEntityDeclPool() const
138 {
139     if(!fGrammar)
140         return 0;
141     return ((DTDGrammar*)fGrammar)->getEntityDeclPool();
142 }
143 
144 // ---------------------------------------------------------------------------
145 //  DGXMLScanner: Main entry point to scan a document
146 // ---------------------------------------------------------------------------
scanDocument(const InputSource & src)147 void DGXMLScanner::scanDocument(const InputSource& src)
148 {
149     //  Bump up the sequence id for this parser instance. This will invalidate
150     //  any previous progressive scan tokens.
151     fSequenceId++;
152 
153     ReaderMgrResetType  resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
154 
155     try
156     {
157         //  Reset the scanner and its plugged in stuff for a new run. This
158         //  resets all the data structures, creates the initial reader and
159         //  pushes it on the stack, and sets up the base document path.
160         scanReset(src);
161 
162         // If we have a document handler, then call the start document
163         if (fDocHandler)
164             fDocHandler->startDocument();
165 
166         //  Scan the prolog part, which is everything before the root element
167         //  including the DTD subsets.
168         scanProlog();
169 
170         //  If we got to the end of input, then its not a valid XML file.
171         //  Else, go on to scan the content.
172         if (fReaderMgr.atEOF())
173         {
174             emitError(XMLErrs::EmptyMainEntity);
175         }
176         else
177         {
178             // Scan content, and tell it its not an external entity
179             if (scanContent())
180             {
181                 // Do post-parse validation if required
182                 if (fValidate)
183                 {
184                     //  We handle ID reference semantics at this level since
185                     //  its required by XML 1.0.
186                     checkIDRefs();
187 
188                     // Then allow the validator to do any extra stuff it wants
189 //                    fValidator->postParseValidation();
190                 }
191 
192                 // That went ok, so scan for any miscellaneous stuff
193                 if (!fReaderMgr.atEOF())
194                     scanMiscellaneous();
195             }
196         }
197 
198         // If we have a document handler, then call the end document
199         if (fDocHandler)
200             fDocHandler->endDocument();
201     }
202     //  NOTE:
203     //
204     //  In all of the error processing below, the emitError() call MUST come
205     //  before the flush of the reader mgr, or it will fail because it tries
206     //  to find out the position in the XML source of the error.
207     catch(const XMLErrs::Codes)
208     {
209         // This is a 'first failure' exception, so fall through
210     }
211     catch(const XMLValid::Codes)
212     {
213         // This is a 'first fatal error' type exit, so fall through
214     }
215     catch(const XMLException& excToCatch)
216     {
217         //  Emit the error and catch any user exception thrown from here. Make
218         //  sure in all cases we flush the reader manager.
219         fInException = true;
220         try
221         {
222             if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
223                 emitError
224                 (
225                     XMLErrs::XMLException_Warning
226                     , excToCatch.getCode()
227                     , excToCatch.getMessage()
228                 );
229             else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
230                 emitError
231                 (
232                     XMLErrs::XMLException_Fatal
233                     , excToCatch.getCode()
234                     , excToCatch.getMessage()
235                 );
236             else
237                 emitError
238                 (
239                     XMLErrs::XMLException_Error
240                     , excToCatch.getCode()
241                     , excToCatch.getMessage()
242                 );
243         }
244         catch(const OutOfMemoryException&)
245         {
246             // This is a special case for out-of-memory
247             // conditions, because resetting the ReaderMgr
248             // can be problematic.
249             resetReaderMgr.release();
250 
251             throw;
252         }
253     }
254     catch(const OutOfMemoryException&)
255     {
256         // This is a special case for out-of-memory
257         // conditions, because resetting the ReaderMgr
258         // can be problematic.
259         resetReaderMgr.release();
260 
261         throw;
262     }
263 }
264 
265 
scanNext(XMLPScanToken & token)266 bool DGXMLScanner::scanNext(XMLPScanToken& token)
267 {
268     // Make sure this token is still legal
269     if (!isLegalToken(token))
270         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);
271 
272     // Find the next token and remember the reader id
273     XMLSize_t orgReader;
274     XMLTokens curToken;
275 
276     ReaderMgrResetType  resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
277 
278     bool retVal = true;
279 
280     try
281     {
282         while (true)
283         {
284             //  We have to handle any end of entity exceptions that happen here.
285             //  We could be at the end of X nested entities, each of which will
286             //  generate an end of entity exception as we try to move forward.
287             try
288             {
289                 curToken = senseNextToken(orgReader);
290                 break;
291             }
292             catch(const EndOfEntityException& toCatch)
293             {
294                 // Send an end of entity reference event
295                 if (fDocHandler)
296                     fDocHandler->endEntityReference(toCatch.getEntity());
297             }
298         }
299 
300         if (curToken == Token_CharData)
301         {
302             scanCharData(fCDataBuf);
303         }
304         else if (curToken == Token_EOF)
305         {
306             if (!fElemStack.isEmpty())
307             {
308                 const ElemStack::StackElem* topElem = fElemStack.popTop();
309                 emitError
310                 (
311                     XMLErrs::EndedWithTagsOnStack
312                     , topElem->fThisElement->getFullName()
313                 );
314             }
315 
316             retVal = false;
317         }
318         else
319         {
320             // Its some sort of markup
321             bool gotData = true;
322             switch(curToken)
323             {
324                 case Token_CData :
325                     // Make sure we are within content
326                     if (fElemStack.isEmpty())
327                         emitError(XMLErrs::CDATAOutsideOfContent);
328                     scanCDSection();
329                     break;
330 
331                 case Token_Comment :
332                     scanComment();
333                     break;
334 
335                 case Token_EndTag :
336                     scanEndTag(gotData);
337                     break;
338 
339                 case Token_PI :
340                     scanPI();
341                     break;
342 
343                 case Token_StartTag :
344                     if (fDoNamespaces)
345                         scanStartTagNS(gotData);
346                     else
347                         scanStartTag(gotData);
348                     break;
349 
350                 default :
351                     fReaderMgr.skipToChar(chOpenAngle);
352                     break;
353             }
354 
355             if (orgReader != fReaderMgr.getCurrentReaderNum())
356                 emitError(XMLErrs::PartialMarkupInEntity);
357 
358             // If we hit the end, then do the miscellaneous part
359             if (!gotData)
360             {
361                 // Do post-parse validation if required
362                 if (fValidate)
363                 {
364                     //  We handle ID reference semantics at this level since
365                     //  its required by XML 1.0.
366                     checkIDRefs();
367 
368                     // Then allow the validator to do any extra stuff it wants
369 //                    fValidator->postParseValidation();
370                 }
371 
372                 // That went ok, so scan for any miscellaneous stuff
373                 scanMiscellaneous();
374 
375                 if (fDocHandler)
376                     fDocHandler->endDocument();
377             }
378         }
379     }
380     //  NOTE:
381     //
382     //  In all of the error processing below, the emitError() call MUST come
383     //  before the flush of the reader mgr, or it will fail because it tries
384     //  to find out the position in the XML source of the error.
385     catch(const XMLErrs::Codes)
386     {
387         // This is a 'first failure' exception, so return failure
388         retVal = false;
389     }
390     catch(const XMLValid::Codes)
391     {
392         // This is a 'first fatal error' type exit, so return failure
393         retVal = false;
394     }
395     catch(const XMLException& excToCatch)
396     {
397         //  Emit the error and catch any user exception thrown from here. Make
398         //  sure in all cases we flush the reader manager.
399         fInException = true;
400         try
401         {
402             if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
403                 emitError
404                 (
405                     XMLErrs::XMLException_Warning
406                     , excToCatch.getCode()
407                     , excToCatch.getMessage()
408                 );
409             else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
410                 emitError
411                 (
412                     XMLErrs::XMLException_Fatal
413                     , excToCatch.getCode()
414                     , excToCatch.getMessage()
415                 );
416             else
417                 emitError
418                 (
419                     XMLErrs::XMLException_Error
420                     , excToCatch.getCode()
421                     , excToCatch.getMessage()
422                 );
423         }
424         catch(const OutOfMemoryException&)
425         {
426             // This is a special case for out-of-memory
427             // conditions, because resetting the ReaderMgr
428             // can be problematic.
429             resetReaderMgr.release();
430 
431             throw;
432         }
433 
434         retVal = false;
435     }
436     catch(const OutOfMemoryException&)
437     {
438         // This is a special case for out-of-memory
439         // conditions, because resetting the ReaderMgr
440         // can be problematic.
441         resetReaderMgr.release();
442 
443         throw;
444     }
445 
446     // If we are not at the end, release the object that will
447     // reset the ReaderMgr.
448     if (retVal)
449         resetReaderMgr.release();
450 
451     return retVal;
452 }
453 
454 
455 // ---------------------------------------------------------------------------
456 //  DGXMLScanner: Private scanning methods
457 // ---------------------------------------------------------------------------
458 
459 //  This method will kick off the scanning of the primary content of the
460 //  document, i.e. the elements.
scanContent()461 bool DGXMLScanner::scanContent()
462 {
463     //  Go into a loop until we hit the end of the root element, or we fall
464     //  out because there is no root element.
465     //
466     //  We have to do kind of a deeply nested double loop here in order to
467     //  avoid doing the setup/teardown of the exception handler on each
468     //  round. Doing it this way we only do it when an exception actually
469     //  occurs.
470     bool gotData = true;
471     bool inMarkup = false;
472     while (gotData)
473     {
474         try
475         {
476             while (gotData)
477             {
478                 //  Sense what the next top level token is. According to what
479                 //  this tells us, we will call something to handle that kind
480                 //  of thing.
481                 XMLSize_t orgReader;
482                 const XMLTokens curToken = senseNextToken(orgReader);
483 
484                 //  Handle character data and end of file specially. Char data
485                 //  is not markup so we don't want to handle it in the loop
486                 //  below.
487                 if (curToken == Token_CharData)
488                 {
489                     //  Scan the character data and call appropriate events. Let
490                     //  him use our local character data buffer for efficiency.
491                     scanCharData(fCDataBuf);
492                     continue;
493                 }
494                 else if (curToken == Token_EOF)
495                 {
496                     //  The element stack better be empty at this point or we
497                     //  ended prematurely before all elements were closed.
498                     if (!fElemStack.isEmpty())
499                     {
500                         const ElemStack::StackElem* topElem = fElemStack.popTop();
501                         emitError
502                         (
503                             XMLErrs::EndedWithTagsOnStack
504                             , topElem->fThisElement->getFullName()
505                         );
506                     }
507 
508                     // Its the end of file, so clear the got data flag
509                     gotData = false;
510                     continue;
511                 }
512 
513                 // We are in some sort of markup now
514                 inMarkup = true;
515 
516                 //  According to the token we got, call the appropriate
517                 //  scanning method.
518                 switch(curToken)
519                 {
520                     case Token_CData :
521                         // Make sure we are within content
522                         if (fElemStack.isEmpty())
523                             emitError(XMLErrs::CDATAOutsideOfContent);
524                         scanCDSection();
525                         break;
526 
527                     case Token_Comment :
528                         scanComment();
529                         break;
530 
531                     case Token_EndTag :
532                         scanEndTag(gotData);
533                         break;
534 
535                     case Token_PI :
536                         scanPI();
537                         break;
538 
539                     case Token_StartTag :
540                         if (fDoNamespaces)
541                             scanStartTagNS(gotData);
542                         else
543                             scanStartTag(gotData);
544                         break;
545 
546                     default :
547                         fReaderMgr.skipToChar(chOpenAngle);
548                         break;
549                 }
550 
551                 if (orgReader != fReaderMgr.getCurrentReaderNum())
552                     emitError(XMLErrs::PartialMarkupInEntity);
553 
554                 // And we are back out of markup again
555                 inMarkup = false;
556             }
557         }
558         catch(const EndOfEntityException& toCatch)
559         {
560             //  If we were in some markup when this happened, then its a
561             //  partial markup error.
562             if (inMarkup)
563                 emitError(XMLErrs::PartialMarkupInEntity);
564 
565             // Send an end of entity reference event
566             if (fDocHandler)
567                 fDocHandler->endEntityReference(toCatch.getEntity());
568 
569             inMarkup = false;
570         }
571     }
572 
573     // It went ok, so return success
574     return true;
575 }
576 
577 
scanEndTag(bool & gotData)578 void DGXMLScanner::scanEndTag(bool& gotData)
579 {
580     //  Assume we will still have data until proven otherwise. It will only
581     //  ever be false if this is the end of the root element.
582     gotData = true;
583 
584     //  Check if the element stack is empty. If so, then this is an unbalanced
585     //  element (i.e. more ends than starts, perhaps because of bad text
586     //  causing one to be skipped.)
587     if (fElemStack.isEmpty())
588     {
589         emitError(XMLErrs::MoreEndThanStartTags);
590         fReaderMgr.skipPastChar(chCloseAngle);
591         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager);
592     }
593 
594     //  Pop the stack of the element we are supposed to be ending. Remember
595     //  that we don't own this. The stack just keeps them and reuses them.
596     unsigned int uriId = (fDoNamespaces)
597         ? fElemStack.getCurrentURI() : fEmptyNamespaceId;
598 
599     //  Pop the stack of the element we are supposed to be ending. Remember
600     //  that we don't own this. The stack just keeps them and reuses them.
601     const ElemStack::StackElem* topElem = fElemStack.popTop();
602     XMLElementDecl *tempElement = topElem->fThisElement;
603 
604     // See if it was the root element, to avoid multiple calls below
605     const bool isRoot = fElemStack.isEmpty();
606 
607     // Make sure that its the end of the element that we expect
608     if (!fReaderMgr.skippedStringLong(tempElement->getFullName()))
609     {
610         emitError
611         (
612             XMLErrs::ExpectedEndOfTagX
613             , tempElement->getFullName()
614         );
615         fReaderMgr.skipPastChar(chCloseAngle);
616         return;
617     }
618 
619     // Make sure we are back on the same reader as where we started
620     if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum())
621         emitError(XMLErrs::PartialTagMarkupError);
622 
623     // Skip optional whitespace
624     fReaderMgr.skipPastSpaces();
625 
626     // Make sure we find the closing bracket
627     if (!fReaderMgr.skippedChar(chCloseAngle))
628     {
629         emitError
630         (
631             XMLErrs::UnterminatedEndTag
632             , topElem->fThisElement->getFullName()
633         );
634     }
635 
636     //  If validation is enabled, then lets pass him the list of children and
637     //  this element and let him validate it.
638     if (fValidate)
639     {
640 
641        //
642        // XML1.0-3rd
643        // Validity Constraint:
644        // The declaration matches EMPTY and the element has no content (not even
645        // entity references, comments, PIs or white space).
646        //
647        if ( (topElem->fCommentOrPISeen)               &&
648             (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Empty))
649        {
650            fValidator->emitError
651                (
652                XMLValid::EmptyElemHasContent
653                , topElem->fThisElement->getFullName()
654                );
655        }
656 
657        //
658        // XML1.0-3rd
659        // Validity Constraint:
660        //
661        // The declaration matches children and the sequence of child elements
662        // belongs to the language generated by the regular expression in the
663        // content model, with optional white space, comments and PIs
664        // (i.e. markup matching production [27] Misc) between the start-tag and
665        // the first child element, between child elements, or between the last
666        // child element and the end-tag.
667        //
668        // Note that
669        //    a CDATA section containing only white space or
670        //    a reference to an entity whose replacement text is character references
671        //       expanding to white space do not match the nonterminal S, and hence
672        //       cannot appear in these positions; however,
673        //    a reference to an internal entity with a literal value consisting
674        //       of character references expanding to white space does match S,
675        //       since its replacement text is the white space resulting from expansion
676        //       of the character references.
677        //
678        if ( (topElem->fReferenceEscaped)               &&
679             (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Children))
680        {
681            fValidator->emitError
682                (
683                XMLValid::ElemChildrenHasInvalidWS
684                , topElem->fThisElement->getFullName()
685                );
686        }
687 
688         XMLSize_t failure;
689         bool res = fValidator->checkContent
690         (
691             topElem->fThisElement
692             , topElem->fChildren
693             , topElem->fChildCount
694             , &failure
695         );
696 
697         if (!res)
698         {
699             //  One of the elements is not valid for the content. NOTE that
700             //  if no children were provided but the content model requires
701             //  them, it comes back with a zero value. But we cannot use that
702             //  to index the child array in this case, and have to put out a
703             //  special message.
704             if (!topElem->fChildCount)
705             {
706                 fValidator->emitError
707                 (
708                     XMLValid::EmptyNotValidForContent
709                     , topElem->fThisElement->getFormattedContentModel()
710                 );
711             }
712             else if (failure >= topElem->fChildCount)
713             {
714                 fValidator->emitError
715                 (
716                     XMLValid::NotEnoughElemsForCM
717                     , topElem->fThisElement->getFormattedContentModel()
718                 );
719             }
720             else
721             {
722                 fValidator->emitError
723                 (
724                     XMLValid::ElementNotValidForContent
725                     , topElem->fChildren[failure]->getRawName()
726                     , topElem->fThisElement->getFormattedContentModel()
727                 );
728             }
729         }
730     }
731 
732     // If we have a doc handler, tell it about the end tag
733     if (fDocHandler)
734     {
735         fDocHandler->endElement
736         (
737             *topElem->fThisElement
738             , uriId
739             , isRoot
740             , (fDoNamespaces)
741                 ? topElem->fThisElement->getElementName()->getPrefix()
742                 : XMLUni::fgZeroLenString
743         );
744     }
745 
746     // If this was the root, then done with content
747     gotData = !isRoot;
748 }
749 
750 
751 //  This method handles the high level logic of scanning the DOCType
752 //  declaration. This calls the DTDScanner and kicks off both the scanning of
753 //  the internal subset and the scanning of the external subset, if any.
754 //
755 //  When we get here the '<!DOCTYPE' part has already been scanned, which is
756 //  what told us that we had a doc type decl to parse.
scanDocTypeDecl()757 void DGXMLScanner::scanDocTypeDecl()
758 {
759     if (fDocTypeHandler)
760         fDocTypeHandler->resetDocType();
761 
762     // There must be some space after DOCTYPE
763     bool skippedSomething;
764     fReaderMgr.skipPastSpaces(skippedSomething);
765     if (!skippedSomething)
766     {
767         emitError(XMLErrs::ExpectedWhitespace);
768 
769         // Just skip the Doctype declaration and return
770         fReaderMgr.skipPastChar(chCloseAngle);
771         return;
772     }
773 
774     // Get a buffer for the root element
775     XMLBufBid bbRootName(&fBufMgr);
776 
777     //  Get a name from the input, which should be the name of the root
778     //  element of the upcoming content.
779     int  colonPosition;
780     bool validName = fDoNamespaces ? fReaderMgr.getQName(bbRootName.getBuffer(), &colonPosition) :
781                                      fReaderMgr.getName(bbRootName.getBuffer());
782     if (!validName)
783     {
784         if (bbRootName.isEmpty())
785             emitError(XMLErrs::NoRootElemInDOCTYPE);
786         else
787             emitError(XMLErrs::InvalidRootElemInDOCTYPE, bbRootName.getRawBuffer());
788         fReaderMgr.skipPastChar(chCloseAngle);
789         return;
790     }
791 
792     //  Store the root element name for later check
793     setRootElemName(bbRootName.getRawBuffer());
794 
795     //  This element obviously is not going to exist in the element decl
796     //  pool yet, but we need to call docTypeDecl. So force it into
797     //  the element decl pool, marked as being there because it was in
798     //  the DOCTYPE. Later, when its declared, the status will be updated.
799     //
800     //  Only do this if we are not reusing the validator! If we are reusing,
801     //  then look it up instead. It has to exist!
802     MemoryManager* const  rootDeclMgr =
803         fUseCachedGrammar ? fMemoryManager : fGrammarPoolMemoryManager;
804 
805     DTDElementDecl* rootDecl = new (rootDeclMgr) DTDElementDecl
806     (
807         bbRootName.getRawBuffer()
808         , fEmptyNamespaceId
809         , DTDElementDecl::Any
810         , rootDeclMgr
811     );
812 
813     Janitor<DTDElementDecl> rootDeclJanitor(rootDecl);
814     rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
815     rootDecl->setExternalElemDeclaration(true);
816     if(!fUseCachedGrammar)
817     {
818         fGrammar->putElemDecl(rootDecl);
819         rootDeclJanitor.release();
820     } else
821     {
822         // put this in the undeclared pool so it gets deleted...
823         XMLElementDecl* elemDecl = fDTDElemNonDeclPool->getByKey(bbRootName.getRawBuffer());
824         if (elemDecl)
825         {
826             rootDecl->setId(elemDecl->getId());
827         }
828         else
829         {
830             rootDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)rootDecl));
831             rootDeclJanitor.release();
832         }
833     }
834 
835     // Skip any spaces after the name
836     fReaderMgr.skipPastSpaces();
837 
838     //  And now if we are looking at a >, then we are done. It is not
839     //  required to have an internal or external subset, though why you
840     //  would not escapes me.
841     if (fReaderMgr.skippedChar(chCloseAngle)) {
842 
843         //  If we have a doc type handler and advanced callbacks are enabled,
844         //  call the doctype event.
845         if (fDocTypeHandler)
846             fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false);
847         return;
848     }
849 
850     // either internal/external subset
851     if (fValScheme == Val_Auto && !fValidate)
852         fValidate = true;
853 
854     bool    hasIntSubset = false;
855     bool    hasExtSubset = false;
856     XMLCh*  sysId = 0;
857     XMLCh*  pubId = 0;
858 
859     DTDScanner dtdScanner
860     (
861         (DTDGrammar*) fGrammar
862         , fDocTypeHandler
863         , fGrammarPoolMemoryManager
864         , fMemoryManager
865     );
866     dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr);
867 
868     //  If the next character is '[' then we have no external subset cause
869     //  there is no system id, just the opening character of the internal
870     //  subset. Else, has to be an id.
871     //
872     // Just look at the next char, don't eat it.
873     if (fReaderMgr.peekNextChar() == chOpenSquare)
874     {
875         hasIntSubset = true;
876     }
877     else
878     {
879         // Indicate we have an external subset
880         hasExtSubset = true;
881         fHasNoDTD = false;
882 
883         // Get buffers for the ids
884         XMLBufBid bbPubId(&fBufMgr);
885         XMLBufBid bbSysId(&fBufMgr);
886 
887         // Get the external subset id
888         if (!dtdScanner.scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), DTDScanner::IDType_External))
889         {
890             fReaderMgr.skipPastChar(chCloseAngle);
891             return;
892         }
893 
894         // Get copies of the ids we got
895         pubId = XMLString::replicate(bbPubId.getRawBuffer(), fMemoryManager);
896         sysId = XMLString::replicate(bbSysId.getRawBuffer(), fMemoryManager);
897 
898         // Skip spaces and check again for the opening of an internal subset
899         fReaderMgr.skipPastSpaces();
900 
901         // Just look at the next char, don't eat it.
902         if (fReaderMgr.peekNextChar() == chOpenSquare) {
903             hasIntSubset = true;
904         }
905     }
906 
907     // Insure that the ids get cleaned up, if they got allocated
908     ArrayJanitor<XMLCh> janSysId(sysId, fMemoryManager);
909     ArrayJanitor<XMLCh> janPubId(pubId, fMemoryManager);
910 
911     //  If we have a doc type handler and advanced callbacks are enabled,
912     //  call the doctype event.
913     if (fDocTypeHandler)
914         fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset, hasExtSubset);
915 
916     //  Ok, if we had an internal subset, we are just past the [ character
917     //  and need to parse that first.
918     if (hasIntSubset)
919     {
920         // Eat the opening square bracket
921         fReaderMgr.getNextChar();
922 
923         checkInternalDTD(hasExtSubset, sysId, pubId);
924 
925         //  And try to scan the internal subset. If we fail, try to recover
926         //  by skipping forward tot he close angle and returning.
927         if (!dtdScanner.scanInternalSubset())
928         {
929             fReaderMgr.skipPastChar(chCloseAngle);
930             return;
931         }
932 
933         //  Do a sanity check that some expanded PE did not propogate out of
934         //  the doctype. This could happen if it was terminated early by bad
935         //  syntax.
936         if (fReaderMgr.getReaderDepth() > 1)
937         {
938             emitError(XMLErrs::PEPropogated);
939 
940             // Ask the reader manager to pop back down to the main level
941             fReaderMgr.cleanStackBackTo(1);
942         }
943 
944         fReaderMgr.skipPastSpaces();
945     }
946 
947     // And that should leave us at the closing > of the DOCTYPE line
948     if (!fReaderMgr.skippedChar(chCloseAngle))
949     {
950         //  Do a special check for the common scenario of an extra ] char at
951         //  the end. This is easy to recover from.
952         if (fReaderMgr.skippedChar(chCloseSquare)
953         &&  fReaderMgr.skippedChar(chCloseAngle))
954         {
955             emitError(XMLErrs::ExtraCloseSquare);
956         }
957          else
958         {
959             emitError(XMLErrs::UnterminatedDOCTYPE);
960             fReaderMgr.skipPastChar(chCloseAngle);
961         }
962     }
963 
964     //  If we had an external subset, then we need to deal with that one
965     //  next. If we are reusing the validator, then don't scan it.
966     if (hasExtSubset) {
967 
968         InputSource* srcUsed=0;
969         Janitor<InputSource> janSrc(srcUsed);
970         // If we had an internal subset and we're using the cached grammar, it
971         // means that the ignoreCachedDTD is set, so we ignore the cached
972         // grammar
973         if (fUseCachedGrammar && !hasIntSubset)
974         {
975             srcUsed = resolveSystemId(sysId, pubId);
976             if (srcUsed) {
977                 janSrc.reset(srcUsed);
978                 Grammar* grammar = fGrammarResolver->getGrammar(srcUsed->getSystemId());
979 
980                 if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) {
981 
982                     fDTDGrammar = (DTDGrammar*) grammar;
983                     fGrammar = fDTDGrammar;
984                     fValidator->setGrammar(fGrammar);
985                     // If we don't report at least the external subset boundaries,
986                     // an advanced document handler cannot know when the DTD end,
987                     // since we've already sent a doctype decl that indicates there's
988                     // there's an external subset.
989                     if (fDocTypeHandler)
990                     {
991                         fDocTypeHandler->startExtSubset();
992                         fDocTypeHandler->endExtSubset();
993                     }
994 
995                     return;
996                 }
997             }
998         }
999 
1000         if (fLoadExternalDTD || fValidate)
1001         {
1002             // And now create a reader to read this entity
1003             XMLReader* reader;
1004             if(srcUsed) {
1005                 reader = fReaderMgr.createReader
1006                         (
1007                             *srcUsed
1008                             , false
1009                             , XMLReader::RefFrom_NonLiteral
1010                             , XMLReader::Type_General
1011                             , XMLReader::Source_External
1012                             , fCalculateSrcOfs
1013                             , fLowWaterMark
1014                         );
1015             }
1016             else {
1017                 reader = fReaderMgr.createReader
1018                         (
1019                             sysId
1020                             , pubId
1021                             , false
1022                             , XMLReader::RefFrom_NonLiteral
1023                             , XMLReader::Type_General
1024                             , XMLReader::Source_External
1025                             , srcUsed
1026                             , fCalculateSrcOfs
1027                             , fLowWaterMark
1028                             , fDisableDefaultEntityResolution
1029                         );
1030                 janSrc.reset(srcUsed);
1031             }
1032             //  If it failed then throw an exception
1033             if (!reader)
1034                 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed ? srcUsed->getSystemId() : sysId, fMemoryManager);
1035 
1036             if (fToCacheGrammar) {
1037 
1038                 unsigned int stringId = fGrammarResolver->getStringPool()->addOrFind(srcUsed->getSystemId());
1039                 const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(stringId);
1040 
1041                 fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString);
1042                 ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr);
1043                 fGrammarResolver->putGrammar(fGrammar);
1044             }
1045 
1046             //  In order to make the processing work consistently, we have to
1047             //  make this look like an external entity. So create an entity
1048             //  decl and fill it in and push it with the reader, as happens
1049             //  with an external entity. Put a janitor on it to insure it gets
1050             //  cleaned up. The reader manager does not adopt them.
1051             const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull };
1052             DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager);
1053             declDTD->setSystemId(sysId);
1054             declDTD->setIsExternal(true);
1055             Janitor<DTDEntityDecl> janDecl(declDTD);
1056 
1057             // Mark this one as a throw at end
1058             reader->setThrowAtEnd(true);
1059 
1060             // And push it onto the stack, with its pseudo name
1061             fReaderMgr.pushReader(reader, declDTD);
1062 
1063             // Tell it its not in an include section
1064             dtdScanner.scanExtSubsetDecl(false, true);
1065         }
1066     }
1067 }
1068 
scanStartTag(bool & gotData)1069 bool DGXMLScanner::scanStartTag(bool& gotData)
1070 {
1071     //  Assume we will still have data until proven otherwise. It will only
1072     //  ever be false if this is the root and its empty.
1073     gotData = true;
1074 
1075     //  Get the QName. In this case, we are not doing namespaces, so we just
1076     //  use it as is and don't have to break it into parts.
1077 
1078     bool validName = fReaderMgr.getName(fQNameBuf);
1079     if (!validName)
1080     {
1081         if (fQNameBuf.isEmpty())
1082             emitError(XMLErrs::ExpectedElementName);
1083         else
1084             emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer());
1085         fReaderMgr.skipToChar(chOpenAngle);
1086         return false;
1087     }
1088 
1089     // Assume it won't be an empty tag
1090     bool isEmpty = false;
1091 
1092     // See if its the root element
1093     const bool isRoot = fElemStack.isEmpty();
1094 
1095     //  Lets try to look up the element in the validator's element decl pool
1096     //  We can pass bogus values for the URI id and the base name. We know that
1097     //  this can only be called if we are doing a DTD style validator and that
1098     //  he will only look at the QName.
1099     //
1100     //  We *do not* tell him to fault in a decl if he does not find one - NG.
1101     bool wasAdded = false;
1102     const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
1103 
1104     XMLElementDecl* elemDecl = fGrammar->getElemDecl
1105     (
1106         fEmptyNamespaceId
1107         , 0
1108         , qnameRawBuf
1109         , Grammar::TOP_LEVEL_SCOPE
1110     );
1111     // look in the undeclared pool:
1112     if(!elemDecl)
1113     {
1114         elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf);
1115     }
1116     if(!elemDecl)
1117     {
1118         wasAdded = true;
1119         elemDecl = new (fMemoryManager) DTDElementDecl
1120         (
1121             qnameRawBuf
1122             , fEmptyNamespaceId
1123             , DTDElementDecl::Any
1124             , fMemoryManager
1125         );
1126         elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl));
1127     }
1128 
1129     if (fValidate) {
1130 
1131         if (wasAdded)
1132         {
1133             // This is to tell the reuse Validator that this element was
1134             // faulted-in, was not an element in the validator pool originally
1135             elemDecl->setCreateReason(XMLElementDecl::JustFaultIn);
1136 
1137             fValidator->emitError
1138             (
1139                 XMLValid::ElementNotDefined
1140                 , qnameRawBuf
1141             );
1142         }
1143         // If its not marked declared, then emit an error
1144         else if (!elemDecl->isDeclared())
1145         {
1146             fValidator->emitError
1147             (
1148                 XMLValid::ElementNotDefined
1149                 , qnameRawBuf
1150             );
1151         }
1152 
1153 
1154         fValidator->validateElement(elemDecl);
1155     }
1156 
1157     // Expand the element stack and add the new element
1158     fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
1159 
1160     //  If this is the first element and we are validating, check the root
1161     //  element.
1162     if (isRoot)
1163     {
1164         fRootGrammar = fGrammar;
1165 
1166         if (fValidate)
1167         {
1168             //  If a DocType exists, then check if it matches the root name there.
1169             if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName))
1170                 fValidator->emitError(XMLValid::RootElemNotLikeDocType);
1171         }
1172     }
1173     else if (fValidate)
1174     {
1175         //  If the element stack is not empty, then add this element as a
1176         //  child of the previous top element. If its empty, this is the root
1177         //  elem and is not the child of anything.
1178         fElemStack.addChild(elemDecl->getElementName(), true);
1179     }
1180 
1181     // Skip any whitespace after the name
1182     fReaderMgr.skipPastSpaces();
1183 
1184     //  We loop until we either see a /> or >, handling attribute/value
1185     //  pairs until we get there.
1186     XMLSize_t    attCount = 0;
1187     XMLSize_t    curAttListSize = fAttrList->size();
1188     wasAdded = false;
1189 
1190     fElemCount++;
1191 
1192     while (true)
1193     {
1194         // And get the next non-space character
1195         XMLCh nextCh = fReaderMgr.peekNextChar();
1196 
1197         //  If the next character is not a slash or closed angle bracket,
1198         //  then it must be whitespace, since whitespace is required
1199         //  between the end of the last attribute and the name of the next
1200         //  one.
1201         if (attCount)
1202         {
1203             if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
1204             {
1205                 if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
1206                 {
1207                     // Ok, skip by them and peek another char
1208                     fReaderMgr.skipPastSpaces();
1209                     nextCh = fReaderMgr.peekNextChar();
1210                 }
1211                  else
1212                 {
1213                     // Emit the error but keep on going
1214                     emitError(XMLErrs::ExpectedWhitespace);
1215                 }
1216             }
1217         }
1218 
1219         //  Ok, here we first check for any of the special case characters.
1220         //  If its not one, then we do the normal case processing, which
1221         //  assumes that we've hit an attribute value, Otherwise, we do all
1222         //  the special case checks.
1223         if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
1224         {
1225             //  Assume its going to be an attribute, so get a name from
1226             //  the input.
1227 
1228             validName = fReaderMgr.getName(fAttNameBuf);
1229             if (!validName)
1230             {
1231                 if (fAttNameBuf.isEmpty())
1232                     emitError(XMLErrs::ExpectedAttrName);
1233                 else
1234                     emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer());
1235                 fReaderMgr.skipPastChar(chCloseAngle);
1236                 return false;
1237             }
1238 
1239             // And next must be an equal sign
1240             if (!scanEq())
1241             {
1242                 static const XMLCh tmpList[] =
1243                 {
1244                     chSingleQuote, chDoubleQuote, chCloseAngle
1245                     , chOpenAngle, chForwardSlash, chNull
1246                 };
1247 
1248                 emitError(XMLErrs::ExpectedEqSign);
1249 
1250                 //  Try to sync back up by skipping forward until we either
1251                 //  hit something meaningful.
1252                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1253 
1254                 if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
1255                 {
1256                     // Jump back to top for normal processing of these
1257                     continue;
1258                 }
1259                 else if ((chFound == chSingleQuote)
1260                       ||  (chFound == chDoubleQuote)
1261                       ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1262                 {
1263                     // Just fall through assuming that the value is to follow
1264                 }
1265                 else if (chFound == chOpenAngle)
1266                 {
1267                     // Assume a malformed tag and that new one is starting
1268                     emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1269                     return false;
1270                 }
1271                 else
1272                 {
1273                     // Something went really wrong
1274                     return false;
1275                 }
1276             }
1277 
1278             //  See if this attribute is declared for this element. If we are
1279             //  not validating of course it will not be at first, but we will
1280             //  fault it into the pool (to avoid lots of redundant errors.)
1281             XMLCh * namePtr = fAttNameBuf.getRawBuffer();
1282             XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr);
1283 
1284             //  Skip any whitespace before the value and then scan the att
1285             //  value. This will come back normalized with entity refs and
1286             //  char refs expanded.
1287             fReaderMgr.skipPastSpaces();
1288             if (!scanAttValue(attDef, namePtr, fAttValueBuf))
1289             {
1290                 static const XMLCh tmpList[] =
1291                 {
1292                     chCloseAngle, chOpenAngle, chForwardSlash, chNull
1293                 };
1294 
1295                 emitError(XMLErrs::ExpectedAttrValue);
1296 
1297                 //  It failed, so lets try to get synced back up. We skip
1298                 //  forward until we find some whitespace or one of the
1299                 //  chars in our list.
1300                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1301 
1302                 if ((chFound == chCloseAngle)
1303                 ||  (chFound == chForwardSlash)
1304                 ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1305                 {
1306                     //  Just fall through and process this attribute, though
1307                     //  the value will be "".
1308                 }
1309                 else if (chFound == chOpenAngle)
1310                 {
1311                     // Assume a malformed tag and that new one is starting
1312                     emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1313                     return false;
1314                 }
1315                 else
1316                 {
1317                     // Something went really wrong
1318                     return false;
1319                 }
1320             }
1321 
1322             //  Add this attribute to the attribute list that we use to
1323             //  pass them to the handler. We reuse its existing elements
1324             //  but expand it as required.
1325             // Note that we want to this first since this will
1326             // make a copy of the namePtr; we can then make use of
1327             // that copy in the hashtable lookup that checks
1328             // for duplicates.  This will mean we may have to update
1329             // the type of the XMLAttr later.
1330             XMLAttr* curAtt;
1331             const XMLCh* attrValue = fAttValueBuf.getRawBuffer();
1332 
1333             if (attCount >= curAttListSize) {
1334                 curAtt = new (fMemoryManager) XMLAttr(fMemoryManager);
1335                 fAttrList->addElement(curAtt);
1336             }
1337             else {
1338                 curAtt = fAttrList->elementAt(attCount);
1339             }
1340 
1341             curAtt->setSpecified(true);
1342 
1343             // NO NAMESPACE CODE
1344             {
1345                 curAtt->set(
1346                     0, namePtr, XMLUni::fgZeroLenString, XMLUni::fgZeroLenString
1347                     , (attDef)?attDef->getType():XMLAttDef::CData
1348                 );
1349 
1350                 // now need to prepare for duplicate detection
1351                 if (attDef) {
1352                     unsigned int *curCountPtr = fAttDefRegistry->get(attDef);
1353                     if (!curCountPtr) {
1354                         curCountPtr = getNewUIntPtr();
1355                         *curCountPtr = fElemCount;
1356                         fAttDefRegistry->put(attDef, curCountPtr);
1357                     }
1358                     else if (*curCountPtr < fElemCount) {
1359                         *curCountPtr = fElemCount;
1360                     }
1361                     else {
1362                         emitError(
1363                             XMLErrs::AttrAlreadyUsedInSTag
1364                             , attDef->getFullName(), elemDecl->getFullName()
1365                         );
1366                     }
1367                 }
1368                 else
1369                 {
1370                     // reset namePtr so it refers to newly-allocated memory
1371                     namePtr = (XMLCh *)curAtt->getQName();
1372                     if (!fUndeclaredAttrRegistry->putIfNotPresent(namePtr, 0))
1373                     {
1374                         emitError(
1375                             XMLErrs::AttrAlreadyUsedInSTag
1376                             , namePtr, elemDecl->getFullName()
1377                         );
1378                     }
1379                 }
1380             }
1381 
1382             if (fValidate)
1383             {
1384                 if (attDef) {
1385                     // Let the validator pass judgement on the attribute value
1386                     fValidator->validateAttrValue(
1387                         attDef, fAttValueBuf.getRawBuffer(), false, elemDecl
1388                     );
1389                 }
1390                 else
1391                 {
1392                     fValidator->emitError
1393                     (
1394                         XMLValid::AttNotDefinedForElement
1395                         , fAttNameBuf.getRawBuffer(), qnameRawBuf
1396                     );
1397                 }
1398             }
1399 
1400             // must set the newly-minted value on the XMLAttr:
1401             curAtt->setValue(attrValue);
1402             attCount++;
1403 
1404             // And jump back to the top of the loop
1405             continue;
1406         }
1407 
1408         //  It was some special case character so do all of the checks and
1409         //  deal with it.
1410         if (!nextCh)
1411             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1412 
1413         if (nextCh == chForwardSlash)
1414         {
1415             fReaderMgr.getNextChar();
1416             isEmpty = true;
1417             if (!fReaderMgr.skippedChar(chCloseAngle))
1418                 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1419             break;
1420         }
1421         else if (nextCh == chCloseAngle)
1422         {
1423             fReaderMgr.getNextChar();
1424             break;
1425         }
1426         else if (nextCh == chOpenAngle)
1427         {
1428             //  Check for this one specially, since its going to be common
1429             //  and it is kind of auto-recovering since we've already hit the
1430             //  next open bracket, which is what we would have seeked to (and
1431             //  skipped this whole tag.)
1432             emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1433             break;
1434         }
1435         else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
1436         {
1437             //  Check for this one specially, which is probably a missing
1438             //  attribute name, e.g. ="value". Just issue expected name
1439             //  error and eat the quoted string, then jump back to the
1440             //  top again.
1441             emitError(XMLErrs::ExpectedAttrName);
1442             fReaderMgr.getNextChar();
1443             fReaderMgr.skipQuotedString(nextCh);
1444             fReaderMgr.skipPastSpaces();
1445             continue;
1446         }
1447     }
1448 
1449     if(attCount)
1450     {
1451         // clean up after ourselves:
1452         // clear the map used to detect duplicate attributes
1453         fUndeclaredAttrRegistry->removeAll();
1454     }
1455 
1456     //  Now lets get the fAttrList filled in. This involves faulting in any
1457     //  defaulted and fixed attributes and normalizing the values of any that
1458     //  we got explicitly.
1459     //
1460     //  We update the attCount value with the total number of attributes, but
1461     //  it goes in with the number of values we got during the raw scan of
1462     //  explictly provided attrs above.
1463     attCount = buildAttList(attCount, elemDecl, *fAttrList);
1464 
1465     //  If we have a document handler, then tell it about this start tag. We
1466     //  don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send
1467     //  any prefix since its just one big name if we are not doing namespaces.
1468     unsigned int uriId = fEmptyNamespaceId;
1469     if (fDocHandler)
1470     {
1471         fDocHandler->startElement
1472         (
1473             *elemDecl
1474             , uriId
1475             , 0
1476             , *fAttrList
1477             , attCount
1478             , isEmpty
1479             , isRoot
1480         );
1481     }
1482 
1483     //  If empty, validate content right now if we are validating and then
1484     //  pop the element stack top. Else, we have to update the current stack
1485     //  top's namespace mapping elements.
1486     if (isEmpty)
1487     {
1488         // If validating, then insure that its legal to have no content
1489         if (fValidate)
1490         {
1491             XMLSize_t failure;
1492             bool res = fValidator->checkContent(elemDecl, 0, 0, &failure);
1493             if (!res)
1494             {
1495                 fValidator->emitError
1496                 (
1497                     XMLValid::ElementNotValidForContent
1498                     , qnameRawBuf
1499                     , elemDecl->getFormattedContentModel()
1500                 );
1501             }
1502         }
1503 
1504         // Pop the element stack back off since it'll never be used now
1505         fElemStack.popTop();
1506 
1507         // If the elem stack is empty, then it was an empty root
1508         if (isRoot)
1509             gotData = false;
1510     }
1511 
1512     return true;
1513 }
1514 
1515 
scanStartTagNS(bool & gotData)1516 bool DGXMLScanner::scanStartTagNS(bool& gotData)
1517 {
1518     //  Assume we will still have data until proven otherwise. It will only
1519     //  ever be false if this is the root and its empty.
1520     gotData = true;
1521 
1522     //  Get the QName. In this case, we are not doing namespaces, so we just
1523     //  use it as is and don't have to break it into parts.
1524 
1525     int  colonPosition;
1526     bool validName = fReaderMgr.getQName(fQNameBuf, &colonPosition);
1527     if (!validName)
1528     {
1529         if (fQNameBuf.isEmpty())
1530             emitError(XMLErrs::ExpectedElementName);
1531         else
1532             emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer());
1533         fReaderMgr.skipToChar(chOpenAngle);
1534         return false;
1535     }
1536 
1537     // Assume it won't be an empty tag
1538     bool isEmpty = false;
1539 
1540     // See if its the root element
1541     const bool isRoot = fElemStack.isEmpty();
1542 
1543     //  Lets try to look up the element in the validator's element decl pool
1544     //  We can pass bogus values for the URI id and the base name. We know that
1545     //  this can only be called if we are doing a DTD style validator and that
1546     //  he will only look at the QName.
1547     //
1548     //  We *do not* tell him to fault in a decl if he does not find one - NG.
1549     bool wasAdded = false;
1550     const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
1551 
1552     XMLElementDecl* elemDecl = fGrammar->getElemDecl
1553     (
1554         fEmptyNamespaceId
1555         , 0
1556         , qnameRawBuf
1557         , Grammar::TOP_LEVEL_SCOPE
1558     );
1559     // look in the undeclared pool:
1560     if(!elemDecl)
1561     {
1562         elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf);
1563     }
1564     if(!elemDecl)
1565     {
1566         wasAdded = true;
1567         elemDecl = new (fMemoryManager) DTDElementDecl
1568         (
1569             qnameRawBuf
1570             , fEmptyNamespaceId
1571             , DTDElementDecl::Any
1572             , fMemoryManager
1573         );
1574         elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl));
1575     }
1576 
1577     if (fValidate) {
1578 
1579         if (wasAdded)
1580         {
1581             // This is to tell the reuse Validator that this element was
1582             // faulted-in, was not an element in the validator pool originally
1583             elemDecl->setCreateReason(XMLElementDecl::JustFaultIn);
1584 
1585             fValidator->emitError
1586             (
1587                 XMLValid::ElementNotDefined
1588                 , qnameRawBuf
1589             );
1590         }
1591         // If its not marked declared, then emit an error
1592         else if (!elemDecl->isDeclared())
1593         {
1594             fValidator->emitError
1595             (
1596                 XMLValid::ElementNotDefined
1597                 , qnameRawBuf
1598             );
1599         }
1600 
1601 
1602         fValidator->validateElement(elemDecl);
1603     }
1604 
1605     // Expand the element stack and add the new element
1606     fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
1607 
1608     //  If this is the first element and we are validating, check the root
1609     //  element.
1610     if (isRoot)
1611     {
1612         fRootGrammar = fGrammar;
1613 
1614         if (fValidate)
1615         {
1616             //  If a DocType exists, then check if it matches the root name there.
1617             if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName))
1618                 fValidator->emitError(XMLValid::RootElemNotLikeDocType);
1619         }
1620     }
1621     else if (fValidate)
1622     {
1623         //  If the element stack is not empty, then add this element as a
1624         //  child of the previous top element. If its empty, this is the root
1625         //  elem and is not the child of anything.
1626         fElemStack.addChild(elemDecl->getElementName(), true);
1627     }
1628 
1629     // Skip any whitespace after the name
1630     fReaderMgr.skipPastSpaces();
1631 
1632     //  We loop until we either see a /> or >, handling attribute/value
1633     //  pairs until we get there.
1634     XMLSize_t    attCount = 0;
1635     XMLSize_t    curAttListSize = fAttrList->size();
1636     wasAdded = false;
1637 
1638     fElemCount++;
1639 
1640     while (true)
1641     {
1642         // And get the next non-space character
1643         XMLCh nextCh = fReaderMgr.peekNextChar();
1644 
1645         //  If the next character is not a slash or closed angle bracket,
1646         //  then it must be whitespace, since whitespace is required
1647         //  between the end of the last attribute and the name of the next
1648         //  one.
1649         if (attCount)
1650         {
1651             if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
1652             {
1653                 if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
1654                 {
1655                     // Ok, skip by them and peek another char
1656                     fReaderMgr.skipPastSpaces();
1657                     nextCh = fReaderMgr.peekNextChar();
1658                 }
1659                  else
1660                 {
1661                     // Emit the error but keep on going
1662                     emitError(XMLErrs::ExpectedWhitespace);
1663                 }
1664             }
1665         }
1666 
1667         //  Ok, here we first check for any of the special case characters.
1668         //  If its not one, then we do the normal case processing, which
1669         //  assumes that we've hit an attribute value, Otherwise, we do all
1670         //  the special case checks.
1671         if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
1672         {
1673             //  Assume its going to be an attribute, so get a name from
1674             //  the input.
1675 
1676             validName = fReaderMgr.getQName(fAttNameBuf, &colonPosition);
1677             if (!validName)
1678             {
1679                 if (fAttNameBuf.isEmpty())
1680                     emitError(XMLErrs::ExpectedAttrName);
1681                 else
1682                     emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer());
1683                 fReaderMgr.skipPastChar(chCloseAngle);
1684                 return false;
1685             }
1686 
1687             // And next must be an equal sign
1688             if (!scanEq())
1689             {
1690                 static const XMLCh tmpList[] =
1691                 {
1692                     chSingleQuote, chDoubleQuote, chCloseAngle
1693                     , chOpenAngle, chForwardSlash, chNull
1694                 };
1695 
1696                 emitError(XMLErrs::ExpectedEqSign);
1697 
1698                 //  Try to sync back up by skipping forward until we either
1699                 //  hit something meaningful.
1700                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1701 
1702                 if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
1703                 {
1704                     // Jump back to top for normal processing of these
1705                     continue;
1706                 }
1707                 else if ((chFound == chSingleQuote)
1708                       ||  (chFound == chDoubleQuote)
1709                       ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1710                 {
1711                     // Just fall through assuming that the value is to follow
1712                 }
1713                 else if (chFound == chOpenAngle)
1714                 {
1715                     // Assume a malformed tag and that new one is starting
1716                     emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1717                     return false;
1718                 }
1719                 else
1720                 {
1721                     // Something went really wrong
1722                     return false;
1723                 }
1724             }
1725 
1726             //  See if this attribute is declared for this element. If we are
1727             //  not validating of course it will not be at first, but we will
1728             //  fault it into the pool (to avoid lots of redundant errors.)
1729             XMLCh * namePtr = fAttNameBuf.getRawBuffer();
1730             XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr);
1731 
1732             //  Skip any whitespace before the value and then scan the att
1733             //  value. This will come back normalized with entity refs and
1734             //  char refs expanded.
1735             fReaderMgr.skipPastSpaces();
1736             if (!scanAttValue(attDef, namePtr, fAttValueBuf))
1737             {
1738                 static const XMLCh tmpList[] =
1739                 {
1740                     chCloseAngle, chOpenAngle, chForwardSlash, chNull
1741                 };
1742 
1743                 emitError(XMLErrs::ExpectedAttrValue);
1744 
1745                 //  It failed, so lets try to get synced back up. We skip
1746                 //  forward until we find some whitespace or one of the
1747                 //  chars in our list.
1748                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1749 
1750                 if ((chFound == chCloseAngle)
1751                 ||  (chFound == chForwardSlash)
1752                 ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1753                 {
1754                     //  Just fall through and process this attribute, though
1755                     //  the value will be "".
1756                 }
1757                 else if (chFound == chOpenAngle)
1758                 {
1759                     // Assume a malformed tag and that new one is starting
1760                     emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1761                     return false;
1762                 }
1763                 else
1764                 {
1765                     // Something went really wrong
1766                     return false;
1767                 }
1768             }
1769 
1770             //  Add this attribute to the attribute list that we use to
1771             //  pass them to the handler. We reuse its existing elements
1772             //  but expand it as required.
1773             // Note that we want to this first since this will
1774             // make a copy of the namePtr; we can then make use of
1775             // that copy in the hashtable lookup that checks
1776             // for duplicates.  This will mean we may have to update
1777             // the type of the XMLAttr later.
1778             XMLAttr* curAtt;
1779             const XMLCh* attrValue = fAttValueBuf.getRawBuffer();
1780 
1781             if (attCount >= curAttListSize) {
1782                 curAtt = new (fMemoryManager) XMLAttr(fMemoryManager);
1783                 fAttrList->addElement(curAtt);
1784             }
1785             else {
1786                 curAtt = fAttrList->elementAt(attCount);
1787             }
1788 
1789             curAtt->setSpecified(true);
1790             // DO NAMESPACES
1791             {
1792                 curAtt->set(
1793                     fEmptyNamespaceId, namePtr, XMLUni::fgZeroLenString
1794                     , (attDef)? attDef->getType() : XMLAttDef::CData
1795                 );
1796 
1797                 // each attribute has the prefix:suffix="value"
1798                 const XMLCh* attPrefix = curAtt->getPrefix();
1799                 const XMLCh* attLocalName = curAtt->getName();
1800 
1801                 if (attPrefix && *attPrefix) {
1802                     if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) {
1803                         curAtt->setURIId(fXMLNamespaceId);
1804                     }
1805                     else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) {
1806                         curAtt->setURIId(fXMLNSNamespaceId);
1807                         updateNSMap(attPrefix, attLocalName, attrValue);
1808                     }
1809                     else {
1810                         fAttrNSList->addElement(curAtt);
1811                     }
1812                 }
1813                 else if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName))
1814                 {
1815                     updateNSMap(attPrefix, XMLUni::fgZeroLenString, attrValue);
1816                 }
1817 
1818                 // NOTE: duplicate attribute check will be done, when we map
1819                 //       namespaces to all attributes
1820                 if (attDef) {
1821                     unsigned int *curCountPtr = fAttDefRegistry->get(attDef);
1822                     if (!curCountPtr) {
1823                         curCountPtr = getNewUIntPtr();
1824                         *curCountPtr = fElemCount;
1825                         fAttDefRegistry->put(attDef, curCountPtr);
1826                    }
1827                     else if (*curCountPtr < fElemCount) {
1828                         *curCountPtr = fElemCount;
1829                     }
1830                 }
1831             }
1832 
1833             if (fValidate)
1834             {
1835                 if (attDef) {
1836                     // Let the validator pass judgement on the attribute value
1837                     fValidator->validateAttrValue(
1838                         attDef, fAttValueBuf.getRawBuffer(), false, elemDecl
1839                     );
1840                 }
1841                 else
1842                 {
1843                     fValidator->emitError
1844                     (
1845                         XMLValid::AttNotDefinedForElement
1846                         , fAttNameBuf.getRawBuffer(), qnameRawBuf
1847                     );
1848                 }
1849             }
1850 
1851             // must set the newly-minted value on the XMLAttr:
1852             curAtt->setValue(attrValue);
1853             attCount++;
1854 
1855             // And jump back to the top of the loop
1856             continue;
1857         }
1858 
1859         //  It was some special case character so do all of the checks and
1860         //  deal with it.
1861         if (!nextCh)
1862             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1863 
1864         if (nextCh == chForwardSlash)
1865         {
1866             fReaderMgr.getNextChar();
1867             isEmpty = true;
1868             if (!fReaderMgr.skippedChar(chCloseAngle))
1869                 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1870             break;
1871         }
1872         else if (nextCh == chCloseAngle)
1873         {
1874             fReaderMgr.getNextChar();
1875             break;
1876         }
1877         else if (nextCh == chOpenAngle)
1878         {
1879             //  Check for this one specially, since its going to be common
1880             //  and it is kind of auto-recovering since we've already hit the
1881             //  next open bracket, which is what we would have seeked to (and
1882             //  skipped this whole tag.)
1883             emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1884             break;
1885         }
1886         else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
1887         {
1888             //  Check for this one specially, which is probably a missing
1889             //  attribute name, e.g. ="value". Just issue expected name
1890             //  error and eat the quoted string, then jump back to the
1891             //  top again.
1892             emitError(XMLErrs::ExpectedAttrName);
1893             fReaderMgr.getNextChar();
1894             fReaderMgr.skipQuotedString(nextCh);
1895             fReaderMgr.skipPastSpaces();
1896             continue;
1897         }
1898     }
1899 
1900     //  Make an initial pass through the list and find any xmlns attributes.
1901     if (attCount)
1902       scanAttrListforNameSpaces(fAttrList, attCount, elemDecl);
1903 
1904     if(attCount)
1905     {
1906         // clean up after ourselves:
1907         // clear the map used to detect duplicate attributes
1908         fUndeclaredAttrRegistry->removeAll();
1909     }
1910 
1911     //  Now lets get the fAttrList filled in. This involves faulting in any
1912     //  defaulted and fixed attributes and normalizing the values of any that
1913     //  we got explicitly.
1914     //
1915     //  We update the attCount value with the total number of attributes, but
1916     //  it goes in with the number of values we got during the raw scan of
1917     //  explictly provided attrs above.
1918     attCount = buildAttList(attCount, elemDecl, *fAttrList);
1919 
1920     //  If we have a document handler, then tell it about this start tag. We
1921     //  don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send
1922     //  any prefix since its just one big name if we are not doing namespaces.
1923     if (fDocHandler)
1924     {
1925         unsigned int uriId = resolvePrefix
1926             (
1927                 elemDecl->getElementName()->getPrefix()
1928                 , ElemStack::Mode_Element
1929             );
1930 
1931         fDocHandler->startElement
1932         (
1933             *elemDecl
1934             , uriId
1935             , elemDecl->getElementName()->getPrefix()
1936             , *fAttrList
1937             , attCount
1938             , isEmpty
1939             , isRoot
1940         );
1941     }
1942 
1943     //  If empty, validate content right now if we are validating and then
1944     //  pop the element stack top. Else, we have to update the current stack
1945     //  top's namespace mapping elements.
1946     if (isEmpty)
1947     {
1948         // If validating, then insure that its legal to have no content
1949         if (fValidate)
1950         {
1951             XMLSize_t failure;
1952             bool res = fValidator->checkContent(elemDecl, 0, 0, &failure);
1953             if (!res)
1954             {
1955                 fValidator->emitError
1956                 (
1957                     XMLValid::ElementNotValidForContent
1958                     , qnameRawBuf
1959                     , elemDecl->getFormattedContentModel()
1960                 );
1961             }
1962         }
1963 
1964         // Pop the element stack back off since it'll never be used now
1965         fElemStack.popTop();
1966 
1967         // If the elem stack is empty, then it was an empty root
1968         if (isRoot)
1969             gotData = false;
1970     }
1971 
1972     return true;
1973 }
1974 
1975 // ---------------------------------------------------------------------------
1976 //  DGXMLScanner: Grammar preparsing
1977 // ---------------------------------------------------------------------------
loadGrammar(const InputSource & src,const short grammarType,const bool toCache)1978 Grammar* DGXMLScanner::loadGrammar(const   InputSource& src
1979                                    , const short        grammarType
1980                                    , const bool         toCache)
1981 {
1982     Grammar* loadedGrammar = 0;
1983 
1984     ReaderMgrResetType  resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
1985 
1986     try
1987     {
1988         fGrammarResolver->cacheGrammarFromParse(false);
1989         fGrammarResolver->useCachedGrammarInParse(false);
1990         fRootGrammar = 0;
1991 
1992         if (fValScheme == Val_Auto) {
1993             fValidate = true;
1994         }
1995 
1996         // Reset some status flags
1997         fInException = false;
1998         fStandalone = false;
1999         fErrorCount = 0;
2000         fHasNoDTD = true;
2001 
2002         if (grammarType == Grammar::DTDGrammarType) {
2003             loadedGrammar = loadDTDGrammar(src, toCache);
2004         }
2005     }
2006     //  NOTE:
2007     //
2008     //  In all of the error processing below, the emitError() call MUST come
2009     //  before the flush of the reader mgr, or it will fail because it tries
2010     //  to find out the position in the XML source of the error.
2011     catch(const XMLErrs::Codes)
2012     {
2013         // This is a 'first failure' exception, so fall through
2014     }
2015     catch(const XMLValid::Codes)
2016     {
2017         // This is a 'first fatal error' type exit, so fall through
2018     }
2019     catch(const XMLException& excToCatch)
2020     {
2021         //  Emit the error and catch any user exception thrown from here. Make
2022         //  sure in all cases we flush the reader manager.
2023         fInException = true;
2024         try
2025         {
2026             if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
2027                 emitError
2028                 (
2029                     XMLErrs::XMLException_Warning
2030                     , excToCatch.getCode()
2031                     , excToCatch.getMessage()
2032                 );
2033             else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
2034                 emitError
2035                 (
2036                     XMLErrs::XMLException_Fatal
2037                     , excToCatch.getCode()
2038                     , excToCatch.getMessage()
2039                 );
2040             else
2041                 emitError
2042                 (
2043                     XMLErrs::XMLException_Error
2044                     , excToCatch.getCode()
2045                     , excToCatch.getMessage()
2046                 );
2047         }
2048         catch(const OutOfMemoryException&)
2049         {
2050             // This is a special case for out-of-memory
2051             // conditions, because resetting the ReaderMgr
2052             // can be problematic.
2053             resetReaderMgr.release();
2054 
2055             throw;
2056         }
2057     }
2058     catch(const OutOfMemoryException&)
2059     {
2060         // This is a special case for out-of-memory
2061         // conditions, because resetting the ReaderMgr
2062         // can be problematic.
2063         resetReaderMgr.release();
2064 
2065         throw;
2066     }
2067 
2068     return loadedGrammar;
2069 }
2070 
loadDTDGrammar(const InputSource & src,const bool toCache)2071 Grammar* DGXMLScanner::loadDTDGrammar(const InputSource& src,
2072                                       const bool toCache)
2073 {
2074     // Reset the validators
2075     fDTDValidator->reset();
2076     if (fValidatorFromUser)
2077         fValidator->reset();
2078 
2079     fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager);
2080     fGrammarResolver->putGrammar(fDTDGrammar);
2081     fGrammar = fDTDGrammar;
2082     fValidator->setGrammar(fGrammar);
2083 
2084     //  And for all installed handlers, send reset events. This gives them
2085     //  a chance to flush any cached data.
2086     if (fDocHandler)
2087         fDocHandler->resetDocument();
2088     if (fEntityHandler)
2089         fEntityHandler->resetEntities();
2090     if (fErrorReporter)
2091         fErrorReporter->resetErrors();
2092 
2093     // Clear out the id reference list
2094     resetValidationContext();
2095 
2096     if (toCache) {
2097 
2098         unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId());
2099         const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId);
2100 
2101         fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString);
2102         ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr);
2103         fGrammarResolver->putGrammar(fGrammar);
2104     }
2105 
2106     //  Handle the creation of the XML reader object for this input source.
2107     //  This will provide us with transcoding and basic lexing services.
2108     XMLReader* newReader = fReaderMgr.createReader
2109     (
2110         src
2111         , false
2112         , XMLReader::RefFrom_NonLiteral
2113         , XMLReader::Type_General
2114         , XMLReader::Source_External
2115         , fCalculateSrcOfs
2116         , fLowWaterMark
2117     );
2118     if (!newReader) {
2119         if (src.getIssueFatalErrorIfNotFound())
2120             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager);
2121         else
2122             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager);
2123     }
2124 
2125     //  In order to make the processing work consistently, we have to
2126     //  make this look like an external entity. So create an entity
2127     //  decl and fill it in and push it with the reader, as happens
2128     //  with an external entity. Put a janitor on it to insure it gets
2129     //  cleaned up. The reader manager does not adopt them.
2130     const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull };
2131     DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager);
2132     declDTD->setSystemId(src.getSystemId());
2133     declDTD->setIsExternal(true);
2134     Janitor<DTDEntityDecl> janDecl(declDTD);
2135 
2136     // Mark this one as a throw at end
2137     newReader->setThrowAtEnd(true);
2138 
2139     // And push it onto the stack, with its pseudo name
2140     fReaderMgr.pushReader(newReader, declDTD);
2141 
2142     //  If we have a doc type handler and advanced callbacks are enabled,
2143     //  call the doctype event.
2144     if (fDocTypeHandler) {
2145 
2146         // Create a dummy root
2147         DTDElementDecl* rootDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
2148         (
2149             gDTDStr
2150             , fEmptyNamespaceId
2151             , DTDElementDecl::Any
2152             , fGrammarPoolMemoryManager
2153         );
2154         rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
2155         rootDecl->setExternalElemDeclaration(true);
2156         Janitor<DTDElementDecl> janSrc(rootDecl);
2157 
2158         fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false, true);
2159     }
2160 
2161     // Create DTDScanner
2162     DTDScanner dtdScanner
2163     (
2164         (DTDGrammar*)fGrammar
2165         , fDocTypeHandler
2166         , fGrammarPoolMemoryManager
2167         , fMemoryManager
2168     );
2169     dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr);
2170 
2171     // Tell it its not in an include section
2172     dtdScanner.scanExtSubsetDecl(false, true);
2173 
2174     if (fValidate) {
2175         //  validate the DTD scan so far
2176         fValidator->preContentValidation(false, true);
2177     }
2178 
2179     if (toCache)
2180         fGrammarResolver->cacheGrammars();
2181 
2182     return fDTDGrammar;
2183 }
2184 
2185 
2186 // ---------------------------------------------------------------------------
2187 //  DGXMLScanner: Private helper methods
2188 // ---------------------------------------------------------------------------
2189 //  This method handles the common initialization, to avoid having to do
2190 //  it redundantly in multiple constructors.
commonInit()2191 void DGXMLScanner::commonInit()
2192 {
2193     //  And we need one for the raw attribute scan. This just stores key/
2194     //  value string pairs (prior to any processing.)
2195     fAttrNSList = new (fMemoryManager) ValueVectorOf<XMLAttr*>(8, fMemoryManager);
2196 
2197     //  Create the Validator and init them
2198     fDTDValidator = new (fMemoryManager) DTDValidator();
2199     initValidator(fDTDValidator);
2200     fDTDElemNonDeclPool = new (fMemoryManager) NameIdPool<DTDElementDecl>(29, 128, fMemoryManager);
2201     fAttDefRegistry = new (fMemoryManager) RefHashTableOf<unsigned int, PtrHasher>
2202     (
2203         131, false, fMemoryManager
2204     );
2205     fUndeclaredAttrRegistry = new (fMemoryManager) Hash2KeysSetOf<StringHasher>(7, fMemoryManager);
2206 
2207     if (fValidator)
2208     {
2209         if (!fValidator->handlesDTD())
2210            ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager);
2211     }
2212     else
2213     {
2214         fValidator = fDTDValidator;
2215     }
2216 }
2217 
cleanUp()2218 void DGXMLScanner::cleanUp()
2219 {
2220     delete fAttrNSList;
2221     delete fDTDValidator;
2222     delete fDTDElemNonDeclPool;
2223     delete fAttDefRegistry;
2224     delete fUndeclaredAttrRegistry;
2225 }
2226 
2227 
2228 //  This method is called from scanStartTagNS() to build up the list of
2229 //  XMLAttr objects that will be passed out in the start tag callout. We
2230 //  get the key/value pairs from the raw scan of explicitly provided attrs,
2231 //  which have not been normalized. And we get the element declaration from
2232 //  which we will get any defaulted or fixed attribute defs and add those
2233 //  in as well.
2234 XMLSize_t
buildAttList(const XMLSize_t attCount,XMLElementDecl * elemDecl,RefVectorOf<XMLAttr> & toFill)2235 DGXMLScanner::buildAttList(const XMLSize_t              attCount
2236                           ,       XMLElementDecl*       elemDecl
2237                           ,       RefVectorOf<XMLAttr>& toFill)
2238 {
2239     //  Ask the element to clear the 'provided' flag on all of the att defs
2240     //  that it owns, and to return us a boolean indicating whether it has
2241     //  any defs.
2242     const bool hasDefs = elemDecl->hasAttDefs();
2243 
2244     //  If there are no expliclitily provided attributes and there are no
2245     //  defined attributes for the element, the we don't have anything to do.
2246     //  So just return zero in this case.
2247     if (!hasDefs && !attCount)
2248         return 0;
2249 
2250     // Keep up with how many attrs we end up with total
2251     XMLSize_t retCount = attCount;
2252 
2253     //  And get the current size of the output vector. This lets us use
2254     //  existing elements until we fill it, then start adding new ones.
2255     const XMLSize_t curAttListSize = toFill.size();
2256 
2257     //  Ok, so lets get an enumerator for the attributes of this element
2258     //  and run through them for well formedness and validity checks. But
2259     //  make sure that we had any attributes before we do it, since the list
2260     //  would have have gotten faulted in anyway.
2261     if (hasDefs)
2262     {
2263         XMLAttDefList& attDefList = elemDecl->getAttDefList();
2264         for(XMLSize_t i=0; i<attDefList.getAttDefCount(); i++)
2265         {
2266             // Get the current att def, for convenience and its def type
2267             XMLAttDef& curDef = attDefList.getAttDef(i);
2268 
2269             unsigned int *attCountPtr = fAttDefRegistry->get(&curDef);
2270             if (!attCountPtr || *attCountPtr < fElemCount)
2271             { // did not occur
2272                 const XMLAttDef::DefAttTypes defType = curDef.getDefaultType();
2273 
2274                 if (fValidate)
2275                 {
2276                     // If we are validating and its required, then an error
2277                     if (defType == XMLAttDef::Required)
2278                     {
2279                         fValidator->emitError
2280                         (
2281                             XMLValid::RequiredAttrNotProvided
2282                             , curDef.getFullName()
2283                         );
2284                     }
2285                     else if ((defType == XMLAttDef::Default) ||
2286 		                       (defType == XMLAttDef::Fixed)  )
2287                     {
2288                         if (fStandalone && curDef.isExternal())
2289                         {
2290                             // XML 1.0 Section 2.9
2291                             // Document is standalone, so attributes must not be defaulted.
2292                             fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName());
2293                         }
2294                     }
2295                 }
2296 
2297                 // Fault in the value if needed, and bump the att count
2298                 if ((defType == XMLAttDef::Default)
2299                 ||  (defType == XMLAttDef::Fixed))
2300                 {
2301                     // Let the validator pass judgement on the attribute value
2302                     if (fValidate)
2303                     {
2304                         fValidator->validateAttrValue
2305                         (
2306                             &curDef
2307                             , curDef.getValue()
2308                             , false
2309                             , elemDecl
2310                         );
2311                     }
2312 
2313                     XMLAttr* curAtt;
2314                     if (retCount >= curAttListSize)
2315                     {
2316                         if (fDoNamespaces)
2317                         {
2318                             curAtt = new (fMemoryManager) XMLAttr
2319                             (
2320                                 fEmptyNamespaceId
2321                                 , curDef.getFullName()
2322                                 , curDef.getValue()
2323                                 , curDef.getType()
2324                                 , false
2325                                 , fMemoryManager
2326                             );
2327                         }
2328                         else
2329                         {
2330                             curAtt = new (fMemoryManager) XMLAttr
2331                             (
2332                                 0
2333                                 , curDef.getFullName()
2334                                 , XMLUni::fgZeroLenString
2335                                 , curDef.getValue()
2336                                 , curDef.getType()
2337                                 , false
2338                                 , fMemoryManager
2339                             );
2340                         }
2341 
2342                         fAttrList->addElement(curAtt);
2343                     }
2344                     else
2345                     {
2346                         curAtt = fAttrList->elementAt(retCount);
2347                         if (fDoNamespaces)
2348                         {
2349                             curAtt->set
2350                             (
2351                                 fEmptyNamespaceId
2352                                 , curDef.getFullName()
2353                                 , curDef.getValue()
2354                                 , curDef.getType()
2355                             );
2356                         }
2357                         else
2358                         {
2359                             curAtt->set
2360                             (
2361                                 0
2362                                 , curDef.getFullName()
2363                                 , XMLUni::fgZeroLenString
2364                                 , curDef.getValue()
2365                                 , curDef.getType()
2366                             );
2367                         }
2368                         curAtt->setSpecified(false);
2369                     }
2370 
2371                     if (fDoNamespaces)
2372                     {
2373                         //  Map the new attribute's prefix to a URI id and store
2374                         //  that in the attribute object.
2375                         const XMLCh* attPrefix = curAtt->getPrefix();
2376                         if (attPrefix && *attPrefix) {
2377                             curAtt->setURIId
2378                             (
2379                                 resolvePrefix(attPrefix, ElemStack::Mode_Attribute)
2380                             );
2381                         }
2382                     }
2383 
2384                     retCount++;
2385                 }
2386             }
2387         }
2388     }
2389 
2390     return retCount;
2391 }
2392 
2393 
2394 //  This method will reset the scanner data structures, and related plugged
2395 //  in stuff, for a new scan session. We get the input source for the primary
2396 //  XML entity, create the reader for it, and push it on the stack so that
2397 //  upon successful return from here we are ready to go.
scanReset(const InputSource & src)2398 void DGXMLScanner::scanReset(const InputSource& src)
2399 {
2400 
2401     //  This call implicitly tells us that we are going to reuse the scanner
2402     //  if it was previously used. So tell the validator to reset itself.
2403     //
2404     //  But, if the fUseCacheGrammar flag is set, then don't reset it.
2405     //
2406     //  NOTE:   The ReaderMgr is flushed on the way out, because that is
2407     //          required to insure that files are closed.
2408     fGrammarResolver->cacheGrammarFromParse(fToCacheGrammar);
2409     fGrammarResolver->useCachedGrammarInParse(fUseCachedGrammar);
2410 
2411     fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager);
2412     fGrammarResolver->putGrammar(fDTDGrammar);
2413     fGrammar = fDTDGrammar;
2414     fRootGrammar = 0;
2415     fValidator->setGrammar(fGrammar);
2416 
2417     // Reset validation
2418     fValidate = (fValScheme == Val_Always) ? true : false;
2419 
2420     //  And for all installed handlers, send reset events. This gives them
2421     //  a chance to flush any cached data.
2422     if (fDocHandler)
2423         fDocHandler->resetDocument();
2424     if (fEntityHandler)
2425         fEntityHandler->resetEntities();
2426     if (fErrorReporter)
2427         fErrorReporter->resetErrors();
2428 
2429     // Clear out the id reference list
2430     resetValidationContext();
2431 
2432     // Reset the Root Element Name
2433     fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
2434     fRootElemName = 0;
2435 
2436     //  Reset the element stack, and give it the latest ids for the special
2437     //  URIs it has to know about.
2438     fElemStack.reset
2439     (
2440         fEmptyNamespaceId
2441         , fUnknownNamespaceId
2442         , fXMLNamespaceId
2443         , fXMLNSNamespaceId
2444     );
2445 
2446     // Reset some status flags
2447     fInException = false;
2448     fStandalone = false;
2449     fErrorCount = 0;
2450     fHasNoDTD = true;
2451 
2452     // Reset the validators
2453     fDTDValidator->reset();
2454     fDTDValidator->setErrorReporter(fErrorReporter);
2455     if (fValidatorFromUser)
2456         fValidator->reset();
2457 
2458     //  Handle the creation of the XML reader object for this input source.
2459     //  This will provide us with transcoding and basic lexing services.
2460     XMLReader* newReader = fReaderMgr.createReader
2461     (
2462         src
2463         , true
2464         , XMLReader::RefFrom_NonLiteral
2465         , XMLReader::Type_General
2466         , XMLReader::Source_External
2467         , fCalculateSrcOfs
2468         , fLowWaterMark
2469     );
2470 
2471     if (!newReader) {
2472         if (src.getIssueFatalErrorIfNotFound())
2473             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager);
2474         else
2475             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager);
2476     }
2477 
2478     // Push this read onto the reader manager
2479     fReaderMgr.pushReader(newReader, 0);
2480 
2481     // and reset security-related things if necessary:
2482     if(fSecurityManager != 0)
2483     {
2484         fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit();
2485         fEntityExpansionCount = 0;
2486     }
2487     if(fUIntPoolRowTotal >= 32)
2488     { // 8 KB tied up with validating attributes...
2489         fAttDefRegistry->removeAll();
2490         recreateUIntPool();
2491     }
2492     else
2493     {
2494         // note that this will implicitly reset the values of the hashtables,
2495         // though their buckets will still be tied up
2496         resetUIntPool();
2497     }
2498     fUndeclaredAttrRegistry->removeAll();
2499     fAttrNSList->removeAllElements();
2500 }
2501 
2502 
2503 //  This method is called between markup in content. It scans for character
2504 //  data that is sent to the document handler. It watches for any markup
2505 //  characters that would indicate that the character data has ended. It also
2506 //  handles expansion of general and character entities.
2507 //
2508 //  sendData() is a local static helper for this method which handles some
2509 //  code that must be done in three different places here.
sendCharData(XMLBuffer & toSend)2510 void DGXMLScanner::sendCharData(XMLBuffer& toSend)
2511 {
2512     // If no data in the buffer, then nothing to do
2513     if (toSend.isEmpty())
2514         return;
2515 
2516     //  We do different things according to whether we are validating or
2517     //  not. If not, its always just characters; else, it depends on the
2518     //  current element's content model.
2519     if (fValidate)
2520     {
2521         // Get the raw data we need for the callback
2522         const XMLCh* const rawBuf = toSend.getRawBuffer();
2523         const XMLSize_t len = toSend.getLen();
2524 
2525         // And see if the current element is a 'Children' style content model
2526         const ElemStack::StackElem* topElem = fElemStack.topElement();
2527 
2528         // Get the character data opts for the current element
2529         XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
2530 
2531         if (charOpts == XMLElementDecl::NoCharData)
2532         {
2533             // They definitely cannot handle any type of char data
2534             fValidator->emitError(XMLValid::NoCharDataInCM);
2535         }
2536         else if (fReaderMgr.getCurrentReader()->isAllSpaces(rawBuf, len))
2537         {
2538             //  Its all spaces. So, if they can take spaces, then send it
2539             //  as ignorable whitespace. If they can handle any char data
2540             //  send it as characters.
2541             if (charOpts == XMLElementDecl::SpacesOk) {
2542                 if (fDocHandler)
2543                     fDocHandler->ignorableWhitespace(rawBuf, len, false);
2544             }
2545             else if (charOpts == XMLElementDecl::AllCharData)
2546             {
2547                 if (fDocHandler)
2548                     fDocHandler->docCharacters(rawBuf, len, false);
2549             }
2550         }
2551         else
2552         {
2553             //  If they can take any char data, then send it. Otherwise, they
2554             //  can only handle whitespace and can't handle this stuff so
2555             //  issue an error.
2556             if (charOpts == XMLElementDecl::AllCharData)
2557             {
2558                 if (fDocHandler)
2559                     fDocHandler->docCharacters(rawBuf, len, false);
2560             }
2561             else
2562             {
2563                 fValidator->emitError(XMLValid::NoCharDataInCM);
2564             }
2565         }
2566     }
2567     else
2568     {
2569         // Always assume its just char data if not validating
2570         if (fDocHandler)
2571             fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
2572     }
2573 
2574     // Reset buffer
2575     toSend.reset();
2576 }
2577 
2578 
2579 
2580 //  This method is called with a key/value string pair that represents an
2581 //  xmlns="yyy" or xmlns:xxx="yyy" attribute. This method will update the
2582 //  current top of the element stack based on this data. We know that when
2583 //  we get here, that it is one of these forms, so we don't bother confirming
2584 //  it.
2585 //
2586 //  But we have to ensure
2587 //      1. xxx is not xmlns
2588 //      2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa
2589 //      3. yyy is not XMLUni::fgXMLNSURIName
2590 //      4. if xxx is not null, then yyy cannot be an empty string.
updateNSMap(const XMLCh * const attrPrefix,const XMLCh * const attrLocalName,const XMLCh * const attrValue)2591 void DGXMLScanner::updateNSMap(const    XMLCh* const attrPrefix
2592                                , const  XMLCh* const attrLocalName
2593                                , const  XMLCh* const attrValue)
2594 {
2595     //  We either have the default prefix (""), or we point it into the attr
2596     //  name parameter. Note that the xmlns is not the prefix we care about
2597     //  here. To us, the 'prefix' is really the local part of the attrName
2598     //  parameter.
2599     //
2600     //  Check 1. xxx is not xmlns
2601     //        2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa
2602     //        3. yyy is not XMLUni::fgXMLNSURIName
2603     //        4. if xxx is not null, then yyy cannot be an empty string.
2604     if (attrPrefix && *attrPrefix) {
2605 
2606         if (XMLString::equals(attrLocalName, XMLUni::fgXMLNSString))
2607             emitError(XMLErrs::NoUseOfxmlnsAsPrefix);
2608         else if (XMLString::equals(attrLocalName, XMLUni::fgXMLString)) {
2609             if (!XMLString::equals(attrValue, XMLUni::fgXMLURIName))
2610                 emitError(XMLErrs::PrefixXMLNotMatchXMLURI);
2611         }
2612 
2613         if (!attrValue)
2614             emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName);
2615         else if(!*attrValue && fXMLVersion == XMLReader::XMLV1_0)
2616             emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName);
2617     }
2618 
2619     if (XMLString::equals(attrValue, XMLUni::fgXMLNSURIName))
2620         emitError(XMLErrs::NoUseOfxmlnsURI);
2621     else if (XMLString::equals(attrValue, XMLUni::fgXMLURIName)) {
2622         if (!XMLString::equals(attrLocalName, XMLUni::fgXMLString))
2623             emitError(XMLErrs::XMLURINotMatchXMLPrefix);
2624     }
2625 
2626     //  Ok, we have to get the unique id for the attribute value, which is the
2627     //  URI that this value should be mapped to. The validator has the
2628     //  namespace string pool, so we ask him to find or add this new one. Then
2629     //  we ask the element stack to add this prefix to URI Id mapping.
2630     fElemStack.addPrefix
2631     (
2632         attrLocalName
2633         , fURIStringPool->addOrFind(attrValue)
2634     );
2635 }
2636 
scanAttrListforNameSpaces(RefVectorOf<XMLAttr> * theAttrList,XMLSize_t attCount,XMLElementDecl * elemDecl)2637 void DGXMLScanner::scanAttrListforNameSpaces(RefVectorOf<XMLAttr>* theAttrList, XMLSize_t attCount,
2638                                                 XMLElementDecl*       elemDecl)
2639 {
2640     // Map prefixes to uris
2641     for (XMLSize_t i=0; i < fAttrNSList->size(); i++) {
2642         XMLAttr* providedAttr = fAttrNSList->elementAt(i);
2643         providedAttr->setURIId(
2644             resolvePrefix(providedAttr->getPrefix(), ElemStack::Mode_Attribute)
2645         );
2646     }
2647 
2648     fAttrNSList->removeAllElements();
2649 
2650      // Decide if to use hash table to do duplicate checking
2651     bool toUseHashTable = false;
2652 
2653 	setAttrDupChkRegistry(attCount, toUseHashTable);
2654     for (XMLSize_t index = 0; index < attCount; index++)
2655     {
2656         // check for duplicate namespace attributes:
2657         // by checking for qualified names with the same local part and with prefixes
2658         // which have been bound to namespace names that are identical.
2659         XMLAttr* curAttr = theAttrList->elementAt(index);
2660         if (!toUseHashTable)
2661         {
2662             XMLAttr* loopAttr;
2663             for (XMLSize_t attrIndex=0; attrIndex < index; attrIndex++) {
2664                 loopAttr = theAttrList->elementAt(attrIndex);
2665                 if (loopAttr->getURIId() == curAttr->getURIId() &&
2666                     XMLString::equals(loopAttr->getName(), curAttr->getName())) {
2667                     emitError(
2668                         XMLErrs::AttrAlreadyUsedInSTag, curAttr->getName()
2669                         , elemDecl->getFullName()
2670                     );
2671                 }
2672             }
2673         }
2674         else
2675         {
2676             if (fAttrDupChkRegistry->containsKey((void*)curAttr->getName(), curAttr->getURIId()))
2677             {
2678                 emitError(
2679                     XMLErrs::AttrAlreadyUsedInSTag
2680                     , curAttr->getName(), elemDecl->getFullName()
2681                 );
2682             }
2683 
2684             fAttrDupChkRegistry->put((void*)curAttr->getName(), curAttr->getURIId(), curAttr);
2685         }
2686     }
2687 }
2688 
resolveSystemId(const XMLCh * const sysId,const XMLCh * const pubId)2689 InputSource* DGXMLScanner::resolveSystemId(const XMLCh* const sysId
2690                                           ,const XMLCh* const pubId)
2691 {
2692     //Normalize sysId
2693     XMLBufBid nnSys(&fBufMgr);
2694     XMLBuffer& normalizedSysId = nnSys.getBuffer();
2695     XMLString::removeChar(sysId, 0xFFFF, normalizedSysId);
2696     const XMLCh* normalizedURI = normalizedSysId.getRawBuffer();
2697 
2698     // Create a buffer for expanding the normalized system id
2699     XMLBufBid bbSys(&fBufMgr);
2700     XMLBuffer& expSysId = bbSys.getBuffer();
2701 
2702     //  Allow the entity handler to expand the system id if they choose
2703     //  to do so.
2704     InputSource* srcToFill = 0;
2705     if (fEntityHandler)
2706     {
2707         if (!fEntityHandler->expandSystemId(normalizedURI, expSysId))
2708             expSysId.set(normalizedURI);
2709 
2710         ReaderMgr::LastExtEntityInfo lastInfo;
2711         fReaderMgr.getLastExtEntityInfo(lastInfo);
2712         XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
2713                             expSysId.getRawBuffer(), 0, pubId, lastInfo.systemId,
2714                             &fReaderMgr);
2715         srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier);
2716     }
2717     else
2718     {
2719         expSysId.set(normalizedURI);
2720     }
2721 
2722     //  If they didn't create a source via the entity handler, then we
2723     //  have to create one on our own.
2724     if (!srcToFill)
2725     {
2726         if (fDisableDefaultEntityResolution)
2727             return srcToFill;
2728 
2729         ReaderMgr::LastExtEntityInfo lastInfo;
2730         fReaderMgr.getLastExtEntityInfo(lastInfo);
2731 
2732         XMLURL urlTmp(fMemoryManager);
2733         if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) ||
2734             (urlTmp.isRelative()))
2735         {
2736             if (!fStandardUriConformant)
2737             {
2738                 XMLBufBid  ddSys(&fBufMgr);
2739                 XMLBuffer& resolvedSysId = ddSys.getBuffer();
2740                 XMLUri::normalizeURI(expSysId.getRawBuffer(), resolvedSysId);
2741 
2742                 srcToFill = new (fMemoryManager) LocalFileInputSource
2743                 (
2744                     lastInfo.systemId
2745                     , resolvedSysId.getRawBuffer()
2746                     , fMemoryManager
2747                 );
2748             }
2749             else
2750                 ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
2751         }
2752         else
2753         {
2754             if (fStandardUriConformant && urlTmp.hasInvalidChar())
2755                 ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
2756             srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager);
2757         }
2758     }
2759 
2760     return srcToFill;
2761 }
2762 
2763 // ---------------------------------------------------------------------------
2764 //  DGXMLScanner: Private parsing methods
2765 // ---------------------------------------------------------------------------
scanAttValue(const XMLAttDef * const attDef,const XMLCh * const attrName,XMLBuffer & toFill)2766 bool DGXMLScanner::scanAttValue(  const   XMLAttDef* const    attDef
2767                                   , const XMLCh *const attrName
2768                                   ,       XMLBuffer&          toFill)
2769 {
2770     enum States
2771     {
2772         InWhitespace
2773         , InContent
2774     };
2775 
2776     // Get the type and name
2777     const XMLAttDef::AttTypes type = (attDef)
2778                         ?attDef->getType()
2779                         :XMLAttDef::CData;
2780 
2781     // Reset the target buffer
2782     toFill.reset();
2783 
2784     // Get the next char which must be a single or double quote
2785     XMLCh quoteCh;
2786     if (!fReaderMgr.skipIfQuote(quoteCh))
2787         return false;
2788 
2789     //  We have to get the current reader because we have to ignore closing
2790     //  quotes until we hit the same reader again.
2791     const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum();
2792 
2793     // Get attribute def - to check to see if it's declared externally or not
2794     bool  isAttExternal = (attDef)
2795                         ?attDef->isExternal()
2796                         :false;
2797 
2798     //  Loop until we get the attribute value. Note that we use a double
2799     //  loop here to avoid the setup/teardown overhead of the exception
2800     //  handler on every round.
2801     XMLCh   nextCh;
2802     XMLCh   secondCh = 0;
2803     States  curState = InContent;
2804     bool    firstNonWS = false;
2805     bool    gotLeadingSurrogate = false;
2806     bool    escaped;
2807     while (true)
2808     {
2809     try
2810     {
2811         while(true)
2812         {
2813             nextCh = fReaderMgr.getNextChar();
2814 
2815             if (!nextCh)
2816                 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
2817 
2818             // Check for our ending quote in the same entity
2819             if (nextCh == quoteCh)
2820             {
2821                 if (curReader == fReaderMgr.getCurrentReaderNum())
2822                     return true;
2823 
2824                 // Watch for spillover into a previous entity
2825                 if (curReader > fReaderMgr.getCurrentReaderNum())
2826                 {
2827                     emitError(XMLErrs::PartialMarkupInEntity);
2828                     return false;
2829                 }
2830             }
2831 
2832             //  Check for an entity ref now, before we let it affect our
2833             //  whitespace normalization logic below. We ignore the empty flag
2834             //  in this one.
2835             escaped = false;
2836             if (nextCh == chAmpersand)
2837             {
2838                 if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
2839                 {
2840                     gotLeadingSurrogate = false;
2841                     continue;
2842                 }
2843             }
2844             else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
2845             {
2846                 // Deal with surrogate pairs
2847                 //  Its a leading surrogate. If we already got one, then
2848                 //  issue an error, else set leading flag to make sure that
2849                 //  we look for a trailing next time.
2850                 if (gotLeadingSurrogate)
2851                     emitError(XMLErrs::Expected2ndSurrogateChar);
2852                 else
2853                     gotLeadingSurrogate = true;
2854             }
2855             else
2856             {
2857                 //  If its a trailing surrogate, make sure that we are
2858                 //  prepared for that. Else, its just a regular char so make
2859                 //  sure that we were not expected a trailing surrogate.
2860                 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
2861                 {
2862                     // Its trailing, so make sure we were expecting it
2863                     if (!gotLeadingSurrogate)
2864                         emitError(XMLErrs::Unexpected2ndSurrogateChar);
2865                 }
2866                 else
2867                 {
2868                     //  Its just a char, so make sure we were not expecting a
2869                     //  trailing surrogate.
2870                     if (gotLeadingSurrogate)
2871                         emitError(XMLErrs::Expected2ndSurrogateChar);
2872 
2873                     // Its got to at least be a valid XML character
2874                     if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
2875                     {
2876                         XMLCh tmpBuf[9];
2877                         XMLString::binToText
2878                         (
2879                             nextCh
2880                             , tmpBuf
2881                             , 8
2882                             , 16
2883                             , fMemoryManager
2884                         );
2885                         emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
2886                     }
2887                 }
2888                 gotLeadingSurrogate = false;
2889             }
2890 
2891             //  If its not escaped, then make sure its not a < character, which
2892             //  is not allowed in attribute values.
2893             if (!escaped && (nextCh == chOpenAngle))
2894                 emitError(XMLErrs::BracketInAttrValue, attrName);
2895 
2896             //  If the attribute is a CDATA type we do simple replacement of
2897             //  tabs and new lines with spaces, if the character is not escaped
2898             //  by way of a char ref.
2899             //
2900             //  Otherwise, we do the standard non-CDATA normalization of
2901             //  compressing whitespace to single spaces and getting rid of leading
2902             //  and trailing whitespace.
2903             if (type == XMLAttDef::CData)
2904             {
2905                 if (!escaped)
2906                 {
2907                     if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
2908                     {
2909                         // Check Validity Constraint for Standalone document declaration
2910                         // XML 1.0, Section 2.9
2911                         if (fStandalone && fValidate && isAttExternal)
2912                         {
2913                              // Can't have a standalone document declaration of "yes" if  attribute
2914                              // values are subject to normalisation
2915                              fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
2916                         }
2917                         nextCh = chSpace;
2918                     }
2919                 }
2920             }
2921             else
2922             {
2923                 if (curState == InWhitespace)
2924                 {
2925                     if ((escaped && nextCh != chSpace) || !fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
2926                     {
2927                         if (firstNonWS)
2928                             toFill.append(chSpace);
2929                         curState = InContent;
2930                         firstNonWS = true;
2931                     }
2932                     else
2933                     {
2934                         continue;
2935                     }
2936                 }
2937                 else if (curState == InContent)
2938                 {
2939                     if ((nextCh == chSpace) ||
2940                         (fReaderMgr.getCurrentReader()->isWhitespace(nextCh) && !escaped))
2941                     {
2942                         curState = InWhitespace;
2943 
2944                         // Check Validity Constraint for Standalone document declaration
2945                         // XML 1.0, Section 2.9
2946                         if (fStandalone && fValidate && isAttExternal)
2947                         {
2948                             if (!firstNonWS || (nextCh != chSpace) || (fReaderMgr.lookingAtSpace()))
2949                             {
2950                                  // Can't have a standalone document declaration of "yes" if  attribute
2951                                  // values are subject to normalisation
2952                                  fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
2953                             }
2954                         }
2955                         continue;
2956                     }
2957                     firstNonWS = true;
2958                 }
2959             }
2960 
2961             // Else add it to the buffer
2962             toFill.append(nextCh);
2963 
2964             if (secondCh)
2965             {
2966                 toFill.append(secondCh);
2967                 secondCh=0;
2968             }
2969         }
2970     }
2971     catch(const EndOfEntityException&)
2972     {
2973         // Just eat it and continue.
2974         gotLeadingSurrogate = false;
2975         escaped = false;
2976     }
2977     }
2978     return true;
2979 }
2980 
2981 
2982 //  This method scans a CDATA section. It collects the character into one
2983 //  of the temp buffers and calls the document handler, if any, with the
2984 //  characters. It assumes that the <![CDATA string has been scanned before
2985 //  this call.
scanCDSection()2986 void DGXMLScanner::scanCDSection()
2987 {
2988     static const XMLCh CDataClose[] =
2989     {
2990             chCloseSquare, chCloseAngle, chNull
2991     };
2992 
2993     //  The next character should be the opening square bracket. If not
2994     //  issue an error, but then try to recover by skipping any whitespace
2995     //  and checking again.
2996     if (!fReaderMgr.skippedChar(chOpenSquare))
2997     {
2998         emitError(XMLErrs::ExpectedOpenSquareBracket);
2999         fReaderMgr.skipPastSpaces();
3000 
3001         // If we still don't find it, then give up, else keep going
3002         if (!fReaderMgr.skippedChar(chOpenSquare))
3003             return;
3004     }
3005 
3006     // Get a buffer for this
3007     XMLBufBid bbCData(&fBufMgr);
3008 
3009     //  We just scan forward until we hit the end of CDATA section sequence.
3010     //  CDATA is effectively a big escape mechanism so we don't treat markup
3011     //  characters specially here.
3012     bool            emittedError = false;
3013     bool     gotLeadingSurrogate = false;
3014 
3015     // Get the character data opts for the current element
3016     const ElemStack::StackElem* topElem = fElemStack.topElement();
3017     XMLElementDecl::CharDataOpts charOpts =  topElem->fThisElement->getCharDataOpts();
3018 
3019     while (true)
3020     {
3021         const XMLCh nextCh = fReaderMgr.getNextChar();
3022 
3023         // Watch for unexpected end of file
3024         if (!nextCh)
3025         {
3026             emitError(XMLErrs::UnterminatedCDATASection);
3027             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3028         }
3029 
3030         if (fValidate && fStandalone && (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)))
3031         {
3032             // This document is standalone; this ignorable CDATA whitespace is forbidden.
3033             // XML 1.0, Section 2.9
3034             // And see if the current element is a 'Children' style content model
3035             if (topElem->fThisElement->isExternal()) {
3036 
3037                 if (charOpts == XMLElementDecl::SpacesOk) // Element Content
3038                 {
3039                     // Error - standalone should have a value of "no" as whitespace detected in an
3040                     // element type with element content whose element declaration was external
3041                     fValidator->emitError(XMLValid::NoWSForStandalone);
3042                 }
3043             }
3044         }
3045 
3046         //  If this is a close square bracket it could be our closing
3047         //  sequence.
3048         if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
3049         {
3050             //  make sure we were not expecting a trailing surrogate.
3051             if (gotLeadingSurrogate)
3052                 emitError(XMLErrs::Expected2ndSurrogateChar);
3053 
3054             if (fValidate) {
3055 
3056                 if (charOpts != XMLElementDecl::AllCharData)
3057                 {
3058                     // They definitely cannot handle any type of char data
3059                     fValidator->emitError(XMLValid::NoCharDataInCM);
3060                 }
3061             }
3062 
3063             // If we have a doc handler, call it
3064             if (fDocHandler)
3065             {
3066                 fDocHandler->docCharacters
3067                     (
3068                     bbCData.getRawBuffer()
3069                     , bbCData.getLen()
3070                     , true
3071                     );
3072             }
3073 
3074             // And we are done
3075             break;
3076         }
3077 
3078         //  Make sure its a valid character. But if we've emitted an error
3079         //  already, don't bother with the overhead since we've already told
3080         //  them about it.
3081         if (!emittedError)
3082         {
3083             // Deal with surrogate pairs
3084             if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
3085             {
3086                 //  Its a leading surrogate. If we already got one, then
3087                 //  issue an error, else set leading flag to make sure that
3088                 //  we look for a trailing next time.
3089                 if (gotLeadingSurrogate)
3090                     emitError(XMLErrs::Expected2ndSurrogateChar);
3091                 else
3092                     gotLeadingSurrogate = true;
3093             }
3094             else
3095             {
3096                 //  If its a trailing surrogate, make sure that we are
3097                 //  prepared for that. Else, its just a regular char so make
3098                 //  sure that we were not expected a trailing surrogate.
3099                 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
3100                 {
3101                     // Its trailing, so make sure we were expecting it
3102                     if (!gotLeadingSurrogate)
3103                         emitError(XMLErrs::Unexpected2ndSurrogateChar);
3104                 }
3105                 else
3106                 {
3107                     //  Its just a char, so make sure we were not expecting a
3108                     //  trailing surrogate.
3109                     if (gotLeadingSurrogate)
3110                         emitError(XMLErrs::Expected2ndSurrogateChar);
3111 
3112                     // Its got to at least be a valid XML character
3113                     else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
3114                     {
3115                         XMLCh tmpBuf[9];
3116                         XMLString::binToText
3117                         (
3118                             nextCh
3119                             , tmpBuf
3120                             , 8
3121                             , 16
3122                             , fMemoryManager
3123                         );
3124                         emitError(XMLErrs::InvalidCharacter, tmpBuf);
3125                         emittedError = true;
3126                     }
3127                 }
3128                 gotLeadingSurrogate = false;
3129             }
3130         }
3131 
3132         // Add it to the buffer
3133         bbCData.append(nextCh);
3134     }
3135 }
3136 
3137 
scanCharData(XMLBuffer & toUse)3138 void DGXMLScanner::scanCharData(XMLBuffer& toUse)
3139 {
3140     //  We have to watch for the stupid ]]> sequence, which is illegal in
3141     //  character data. So this is a little state machine that handles that.
3142     enum States
3143     {
3144         State_Waiting
3145         , State_GotOne
3146         , State_GotTwo
3147     };
3148 
3149     // Reset the buffer before we start
3150     toUse.reset();
3151 
3152     // Turn on the 'throw at end' flag of the reader manager
3153     ThrowEOEJanitor jan(&fReaderMgr, true);
3154 
3155     //  In order to be more efficient we have to use kind of a deeply nested
3156     //  set of blocks here. The outer block puts on a try and catches end of
3157     //  entity exceptions. The inner loop is the per-character loop. If we
3158     //  put the try inside the inner loop, it would work but would require
3159     //  the exception handling code setup/teardown code to be invoked for
3160     //  each character.
3161     XMLCh   nextCh;
3162     XMLCh   secondCh = 0;
3163     States  curState = State_Waiting;
3164     bool    escaped = false;
3165     bool    gotLeadingSurrogate = false;
3166     bool    notDone = true;
3167     while (notDone)
3168     {
3169         try
3170         {
3171             while (true)
3172             {
3173                 //  Eat through as many plain content characters as possible without
3174                 //  needing special handling.  Moving most content characters here,
3175                 //  in this one call, rather than running the overall loop once
3176                 //  per content character, is a speed optimization.
3177                 if (curState == State_Waiting  &&  !gotLeadingSurrogate)
3178                 {
3179                      fReaderMgr.movePlainContentChars(toUse);
3180                 }
3181 
3182                 // Try to get another char from the source
3183                 //   The code from here on down covers all contengencies,
3184                 if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
3185                 {
3186                     // If we were waiting for a trailing surrogate, its an error
3187                     if (gotLeadingSurrogate)
3188                         emitError(XMLErrs::Expected2ndSurrogateChar);
3189 
3190                     notDone = false;
3191                     break;
3192                 }
3193 
3194                 //  Watch for a reference. Note that the escapement mechanism
3195                 //  is ignored in this content.
3196                 escaped = false;
3197                 if (nextCh == chAmpersand)
3198                 {
3199                     sendCharData(toUse);
3200 
3201                     // Turn off the throwing at the end of entity during this
3202                     ThrowEOEJanitor jan(&fReaderMgr, false);
3203 
3204                     if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
3205                     {
3206                         gotLeadingSurrogate = false;
3207                         continue;
3208                     }
3209                     else
3210                     {
3211                         if (escaped && !fElemStack.isEmpty())
3212                             fElemStack.setReferenceEscaped();
3213                     }
3214                 }
3215                 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
3216                 {
3217                     // Deal with surrogate pairs
3218                     //  Its a leading surrogate. If we already got one, then
3219                     //  issue an error, else set leading flag to make sure that
3220                     //  we look for a trailing next time.
3221                     if (gotLeadingSurrogate)
3222                         emitError(XMLErrs::Expected2ndSurrogateChar);
3223                     else
3224                         gotLeadingSurrogate = true;
3225                 }
3226                 else
3227                 {
3228                     //  If its a trailing surrogate, make sure that we are
3229                     //  prepared for that. Else, its just a regular char so make
3230                     //  sure that we were not expected a trailing surrogate.
3231                     if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
3232                     {
3233                         // Its trailing, so make sure we were expecting it
3234                         if (!gotLeadingSurrogate)
3235                             emitError(XMLErrs::Unexpected2ndSurrogateChar);
3236                     }
3237                     else
3238                     {
3239                         //  Its just a char, so make sure we were not expecting a
3240                         //  trailing surrogate.
3241                         if (gotLeadingSurrogate)
3242                             emitError(XMLErrs::Expected2ndSurrogateChar);
3243 
3244                         // Make sure the returned char is a valid XML char
3245                         if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
3246                         {
3247                             XMLCh tmpBuf[9];
3248                             XMLString::binToText
3249                             (
3250                                 nextCh
3251                                 , tmpBuf
3252                                 , 8
3253                                 , 16
3254                                 , fMemoryManager
3255                             );
3256                             emitError(XMLErrs::InvalidCharacter, tmpBuf);
3257                         }
3258                     }
3259                     gotLeadingSurrogate = false;
3260                 }
3261 
3262                  // Keep the state machine up to date
3263                 if (!escaped)
3264                 {
3265                     if (nextCh == chCloseSquare)
3266                     {
3267                         if (curState == State_Waiting)
3268                             curState = State_GotOne;
3269                         else if (curState == State_GotOne)
3270                             curState = State_GotTwo;
3271                     }
3272                     else if (nextCh == chCloseAngle)
3273                     {
3274                         if (curState == State_GotTwo)
3275                             emitError(XMLErrs::BadSequenceInCharData);
3276                         curState = State_Waiting;
3277                     }
3278                     else
3279                     {
3280                         curState = State_Waiting;
3281                     }
3282                 }
3283                 else
3284                 {
3285                     curState = State_Waiting;
3286                 }
3287 
3288                 // Add this char to the buffer
3289                 toUse.append(nextCh);
3290 
3291                 if (secondCh)
3292                 {
3293                     toUse.append(secondCh);
3294                     secondCh=0;
3295                 }
3296             }
3297         }
3298         catch(const EndOfEntityException& toCatch)
3299         {
3300             //  Some entity ended, so we have to send any accumulated
3301             //  chars and send an end of entity event.
3302             sendCharData(toUse);
3303             gotLeadingSurrogate = false;
3304 
3305             if (fDocHandler)
3306                 fDocHandler->endEntityReference(toCatch.getEntity());
3307         }
3308     }
3309 
3310     // Check the validity constraints as per XML 1.0 Section 2.9
3311     if (fValidate && fStandalone)
3312     {
3313         // See if the text contains whitespace
3314         // Get the raw data we need for the callback
3315         const XMLCh* rawBuf = toUse.getRawBuffer();
3316         const XMLSize_t len = toUse.getLen();
3317         const bool isSpaces = fReaderMgr.getCurrentReader()->containsWhiteSpace(rawBuf, len);
3318 
3319         if (isSpaces)
3320         {
3321             // And see if the current element is a 'Children' style content model
3322             const ElemStack::StackElem* topElem = fElemStack.topElement();
3323 
3324             if (topElem->fThisElement->isExternal()) {
3325 
3326                 // Get the character data opts for the current element
3327                 XMLElementDecl::CharDataOpts charOpts =  topElem->fThisElement->getCharDataOpts();
3328 
3329                 if (charOpts == XMLElementDecl::SpacesOk)  // => Element Content
3330                 {
3331                     // Error - standalone should have a value of "no" as whitespace detected in an
3332                     // element type with element content whose element declaration was external
3333                     //
3334                     fValidator->emitError(XMLValid::NoWSForStandalone);
3335                 }
3336             }
3337         }
3338     }
3339     // Send any char data that we accumulated into the buffer
3340     sendCharData(toUse);
3341 }
3342 
3343 
3344 //  This method will scan a general/character entity ref. It will either
3345 //  expand a char ref and return it directly, or push a reader for a general
3346 //  entity.
3347 //
3348 //  The return value indicates whether the char parameters hold the value
3349 //  or whether the value was pushed as a reader, or that it failed.
3350 //
3351 //  The escaped flag tells the caller whether the returned parameter resulted
3352 //  from a character reference, which escapes the character in some cases. It
3353 //  only makes any difference if the return value indicates the value was
3354 //  returned directly.
3355 DGXMLScanner::EntityExpRes
scanEntityRef(const bool inAttVal,XMLCh & firstCh,XMLCh & secondCh,bool & escaped)3356 DGXMLScanner::scanEntityRef(  const   bool    inAttVal
3357                             ,       XMLCh&  firstCh
3358                             ,       XMLCh&  secondCh
3359                             ,       bool&   escaped)
3360 {
3361     // Assume no escape
3362     secondCh = 0;
3363     escaped = false;
3364 
3365     // We have to insure that its all in one entity
3366     const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum();
3367 
3368     //  If the next char is a pound, then its a character reference and we
3369     //  need to expand it always.
3370     if (fReaderMgr.skippedChar(chPound))
3371     {
3372         //  Its a character reference, so scan it and get back the numeric
3373         //  value it represents.
3374         if (!scanCharRef(firstCh, secondCh))
3375             return EntityExp_Failed;
3376 
3377         escaped = true;
3378 
3379         if (curReader != fReaderMgr.getCurrentReaderNum())
3380             emitError(XMLErrs::PartialMarkupInEntity);
3381 
3382         return EntityExp_Returned;
3383     }
3384 
3385     // Expand it since its a normal entity ref
3386     XMLBufBid bbName(&fBufMgr);
3387 
3388     int  colonPosition;
3389     bool validName = fDoNamespaces ? fReaderMgr.getQName(bbName.getBuffer(), &colonPosition) :
3390                                      fReaderMgr.getName(bbName.getBuffer());
3391     if (!validName)
3392     {
3393         if (bbName.isEmpty())
3394             emitError(XMLErrs::ExpectedEntityRefName);
3395         else
3396             emitError(XMLErrs::InvalidEntityRefName, bbName.getRawBuffer());
3397         return EntityExp_Failed;
3398     }
3399 
3400     //  Next char must be a semi-colon. But if its not, just emit
3401     //  an error and try to continue.
3402     if (!fReaderMgr.skippedChar(chSemiColon))
3403         emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
3404 
3405     // Make sure we ended up on the same entity reader as the & char
3406     if (curReader != fReaderMgr.getCurrentReaderNum())
3407         emitError(XMLErrs::PartialMarkupInEntity);
3408 
3409     // Look up the name in the general entity pool
3410     XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
3411 
3412     // If it does not exist, then obviously an error
3413     if (!decl)
3414     {
3415         // XML 1.0 Section 4.1
3416         // Well-formedness Constraint for entity not found:
3417         //   In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references,
3418         //      or a document with "standalone='yes'", for an entity reference that does not occur within the external subset
3419         //      or a parameter entity
3420         //
3421         // Else it's Validity Constraint
3422         if (fStandalone || fHasNoDTD)
3423             emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
3424         else {
3425             if (fValidate)
3426                 fValidator->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
3427         }
3428 
3429         return EntityExp_Failed;
3430     }
3431 
3432     // XML 1.0 Section 4.1
3433     //  If we are a standalone document, then it has to have been declared
3434     //  in the internal subset.
3435     if (fStandalone && !decl->getDeclaredInIntSubset())
3436         emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer());
3437 
3438     if (decl->isExternal())
3439     {
3440         // If its unparsed, then its not valid here
3441         if (decl->isUnparsed())
3442         {
3443             emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
3444             return EntityExp_Failed;
3445         }
3446 
3447         // If we are in an attribute value, then not valid but keep going
3448         if (inAttVal)
3449             emitError(XMLErrs::NoExtRefsInAttValue);
3450 
3451         // And now create a reader to read this entity
3452         InputSource* srcUsed;
3453         XMLReader* reader = fReaderMgr.createReader
3454         (
3455             decl->getBaseURI()
3456             , decl->getSystemId()
3457             , decl->getPublicId()
3458             , false
3459             , XMLReader::RefFrom_NonLiteral
3460             , XMLReader::Type_General
3461             , XMLReader::Source_External
3462             , srcUsed
3463             , fCalculateSrcOfs
3464             , fLowWaterMark
3465             , fDisableDefaultEntityResolution
3466         );
3467 
3468         // Put a janitor on the source so it gets cleaned up on exit
3469         Janitor<InputSource> janSrc(srcUsed);
3470 
3471         //  If the creation failed, and its not because the source was empty,
3472         //  then emit an error and return.
3473         if (!reader)
3474             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
3475 
3476         //  Push the reader. If its a recursive expansion, then emit an error
3477         //  and return an failure.
3478         if (!fReaderMgr.pushReader(reader, decl))
3479         {
3480             emitError(XMLErrs::RecursiveEntity, decl->getName());
3481             return EntityExp_Failed;
3482         }
3483 
3484         // here's where we need to check if there's a SecurityManager,
3485         // how many entity references we've had
3486         if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) {
3487             XMLCh expLimStr[32];
3488             XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager);
3489             emitError
3490             (
3491                 XMLErrs::EntityExpansionLimitExceeded
3492                 , expLimStr
3493             );
3494             // there seems nothing better to do than reset the entity expansion counter
3495             fEntityExpansionCount = 0;
3496         }
3497 
3498         //  Do a start entity reference event.
3499         //
3500         //  <TBD> For now, we supress them in att values. Later, when
3501         //  the stuff is in place to correctly allow DOM to handle them
3502         //  we'll turn this back on.
3503         if (fDocHandler && !inAttVal)
3504             fDocHandler->startEntityReference(*decl);
3505 
3506         // If it starts with the XML string, then parse a text decl
3507         if (checkXMLDecl(true))
3508             scanXMLDecl(Decl_Text);
3509     }
3510     else
3511     {
3512         //  If its one of the special char references, then we can return
3513         //  it as a character, and its considered escaped.
3514         if (decl->getIsSpecialChar())
3515         {
3516             firstCh = decl->getValue()[0];
3517             escaped = true;
3518             return EntityExp_Returned;
3519         }
3520 
3521         //  Create a reader over a memory stream over the entity value
3522         //  We force it to assume UTF-16 by passing in an encoding
3523         //  string. This way it won't both trying to predecode the
3524         //  first line, looking for an XML/TextDecl.
3525         XMLReader* valueReader = fReaderMgr.createIntEntReader
3526         (
3527             decl->getName()
3528             , XMLReader::RefFrom_NonLiteral
3529             , XMLReader::Type_General
3530             , decl->getValue()
3531             , decl->getValueLen()
3532             , false
3533         );
3534 
3535         //  Try to push the entity reader onto the reader manager stack,
3536         //  where it will become the subsequent input. If it fails, that
3537         //  means the entity is recursive, so issue an error. The reader
3538         //  will have just been discarded, but we just keep going.
3539         if (!fReaderMgr.pushReader(valueReader, decl))
3540             emitError(XMLErrs::RecursiveEntity, decl->getName());
3541 
3542         // here's where we need to check if there's a SecurityManager,
3543         // how many entity references we've had
3544         if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) {
3545             XMLCh expLimStr[32];
3546             XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager);
3547             emitError
3548             (
3549                 XMLErrs::EntityExpansionLimitExceeded
3550                 , expLimStr
3551             );
3552         }
3553 
3554         //  Do a start entity reference event.
3555         //
3556         //  <TBD> For now, we supress them in att values. Later, when
3557         //  the stuff is in place to correctly allow DOM to handle them
3558         //  we'll turn this back on.
3559         if (fDocHandler && !inAttVal)
3560             fDocHandler->startEntityReference(*decl);
3561 
3562         // If it starts with the XML string, then it's an error
3563         if (checkXMLDecl(true)) {
3564             emitError(XMLErrs::TextDeclNotLegalHere);
3565             fReaderMgr.skipPastChar(chCloseAngle);
3566         }
3567     }
3568     return EntityExp_Pushed;
3569 }
3570 
3571 
3572 XERCES_CPP_NAMESPACE_END
3573