1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 /*
19  * $Id$
20  */
21 
22 
23 // ---------------------------------------------------------------------------
24 //  Includes
25 // ---------------------------------------------------------------------------
26 #include <xercesc/util/BinMemInputStream.hpp>
27 #include <xercesc/util/FlagJanitor.hpp>
28 #include <xercesc/util/Janitor.hpp>
29 #include <xercesc/util/XMLUniDefs.hpp>
30 #include <xercesc/util/ValueStackOf.hpp>
31 #include <xercesc/util/UnexpectedEOFException.hpp>
32 #include <xercesc/util/OutOfMemoryException.hpp>
33 #include <xercesc/sax/InputSource.hpp>
34 #include <xercesc/framework/XMLDocumentHandler.hpp>
35 #include <xercesc/framework/XMLEntityHandler.hpp>
36 #include <xercesc/framework/XMLValidator.hpp>
37 #include <xercesc/internal/EndOfEntityException.hpp>
38 #include <xercesc/internal/XMLScanner.hpp>
39 #include <xercesc/validators/common/ContentSpecNode.hpp>
40 #include <xercesc/validators/common/MixedContentModel.hpp>
41 #include <xercesc/validators/DTD/DTDEntityDecl.hpp>
42 #include <xercesc/validators/DTD/DocTypeHandler.hpp>
43 #include <xercesc/validators/DTD/DTDScanner.hpp>
44 
45 XERCES_CPP_NAMESPACE_BEGIN
46 
47 #define CONTENTSPEC_DEPTH_LIMIT 1000
48 
49 // ---------------------------------------------------------------------------
50 //  Local methods
51 // ---------------------------------------------------------------------------
52 //
53 //  This method automates the grunt work of looking at a char and see if its
54 //  a repetition suffix. If so, it creates a new correct rep node and wraps
55 //  the pass node in it. Otherwise, it returns the previous node.
56 //
makeRepNode(const XMLCh testCh,ContentSpecNode * const prevNode,MemoryManager * const manager)57 static ContentSpecNode* makeRepNode(const XMLCh testCh,
58                                     ContentSpecNode* const prevNode,
59                                     MemoryManager* const manager)
60 {
61     if (testCh == chQuestion)
62     {
63         return new (manager) ContentSpecNode
64         (
65             ContentSpecNode::ZeroOrOne
66             , prevNode
67             , 0
68             , true
69             , true
70             , manager
71         );
72     }
73      else if (testCh == chPlus)
74     {
75         return new (manager) ContentSpecNode
76         (
77             ContentSpecNode::OneOrMore
78             , prevNode
79             , 0
80             , true
81             , true
82             , manager
83         );
84     }
85      else if (testCh == chAsterisk)
86     {
87         return new (manager) ContentSpecNode
88         (
89             ContentSpecNode::ZeroOrMore
90             , prevNode
91             , 0
92             , true
93             , true
94             , manager
95         );
96     }
97 
98     // Just return the incoming node
99     return prevNode;
100 }
101 
102 // ---------------------------------------------------------------------------
103 //  DTDValidator: Constructors and Destructor
104 // ---------------------------------------------------------------------------
DTDScanner(DTDGrammar * dtdGrammar,DocTypeHandler * const docTypeHandler,MemoryManager * const grammarPoolMemoryManager,MemoryManager * const manager)105 DTDScanner::DTDScanner( DTDGrammar*           dtdGrammar
106                       , DocTypeHandler* const docTypeHandler
107                       , MemoryManager* const  grammarPoolMemoryManager
108                       , MemoryManager* const  manager) :
109     fMemoryManager(manager)
110     , fGrammarPoolMemoryManager(grammarPoolMemoryManager)
111     , fDocTypeHandler(docTypeHandler)
112     , fDumAttDef(0)
113     , fDumElemDecl(0)
114     , fDumEntityDecl(0)
115     , fInternalSubset(false)
116     , fNextAttrId(1)
117     , fDTDGrammar(dtdGrammar)
118     , fBufMgr(0)
119     , fReaderMgr(0)
120     , fScanner(0)
121     , fPEntityDeclPool(0)
122     , fEmptyNamespaceId(0)
123     , fDocTypeReaderId(0)
124 {
125     fPEntityDeclPool = new (fMemoryManager) NameIdPool<DTDEntityDecl>(109, 128, fMemoryManager);
126 }
127 
~DTDScanner()128 DTDScanner::~DTDScanner()
129 {
130     delete fDumAttDef;
131     delete fDumElemDecl;
132     delete fDumEntityDecl;
133     delete fPEntityDeclPool;
134 }
135 
136 // -----------------------------------------------------------------------
137 //  Setter methods
138 // -----------------------------------------------------------------------
setScannerInfo(XMLScanner * const owningScanner,ReaderMgr * const readerMgr,XMLBufferMgr * const bufMgr)139 void DTDScanner::setScannerInfo(XMLScanner* const      owningScanner
140                             , ReaderMgr* const      readerMgr
141                             , XMLBufferMgr* const   bufMgr)
142 {
143     // We don't own any of these, we just reference them
144     fScanner = owningScanner;
145     fReaderMgr = readerMgr;
146     fBufMgr = bufMgr;
147 
148     if (fScanner->getDoNamespaces())
149         fEmptyNamespaceId = fScanner->getEmptyNamespaceId();
150     else
151         fEmptyNamespaceId = 0;
152 
153     fDocTypeReaderId = fReaderMgr->getCurrentReaderNum();
154 }
155 
156 
157 // ---------------------------------------------------------------------------
158 //  DTDScanner: Private scanning methods
159 // ---------------------------------------------------------------------------
checkForPERef(const bool inLiteral,const bool inMarkup)160 bool DTDScanner::checkForPERef(   const bool    inLiteral
161                                 , const bool    inMarkup)
162 {
163     bool gotSpace = false;
164 
165     //
166     //  See if we have any spaces up front. If so, then skip them and set
167     //  the gotSpaces flag.
168     //
169     if (fReaderMgr->skippedSpace())
170     {
171         fReaderMgr->skipPastSpaces();
172         gotSpace = true;
173     }
174 
175     // If the next char is a percent, then expand the PERef
176     if (!fReaderMgr->skippedChar(chPercent))
177        return gotSpace;
178 
179     while (true)
180     {
181        if (!expandPERef(false, inLiteral, inMarkup, false))
182           fScanner->emitError(XMLErrs::ExpectedEntityRefName);
183        // And skip any more spaces in the expanded value
184        if (fReaderMgr->skippedSpace())
185        {
186           fReaderMgr->skipPastSpaces();
187           gotSpace = true;
188        }
189        if (!fReaderMgr->skippedChar(chPercent))
190           break;
191     }
192     return gotSpace;
193 }
194 
195 
expandPERef(const bool scanExternal,const bool inLiteral,const bool inMarkup,const bool throwEndOfExt)196 bool DTDScanner::expandPERef( const   bool    scanExternal
197                                 , const bool    inLiteral
198                                 , const bool    inMarkup
199                                 , const bool    throwEndOfExt)
200 {
201     fScanner->setHasNoDTD(false);
202     XMLBufBid bbName(fBufMgr);
203 
204     //
205     //  If we are in the internal subset and in markup, then this is
206     //  an error but we go ahead and do it anyway.
207     //
208     if (fInternalSubset && inMarkup)
209         fScanner->emitError(XMLErrs::PERefInMarkupInIntSubset);
210 
211     if (!fReaderMgr->getName(bbName.getBuffer()))
212     {
213         fScanner->emitError(XMLErrs::ExpectedPEName);
214 
215         // Skip the semicolon if that's what we ended up on
216         fReaderMgr->skippedChar(chSemiColon);
217         return false;
218     }
219 
220     // If no terminating semicolon, emit an error but try to keep going
221     if (!fReaderMgr->skippedChar(chSemiColon))
222         fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
223 
224     //
225     //  Look it up in the PE decl pool and see if it exists. If not, just
226     //  emit an error and continue.
227     //
228     XMLEntityDecl* decl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
229     if (!decl)
230     {
231         // XML 1.0 Section 4.1
232         if (fScanner->getStandalone()) {
233             // no need to check fScanner->fHasNoDTD which is for sure false
234             // since we are in expandPERef already
235             fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
236         }
237         else {
238             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
239                 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
240         }
241 
242         return false;
243     }
244 
245     //
246     // XML 1.0 Section 2.9
247     //  If we are a standalone document, then it has to have been declared
248     //  in the internal subset. Keep going though.
249     //
250     if (fScanner->getValidationScheme() == XMLScanner::Val_Always && fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
251         fScanner->getValidator()->emitError(XMLValid::VC_IllegalRefInStandalone, bbName.getRawBuffer());
252 
253     //
254     //  Okee dokee, we found it. So create either a memory stream with
255     //  the entity value contents, or a file stream if its an external
256     //  entity.
257     //
258     if (decl->isExternal())
259     {
260         // And now create a reader to read this entity
261         InputSource* srcUsed;
262         XMLReader* reader = fReaderMgr->createReader
263         (
264             decl->getBaseURI()
265             , decl->getSystemId()
266             , decl->getPublicId()
267             , false
268             , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
269             , XMLReader::Type_PE
270             , XMLReader::Source_External
271             , srcUsed
272             , fScanner->getCalculateSrcOfs()
273             , fScanner->getLowWaterMark()
274             , fScanner->getDisableDefaultEntityResolution()
275         );
276 
277         // Put a janitor on the source so its cleaned up on exit
278         Janitor<InputSource> janSrc(srcUsed);
279 
280         // If the creation failed then throw an exception
281         if (!reader)
282             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
283 
284         // Set the 'throw at end' flag, to the one we were given
285         reader->setThrowAtEnd(throwEndOfExt);
286 
287         //
288         //  Push the reader. If its a recursive expansion, then emit an error
289         //  and return an failure.
290         //
291         if (!fReaderMgr->pushReader(reader, decl))
292         {
293             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
294             return false;
295         }
296 
297         //
298         //  If the caller wants us to scan the external entity, then lets
299         //  do that now.
300         //
301         if (scanExternal)
302         {
303             XMLEntityHandler* entHandler = fScanner->getEntityHandler();
304 
305             // If we have an entity handler, tell it we are starting this entity
306             if (entHandler)
307                 entHandler->startInputSource(*srcUsed);
308 
309             //
310             //  Scan the external entity now. The parameter tells it that
311             //  it is not in an include section. Get the current reader
312             //  level so we can catch partial markup errors and be sure
313             //  to get back to here if we get an exception out of the
314             //  ext subset scan.
315             //
316             const XMLSize_t readerNum = fReaderMgr->getCurrentReaderNum();
317             try
318             {
319                 scanExtSubsetDecl(false, false);
320             }
321             catch(const OutOfMemoryException&)
322             {
323                 throw;
324             }
325             catch(...)
326             {
327                 // Pop the reader back to the original level
328                 fReaderMgr->cleanStackBackTo(readerNum);
329 
330                 // End the input source, even though its not happy
331                 if (entHandler)
332                     entHandler->endInputSource(*srcUsed);
333                 throw;
334             }
335 
336             // If we have an entity handler, tell it we are ending this entity
337             if (entHandler)
338                 entHandler->endInputSource(*srcUsed);
339         }
340         else {
341             // If it starts with the XML string, then parse a text decl
342             if (fScanner->checkXMLDecl(true))
343                 scanTextDecl();
344         }
345     }
346      else
347     {
348         // Create a reader over a memory stream over the entity value
349         XMLReader* valueReader = fReaderMgr->createIntEntReader
350         (
351             decl->getName()
352             , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
353             , XMLReader::Type_PE
354             , decl->getValue()
355             , decl->getValueLen()
356             , false
357         );
358 
359         //
360         //  Trt to push the entity reader onto the reader manager stack,
361         //  where it will become the subsequent input. If it fails, that
362         //  means the entity is recursive, so issue an error. The reader
363         //  will have just been discarded, but we just keep going.
364         //
365         if (!fReaderMgr->pushReader(valueReader, decl))
366             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
367     }
368 
369     return true;
370 }
371 
372 
getQuotedString(XMLBuffer & toFill)373 bool DTDScanner::getQuotedString(XMLBuffer& toFill)
374 {
375     // Reset the target buffer
376     toFill.reset();
377 
378     // Get the next char which must be a single or double quote
379     XMLCh quoteCh;
380     if (!fReaderMgr->skipIfQuote(quoteCh))
381         return false;
382 
383 	XMLCh nextCh;
384     // Get another char and see if it matches the starting quote char
385     while ((nextCh=fReaderMgr->getNextChar())!=quoteCh)
386     {
387         //
388         //  We should never get either an end of file null char here. If we
389         //  do, just fail. It will be handled more gracefully in the higher
390         //  level code that called us.
391         //
392         if (!nextCh)
393             return false;
394 
395         // Else add it to the buffer
396         toFill.append(nextCh);
397     }
398     return true;
399 }
400 
401 
402 XMLAttDef*
scanAttDef(DTDElementDecl & parentElem,XMLBuffer & bufToUse)403 DTDScanner::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse)
404 {
405     // Check for PE ref or optional whitespace
406     checkForPERef(false, true);
407 
408     // Get the name of the attribute
409     if (!fReaderMgr->getName(bufToUse))
410     {
411         fScanner->emitError(XMLErrs::ExpectedAttrName);
412         return 0;
413     }
414 
415     //
416     //  Look up this attribute in the parent element's attribute list. If
417     //  it already exists, then use the dummy.
418     //
419     DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer());
420     if (decl)
421     {
422         // It already exists, so put out a warning
423         fScanner->emitError
424         (
425             XMLErrs::AttListAlreadyExists
426             , bufToUse.getRawBuffer()
427             , parentElem.getFullName()
428         );
429 
430         // Use the dummy decl to parse into and set its name to the name we got
431         if (!fDumAttDef)
432         {
433             fDumAttDef = new (fMemoryManager) DTDAttDef(fMemoryManager);
434             fDumAttDef->setId(fNextAttrId++);
435         }
436         fDumAttDef->setName(bufToUse.getRawBuffer());
437         decl = fDumAttDef;
438     }
439      else
440     {
441         //
442         //  It does not already exist so create a new one, give it the next
443         //  available unique id, and add it
444         //
445         decl = new (fGrammarPoolMemoryManager) DTDAttDef
446         (
447             bufToUse.getRawBuffer()
448             , XMLAttDef::CData
449             , XMLAttDef::Implied
450             , fGrammarPoolMemoryManager
451         );
452         decl->setId(fNextAttrId++);
453         decl->setExternalAttDeclaration(isReadingExternalEntity());
454         parentElem.addAttDef(decl);
455     }
456 
457     // Set a flag to indicate whether we are doing a dummy parse
458     const bool isIgnored = (decl == fDumAttDef);
459 
460     // Space is required here, so check for PE ref, and require space
461     if (!checkForPERef(false, true))
462         fScanner->emitError(XMLErrs::ExpectedWhitespace);
463 
464     //
465     //  Next has to be one of the attribute type strings. This tells us what
466     //  is to follow.
467     //
468     if (fReaderMgr->skippedString(XMLUni::fgCDATAString))
469     {
470         decl->setType(XMLAttDef::CData);
471     }
472      else if (fReaderMgr->skippedString(XMLUni::fgIDString))
473     {
474         if (!fReaderMgr->skippedString(XMLUni::fgRefString))
475             decl->setType(XMLAttDef::ID);
476         else if (!fReaderMgr->skippedChar(chLatin_S))
477             decl->setType(XMLAttDef::IDRef);
478         else
479             decl->setType(XMLAttDef::IDRefs);
480     }
481      else if (fReaderMgr->skippedString(XMLUni::fgEntitString))
482     {
483         if (fReaderMgr->skippedChar(chLatin_Y))
484         {
485             decl->setType(XMLAttDef::Entity);
486         }
487          else if (fReaderMgr->skippedString(XMLUni::fgIESString))
488         {
489             decl->setType(XMLAttDef::Entities);
490         }
491          else
492         {
493             fScanner->emitError
494             (
495                 XMLErrs::ExpectedAttributeType
496                 , decl->getFullName()
497                 , parentElem.getFullName()
498             );
499             return 0;
500         }
501     }
502      else if (fReaderMgr->skippedString(XMLUni::fgNmTokenString))
503     {
504         if (fReaderMgr->skippedChar(chLatin_S))
505             decl->setType(XMLAttDef::NmTokens);
506         else
507             decl->setType(XMLAttDef::NmToken);
508     }
509      else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
510     {
511         // Check for PE ref and require space
512         if (!checkForPERef(false, true))
513             fScanner->emitError(XMLErrs::ExpectedWhitespace);
514 
515         decl->setType(XMLAttDef::Notation);
516         if (!scanEnumeration(*decl, bufToUse, true))
517             return 0;
518 
519         // Set the value as the enumeration for this decl
520         decl->setEnumeration(bufToUse.getRawBuffer());
521     }
522      else if (fReaderMgr->skippedChar(chOpenParen))
523     {
524         decl->setType(XMLAttDef::Enumeration);
525         if (!scanEnumeration(*decl, bufToUse, false))
526             return 0;
527 
528         // Set the value as the enumeration for this decl
529         decl->setEnumeration(bufToUse.getRawBuffer());
530     }
531      else
532     {
533         fScanner->emitError
534         (
535             XMLErrs::ExpectedAttributeType
536             , decl->getFullName()
537             , parentElem.getFullName()
538         );
539         return 0;
540     }
541 
542     // Space is required here, so check for PE ref, and require space
543     if (!checkForPERef(false, true))
544         fScanner->emitError(XMLErrs::ExpectedWhitespace);
545 
546     // And then scan for the optional default value declaration
547     scanDefaultDecl(*decl);
548 
549     // If validating, then do a couple of validation constraints
550     if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
551     {
552         if (decl->getType() == XMLAttDef::ID)
553         {
554             if ((decl->getDefaultType() != XMLAttDef::Implied)
555             &&  (decl->getDefaultType() != XMLAttDef::Required))
556             {
557                 fScanner->getValidator()->emitError(XMLValid::BadIDAttrDefType, decl->getFullName());
558             }
559         }
560 
561         // if attdef is xml:space, check correct enumeration (default|preserve)
562         const XMLCh fgXMLSpace[] = { chLatin_x, chLatin_m, chLatin_l, chColon, chLatin_s, chLatin_p, chLatin_a, chLatin_c, chLatin_e, chNull };
563 
564         if (XMLString::equals(decl->getFullName(),fgXMLSpace)) {
565             const XMLCh fgPreserve[] = { chLatin_p, chLatin_r, chLatin_e, chLatin_s, chLatin_e, chLatin_r, chLatin_v, chLatin_e, chNull };
566             const XMLCh fgDefault[] = { chLatin_d, chLatin_e, chLatin_f, chLatin_a, chLatin_u, chLatin_l, chLatin_t, chNull };
567             bool ok = false;
568             if (decl->getType() == XMLAttDef::Enumeration) {
569                 BaseRefVectorOf<XMLCh>* enumVector = XMLString::tokenizeString(decl->getEnumeration(), fMemoryManager);
570                 XMLSize_t size = enumVector->size();
571                 ok = (size == 1 &&
572                      (XMLString::equals(enumVector->elementAt(0), fgDefault) ||
573                       XMLString::equals(enumVector->elementAt(0), fgPreserve))) ||
574                      (size == 2 &&
575                      (XMLString::equals(enumVector->elementAt(0), fgDefault) &&
576                       XMLString::equals(enumVector->elementAt(1), fgPreserve))) ||
577                      (size == 2 &&
578                      (XMLString::equals(enumVector->elementAt(1), fgDefault) &&
579                       XMLString::equals(enumVector->elementAt(0), fgPreserve)));
580                 delete enumVector;
581             }
582             if (!ok)
583                 fScanner->getValidator()->emitError(XMLValid::IllegalXMLSpace);
584         }
585     }
586 
587     // If we have a doc type handler, tell it about this attdef.
588     if (fDocTypeHandler)
589         fDocTypeHandler->attDef(parentElem, *decl, isIgnored);
590     return decl;
591 }
592 
593 
scanAttListDecl()594 void DTDScanner::scanAttListDecl()
595 {
596     // Space is required here, so check for a PE ref
597     if (!checkForPERef(false, true))
598     {
599         fScanner->emitError(XMLErrs::ExpectedWhitespace);
600         fReaderMgr->skipPastChar(chCloseAngle);
601         return;
602     }
603 
604     //
605     //  Next should be the name of the element it belongs to, so get a buffer
606     //  and get the name into it.
607     //
608     XMLBufBid bbName(fBufMgr);
609     if (!fReaderMgr->getName(bbName.getBuffer()))
610     {
611         fScanner->emitError(XMLErrs::ExpectedElementName);
612         fReaderMgr->skipPastChar(chCloseAngle);
613         return;
614     }
615 
616     //
617     //  Find this element's declaration. If it has not been declared yet,
618     //  we will force one into the list, but not mark it as declared.
619     //
620     DTDElementDecl* elemDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
621     if (!elemDecl)
622     {
623         //
624         //  Lets fault in a declaration and add it to the pool. We mark
625         //  it having been created because of an attlist. Later, if its
626         //  declared, this will be updated.
627         //
628         elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
629         (
630             bbName.getRawBuffer()
631             , fEmptyNamespaceId
632             , DTDElementDecl::Any
633             , fGrammarPoolMemoryManager
634         );
635         elemDecl->setCreateReason(XMLElementDecl::AttList);
636         elemDecl->setExternalElemDeclaration(isReadingExternalEntity());
637         fDTDGrammar->putElemDecl((XMLElementDecl*) elemDecl);
638     }
639 
640     // If we have a doc type handler, tell it the att list is starting
641     if (fDocTypeHandler)
642         fDocTypeHandler->startAttList(*elemDecl);
643 
644     //
645     //  Now we loop until we are done with all of the attributes in this
646     //  list. We need a buffer to use for local processing.
647     //
648     XMLBufBid   bbTmp(fBufMgr);
649     XMLBuffer&  tmpBuf = bbTmp.getBuffer();
650     bool        seenAnId = false;
651     while (true)
652     {
653         // Get the next char out and see what it tells us to do
654         const XMLCh nextCh = fReaderMgr->peekNextChar();
655 
656         // Watch for EOF
657         if (!nextCh)
658             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
659 
660         if (nextCh == chCloseAngle)
661         {
662             // We are done with this attribute list
663             fReaderMgr->getNextChar();
664             break;
665         }
666          else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
667         {
668             //
669             //  If advanced callbacks are enabled and we have a doc
670             //  type handler, then gather up the white space and call
671             //  back on the doctype handler. Otherwise, just skip
672             //  whitespace.
673             //
674             if (fDocTypeHandler)
675             {
676                 fReaderMgr->getSpaces(tmpBuf);
677                 fDocTypeHandler->doctypeWhitespace
678                 (
679                     tmpBuf.getRawBuffer()
680                     , tmpBuf.getLen()
681                 );
682             }
683              else
684             {
685                 fReaderMgr->skipPastSpaces();
686             }
687         }
688          else if (nextCh == chPercent)
689         {
690             // Eat the percent and expand the ref
691             fReaderMgr->getNextChar();
692             expandPERef(false, false, true);
693         }
694          else
695         {
696             //
697             //  It must be an attribute name, so scan it. We let
698             //  it use our local buffer for its name scanning.
699             //
700             XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf);
701 
702             if (!attDef)
703             {
704                 fReaderMgr->skipPastChar(chCloseAngle);
705                 break;
706             }
707 
708             //
709             //  If we are validating and its an ID type, then we have to
710             //  make sure that we have not seen an id attribute yet. Set
711             //  the flag to say that we've seen one now also.
712             //
713             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
714             {
715                 if (attDef->getType() == XMLAttDef::ID)
716                 {
717                     if (seenAnId)
718                         fScanner->getValidator()->emitError(XMLValid::MultipleIdAttrs, elemDecl->getFullName());
719                     seenAnId = true;
720                 }
721             }
722         }
723     }
724 
725     // If we have a doc type handler, tell it the att list is ending
726     if (fDocTypeHandler)
727         fDocTypeHandler->endAttList(*elemDecl);
728 }
729 
730 
731 //
732 //  This method is called to scan the value of an attribute in content. This
733 //  involves some normalization and replacement of general entity and
734 //  character references.
735 //
736 //  End of entity's must be dealt with here. During DTD scan, they can come
737 //  from external entities. During content, they can come from any entity.
738 //  We just eat the end of entity and continue with our scan until we come
739 //  to the closing quote. If an unterminated value causes us to go through
740 //  subsequent entities, that will cause errors back in the calling code,
741 //  but there's little we can do about it here.
742 //
scanAttValue(const XMLCh * const attrName,XMLBuffer & toFill,const XMLAttDef::AttTypes type)743 bool DTDScanner::scanAttValue(const   XMLCh* const        attrName
744                                 ,       XMLBuffer&          toFill
745                                 , const XMLAttDef::AttTypes type)
746 {
747     enum States
748     {
749         InWhitespace
750         , InContent
751     };
752 
753     // Reset the target buffer
754     toFill.reset();
755 
756     // Get the next char which must be a single or double quote
757     XMLCh quoteCh;
758     if (!fReaderMgr->skipIfQuote(quoteCh))
759         return false;
760 
761     //
762     //  We have to get the current reader because we have to ignore closing
763     //  quotes until we hit the same reader again.
764     //
765     const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
766 
767     //
768     //  Loop until we get the attribute value. Note that we use a double
769     //  loop here to avoid the setup/teardown overhead of the exception
770     //  handler on every round.
771     //
772     XMLCh   nextCh;
773     XMLCh   secondCh = 0;
774     States  curState = InContent;
775     bool    firstNonWS = false;
776     bool    gotLeadingSurrogate = false;
777     bool    escaped;
778     while (true)
779     {
780     try
781     {
782         while(true)
783         {
784             nextCh = fReaderMgr->getNextChar();
785 
786             if (!nextCh)
787                 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
788 
789             // Check for our ending quote in the same entity
790             if (nextCh == quoteCh)
791             {
792                 if (curReader == fReaderMgr->getCurrentReaderNum())
793                     return true;
794 
795                 // Watch for spillover into a previous entity
796                 if (curReader > fReaderMgr->getCurrentReaderNum())
797                 {
798                     fScanner->emitError(XMLErrs::PartialMarkupInEntity);
799                     return false;
800                 }
801             }
802 
803             //
804             //  Check for an entity ref now, before we let it affect our
805             //  whitespace normalization logic below. We ignore the empty flag
806             //  in this one.
807             //
808             escaped = false;
809             if (nextCh == chAmpersand)
810             {
811                 if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned)
812                 {
813                     gotLeadingSurrogate = false;
814                     continue;
815                 }
816             }
817             else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
818             {
819                 // Check for correct surrogate pairs
820                 if (gotLeadingSurrogate)
821                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
822                 else
823                     gotLeadingSurrogate = true;
824             }
825              else
826             {
827                 if (gotLeadingSurrogate)
828                 {
829                     if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
830                         fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
831                 }
832                 // Its got to at least be a valid XML character
833                 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
834                 {
835                     XMLCh tmpBuf[9];
836                     XMLString::binToText
837                     (
838                         nextCh
839                         , tmpBuf
840                         , 8
841                         , 16
842                         , fMemoryManager
843                     );
844                     fScanner->emitError
845                     (
846                         XMLErrs::InvalidCharacterInAttrValue
847                         , attrName
848                         , tmpBuf
849                     );
850                 }
851 
852                 gotLeadingSurrogate = false;
853             }
854 
855             //
856             //  If its not escaped, then make sure its not a < character, which
857             //  is not allowed in attribute values.
858             //
859             if (!escaped && (nextCh == chOpenAngle))
860                 fScanner->emitError(XMLErrs::BracketInAttrValue, attrName);
861 
862             //
863             //  If the attribute is a CDATA type we do simple replacement of
864             //  tabs and new lines with spaces, if the character is not escaped
865             //  by way of a char ref.
866             //
867             //  Otherwise, we do the standard non-CDATA normalization of
868             //  compressing whitespace to single spaces and getting rid of
869             //  leading and trailing whitespace.
870             //
871             if (type == XMLAttDef::CData)
872             {
873                 if (!escaped)
874                 {
875                     if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
876                         nextCh = chSpace;
877                 }
878             }
879              else
880             {
881                 if (curState == InWhitespace)
882                 {
883                     if (!fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
884                     {
885                         if (firstNonWS)
886                             toFill.append(chSpace);
887                         curState = InContent;
888                         firstNonWS = true;
889                     }
890                      else
891                     {
892                         continue;
893                     }
894                 }
895                  else if (curState == InContent)
896                 {
897                     if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
898                     {
899                         curState = InWhitespace;
900                         continue;
901                     }
902                     firstNonWS = true;
903                 }
904             }
905 
906             // Else add it to the buffer
907             toFill.append(nextCh);
908 
909             if (secondCh)
910             {
911                 toFill.append(secondCh);
912                 secondCh=0;
913             }
914         }
915     }
916 
917     catch(const EndOfEntityException&)
918     {
919         // Just eat it and continue.
920         gotLeadingSurrogate = false;
921         escaped = false;
922     }
923     }
924     return true;
925 }
926 
927 
scanCharRef(XMLCh & first,XMLCh & second)928 bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second)
929 {
930     bool gotOne = false;
931     unsigned int value = 0;
932 
933     //
934     //  Set the radix. Its supposed to be a lower case x if hex. But, in
935     //  order to recover well, we check for an upper and put out an error
936     //  for that.
937     //
938     unsigned int radix = 10;
939 
940     if (fReaderMgr->skippedChar(chLatin_x))
941     {
942         radix = 16;
943     }
944      else if (fReaderMgr->skippedChar(chLatin_X))
945     {
946         fScanner->emitError(XMLErrs::HexRadixMustBeLowerCase);
947         radix = 16;
948     }
949 
950     while (true)
951     {
952         const XMLCh nextCh = fReaderMgr->peekNextChar();
953 
954         // Watch for EOF
955         if (!nextCh)
956             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
957 
958         // Break out on the terminating semicolon
959         if (nextCh == chSemiColon)
960         {
961             fReaderMgr->getNextChar();
962             break;
963         }
964 
965         //
966         //  Convert this char to a binary value, or bail out if its not
967         //  one.
968         //
969         unsigned int nextVal;
970         if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
971             nextVal = (unsigned int)(nextCh - chDigit_0);
972         else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
973             nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
974         else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
975             nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
976         else
977         {
978             //
979             //  If we got at least a sigit, then do an unterminated ref
980             //  error. Else, do an expected a numerical ref thing.
981             //
982             if (gotOne)
983                 fScanner->emitError(XMLErrs::UnterminatedCharRef);
984             else
985                 fScanner->emitError(XMLErrs::ExpectedNumericalCharRef);
986 
987             return false;
988         }
989 
990         //
991         //  Make sure its valid for the radix. If not, then just eat the
992         //  digit and go on after issueing an error. Else, update the
993         //  running value with this new digit.
994         //
995         if (nextVal >= radix)
996         {
997             XMLCh tmpStr[2];
998             tmpStr[0] = nextCh;
999             tmpStr[1] = chNull;
1000             fScanner->emitError(XMLErrs::BadDigitForRadix, tmpStr);
1001         }
1002          else
1003         {
1004             value = (value * radix) + nextVal;
1005         }
1006 
1007         // Indicate that we got at least one good digit
1008         gotOne = true;
1009 
1010         // Eat the char we just processed
1011         fReaderMgr->getNextChar();
1012     }
1013 
1014     // Return the char (or chars)
1015     // And check if the character expanded is valid or not
1016     if (value >= 0x10000 && value <= 0x10FFFF)
1017     {
1018         value -= 0x10000;
1019         first  = XMLCh((value >> 10) + 0xD800);
1020         second = XMLCh((value & 0x3FF) + 0xDC00);
1021     }
1022     else if (value <= 0xFFFD)
1023     {
1024         first  = XMLCh(value);
1025         second = 0;
1026         if (!fReaderMgr->getCurrentReader()->isXMLChar(first) && !fReaderMgr->getCurrentReader()->isControlChar(first)) {
1027             // Character reference was not in the valid range
1028             fScanner->emitError(XMLErrs::InvalidCharacterRef);
1029             return false;
1030         }
1031     }
1032     else {
1033         // Character reference was not in the valid range
1034         fScanner->emitError(XMLErrs::InvalidCharacterRef);
1035         return false;
1036     }
1037 
1038     return true;
1039 }
1040 
1041 
1042 ContentSpecNode*
scanChildren(const DTDElementDecl & elemDecl,XMLBuffer & bufToUse,unsigned int & depth)1043 DTDScanner::scanChildren(const DTDElementDecl& elemDecl, XMLBuffer& bufToUse, unsigned int& depth)
1044 {
1045     if (depth++ > CONTENTSPEC_DEPTH_LIMIT) {
1046         fScanner->emitError(XMLErrs::UnterminatedDOCTYPE);
1047         return 0;
1048     }
1049 
1050     // Check for a PE ref here, but don't require spaces
1051     checkForPERef(false, true);
1052 
1053     ValueStackOf<XMLSize_t>* arrNestedDecl=NULL;
1054     //
1055     //  We know that the caller just saw an opening parenthesis, so we need
1056     //  to parse until we hit the end of it; if we find several parenthesis,
1057     //  store them in an array to be processed later.
1058     //
1059     //  We have to check for one up front, since it could be something like
1060     //  (((a)*)) etc...
1061     //
1062     ContentSpecNode* curNode = 0;
1063     while(fReaderMgr->skippedChar(chOpenParen))
1064     {
1065         // to check entity nesting
1066         const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1067         if(arrNestedDecl==NULL)
1068             arrNestedDecl=new (fMemoryManager) ValueStackOf<XMLSize_t>(5, fMemoryManager);
1069         arrNestedDecl->push(curReader);
1070 
1071         // Check for a PE ref here, but don't require spaces
1072         checkForPERef(false, true);
1073     }
1074 
1075     // We must find a leaf node here, either standalone or nested in the parenthesis
1076     if (!fReaderMgr->getName(bufToUse))
1077     {
1078         fScanner->emitError(XMLErrs::ExpectedElementName);
1079         return 0;
1080     }
1081 
1082     //
1083     //  Create a leaf node for it. If we can find the element id for
1084     //  this element, then use it. Else, we have to fault in an element
1085     //  decl, marked as created because of being in a content model.
1086     //
1087     XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
1088     if (!decl)
1089     {
1090         decl = new (fGrammarPoolMemoryManager) DTDElementDecl
1091         (
1092             bufToUse.getRawBuffer()
1093             , fEmptyNamespaceId
1094             , DTDElementDecl::Any
1095             , fGrammarPoolMemoryManager
1096         );
1097         decl->setCreateReason(XMLElementDecl::InContentModel);
1098         decl->setExternalElemDeclaration(isReadingExternalEntity());
1099         fDTDGrammar->putElemDecl(decl);
1100     }
1101     curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
1102     (
1103         decl->getElementName()
1104         , fGrammarPoolMemoryManager
1105     );
1106 
1107     // Check for a PE ref here, but don't require spaces
1108     const bool gotSpaces = checkForPERef(false, true);
1109 
1110     // Check for a repetition character after the leaf
1111     XMLCh repCh = fReaderMgr->peekNextChar();
1112     ContentSpecNode* tmpNode = makeRepNode(repCh, curNode, fGrammarPoolMemoryManager);
1113     if (tmpNode != curNode)
1114     {
1115         if (gotSpaces)
1116         {
1117             if (fScanner->emitErrorWillThrowException(XMLErrs::UnexpectedWhitespace))
1118             {
1119                 delete tmpNode;
1120             }
1121             fScanner->emitError(XMLErrs::UnexpectedWhitespace);
1122         }
1123         fReaderMgr->getNextChar();
1124         curNode = tmpNode;
1125     }
1126 
1127     while(arrNestedDecl==NULL || !arrNestedDecl->empty())
1128     {
1129         // Check for a PE ref here, but don't require spaces
1130         checkForPERef(false, true);
1131 
1132         //
1133         //  Ok, the next character tells us what kind of content this particular
1134         //  model this particular parentesized section is. Its either a choice if
1135         //  we see ',', a sequence if we see '|', or a single leaf node if we see
1136         //  a closing paren.
1137         //
1138         const XMLCh opCh = fReaderMgr->peekNextChar();
1139 
1140         if ((opCh != chComma)
1141         &&  (opCh != chPipe)
1142         &&  (opCh != chCloseParen))
1143         {
1144             // Not a legal char, so delete our node and return failure
1145             delete curNode;
1146             fScanner->emitError(XMLErrs::ExpectedSeqChoiceLeaf);
1147             return 0;
1148         }
1149 
1150         //
1151         //  Create the head node of the correct type. We need this to remember
1152         //  the top of the local tree. If it was a single subexpr, then just
1153         //  set the head node to the current node. For the others, we'll build
1154         //  the tree off the second child as we move across.
1155         //
1156         ContentSpecNode* headNode = 0;
1157         ContentSpecNode::NodeTypes curType = ContentSpecNode::UnknownType;
1158         if (opCh == chComma)
1159         {
1160             curType = ContentSpecNode::Sequence;
1161             headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
1162             (
1163                 curType
1164                 , curNode
1165                 , 0
1166                 , true
1167                 , true
1168                 , fGrammarPoolMemoryManager
1169             );
1170             curNode = headNode;
1171         }
1172          else if (opCh == chPipe)
1173         {
1174             curType = ContentSpecNode::Choice;
1175             headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
1176             (
1177                 curType
1178                 , curNode
1179                 , 0
1180                 , true
1181                 , true
1182                 , fGrammarPoolMemoryManager
1183             );
1184             curNode = headNode;
1185         }
1186          else
1187         {
1188             headNode = curNode;
1189             fReaderMgr->getNextChar();
1190         }
1191 
1192         //
1193         //  If it was a sequence or choice, we just loop until we get to the
1194         //  end of our section, adding each new leaf or sub expression to the
1195         //  right child of the current node, and making that new node the current
1196         //  node.
1197         //
1198         if ((opCh == chComma) || (opCh == chPipe))
1199         {
1200             ContentSpecNode* lastNode = 0;
1201             while (true)
1202             {
1203                 //
1204                 //  The next thing must either be another | or , character followed
1205                 //  by another leaf or subexpression, or a closing parenthesis, or a
1206                 //  PE ref.
1207                 //
1208                 if (fReaderMgr->lookingAtChar(chPercent))
1209                 {
1210                     checkForPERef(false, true);
1211                 }
1212                  else if (fReaderMgr->skippedSpace())
1213                 {
1214                     // Just skip whitespace
1215                     fReaderMgr->skipPastSpaces();
1216                 }
1217                  else if (fReaderMgr->skippedChar(chCloseParen))
1218                 {
1219                     //
1220                     //  We've hit the end of this section, so break out. But, we
1221                     //  need to see if we left a partial sequence of choice node
1222                     //  without a second node. If so, we have to undo that and
1223                     //  put its left child into the right node of the previous
1224                     //  node.
1225                     //
1226                     if ((curNode->getType() == ContentSpecNode::Choice)
1227                     ||  (curNode->getType() == ContentSpecNode::Sequence))
1228                     {
1229                         if (!curNode->getSecond() && lastNode)
1230                         {
1231                             ContentSpecNode* saveFirst = curNode->orphanFirst();
1232                             lastNode->setSecond(saveFirst);
1233                             curNode = lastNode;
1234                         }
1235                     }
1236                     break;
1237                 }
1238                  else if (fReaderMgr->skippedChar(opCh))
1239                 {
1240                     // Check for a PE ref here, but don't require spaces
1241                     checkForPERef(false, true);
1242 
1243                     if (fReaderMgr->skippedChar(chOpenParen))
1244                     {
1245                         const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1246 
1247                         // Recurse to handle this new guy
1248                         ContentSpecNode* subNode;
1249                         try {
1250                             subNode = scanChildren(elemDecl, bufToUse, depth);
1251                         }
1252                         catch (const XMLErrs::Codes)
1253                         {
1254                             delete headNode;
1255                             throw;
1256                         }
1257 
1258                         // If it failed, we are done, clean up here and return failure
1259                         if (!subNode)
1260                         {
1261                             delete headNode;
1262                             return 0;
1263                         }
1264 
1265                         if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
1266                             fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
1267 
1268                         // Else patch it in and make it the new current
1269                         ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode
1270                         (
1271                             curType
1272                             , subNode
1273                             , 0
1274                             , true
1275                             , true
1276                             , fGrammarPoolMemoryManager
1277                         );
1278                         curNode->setSecond(newCur);
1279                         lastNode = curNode;
1280                         curNode = newCur;
1281                     }
1282                      else
1283                     {
1284                         //
1285                         //  Got to be a leaf node, so get a name. If we cannot get
1286                         //  one, then clean up and get outa here.
1287                         //
1288                         if (!fReaderMgr->getName(bufToUse))
1289                         {
1290                             delete headNode;
1291                             fScanner->emitError(XMLErrs::ExpectedElementName);
1292                             return 0;
1293                         }
1294 
1295                         //
1296                         //  Create a leaf node for it. If we can find the element
1297                         //  id for this element, then use it. Else, we have to
1298                         //  fault in an element decl, marked as created because
1299                         //  of being in a content model.
1300                         //
1301                         XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
1302                         if (!decl)
1303                         {
1304                             decl = new (fGrammarPoolMemoryManager) DTDElementDecl
1305                             (
1306                                 bufToUse.getRawBuffer()
1307                                 , fEmptyNamespaceId
1308                                 , DTDElementDecl::Any
1309                                 , fGrammarPoolMemoryManager
1310                             );
1311                             decl->setCreateReason(XMLElementDecl::InContentModel);
1312                             decl->setExternalElemDeclaration(isReadingExternalEntity());
1313                             fDTDGrammar->putElemDecl(decl);
1314                         }
1315 
1316                         ContentSpecNode* tmpLeaf = new (fGrammarPoolMemoryManager) ContentSpecNode
1317                         (
1318                             decl->getElementName()
1319                             , fGrammarPoolMemoryManager
1320                         );
1321 
1322                         // Check for a repetition character after the leaf
1323                         const XMLCh repCh = fReaderMgr->peekNextChar();
1324                         ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf, fGrammarPoolMemoryManager);
1325                         if (tmpLeaf != tmpLeaf2)
1326                             fReaderMgr->getNextChar();
1327 
1328                         //
1329                         //  Create a new sequence or choice node, with the leaf
1330                         //  (or rep surrounding it) we just got as its first node.
1331                         //  Make the new node the second node of the current node,
1332                         //  and then make it the current node.
1333                         //
1334                         ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode
1335                         (
1336                             curType
1337                             , tmpLeaf2
1338                             , 0
1339                             , true
1340                             , true
1341                             , fGrammarPoolMemoryManager
1342                         );
1343                         curNode->setSecond(newCur);
1344                         lastNode = curNode;
1345                         curNode = newCur;
1346                     }
1347                 }
1348                  else
1349                 {
1350                     // Cannot be valid
1351                     delete headNode;  // emitError may do a throw so need to clean-up first
1352                     if (opCh == chComma)
1353                     {
1354                         fScanner->emitError(XMLErrs::ExpectedChoiceOrCloseParen);
1355                     }
1356                      else
1357                     {
1358                         fScanner->emitError
1359                         (
1360                             XMLErrs::ExpectedSeqOrCloseParen
1361                             , elemDecl.getFullName()
1362                         );
1363                     }
1364                     return 0;
1365                 }
1366             }
1367         }
1368 
1369         //
1370         //  We saw the terminating parenthesis so lets check for any repetition
1371         //  character, and create a node for that, making the head node the child
1372         //  of it.
1373         //
1374         const XMLCh repCh = fReaderMgr->peekNextChar();
1375         curNode = makeRepNode(repCh, headNode, fGrammarPoolMemoryManager);
1376         if (curNode != headNode)
1377             fReaderMgr->getNextChar();
1378 
1379         // prepare for recursion
1380         if(arrNestedDecl==NULL)
1381             break;
1382         else
1383         {
1384             // If that failed, no need to go further, return failure
1385             if (!curNode)
1386                 return 0;
1387 
1388             const XMLSize_t curReader = arrNestedDecl->pop();
1389             if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
1390                 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
1391 
1392             if(arrNestedDecl->empty())
1393             {
1394                 delete arrNestedDecl;
1395                 arrNestedDecl=NULL;
1396             }
1397         }
1398     }
1399 
1400     return curNode;
1401 }
1402 
1403 
1404 //
1405 //  We get here after the '<!--' part of the comment. We scan past the
1406 //  terminating '-->' It will calls the appropriate handler with the comment
1407 //  text, if one is provided. A comment can be in either the document or
1408 //  the DTD, so the fInDocument flag is used to know which handler to send
1409 //  it to.
1410 //
scanComment()1411 void DTDScanner::scanComment()
1412 {
1413     enum States
1414     {
1415         InText
1416         , OneDash
1417         , TwoDashes
1418     };
1419 
1420     // Get a buffer for this
1421     XMLBufBid bbComment(fBufMgr);
1422 
1423     //
1424     //  Get the comment text into a temp buffer. Be sure to use temp buffer
1425     //  two here, since its to be used for stuff that is potentially longer
1426     //  than just a name.
1427     //
1428     bool   gotLeadingSurrogate = false;
1429     States curState = InText;
1430     while (true)
1431     {
1432         // Get the next character
1433         const XMLCh nextCh = fReaderMgr->getNextChar();
1434 
1435         //  Watch for an end of file
1436         if (!nextCh)
1437         {
1438             fScanner->emitError(XMLErrs::UnterminatedComment);
1439             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1440         }
1441 
1442         // Check for correct surrogate pairs
1443         if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
1444         {
1445             if (gotLeadingSurrogate)
1446                 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
1447             else
1448                 gotLeadingSurrogate = true;
1449         }
1450         else
1451         {
1452             if (gotLeadingSurrogate)
1453             {
1454                 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
1455                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
1456             }
1457             // Its got to at least be a valid XML character
1458             else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
1459 
1460                 XMLCh tmpBuf[9];
1461                 XMLString::binToText
1462                 (
1463                     nextCh
1464                     , tmpBuf
1465                     , 8
1466                     , 16
1467                     , fMemoryManager
1468                 );
1469                 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
1470             }
1471 
1472             gotLeadingSurrogate = false;
1473         }
1474 
1475         if (curState == InText)
1476         {
1477             // If its a dash, go to OneDash state. Otherwise take as text
1478             if (nextCh == chDash)
1479                 curState = OneDash;
1480             else
1481                 bbComment.append(nextCh);
1482         }
1483         else if (curState == OneDash)
1484         {
1485             //
1486             //  If its another dash, then we change to the two dashes states.
1487             //  Otherwise, we have to put in the deficit dash and the new
1488             //  character and go back to InText.
1489             //
1490             if (nextCh == chDash)
1491             {
1492                 curState = TwoDashes;
1493             }
1494             else
1495             {
1496                 bbComment.append(chDash);
1497                 bbComment.append(nextCh);
1498                 curState = InText;
1499             }
1500         }
1501         else if (curState == TwoDashes)
1502         {
1503             // The next character must be the closing bracket
1504             if (nextCh != chCloseAngle)
1505             {
1506                 fScanner->emitError(XMLErrs::IllegalSequenceInComment);
1507                 fReaderMgr->skipPastChar(chCloseAngle);
1508                 return;
1509             }
1510             break;
1511         }
1512     }
1513 
1514     // If there is a doc type handler, then pass on the comment stuff
1515     if (fDocTypeHandler)
1516         fDocTypeHandler->doctypeComment(bbComment.getRawBuffer());
1517 }
1518 
1519 
scanContentSpec(DTDElementDecl & toFill)1520 bool DTDScanner::scanContentSpec(DTDElementDecl& toFill)
1521 {
1522     //
1523     //  Check for for a couple of the predefined content type strings. If
1524     //  its not one of these, its got to be a parenthesized reg ex type
1525     //  expression.
1526     //
1527     if (fReaderMgr->skippedString(XMLUni::fgEmptyString))
1528     {
1529         toFill.setModelType(DTDElementDecl::Empty);
1530         return true;
1531     }
1532 
1533     if (fReaderMgr->skippedString(XMLUni::fgAnyString))
1534     {
1535         toFill.setModelType(DTDElementDecl::Any);
1536         return true;
1537     }
1538 
1539     // Its got to be a parenthesized regular expression
1540     if (!fReaderMgr->skippedChar(chOpenParen))
1541     {
1542         fScanner->emitError
1543         (
1544             XMLErrs::ExpectedContentSpecExpr
1545             , toFill.getFullName()
1546         );
1547         return false;
1548     }
1549 
1550     // Get the current reader id, so we can test for partial markup
1551     const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1552 
1553     // We could have a PE ref here, but don't require space
1554     checkForPERef(false, true);
1555 
1556     //
1557     //  Now we look for a PCDATA string. If its PCDATA, then it must be a
1558     //  MIXED model. Otherwise, it must be a regular list of children in
1559     //  a regular expression perhaps.
1560     //
1561     bool status;
1562     if (fReaderMgr->skippedString(XMLUni::fgPCDATAString))
1563     {
1564         // Set the model to mixed
1565         toFill.setModelType(DTDElementDecl::Mixed_Simple);
1566         status = scanMixed(toFill);
1567 
1568         //
1569         //  If we are validating we have to check that there are no multiple
1570         //  uses of any child elements.
1571         //
1572         if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
1573         {
1574             if (((const MixedContentModel*)toFill.getContentModel())->hasDups())
1575                 fScanner->getValidator()->emitError(XMLValid::RepElemInMixed);
1576         }
1577     }
1578      else
1579     {
1580         //
1581         //  We have to do a recursive scan of the content model. Create a
1582         //  buffer for it to use, for efficiency. It returns the top ofthe
1583         //  content spec node tree, which we set if successful.
1584         //
1585         toFill.setModelType(DTDElementDecl::Children);
1586         XMLBufBid bbTmp(fBufMgr);
1587         unsigned int depth = 0;
1588         ContentSpecNode* resNode = scanChildren(toFill, bbTmp.getBuffer(), depth);
1589         status = (resNode != 0);
1590         if (status)
1591             toFill.setContentSpec(resNode);
1592     }
1593 
1594     // Make sure we are on the same reader as where we started
1595     if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
1596         fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
1597 
1598     return status;
1599 }
1600 
1601 
scanDefaultDecl(DTDAttDef & toFill)1602 void DTDScanner::scanDefaultDecl(DTDAttDef& toFill)
1603 {
1604     if (fReaderMgr->skippedString(XMLUni::fgRequiredString))
1605     {
1606         toFill.setDefaultType(XMLAttDef::Required);
1607         return;
1608     }
1609 
1610     if (fReaderMgr->skippedString(XMLUni::fgImpliedString))
1611     {
1612         toFill.setDefaultType(XMLAttDef::Implied);
1613         return;
1614     }
1615 
1616     if (fReaderMgr->skippedString(XMLUni::fgFixedString))
1617     {
1618         //
1619         //  There must be space before the fixed value. If there is not, then
1620         //  emit an error but keep going.
1621         //
1622         if (!fReaderMgr->skippedSpace())
1623             fScanner->emitError(XMLErrs::ExpectedWhitespace);
1624         else
1625             fReaderMgr->skipPastSpaces();
1626         toFill.setDefaultType(XMLAttDef::Fixed);
1627     }
1628      else
1629     {
1630         toFill.setDefaultType(XMLAttDef::Default);
1631     }
1632 
1633     //
1634     //  If we got here, its fixed or default, so we need to get a value.
1635     //  If we don't, then emit an error but just set the default value to
1636     //  an empty string and try to keep going.
1637     //
1638     // Check for PE ref or optional whitespace
1639     checkForPERef(false, true);
1640 
1641     XMLBufBid bbValue(fBufMgr);
1642     if (!scanAttValue(toFill.getFullName(), bbValue.getBuffer(), toFill.getType()))
1643         fScanner->emitError(XMLErrs::ExpectedDefAttrDecl);
1644 
1645     toFill.setValue(bbValue.getRawBuffer());
1646 }
1647 
1648 
1649 //
1650 //  This is called after seeing '<!ELEMENT' which indicates that an element
1651 //  markup is starting. This guy scans the rest of it and adds it to the
1652 //  element decl pool if it has not already been declared.
1653 //
scanElementDecl()1654 void DTDScanner::scanElementDecl()
1655 {
1656     //
1657     //  Space is legal (required actually) here so check for a PE ref. If
1658     //  we don't get our whitespace, then issue and error, but try to keep
1659     //  going.
1660     //
1661     if (!checkForPERef(false, true))
1662         fScanner->emitError(XMLErrs::ExpectedWhitespace);
1663 
1664     // Get a buffer for the element name and scan in the name
1665     XMLBufBid bbName(fBufMgr);
1666     if (!fReaderMgr->getName(bbName.getBuffer()))
1667     {
1668         fScanner->emitError(XMLErrs::ExpectedElementName);
1669         fReaderMgr->skipPastChar(chCloseAngle);
1670         return;
1671     }
1672 
1673     // Look this guy up in the element decl pool
1674     DTDElementDecl* decl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
1675 
1676     //
1677     //  If it does not exist, then we need to create it. If it does and
1678     //  its marked as declared, then that's an error, but we still need to
1679     //  scan over the content model so use the dummy declaration that the
1680     //  parsing code can fill in.
1681     //
1682     if (decl)
1683     {
1684         if (decl->isDeclared())
1685         {
1686             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
1687                 fScanner->getValidator()->emitError(XMLValid::ElementAlreadyExists, bbName.getRawBuffer());
1688 
1689             if (!fDumElemDecl)
1690                 fDumElemDecl = new (fMemoryManager) DTDElementDecl
1691                 (
1692                     bbName.getRawBuffer()
1693                     , fEmptyNamespaceId
1694                     , DTDElementDecl::Any
1695                     , fMemoryManager
1696                 );
1697             else
1698                 fDumElemDecl->setElementName(bbName.getRawBuffer(),fEmptyNamespaceId);
1699         }
1700     }
1701      else
1702     {
1703         //
1704         //  Create the new empty declaration to fill in and put it into
1705         //  the decl pool.
1706         //
1707         decl = new (fGrammarPoolMemoryManager) DTDElementDecl
1708         (
1709             bbName.getRawBuffer()
1710             , fEmptyNamespaceId
1711             , DTDElementDecl::Any
1712             , fGrammarPoolMemoryManager
1713         );
1714         fDTDGrammar->putElemDecl(decl);
1715     }
1716 
1717     // Set a flag for whether we will ignore this one
1718     const bool isIgnored = (decl == fDumElemDecl);
1719 
1720     // Mark this one if being externally declared
1721     decl->setExternalElemDeclaration(isReadingExternalEntity());
1722 
1723     // Mark this one as being declared
1724     decl->setCreateReason(XMLElementDecl::Declared);
1725 
1726     // Another check for a PE ref, with at least required whitespace
1727     if (!checkForPERef(false, true))
1728         fScanner->emitError(XMLErrs::ExpectedWhitespace);
1729 
1730     // And now scan the content model for this guy.
1731     if (!scanContentSpec(*decl))
1732     {
1733         fReaderMgr->skipPastChar(chCloseAngle);
1734         return;
1735     }
1736 
1737     // Another check for a PE ref, but we don't require whitespace here
1738     checkForPERef(false, true);
1739 
1740     // And we should have the ending angle bracket
1741     if (!fReaderMgr->skippedChar(chCloseAngle))
1742     {
1743         fScanner->emitError(XMLErrs::UnterminatedElementDecl, bbName.getRawBuffer());
1744         fReaderMgr->skipPastChar(chCloseAngle);
1745     }
1746 
1747     //
1748     //  If we have a DTD handler tell it about the new element decl. We
1749     //  tell it if its one that can be ignored, cause its an override of a
1750     //  previously existing decl. If it is being ignored, only call back
1751     //  if advanced callbacks are enabled.
1752     //
1753     if (fDocTypeHandler)
1754         fDocTypeHandler->elementDecl(*decl, isIgnored);
1755 }
1756 
1757 
1758 //
1759 //  This method will process a general or parameter entity reference. The
1760 //  entity name and entity text will be stored in the entity pool. The value
1761 //  of the entity will be scanned for any other parameter entity or char
1762 //  references which will be expanded. So the stored value can only have
1763 //  general entity references when done.
1764 //
scanEntityDecl()1765 void DTDScanner::scanEntityDecl()
1766 {
1767     //
1768     //  Space is required here, but we cannot check for a PE Ref since
1769     //  there could be a legal (no-ref) percent sign here. Since any
1770     //  entity that ended here would be illegal, we just skip spaces
1771     //  and then check for a percent.
1772     //
1773     if (!fReaderMgr->lookingAtSpace())
1774         fScanner->emitError(XMLErrs::ExpectedWhitespace);
1775     else
1776         fReaderMgr->skipPastSpaces();
1777     bool isPEDecl = fReaderMgr->skippedChar(chPercent);
1778 
1779     //
1780     //  If a PE decl, then check if it is followed by a space; if it is so,
1781     //  eat the percent and check for spaces or a PE ref on the other side of it.
1782     //  Otherwise, it has to be an entity reference for a general entity.
1783     //
1784     if (isPEDecl)
1785     {
1786         if(!fReaderMgr->getCurrentReader()->isWhitespace(fReaderMgr->peekNextChar()))
1787         {
1788             isPEDecl=false;
1789             while (true)
1790             {
1791                if (!expandPERef(false, false, true, false))
1792                   fScanner->emitError(XMLErrs::ExpectedEntityRefName);
1793                // And skip any more spaces in the expanded value
1794                if (fReaderMgr->skippedSpace())
1795                   fReaderMgr->skipPastSpaces();
1796                if (!fReaderMgr->skippedChar(chPercent))
1797                   break;
1798             }
1799         }
1800         else if (!checkForPERef(false, true))
1801             fScanner->emitError(XMLErrs::ExpectedWhitespace);
1802     }
1803 
1804     //
1805     //  Now lets get a name, which should be the name of the entity. We
1806     //  have to get a buffer for this.
1807     //
1808     XMLBufBid bbName(fBufMgr);
1809     if (!fReaderMgr->getName(bbName.getBuffer()))
1810     {
1811         fScanner->emitError(XMLErrs::ExpectedPEName);
1812         fReaderMgr->skipPastChar(chCloseAngle);
1813         return;
1814     }
1815 
1816     // If namespaces are enabled, then no colons allowed
1817     if (fScanner->getDoNamespaces())
1818     {
1819         if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
1820             fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
1821     }
1822 
1823     //
1824     //  See if this entity already exists. If so, then the existing one
1825     //  takes precendence. So we use the local dummy decl to parse into
1826     //  and just ignore the results.
1827     //
1828     DTDEntityDecl* entityDecl;
1829     if (isPEDecl)
1830         entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
1831     else
1832         entityDecl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
1833 
1834     if (entityDecl)
1835     {
1836         if (!fDumEntityDecl)
1837             fDumEntityDecl = new (fMemoryManager) DTDEntityDecl(fMemoryManager);
1838         fDumEntityDecl->setName(bbName.getRawBuffer());
1839         entityDecl = fDumEntityDecl;
1840     }
1841      else
1842     {
1843         // Its not in existence already, then create an entity decl for it
1844         entityDecl = new (fGrammarPoolMemoryManager) DTDEntityDecl(bbName.getRawBuffer(), false, fGrammarPoolMemoryManager);
1845 
1846         //
1847         //  Set the declaration location. The parameter indicates whether its
1848         //  declared in the content/internal subset, so we know whether or not
1849         //  its in the external subset.
1850         //
1851         entityDecl->setDeclaredInIntSubset(fInternalSubset);
1852 
1853         // Add it to the appropriate entity decl pool
1854         if (isPEDecl)
1855             fPEntityDeclPool->put(entityDecl);
1856          else
1857             fDTDGrammar->putEntityDecl(entityDecl);
1858     }
1859 
1860     // Set a flag that indicates whether we are ignoring this one
1861     const bool isIgnored = (entityDecl == fDumEntityDecl);
1862 
1863     // Set the PE flag on it
1864     entityDecl->setIsParameter(isPEDecl);
1865 
1866     //
1867     //  Space is legal (required actually) here so check for a PE ref. If
1868     //  we don't get our whitespace, then issue an error, but try to keep
1869     //  going.
1870     //
1871     if (!checkForPERef(false, true))
1872         fScanner->emitError(XMLErrs::ExpectedWhitespace);
1873 
1874     // save the hasNoDTD status for Entity Constraint Checking
1875     bool hasNoDTD = fScanner->getHasNoDTD();
1876     if (hasNoDTD && isPEDecl)
1877         fScanner->setHasNoDTD(false);
1878 
1879     // According to the type call the value scanning method
1880     if (!scanEntityDef(*entityDecl, isPEDecl))
1881     {
1882         fReaderMgr->skipPastChar(chCloseAngle);
1883         fScanner->setHasNoDTD(true);
1884         fScanner->emitError(XMLErrs::ExpectedEntityValue);
1885         return;
1886     }
1887     if (hasNoDTD)
1888         fScanner->setHasNoDTD(true);
1889 
1890     // Space is legal (but not required) here so check for a PE ref
1891     checkForPERef(false, true);
1892 
1893     // And then we have to have the closing angle bracket
1894     if (!fReaderMgr->skippedChar(chCloseAngle))
1895     {
1896         fScanner->emitError(XMLErrs::UnterminatedEntityDecl, entityDecl->getName());
1897         fReaderMgr->skipPastChar(chCloseAngle);
1898     }
1899 
1900     //
1901     //  If we have a doc type handler, then call it. But only call it for
1902     //  ignored elements if advanced callbacks are enabled.
1903     //
1904     if (fDocTypeHandler)
1905         fDocTypeHandler->entityDecl(*entityDecl, isPEDecl, isIgnored);
1906 }
1907 
1908 
1909 //
1910 //  This method will scan a general/character entity ref. It will either
1911 //  expand a char ref and return the value directly, or it will expand
1912 //  a general entity and a reader for it onto the reader stack.
1913 //
1914 //  The return value indicates whether the value was returned directly or
1915 //  pushed as a reader or it failed.
1916 //
1917 //  The escaped flag tells the caller whether the returnd parameter resulted
1918 //  from a character reference, which escapes the character in some cases. It
1919 //  only makes any difference if the return indicates the value was returned
1920 //  directly.
1921 //
1922 //  NOTE: This is only called when scanning attribute values, so we always
1923 //  expand general entities.
1924 //
1925 DTDScanner::EntityExpRes
scanEntityRef(XMLCh & firstCh,XMLCh & secondCh,bool & escaped)1926 DTDScanner::scanEntityRef(XMLCh& firstCh, XMLCh& secondCh, bool& escaped)
1927 {
1928     // Assume no escape and no second char
1929     escaped = false;
1930     secondCh = 0;
1931 
1932     // We have to insure its all done in a single entity
1933     const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1934 
1935     //
1936     //  If the next char is a pound, then its a character reference and we
1937     //  need to expand it always.
1938     //
1939     if (fReaderMgr->skippedChar(chPound))
1940     {
1941         //
1942         //  Its a character reference, so scan it and get back the numeric
1943         //  value it represents. If it fails, just return immediately.
1944         //
1945         if (!scanCharRef(firstCh, secondCh))
1946             return EntityExp_Failed;
1947 
1948         if (curReader != fReaderMgr->getCurrentReaderNum())
1949             fScanner->emitError(XMLErrs::PartialMarkupInEntity);
1950 
1951         // Its now escaped since it was a char ref
1952         escaped = true;
1953         return EntityExp_Returned;
1954     }
1955 
1956     // Get the name of the general entity
1957     XMLBufBid bbName(fBufMgr);
1958     if (!fReaderMgr->getName(bbName.getBuffer()))
1959     {
1960         fScanner->emitError(XMLErrs::ExpectedEntityRefName);
1961         return EntityExp_Failed;
1962     }
1963 
1964     //
1965     //  Next char must be a semi-colon. But if its not, just emit
1966     //  an error and try to continue.
1967     //
1968     if (!fReaderMgr->skippedChar(chSemiColon))
1969         fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
1970 
1971     // Make sure it was all in one entity reader
1972     if (curReader != fReaderMgr->getCurrentReaderNum())
1973         fScanner->emitError(XMLErrs::PartialMarkupInEntity);
1974 
1975     // Look it up the name the general entity pool
1976     XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
1977 
1978     // If it does not exist, then obviously an error
1979     if (!decl)
1980     {
1981         // XML 1.0 Section 4.1
1982         if (fScanner->getStandalone() || fScanner->getHasNoDTD()) {
1983             fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
1984         }
1985         else {
1986             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
1987                 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
1988         }
1989 
1990         return EntityExp_Failed;
1991     }
1992 
1993 
1994     //
1995     // XML 1.0 Section 4.1
1996     //  If we are a standalone document, then it has to have been declared
1997     //  in the internal subset.
1998     //
1999     if (fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
2000         fScanner->emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer());
2001 
2002     //
2003     //  If its a special char reference, then its escaped and we can return
2004     //  it directly.
2005     //
2006     if (decl->getIsSpecialChar())
2007     {
2008         firstCh = decl->getValue()[0];
2009         escaped = true;
2010         return EntityExp_Returned;
2011     }
2012 
2013     if (decl->isExternal())
2014     {
2015         // If its unparsed, then its not valid here
2016         // XML 1.0 Section 4.4.4 the appearance of a reference to an unparsed entity is forbidden.
2017         if (decl->isUnparsed())
2018         {
2019             fScanner->emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
2020             return EntityExp_Failed;
2021         }
2022 
2023         // We are in an attribute value, so not valid.
2024         // XML 1.0 Section 4.4.4 a reference to an external entity in an attribute value is forbidden.
2025         fScanner->emitError(XMLErrs::NoExtRefsInAttValue);
2026 
2027         // And now create a reader to read this entity
2028         InputSource* srcUsed;
2029         XMLReader* reader = fReaderMgr->createReader
2030         (
2031             decl->getBaseURI()
2032             , decl->getSystemId()
2033             , decl->getPublicId()
2034             , false
2035             , XMLReader::RefFrom_NonLiteral
2036             , XMLReader::Type_General
2037             , XMLReader::Source_External
2038             , srcUsed
2039             , fScanner->getCalculateSrcOfs()
2040             , fScanner->getLowWaterMark()
2041             , fScanner->getDisableDefaultEntityResolution()
2042         );
2043 
2044         // Put a janitor on the source so it gets cleaned up on exit
2045         Janitor<InputSource> janSrc(srcUsed);
2046 
2047         //
2048         //  If the creation failed then throw an exception
2049         //
2050         if (!reader)
2051             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
2052 
2053         //
2054         //  Push the reader. If its a recursive expansion, then emit an error
2055         //  and return an failure.
2056         //
2057         if (!fReaderMgr->pushReader(reader, decl))
2058         {
2059             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
2060             return EntityExp_Failed;
2061         }
2062 
2063         // If it starts with the XML string, then parse a text decl
2064         if (fScanner->checkXMLDecl(true))
2065             scanTextDecl();
2066     }
2067      else
2068     {
2069         //
2070         //  Create a reader over a memory stream over the entity value
2071         //  We force it to assume UTF-16 by passing in an encoding
2072         //  string. This way it won't both trying to predecode the
2073         //  first line, looking for an XML/TextDecl.
2074         //
2075         XMLReader* valueReader = fReaderMgr->createIntEntReader
2076         (
2077             decl->getName()
2078             , XMLReader::RefFrom_NonLiteral
2079             , XMLReader::Type_General
2080             , decl->getValue()
2081             , decl->getValueLen()
2082             , false
2083         );
2084 
2085         //
2086         //  Trt to push the entity reader onto the reader manager stack,
2087         //  where it will become the subsequent input. If it fails, that
2088         //  means the entity is recursive, so issue an error. The reader
2089         //  will have just been discarded, but we just keep going.
2090         //
2091         if (!fReaderMgr->pushReader(valueReader, decl))
2092             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
2093     }
2094 
2095     return EntityExp_Pushed;
2096 }
2097 
2098 
2099 //
2100 //  This method will scan a quoted literal of an entity value. It has to
2101 //  deal with replacement of PE references; however, since this is a DTD
2102 //  scanner, all such entity literals are in entity decls and therefore
2103 //  general entities are not expanded.
2104 //
scanEntityLiteral(XMLBuffer & toFill)2105 bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill)
2106 {
2107     toFill.reset();
2108 
2109     // Get the next char which must be a single or double quote
2110     XMLCh quoteCh;
2111     if (!fReaderMgr->skipIfQuote(quoteCh))
2112         return false;
2113 
2114     // Get a buffer for pulling in entity names when we see GE refs
2115     XMLBufBid bbName(fBufMgr);
2116     XMLBuffer& nameBuf = bbName.getBuffer();
2117 
2118     // Remember the current reader
2119     const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2120 
2121     //
2122     //  Loop until we see the ending quote character, handling any references
2123     //  in the process.
2124     //
2125     XMLCh   nextCh;
2126     XMLCh   secondCh = 0;
2127     bool    gotLeadingSurrogate = false;
2128     while (true)
2129     {
2130         nextCh = fReaderMgr->getNextChar();
2131 
2132         //
2133         //  Watch specifically for EOF and issue a more meaningful error
2134         //  if that occurs (since an unterminated quoted char can cause
2135         //  this easily.)
2136         //
2137         if (!nextCh)
2138         {
2139             fScanner->emitError(XMLErrs::UnterminatedEntityLiteral);
2140             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
2141         }
2142 
2143         //
2144         //  Break out on our terminating quote char when we are back in the
2145         //  same reader. Otherwise, we might trigger on a nested quote char
2146         //  in an expanded entity.
2147         //
2148         if ((nextCh == quoteCh)
2149         &&  (fReaderMgr->getCurrentReaderNum() == orgReader))
2150         {
2151             break;
2152         }
2153 
2154         if (nextCh == chPercent)
2155         {
2156             //
2157             //  Put the PE's value on the reader stack and then jump back
2158             //  to the top to start processing it. The parameter indicates
2159             //  that it should not scan the reference's content as an external
2160             //  subset.
2161             //
2162             expandPERef(false, true, true);
2163             continue;
2164         }
2165 
2166         //
2167         //  Ok, now that all the other special stuff is checked, we can
2168         //  look for a general entity. In here, we cannot have a naked &
2169         //  and will only expand numerical char refs or the intrinsic char
2170         //  refs. Others will be left alone.
2171         //
2172         if (nextCh == chAmpersand)
2173         {
2174             //
2175             //  Here, we only expand numeric char refs, but not any general
2176             //  entities. However, the stupid XML spec requires that we check
2177             //  and make sure it does refer to a general entity if its not
2178             //  a char ref (i.e. no naked '&' chars.)
2179             //
2180             if (fReaderMgr->skippedChar(chPound))
2181             {
2182                 // If it failed, then just jump back to the top and try to pick up
2183                 if (!scanCharRef(nextCh, secondCh))
2184                 {
2185                     gotLeadingSurrogate = false;
2186                     continue;
2187                 }
2188             }
2189              else
2190             {
2191                 if (!fReaderMgr->getName(nameBuf))
2192                 {
2193                     fScanner->emitError(XMLErrs::ExpectedEntityRefName);
2194                 }
2195                  else
2196                 {
2197                     //
2198                     //  Since we are not expanding any of this, we have to
2199                     //  put the amp and name into the target buffer as data.
2200                     //
2201                     toFill.append(chAmpersand);
2202                     toFill.append(nameBuf.getRawBuffer());
2203 
2204                     // Make sure we skipped a trailing semicolon
2205                     if (!fReaderMgr->skippedChar(chSemiColon))
2206                     {
2207                         fScanner->emitError
2208                         (
2209                             XMLErrs::UnterminatedEntityRef
2210                             , nameBuf.getRawBuffer()
2211                         );
2212                     }
2213 
2214                     // And make the new character the semicolon
2215                     nextCh = chSemiColon;
2216                 }
2217 
2218                 // Either way here we reset the surrogate flag
2219                 gotLeadingSurrogate = false;
2220             }
2221         }
2222         else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
2223         {
2224             if (gotLeadingSurrogate)
2225                 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2226             else
2227                 gotLeadingSurrogate = true;
2228         }
2229          else
2230         {
2231             if (gotLeadingSurrogate)
2232             {
2233                 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
2234                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2235             }
2236              else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
2237             {
2238                 XMLCh tmpBuf[9];
2239                 XMLString::binToText
2240                 (
2241                     nextCh
2242                     , tmpBuf
2243                     , 8
2244                     , 16
2245                     , fMemoryManager
2246                 );
2247                 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
2248                 fReaderMgr->skipPastChar(quoteCh);
2249                 return false;
2250             }
2251             gotLeadingSurrogate = false;
2252         }
2253 
2254         // Looks ok, so add it to the literal
2255         toFill.append(nextCh);
2256 
2257         if (secondCh)
2258         {
2259             toFill.append(secondCh);
2260             secondCh=0;
2261         }
2262     }
2263 
2264     //
2265     //  If we got here and did not get back to the original reader level,
2266     //  then we propogated some entity out of the literal, so issue an
2267     //  error, but don't fail.
2268     //
2269     if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
2270         fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
2271 
2272     return true;
2273 }
2274 
2275 
2276 //
2277 //  This method is called after the entity name has been scanned, and any
2278 //  PE referenced following the name is handled. The passed decl will be
2279 //  filled in with the info scanned.
2280 //
scanEntityDef(DTDEntityDecl & decl,const bool isPEDecl)2281 bool DTDScanner::scanEntityDef(DTDEntityDecl& decl, const bool isPEDecl)
2282 {
2283     // Its got to be an entity literal
2284     if (fReaderMgr->lookingAtChar(chSingleQuote)
2285     ||  fReaderMgr->lookingAtChar(chDoubleQuote))
2286     {
2287         // Get a buffer for the literal
2288         XMLBufBid bbValue(fBufMgr);
2289 
2290         if (!scanEntityLiteral(bbValue.getBuffer()))
2291             return false;
2292 
2293         // Set it on the entity decl
2294         decl.setValue(bbValue.getRawBuffer());
2295         return true;
2296     }
2297 
2298     //
2299     //  Its got to be an external entity, so there must be an external id.
2300     //  Get buffers for them and scan an external id into them.
2301     //
2302     XMLBufBid bbPubId(fBufMgr);
2303     XMLBufBid bbSysId(fBufMgr);
2304     if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
2305         return false;
2306 
2307     decl.setIsExternal(true);
2308     ReaderMgr::LastExtEntityInfo lastInfo;
2309     fReaderMgr->getLastExtEntityInfo(lastInfo);
2310 
2311     // Fill in the id fields of the decl with the info we got
2312     const XMLCh* publicId = bbPubId.getRawBuffer();
2313     const XMLCh* systemId = bbSysId.getRawBuffer();
2314     decl.setPublicId((publicId && *publicId) ? publicId : 0);
2315     decl.setSystemId((systemId && *systemId) ? systemId : 0);
2316     decl.setBaseURI((lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0);
2317 
2318     // If its a PE decl, we are done
2319     bool gotSpaces = checkForPERef(false, true);
2320     if (isPEDecl)
2321     {
2322         //
2323         //  Check for a common error here. NDATA is not allowed for PEs
2324         //  so check for the NDATA string. If found give a nice meaningful
2325         //  error and continue parsing to eat the NDATA text.
2326         //
2327         if (gotSpaces)
2328         {
2329             if (fReaderMgr->skippedString(XMLUni::fgNDATAString))
2330                 fScanner->emitError(XMLErrs::NDATANotValidForPE);
2331         }
2332          else
2333         {
2334             return true;
2335         }
2336     }
2337 
2338     // If looking at close angle now, we are done
2339     if (fReaderMgr->lookingAtChar(chCloseAngle))
2340         return true;
2341 
2342     // Else we had to have seem the whitespace
2343     if (!gotSpaces)
2344         fScanner->emitError(XMLErrs::ExpectedWhitespace);
2345 
2346     // We now have to see a notation data string
2347     if (!fReaderMgr->skippedString(XMLUni::fgNDATAString))
2348         fScanner->emitError(XMLErrs::ExpectedNDATA);
2349 
2350     // Space is required here, but try to go on if not
2351     if (!checkForPERef(false, true))
2352         fScanner->emitError(XMLErrs::ExpectedWhitespace);
2353 
2354     // Get a name
2355     XMLBufBid bbName(fBufMgr);
2356     if (!fReaderMgr->getName(bbName.getBuffer()))
2357     {
2358         fScanner->emitError(XMLErrs::ExpectedNotationName);
2359         return false;
2360     }
2361 
2362     // Set the decl's notation name
2363     decl.setNotationName(bbName.getRawBuffer());
2364 
2365     return true;
2366 }
2367 
2368 
2369 //
2370 //  This method is called after an attribute decl name or a notation decl has
2371 //  been scanned and then an opening parenthesis was see, indicating the list
2372 //  of values. It scans the enumeration values and creates a single string
2373 //  which has a single space between each value.
2374 //
2375 //  The terminating close paren ends this scan.
2376 //
scanEnumeration(const DTDAttDef & attDef,XMLBuffer & toFill,const bool notation)2377 bool DTDScanner::scanEnumeration( const   DTDAttDef&  attDef
2378                                     ,       XMLBuffer&  toFill
2379                                     , const bool        notation)
2380 {
2381     // Reset the passed buffer
2382     toFill.reset();
2383 
2384     // Check for PE ref but don't require space
2385     checkForPERef(false, true);
2386 
2387     // If this is a notation, we need an opening paren
2388     if (notation)
2389     {
2390         if (!fReaderMgr->skippedChar(chOpenParen))
2391             fScanner->emitError(XMLErrs::ExpectedOpenParen);
2392     }
2393 
2394     // We need a local buffer to use as well
2395     XMLBufBid bbTmp(fBufMgr);
2396 
2397     while (true)
2398     {
2399         // Space is allowed here for either type so check for PE ref
2400         checkForPERef(false, true);
2401 
2402         // And then get either a name or a name token
2403         bool success;
2404         if (notation)
2405             success = fReaderMgr->getName(bbTmp.getBuffer());
2406         else
2407             success = fReaderMgr->getNameToken(bbTmp.getBuffer());
2408 
2409         if (!success)
2410         {
2411             fScanner->emitError
2412             (
2413                 XMLErrs::ExpectedEnumValue
2414                 , attDef.getFullName()
2415             );
2416             return false;
2417         }
2418 
2419         // Append this value to the target value
2420         toFill.append(bbTmp.getRawBuffer(), bbTmp.getLen());
2421 
2422         // Space is allowed here for either type so check for PE ref
2423         checkForPERef(false, true);
2424 
2425         // Check for the terminating paren
2426         if (fReaderMgr->skippedChar(chCloseParen))
2427             break;
2428 
2429         // And append a space separator
2430         toFill.append(chSpace);
2431 
2432         // Check for the pipe character separator
2433         if (!fReaderMgr->skippedChar(chPipe))
2434         {
2435             fScanner->emitError(XMLErrs::ExpectedEnumSepOrParen);
2436             return false;
2437         }
2438     }
2439     return true;
2440 }
2441 
2442 
scanEq()2443 bool DTDScanner::scanEq()
2444 {
2445     fReaderMgr->skipPastSpaces();
2446     if (fReaderMgr->skippedChar(chEqual))
2447     {
2448         fReaderMgr->skipPastSpaces();
2449         return true;
2450     }
2451     return false;
2452 }
2453 
2454 
2455 //
2456 //  This method is called when an external entity reference is seen in the
2457 //  DTD or an external DTD subset is encountered, and their contents pushed
2458 //  onto the reader stack. This method will scan that contents.
2459 //
scanExtSubsetDecl(const bool inIncludeSect,const bool isDTD)2460 void DTDScanner::scanExtSubsetDecl(const bool inIncludeSect, const bool isDTD)
2461 {
2462     // Indicate we are in the external subset now
2463     FlagJanitor<bool> janContentFlag(&fInternalSubset, false);
2464 
2465 
2466     bool bAcceptDecl = !inIncludeSect;
2467 
2468     // Get a buffer for whitespace
2469     XMLBufBid bbSpace(fBufMgr);
2470 
2471     //
2472     //  If we have a doc type handler and we are not being called recursively
2473     //  to handle an include section, tell it the ext subset starts
2474     //
2475     if (fDocTypeHandler && isDTD && !inIncludeSect)
2476         fDocTypeHandler->startExtSubset();
2477 
2478     //
2479     //  We have to play a trick here if the current entity we are parsing
2480     //  is a PE. Because the spooling code will put out a whitespace before
2481     //  and after an expanded PE if its being scanned outside the context of
2482     //  a literal entity, this will confuse this external subset code.
2483     //
2484     //  So, we see if that is what is happening and, if so, eat the single
2485     //  space, a check for the <?xml string. If we find it, we parse that
2486     //  markup right now and put the space back.
2487     //
2488     if (fReaderMgr->isScanningPERefOutOfLiteral())
2489     {
2490         if (fReaderMgr->skippedSpace())
2491         {
2492             if (fScanner->checkXMLDecl(true))
2493             {
2494                 scanTextDecl();
2495                 bAcceptDecl = false;
2496 
2497                 // <TBD> Figure out how to do this
2498                 // fReaderMgr->unGet(chSpace);
2499             }
2500         }
2501     }
2502 
2503     // Get the current reader number
2504     const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2505 
2506     //
2507     //  Loop until we hit the end of the external subset entity. Note that
2508     //  we use a double loop here in order to avoid the overhead of doing
2509     //  the exception setup/teardown work on every loop.
2510     //
2511     bool inMarkup = false;
2512     bool inCharData = false;
2513     while (true)
2514     {
2515         bool bDoBreak=false;    // workaround for Borland bug with 'break' in 'catch'
2516         try
2517         {
2518             while (true)
2519             {
2520                 XMLCh nextCh;
2521 
2522                 try {
2523                     nextCh = fReaderMgr->peekNextChar();
2524                 }
2525                 catch (XMLException& ex) {
2526                     fScanner->emitError(XMLErrs::XMLException_Fatal, ex.getCode(), ex.getMessage(), NULL, NULL);
2527                     nextCh = chNull;
2528                 }
2529 
2530                 if (!nextCh)
2531                 {
2532                     return; // nothing left
2533                 }
2534                 else if (nextCh == chOpenAngle)
2535                 {
2536                     // Get the reader we started this on
2537                     // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
2538                     const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2539                     bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
2540 
2541                     //
2542                     //  Now scan the markup. Set the flag so that we will know that
2543                     //  we were in markup if an end of entity exception occurs.
2544                     //
2545                     fReaderMgr->getNextChar();
2546                     inMarkup = true;
2547                     scanMarkupDecl(bAcceptDecl);
2548                     inMarkup = false;
2549 
2550                     //
2551                     //  And see if we got back to the same level. If not, then its
2552                     //  a partial markup error.
2553                     //
2554                     if (fReaderMgr->getCurrentReaderNum() != orgReader){
2555                         if (wasInPE)
2556                             fScanner->emitError(XMLErrs::PEBetweenDecl);
2557                         else if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
2558                             fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
2559                     }
2560 
2561                 }
2562                 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
2563                 {
2564                     //
2565                     //  If we have a doc type handler, and advanced callbacks are
2566                     //  enabled, then gather up whitespace and call back. Otherwise
2567                     //  just skip whitespaces.
2568                     //
2569                     if (fDocTypeHandler)
2570                     {
2571                         inCharData = true;
2572                         fReaderMgr->getSpaces(bbSpace.getBuffer());
2573                         inCharData = false;
2574 
2575                         fDocTypeHandler->doctypeWhitespace
2576                         (
2577                             bbSpace.getRawBuffer()
2578                             , bbSpace.getLen()
2579                         );
2580                     }
2581                     else
2582                     {
2583                         //
2584                         //  If we hit an end of entity in the middle of white
2585                         //  space, that's fine. We'll just come back in here
2586                         //  again on the next round and skip some more.
2587                         //
2588                         fReaderMgr->skipPastSpaces();
2589                     }
2590                 }
2591                 else if (nextCh == chPercent)
2592                 {
2593                     //
2594                     //  Expand (and scan if external) the reference value. Tell
2595                     //  it to throw an end of entity exception at the end of the
2596                     //  entity.
2597                     //
2598                     fReaderMgr->getNextChar();
2599                     expandPERef(true, false, false, true);
2600                 }
2601                 else if (inIncludeSect && (nextCh == chCloseSquare))
2602                 {
2603                     //
2604                     //  Its the end of a conditional include section. So scan it and
2605                     //  decrement the include depth counter.
2606                     //
2607                     fReaderMgr->getNextChar();
2608                     if (!fReaderMgr->skippedChar(chCloseSquare))
2609                     {
2610                         fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
2611                         fReaderMgr->skipPastChar(chCloseAngle);
2612                     }
2613                     else if (!fReaderMgr->skippedChar(chCloseAngle))
2614                     {
2615                         fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
2616                         fReaderMgr->skipPastChar(chCloseAngle);
2617                     }
2618                     return;
2619                 }
2620                 else
2621                 {
2622                     fReaderMgr->getNextChar();
2623                     if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
2624                     {
2625                         XMLCh tmpBuf[9];
2626                         XMLString::binToText
2627                         (
2628                             nextCh
2629                             , tmpBuf
2630                             , 8
2631                             , 16
2632                             , fMemoryManager
2633                         );
2634                         fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
2635                     }
2636                     else
2637                     {
2638                         fScanner->emitError(XMLErrs::InvalidDocumentStructure);
2639                     }
2640 
2641                     // Try to get realigned
2642                     static const XMLCh toSkip[] =
2643                     {
2644                         chPercent, chCloseSquare, chOpenAngle, chNull
2645                     };
2646                     fReaderMgr->skipUntilInOrWS(toSkip);
2647                 }
2648                 bAcceptDecl = false;
2649             }
2650         }
2651         catch(const EndOfEntityException& toCatch)
2652         {
2653             //
2654             //  If the external entity ended while we were in markup, then that's
2655             //  a partial markup error.
2656             //
2657             if (inMarkup)
2658             {
2659                 fScanner->emitError(XMLErrs::PartialMarkupInEntity);
2660                 inMarkup = false;
2661             }
2662 
2663             // If we were in char data, then send what we got
2664             if (inCharData)
2665             {
2666                 // Send what we got, then rethrow
2667                 if (fDocTypeHandler)
2668                 {
2669                     fDocTypeHandler->doctypeWhitespace
2670                     (
2671                         bbSpace.getRawBuffer()
2672                         , bbSpace.getLen()
2673                     );
2674                 }
2675                 inCharData = false;
2676             }
2677 
2678             //
2679             //  If the entity that just ended was the entity that we started
2680             //  on, then this is the end of the external subset.
2681             //
2682             if (orgReader == toCatch.getReaderNum())
2683                 bDoBreak=true;
2684         }
2685         if(bDoBreak)
2686             break;
2687     }
2688 
2689     // If we have a doc type handler, tell it the ext subset ends
2690     if (fDocTypeHandler && isDTD && !inIncludeSect)
2691         fDocTypeHandler->endExtSubset();
2692 }
2693 
2694 
2695 //
2696 //  This method will scan for an id, either public or external.
2697 //
2698 //
2699 // [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2700 //                     | 'PUBLIC' S PubidLiteral S SystemLiteral
2701 // [83] PublicID ::= 'PUBLIC' S PubidLiteral
2702 //
scanId(XMLBuffer & pubIdToFill,XMLBuffer & sysIdToFill,const IDTypes whatKind)2703 bool DTDScanner::scanId(          XMLBuffer&  pubIdToFill
2704                             ,       XMLBuffer&  sysIdToFill
2705                             , const IDTypes     whatKind)
2706 {
2707     // Clean out both return buffers
2708     pubIdToFill.reset();
2709     sysIdToFill.reset();
2710 
2711     //
2712     //  Check first for the system id first. If we find it, and system id
2713     //  is one of the legal values, then lets try to scan it.
2714     //
2715     // 'SYSTEM' S SystemLiteral
2716     if (fReaderMgr->skippedString(XMLUni::fgSysIDString))
2717     {
2718         // If they were looking for a public id, then we failed
2719         if (whatKind == IDType_Public)
2720         {
2721             fScanner->emitError(XMLErrs::ExpectedPublicId);
2722             return false;
2723         }
2724 
2725         // We must skip spaces
2726         bool skippedSomething;
2727         fReaderMgr->skipPastSpaces(skippedSomething);
2728         if (!skippedSomething)
2729         {
2730             fScanner->emitError(XMLErrs::ExpectedWhitespace);
2731             return false;
2732         }
2733 
2734         // Get the system literal value
2735         return scanSystemLiteral(sysIdToFill);
2736     }
2737 
2738     // Now scan for public id
2739     // 'PUBLIC' S PubidLiteral S SystemLiteral
2740     //  or
2741     // 'PUBLIC' S PubidLiteral
2742 
2743     // If we don't have any public id string => Error
2744     if (!fReaderMgr->skippedString(XMLUni::fgPubIDString)) {
2745         fScanner->emitError(XMLErrs::ExpectedSystemOrPublicId);
2746         return false;
2747     }
2748 
2749     //
2750     //  So following this we must have whitespace, a public literal, whitespace,
2751     //  and a system literal.
2752     //
2753     bool skippedSomething;
2754     fReaderMgr->skipPastSpaces(skippedSomething);
2755     if (!skippedSomething)
2756     {
2757         fScanner->emitError(XMLErrs::ExpectedWhitespace);
2758 
2759         //
2760         //  Just in case, if they just forgot the whitespace but the next char
2761         //  is a single or double quote, then keep going.
2762         //
2763         const XMLCh chPeek = fReaderMgr->peekNextChar();
2764         if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote))
2765             return false;
2766     }
2767 
2768     if (!scanPublicLiteral(pubIdToFill))
2769         return false;
2770 
2771     // If they wanted a public id, then this is all
2772     if (whatKind == IDType_Public)
2773         return true;
2774 
2775     // check if there is any space follows
2776     bool hasSpace;
2777     fReaderMgr->skipPastSpaces(hasSpace);
2778 
2779     //
2780     //  In order to recover best here we need to see if
2781     //  the next thing is a quote or not
2782     //
2783     const XMLCh chPeek = fReaderMgr->peekNextChar();
2784     const bool bIsQuote =  ((chPeek == chDoubleQuote)
2785                          || (chPeek == chSingleQuote));
2786 
2787     if (!hasSpace)
2788     {
2789         if (whatKind == IDType_External)
2790         {
2791             //
2792             //  If its an external Id, then we need to see the system id.
2793             //  So, emit the error. But, if the next char is a quote, don't
2794             //  give up since its probably going to work. The user just
2795             //  missed the separating space. Otherwise, fail.
2796             //
2797             fScanner->emitError(XMLErrs::ExpectedWhitespace);
2798             if (!bIsQuote)
2799                 return false;
2800         }
2801          else
2802         {
2803             //
2804             //  We can legally return here. But, if the next char is a quote,
2805             //  then that's probably not what was desired, since its probably
2806             //  just that space was forgotten and there really is a system
2807             //  id to follow.
2808             //
2809             //  So treat it like missing whitespace if so and keep going.
2810             //  Else, just return success.
2811             //
2812             if (bIsQuote)
2813                 fScanner->emitError(XMLErrs::ExpectedWhitespace);
2814              else
2815                 return true;
2816         }
2817     }
2818 
2819     if (bIsQuote) {
2820         // there is a quote coming, scan the system literal
2821         if (!scanSystemLiteral(sysIdToFill))
2822             return false;
2823     }
2824     else {
2825         // no quote, if expecting exteral id, this is an error
2826         if (whatKind == IDType_External)
2827             fScanner->emitError(XMLErrs::ExpectedQuotedString);
2828     }
2829 
2830     return true;
2831 }
2832 
2833 
2834 //
2835 //  This method will scan the contents of an ignored section. It assumes that
2836 //  we already are in the body, i.e. we've seen <![IGNORE[ at this point. So
2837 //  we have to just scan until we see a matching ]]> closing markup.
2838 //
scanIgnoredSection()2839 void DTDScanner::scanIgnoredSection()
2840 {
2841     //
2842     //  Depth starts at one because we are already in one section and want
2843     //  to parse until we hit its end.
2844     //
2845     unsigned long depth = 1;
2846     bool gotLeadingSurrogate = false;
2847     while (true)
2848     {
2849         const XMLCh nextCh = fReaderMgr->getNextChar();
2850 
2851         if (!nextCh)
2852             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
2853 
2854         if (nextCh == chOpenAngle)
2855         {
2856             if (fReaderMgr->skippedChar(chBang)
2857             &&  fReaderMgr->skippedChar(chOpenSquare))
2858             {
2859                 depth++;
2860             }
2861         }
2862          else if (nextCh == chCloseSquare)
2863         {
2864             if (fReaderMgr->skippedChar(chCloseSquare))
2865             {
2866                 while (fReaderMgr->skippedChar(chCloseSquare))
2867                 {
2868                     // Do nothing, just skip them
2869                 }
2870 
2871                 if (fReaderMgr->skippedChar(chCloseAngle))
2872                 {
2873                     depth--;
2874                     if (!depth)
2875                         break;
2876                 }
2877             }
2878         }
2879         // Deal with surrogate pairs
2880         else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
2881         {
2882             //  Its a leading surrogate. If we already got one, then
2883             //  issue an error, else set leading flag to make sure that
2884             //  we look for a trailing next time.
2885             if (gotLeadingSurrogate)
2886                 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2887             else
2888                 gotLeadingSurrogate = true;
2889         }
2890         else
2891         {
2892             //  If its a trailing surrogate, make sure that we are
2893             //  prepared for that. Else, its just a regular char so make
2894             //  sure that we were not expected a trailing surrogate.
2895             if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
2896             {
2897                 // Its trailing, so make sure we were expecting it
2898                 if (!gotLeadingSurrogate)
2899                     fScanner->emitError(XMLErrs::Unexpected2ndSurrogateChar);
2900             }
2901             else
2902             {
2903                 //  Its just a char, so make sure we were not expecting a
2904                 //  trailing surrogate.
2905                 if (gotLeadingSurrogate)
2906                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2907 
2908                 // Its got to at least be a valid XML character
2909                 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
2910                 {
2911                     XMLCh tmpBuf[9];
2912                     XMLString::binToText
2913                     (
2914                         nextCh
2915                         , tmpBuf
2916                         , 8
2917                         , 16
2918                         , fMemoryManager
2919                     );
2920                     fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
2921                 }
2922             }
2923             gotLeadingSurrogate = false;
2924         }
2925     }
2926 }
2927 
2928 
2929 //
2930 //  This method scans the entire internal subset. All we can have here is
2931 //  decl markup, and PE references. The expanded PE references must contain
2932 //  whole markup, so we don't have to worry about their content at this
2933 //  level. We just scan them, expand them, push them, and parse their content
2934 //  right there, via the expandERef() method.
2935 //
scanInternalSubset()2936 bool DTDScanner::scanInternalSubset()
2937 {
2938     // Indicate we are in the internal subset now
2939     FlagJanitor<bool> janContentFlag(&fInternalSubset, true);
2940 
2941     // If we have a doc type handler, tell it the internal subset starts
2942     if (fDocTypeHandler)
2943         fDocTypeHandler->startIntSubset();
2944 
2945     // Get a buffer for whitespace
2946     XMLBufBid bbSpace(fBufMgr);
2947 
2948     bool noErrors = true;
2949     while (true)
2950     {
2951         const XMLCh nextCh = fReaderMgr->peekNextChar();
2952 
2953         //
2954         //  If we get an end of file marker, just unget it and return a
2955         //  failure status. The caller will then see the end of file and
2956         //  faill out correctly.
2957         //
2958         if (!nextCh)
2959             return false;
2960 
2961         // Watch for the end of internal subset marker
2962         if (nextCh == chCloseSquare)
2963         {
2964             fReaderMgr->getNextChar();
2965             break;
2966         }
2967 
2968         if (nextCh == chPercent)
2969         {
2970             //
2971             //  Expand (and scan if external) the reference value. Tell
2972             //  it to set the reader to cause an end of entity exception
2973             //  when this reader dies, which is what the scanExtSubset
2974             //  method wants (who is called to scan this.)
2975             //
2976             fReaderMgr->getNextChar();
2977             expandPERef(true, false, false, true);
2978         }
2979          else if (nextCh == chOpenAngle)
2980         {
2981             // Remember this reader before we start the scan, for checking
2982             // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
2983             const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2984             bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
2985 
2986             // And scan this markup
2987             fReaderMgr->getNextChar();
2988             scanMarkupDecl(false);
2989 
2990             // If we did not get back to entry level, then partial markup
2991             if (fReaderMgr->getCurrentReaderNum() != orgReader) {
2992                 if (wasInPE)
2993                     fScanner->emitError(XMLErrs::PEBetweenDecl);
2994                 else if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
2995                     fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
2996             }
2997         }
2998          else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
2999         {
3000             //
3001             //  IF we are doing advanced callbacks and have a doc type
3002             //  handler, then get the whitespace and call the doc type
3003             //  handler with it. Otherwise, just skip whitespace.
3004             //
3005             if (fDocTypeHandler)
3006             {
3007                 fReaderMgr->getSpaces(bbSpace.getBuffer());
3008                 fDocTypeHandler->doctypeWhitespace
3009                 (
3010                     bbSpace.getRawBuffer()
3011                     , bbSpace.getLen()
3012                 );
3013             }
3014              else
3015             {
3016                 fReaderMgr->skipPastSpaces();
3017             }
3018         }
3019          else
3020         {
3021             // Not valid, so emit an error
3022             XMLCh tmpBuf[9];
3023             XMLString::binToText
3024             (
3025                 fReaderMgr->getNextChar()
3026                 , tmpBuf
3027                 , 8
3028                 , 16
3029                 , fMemoryManager
3030             );
3031             fScanner->emitError
3032             (
3033                 XMLErrs::InvalidCharacterInIntSubset
3034                 , tmpBuf
3035             );
3036 
3037             //
3038             //  If an '>', then probably an abnormally terminated
3039             //  internal subset so just return.
3040             //
3041             if (nextCh == chCloseAngle)
3042             {
3043                 noErrors = false;
3044                 break;
3045             }
3046 
3047             //
3048             //  Otherwise, try to sync back up by scanning forward for
3049             //  a reasonable start character.
3050             //
3051             static const XMLCh toSkip[] =
3052             {
3053                 chPercent, chCloseSquare, chOpenAngle, chNull
3054             };
3055             fReaderMgr->skipUntilInOrWS(toSkip);
3056         }
3057     }
3058 
3059     // If we have a doc type handler, tell it the internal subset ends
3060     if (fDocTypeHandler)
3061         fDocTypeHandler->endIntSubset();
3062 
3063     return noErrors;
3064 }
3065 
3066 
3067 //
3068 //  This method is called once we see a < in the input of an int/ext subset,
3069 //  which indicates the start of some sort of markup.
3070 //
scanMarkupDecl(const bool parseTextDecl)3071 void DTDScanner::scanMarkupDecl(const bool parseTextDecl)
3072 {
3073     //
3074     //  We only have two valid first characters here. One is a ! which opens
3075     //  some markup decl. The other is a ?, which could begin either a PI
3076     //  or a text decl. If parseTextDecl is false, we cannot accept a text
3077     //  decl.
3078     //
3079     const XMLCh nextCh = fReaderMgr->getNextChar();
3080 
3081     if (nextCh == chBang)
3082     {
3083         if (fReaderMgr->skippedChar(chDash))
3084         {
3085             if (fReaderMgr->skippedChar(chDash))
3086             {
3087                 scanComment();
3088             }
3089              else
3090             {
3091                 fScanner->emitError(XMLErrs::CommentsMustStartWith);
3092                 fReaderMgr->skipPastChar(chCloseAngle);
3093             }
3094         }
3095          else if (fReaderMgr->skippedChar(chOpenSquare))
3096         {
3097             //
3098             //  Its a conditional section. This is only valid in the external
3099             //  subset, so issue an error if we aren't there.
3100             //
3101             if (fInternalSubset)
3102             {
3103                 fScanner->emitError(XMLErrs::ConditionalSectInIntSubset);
3104                 fReaderMgr->skipPastChar(chCloseAngle);
3105                 return;
3106             }
3107 
3108             // A PE ref can happen here, but space is not required
3109             checkForPERef(false, true);
3110 
3111             if (fReaderMgr->skippedString(XMLUni::fgIncludeString))
3112             {
3113                 checkForPERef(false, true);
3114 
3115                 // Check for the following open square bracket
3116                 if (!fReaderMgr->skippedChar(chOpenSquare))
3117                     fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
3118 
3119                 // Get the reader we started this on
3120                 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
3121 
3122                 checkForPERef(false, true);
3123 
3124                 //
3125                 //  Recurse back to the ext subset call again, telling it its
3126                 //  in an include section.
3127                 //
3128                 scanExtSubsetDecl(true, false);
3129 
3130                 //
3131                 //  And see if we got back to the same level. If not, then its
3132                 //  a partial markup error.
3133                 //
3134                 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
3135                     fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
3136 
3137             }
3138              else if (fReaderMgr->skippedString(XMLUni::fgIgnoreString))
3139             {
3140                 checkForPERef(false, true);
3141 
3142                 // Check for the following open square bracket
3143                 if (!fReaderMgr->skippedChar(chOpenSquare))
3144                     fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
3145 
3146                 // Get the reader we started this on
3147                 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
3148 
3149                 // And scan over the ignored part
3150                 scanIgnoredSection();
3151 
3152                 //
3153                 //  And see if we got back to the same level. If not, then its
3154                 //  a partial markup error.
3155                 //
3156                 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
3157                     fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
3158 
3159             }
3160              else
3161             {
3162                 fScanner->emitError(XMLErrs::ExpectedIncOrIgn);
3163                 fReaderMgr->skipPastChar(chCloseAngle);
3164             }
3165         }
3166          else if (fReaderMgr->skippedString(XMLUni::fgAttListString))
3167         {
3168             scanAttListDecl();
3169         }
3170          else if (fReaderMgr->skippedString(XMLUni::fgElemString))
3171         {
3172             scanElementDecl();
3173         }
3174          else if (fReaderMgr->skippedString(XMLUni::fgEntityString))
3175         {
3176             scanEntityDecl();
3177         }
3178          else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
3179         {
3180             scanNotationDecl();
3181         }
3182          else
3183         {
3184             fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
3185             fReaderMgr->skipPastChar(chCloseAngle);
3186         }
3187     }
3188      else if (nextCh == chQuestion)
3189     {
3190         // It could be a PI or the XML declaration. Check for Decl
3191         if (fScanner->checkXMLDecl(false))
3192         {
3193             // If we are not accepting text decls, its an error
3194             if (parseTextDecl)
3195             {
3196                 scanTextDecl();
3197             }
3198              else
3199             {
3200                 // Emit the error and skip past this markup
3201                 fScanner->emitError(XMLErrs::TextDeclNotLegalHere);
3202                 fReaderMgr->skipPastChar(chCloseAngle);
3203             }
3204         }
3205          else
3206         {
3207             // It has to be a PI
3208             scanPI();
3209         }
3210     }
3211      else
3212     {
3213         // Can't be valid so emit error and try to skip past end of this decl
3214         fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
3215         fReaderMgr->skipPastChar(chCloseAngle);
3216     }
3217 }
3218 
3219 
3220 //
3221 //  This method is called for a mixed model element's content mode. We've
3222 //  already scanned past the '(PCDATA' part by the time we get here. So
3223 //  everything else is element names separated by | characters until we
3224 //  hit the end. The passed element decl's content model is filled in with
3225 //  the information found.
3226 //
scanMixed(DTDElementDecl & toFill)3227 bool DTDScanner::scanMixed(DTDElementDecl& toFill)
3228 {
3229     //
3230     //  The terminating star is only required if there is something more
3231     //  than (PCDATA).
3232     //
3233     bool starRequired = false;
3234 
3235     // Get a buffer to be used below to get element names
3236     XMLBufBid bbName(fBufMgr);
3237     XMLBuffer& nameBuf = bbName.getBuffer();
3238 
3239     //
3240     //  Create an initial content spec node. Its just a leaf node with a
3241     //  PCDATA element id. This current node pointer will be pushed down the
3242     //  tree as we go.
3243     //
3244     ContentSpecNode* curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
3245     (
3246         new (fGrammarPoolMemoryManager) QName
3247         (
3248             XMLUni::fgZeroLenString
3249             , XMLUni::fgZeroLenString
3250             , XMLElementDecl::fgPCDataElemId
3251             , fGrammarPoolMemoryManager
3252         )
3253         , false
3254         , fGrammarPoolMemoryManager
3255     );
3256 
3257     //
3258     //  Set the initial leaf as the temporary head. If we hit the first choice
3259     //  node, it will be set up here. When done, this is the node that's set
3260     //  as the content spec for the element.
3261     //
3262     ContentSpecNode* headNode = curNode;
3263 
3264     // Remember the original node so we can sense the first choice node
3265     ContentSpecNode* orgNode = curNode;
3266 
3267     //
3268     //  We just loop around, getting the | character at the top and then
3269     //  looking for the next element name. We keep up with the last node
3270     //  and add each new one to its right node.
3271     //
3272     while (true)
3273     {
3274         //
3275         //  First of all we check for some grunt work details of skipping
3276         //  whitespace, expand PE refs, and catching invalid reps.
3277         //
3278         if (fReaderMgr->lookingAtChar(chPercent))
3279         {
3280             // Expand it and continue
3281             checkForPERef(false, true);
3282         }
3283          else if (fReaderMgr->skippedChar(chAsterisk))
3284         {
3285             //
3286             //  Tell them they can't have reps in mixed model, but eat
3287             //  it and keep going if we are allowed to.
3288             //
3289             if (fScanner->emitErrorWillThrowException(XMLErrs::NoRepInMixed))
3290             {
3291                 delete headNode;
3292             }
3293             fScanner->emitError(XMLErrs::NoRepInMixed);
3294         }
3295          else if (fReaderMgr->skippedSpace())
3296         {
3297             // Spaces are ok at this point, just eat them and continue
3298             fReaderMgr->skipPastSpaces();
3299         }
3300          else
3301         {
3302             if (!fReaderMgr->skippedChar(chPipe))
3303             {
3304                 // Has to be the closing paren now.
3305                 if (!fReaderMgr->skippedChar(chCloseParen))
3306                 {
3307                     delete headNode;
3308                     fScanner->emitError(XMLErrs::UnterminatedContentModel, toFill.getElementName()->getLocalPart());
3309                     return false;
3310                 }
3311 
3312                 bool starSkipped = true;
3313                 if (!fReaderMgr->skippedChar(chAsterisk)) {
3314 
3315                     starSkipped = false;
3316 
3317                     if (starRequired)
3318                     {
3319                         if (fScanner->emitErrorWillThrowException(XMLErrs::ExpectedAsterisk))
3320                         {
3321                             delete headNode;
3322                         }
3323                         fScanner->emitError(XMLErrs::ExpectedAsterisk);
3324                     }
3325                 }
3326 
3327                 //
3328                 //  Create a zero or more node and make the original head
3329                 //  node its first child.
3330                 //
3331                 if (starRequired || starSkipped) {
3332                     headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
3333                     (
3334                         ContentSpecNode::ZeroOrMore
3335                         , headNode
3336                         , 0
3337                         , true
3338                         , true
3339                         , fGrammarPoolMemoryManager
3340                     );
3341                 }
3342 
3343                 // Store the head node as the content spec of the element.
3344                 toFill.setContentSpec(headNode);
3345                 break;
3346             }
3347 
3348             // Its more than just a PCDATA, so an ending star will be required now
3349             starRequired = true;
3350 
3351             // Space is legal here so check for a PE ref, but don't require space
3352             checkForPERef(false, true);
3353 
3354             // Get a name token
3355             if (!fReaderMgr->getName(nameBuf))
3356             {
3357                 delete headNode;
3358                 fScanner->emitError(XMLErrs::ExpectedElementName);
3359                 return false;
3360             }
3361 
3362             //
3363             //  Create a leaf node for it. If we can find the element id for
3364             //  this element, then use it. Else, we have to fault in an element
3365             //  decl, marked as created because of being in a content model.
3366             //
3367             XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, nameBuf.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
3368             if (!decl)
3369             {
3370                 decl = new (fGrammarPoolMemoryManager) DTDElementDecl
3371                 (
3372                     nameBuf.getRawBuffer()
3373                     , fEmptyNamespaceId
3374                     , DTDElementDecl::Any
3375                     , fGrammarPoolMemoryManager
3376                 );
3377                 decl->setCreateReason(XMLElementDecl::InContentModel);
3378                 decl->setExternalElemDeclaration(isReadingExternalEntity());
3379                 fDTDGrammar->putElemDecl(decl);
3380             }
3381 
3382             //
3383             //  If the current node is the original node, this is the first choice
3384             //  node, so create an initial choice node with the current node and
3385             //  the new element id. Store this as the head node.
3386             //
3387             //  Otherwise, we have to steal the right node of the previous choice
3388             //  and weave in another choice node there, which has the old choice
3389             //  as its left and the new leaf as its right.
3390             //
3391             if (curNode == orgNode)
3392             {
3393                 curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
3394                 (
3395                     ContentSpecNode::Choice
3396                     , curNode
3397                     , new (fGrammarPoolMemoryManager) ContentSpecNode
3398                       (
3399                           decl->getElementName()
3400                           , fGrammarPoolMemoryManager
3401                       )
3402                     , true
3403                     , true
3404                     , fGrammarPoolMemoryManager
3405                 );
3406 
3407                 // Remember the top node
3408                 headNode = curNode;
3409             }
3410              else
3411             {
3412                 ContentSpecNode* oldRight = curNode->orphanSecond();
3413                 curNode->setSecond
3414                 (
3415                     new (fGrammarPoolMemoryManager) ContentSpecNode
3416                     (
3417                         ContentSpecNode::Choice
3418                         , oldRight
3419                         , new (fGrammarPoolMemoryManager) ContentSpecNode
3420                           (
3421                               decl->getElementName()
3422                               , fGrammarPoolMemoryManager
3423                           )
3424                         , true
3425                         , true
3426                         , fGrammarPoolMemoryManager
3427                     )
3428                 );
3429 
3430                 // Make the new right node the current node
3431                 curNode = curNode->getSecond();
3432             }
3433         }
3434     }
3435 
3436     return true;
3437 }
3438 
3439 
3440 //
3441 //  This method is called when we see a '<!NOTATION' string while scanning
3442 //  markup decl. It parses out the notation and its id and stores a new
3443 //  notation decl object in the notation decl pool.
3444 //
scanNotationDecl()3445 void DTDScanner::scanNotationDecl()
3446 {
3447     // Space is required here so check for a PE ref, and require space
3448     if (!checkForPERef(false, true))
3449     {
3450         fScanner->emitError(XMLErrs::ExpectedWhitespace);
3451         fReaderMgr->skipPastChar(chCloseAngle);
3452         return;
3453     }
3454 
3455     //
3456     //  And now we get a name, which is the name of the notation. Get a
3457     //  buffer for the name.
3458     //
3459     XMLBufBid bbName(fBufMgr);
3460     if (!fReaderMgr->getName(bbName.getBuffer()))
3461     {
3462         fScanner->emitError(XMLErrs::ExpectedNotationName);
3463         fReaderMgr->skipPastChar(chCloseAngle);
3464         return;
3465     }
3466 
3467     // If namespaces are enabled, then no colons allowed
3468     if (fScanner->getDoNamespaces())
3469     {
3470         if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
3471             fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
3472     }
3473 
3474     // Space is required here so check for a PE ref, and require space
3475     if (!checkForPERef(false, true))
3476     {
3477         fScanner->emitError(XMLErrs::ExpectedWhitespace);
3478         fReaderMgr->skipPastChar(chCloseAngle);
3479         return;
3480     }
3481 
3482     //
3483     //  And scan an external or public id. We need buffers to use for both
3484     //  of these.
3485     //
3486     XMLBufBid bbPubId(fBufMgr);
3487     XMLBufBid bbSysId(fBufMgr);
3488     if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_Either))
3489     {
3490         fReaderMgr->skipPastChar(chCloseAngle);
3491         return;
3492     }
3493 
3494     // We can have an optional space or PE ref here
3495     checkForPERef(false, true);
3496 
3497     //
3498     //  See if it already exists. If so, add it to the notatino decl pool.
3499     //  Otherwise, if advanced callbacks are on, create a temp one and
3500     //  call out for that one.
3501     //
3502     XMLNotationDecl* decl = fDTDGrammar->getNotationDecl(bbName.getRawBuffer());
3503     bool isIgnoring = (decl != 0);
3504     if (isIgnoring)
3505     {
3506         fScanner->emitError(XMLErrs::NotationAlreadyExists, bbName.getRawBuffer());
3507     }
3508      else
3509     {
3510         // Fill in a new notation declaration and add it to the pool
3511         const XMLCh* publicId = bbPubId.getRawBuffer();
3512         const XMLCh* systemId = bbSysId.getRawBuffer();
3513         ReaderMgr::LastExtEntityInfo lastInfo;
3514         fReaderMgr->getLastExtEntityInfo(lastInfo);
3515 
3516         decl = new (fGrammarPoolMemoryManager) XMLNotationDecl
3517         (
3518             bbName.getRawBuffer()
3519             , (publicId && *publicId) ? publicId : 0
3520             , (systemId && *systemId) ? systemId : 0
3521             , (lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0
3522             , fGrammarPoolMemoryManager
3523         );
3524         fDTDGrammar->putNotationDecl(decl);
3525     }
3526 
3527     //
3528     //  If we have a document type handler, then tell it about this. If we
3529     //  are ignoring it, only call out if advanced callbacks are enabled.
3530     //
3531     if (fDocTypeHandler)
3532     {
3533         fDocTypeHandler->notationDecl
3534         (
3535             *decl
3536             , isIgnoring
3537         );
3538     }
3539 
3540     // And one more optional space or PE ref
3541     checkForPERef(false, true);
3542 
3543     // And skip the terminating bracket
3544     if (!fReaderMgr->skippedChar(chCloseAngle))
3545         fScanner->emitError(XMLErrs::UnterminatedNotationDecl);
3546 }
3547 
3548 
3549 //
3550 //  Scans a PI and calls the appropriate callbacks. A PI can happen in either
3551 //  the document or the DTD, so it calls the appropriate handler according
3552 //  to the fInDocument flag.
3553 //
3554 //  At entry we have just scanned the <? part, and need to now start on the
3555 //  PI target name.
3556 //
scanPI()3557 void DTDScanner::scanPI()
3558 {
3559     const XMLCh* namePtr = 0;
3560     const XMLCh* targetPtr = 0;
3561 
3562     //
3563     //  If there are any spaces here, then warn about it. If we aren't in
3564     //  'first error' mode, then we'll come back and can easily pick up
3565     //  again by just skipping them.
3566     //
3567     if (fReaderMgr->lookingAtSpace())
3568     {
3569         fScanner->emitError(XMLErrs::PINameExpected);
3570         fReaderMgr->skipPastSpaces();
3571     }
3572 
3573     // Get a buffer for the PI name and scan it in
3574     XMLBufBid bbName(fBufMgr);
3575     if (!fReaderMgr->getName(bbName.getBuffer()))
3576     {
3577         fScanner->emitError(XMLErrs::PINameExpected);
3578         fReaderMgr->skipPastChar(chCloseAngle);
3579         return;
3580     }
3581 
3582     // Point the name pointer at the raw data
3583     namePtr = bbName.getRawBuffer();
3584 
3585     // See if it issome form of 'xml' and emit a warning
3586     //if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
3587     if (bbName.getLen() == 3 &&
3588         (((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) &&
3589          ((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) &&
3590          ((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L))))
3591         fScanner->emitError(XMLErrs::NoPIStartsWithXML);
3592 
3593     // If namespaces are enabled, then no colons allowed
3594     if (fScanner->getDoNamespaces())
3595     {
3596         if (XMLString::indexOf(namePtr, chColon) != -1)
3597             fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
3598     }
3599 
3600     //
3601     //  If we don't hit a space next, then the PI has no target. If we do
3602     //  then get out the target. Get a buffer for it as well
3603     //
3604     XMLBufBid bbTarget(fBufMgr);
3605     if (fReaderMgr->skippedSpace())
3606     {
3607         // Skip any leading spaces
3608         fReaderMgr->skipPastSpaces();
3609 
3610         bool gotLeadingSurrogate = false;
3611 
3612         // It does have a target, so lets move on to deal with that.
3613         while (1)
3614         {
3615             const XMLCh nextCh = fReaderMgr->getNextChar();
3616 
3617             // Watch for an end of file, which is always bad here
3618             if (!nextCh)
3619             {
3620                 fScanner->emitError(XMLErrs::UnterminatedPI);
3621                 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3622             }
3623 
3624             // Watch for potential terminating character
3625             if (nextCh == chQuestion)
3626             {
3627                 // It must be followed by '>' to be a termination of the target
3628                 if (fReaderMgr->skippedChar(chCloseAngle))
3629                     break;
3630             }
3631 
3632             // Check for correct surrogate pairs
3633             if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
3634             {
3635                 if (gotLeadingSurrogate)
3636                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
3637                 else
3638                     gotLeadingSurrogate = true;
3639             }
3640              else
3641             {
3642                 if (gotLeadingSurrogate)
3643                 {
3644                     if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
3645                         fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
3646                 }
3647                 // Its got to at least be a valid XML character
3648                 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
3649 
3650                     XMLCh tmpBuf[9];
3651                     XMLString::binToText
3652                     (
3653                         nextCh
3654                         , tmpBuf
3655                         , 8
3656                         , 16
3657                         , fMemoryManager
3658                     );
3659                     fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
3660                 }
3661 
3662                 gotLeadingSurrogate = false;
3663             }
3664             bbTarget.append(nextCh);
3665         }
3666     }
3667      else
3668     {
3669         // No target, but make sure its terminated ok
3670         if (!fReaderMgr->skippedChar(chQuestion))
3671         {
3672             fScanner->emitError(XMLErrs::UnterminatedPI);
3673             fReaderMgr->skipPastChar(chCloseAngle);
3674             return;
3675         }
3676 
3677         if (!fReaderMgr->skippedChar(chCloseAngle))
3678         {
3679             fScanner->emitError(XMLErrs::UnterminatedPI);
3680             fReaderMgr->skipPastChar(chCloseAngle);
3681             return;
3682         }
3683     }
3684 
3685     // Point the target pointer at the raw data
3686     targetPtr = bbTarget.getRawBuffer();
3687 
3688     //
3689     //  If we have a handler, then call it.
3690     //
3691     if (fDocTypeHandler)
3692     {
3693         fDocTypeHandler->doctypePI
3694         (
3695             namePtr
3696             , targetPtr
3697         );
3698     }
3699 }
3700 
3701 
3702 //
3703 //  This method scans a public literal. It must be quoted and all of its
3704 //  characters must be valid public id characters. The quotes are discarded
3705 //  and the results are returned.
3706 //
scanPublicLiteral(XMLBuffer & toFill)3707 bool DTDScanner::scanPublicLiteral(XMLBuffer& toFill)
3708 {
3709     toFill.reset();
3710 
3711     // Get the next char which must be a single or double quote
3712     XMLCh quoteCh;
3713     if (!fReaderMgr->skipIfQuote(quoteCh)) {
3714         fScanner->emitError(XMLErrs::ExpectedQuotedString);
3715         return false;
3716     }
3717 
3718     while (true)
3719     {
3720         const XMLCh nextCh = fReaderMgr->getNextChar();
3721 
3722         // Watch for EOF
3723         if (!nextCh)
3724             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3725 
3726         if (nextCh == quoteCh)
3727             break;
3728 
3729         //
3730         //  If its not a valid public id char, then report it but keep going
3731         //  since that's the best recovery scheme.
3732         //
3733         if (!fReaderMgr->getCurrentReader()->isPublicIdChar(nextCh))
3734         {
3735             XMLCh tmpBuf[9];
3736             XMLString::binToText
3737             (
3738                 nextCh
3739                 , tmpBuf
3740                 , 8
3741                 , 16
3742                 , fMemoryManager
3743             );
3744             fScanner->emitError(XMLErrs::InvalidPublicIdChar, tmpBuf);
3745         }
3746 
3747         toFill.append(nextCh);
3748     }
3749     return true;
3750 }
3751 
3752 
3753 //
3754 //  This method handles scanning in a quoted system literal. It expects to
3755 //  start on the open quote and returns after eating the ending quote. There
3756 //  are not really any restrictions on the contents of system literals.
3757 //
scanSystemLiteral(XMLBuffer & toFill)3758 bool DTDScanner::scanSystemLiteral(XMLBuffer& toFill)
3759 {
3760     toFill.reset();
3761 
3762     // Get the next char which must be a single or double quote
3763     XMLCh quoteCh;
3764     if (!fReaderMgr->skipIfQuote(quoteCh)) {
3765         fScanner->emitError(XMLErrs::ExpectedQuotedString);
3766         return false;
3767     }
3768 
3769 	XMLCh nextCh;
3770     // Break out on terminating quote
3771     while ((nextCh=fReaderMgr->getNextChar())!=quoteCh)
3772     {
3773         // Watch for EOF
3774         if (!nextCh)
3775             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3776         toFill.append(nextCh);
3777     }
3778     return true;
3779 }
3780 
3781 
3782 
3783 //
3784 //  This method is called to scan a text decl line, which can be the first
3785 //  line in an external entity or external subset.
3786 //
3787 //  On entry the <? has been scanned, and next should be 'xml' followed by
3788 //  some whitespace, version string, etc...
3789 //    [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
3790 //
scanTextDecl()3791 void DTDScanner::scanTextDecl()
3792 {
3793     // Skip any subsequent whitespace before the version string
3794     fReaderMgr->skipPastSpaces();
3795 
3796     // Next should be the version string
3797     XMLBufBid bbVersion(fBufMgr);
3798     if (fReaderMgr->skippedString(XMLUni::fgVersionString))
3799     {
3800         if (!scanEq())
3801         {
3802             fScanner->emitError(XMLErrs::ExpectedEqSign);
3803             fReaderMgr->skipPastChar(chCloseAngle);
3804             return;
3805         }
3806 
3807         //
3808         //  Followed by a single or double quoted version. Get a buffer for
3809         //  the string.
3810         //
3811         if (!getQuotedString(bbVersion.getBuffer()))
3812         {
3813             fScanner->emitError(XMLErrs::BadXMLVersion);
3814             fReaderMgr->skipPastChar(chCloseAngle);
3815             return;
3816         }
3817 
3818         // If its not our supported version, issue an error but continue
3819         if (XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_1)) {
3820             if (fScanner->getXMLVersion() != XMLReader::XMLV1_1)
3821         	    fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
3822         }
3823         else if (!XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_0))
3824             fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
3825     }
3826 
3827     // Ok, now we must have an encoding string
3828     XMLBufBid bbEncoding(fBufMgr);
3829     fReaderMgr->skipPastSpaces();
3830     bool gotEncoding = false;
3831     if (fReaderMgr->skippedString(XMLUni::fgEncodingString))
3832     {
3833         // There must be a equal sign next
3834         if (!scanEq())
3835         {
3836             fScanner->emitError(XMLErrs::ExpectedEqSign);
3837             fReaderMgr->skipPastChar(chCloseAngle);
3838             return;
3839         }
3840 
3841         // Followed by a single or double quoted version string
3842         getQuotedString(bbEncoding.getBuffer());
3843         if (bbEncoding.isEmpty() || !XMLString::isValidEncName(bbEncoding.getRawBuffer()))
3844         {
3845             fScanner->emitError(XMLErrs::BadXMLEncoding, bbEncoding.getRawBuffer());
3846             fReaderMgr->skipPastChar(chCloseAngle);
3847             return;
3848         }
3849 
3850         // Indicate that we got an encoding
3851         gotEncoding = true;
3852     }
3853 
3854     //
3855     // Encoding declarations are required in the external entity
3856     // if there is a text declaration present
3857     //
3858     if (!gotEncoding)
3859     {
3860       fScanner->emitError(XMLErrs::EncodingRequired);
3861       fReaderMgr->skipPastChar(chCloseAngle);
3862       return;
3863 
3864     }
3865 
3866     fReaderMgr->skipPastSpaces();
3867     if (!fReaderMgr->skippedChar(chQuestion))
3868     {
3869         fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
3870         fReaderMgr->skipPastChar(chCloseAngle);
3871     }
3872      else if (!fReaderMgr->skippedChar(chCloseAngle))
3873     {
3874         fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
3875         fReaderMgr->skipPastChar(chCloseAngle);
3876     }
3877 
3878     //
3879     //  If we have a document type handler and advanced callbacks are on,
3880     //  then call the TextDecl callback
3881     //
3882     if (fDocTypeHandler)
3883     {
3884         fDocTypeHandler->TextDecl
3885         (
3886             bbVersion.getRawBuffer()
3887             , bbEncoding.getRawBuffer()
3888         );
3889     }
3890 
3891     //
3892     //  If we got an encoding string, then we have to call back on the reader
3893     //  to tell it what the encoding is.
3894     //
3895     if (!bbEncoding.isEmpty())
3896     {
3897         if (!fReaderMgr->getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
3898             fScanner->emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
3899     }
3900 }
3901 
3902 XERCES_CPP_NAMESPACE_END
3903