1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * $Id: DGXMLScanner.cpp 833045 2009-11-05 13:21:27Z borisk $
20 */
21
22
23 // ---------------------------------------------------------------------------
24 // Includes
25 // ---------------------------------------------------------------------------
26 #include <xercesc/internal/DGXMLScanner.hpp>
27 #include <xercesc/util/Janitor.hpp>
28 #include <xercesc/util/RuntimeException.hpp>
29 #include <xercesc/util/UnexpectedEOFException.hpp>
30 #include <xercesc/util/XMLUri.hpp>
31 #include <xercesc/framework/URLInputSource.hpp>
32 #include <xercesc/framework/LocalFileInputSource.hpp>
33 #include <xercesc/framework/XMLDocumentHandler.hpp>
34 #include <xercesc/framework/XMLEntityHandler.hpp>
35 #include <xercesc/framework/XMLPScanToken.hpp>
36 #include <xercesc/framework/XMLGrammarPool.hpp>
37 #include <xercesc/framework/XMLDTDDescription.hpp>
38 #include <xercesc/internal/EndOfEntityException.hpp>
39 #include <xercesc/validators/common/GrammarResolver.hpp>
40 #include <xercesc/validators/DTD/DocTypeHandler.hpp>
41 #include <xercesc/validators/DTD/DTDScanner.hpp>
42 #include <xercesc/validators/DTD/DTDValidator.hpp>
43 #include <xercesc/util/OutOfMemoryException.hpp>
44 #include <xercesc/util/XMLResourceIdentifier.hpp>
45
46 XERCES_CPP_NAMESPACE_BEGIN
47
48
49 typedef JanitorMemFunCall<DGXMLScanner> CleanupType;
50 typedef JanitorMemFunCall<ReaderMgr> ReaderMgrResetType;
51
52
53 // ---------------------------------------------------------------------------
54 // DGXMLScanner: Constructors and Destructor
55 // ---------------------------------------------------------------------------
DGXMLScanner(XMLValidator * const valToAdopt,GrammarResolver * const grammarResolver,MemoryManager * const manager)56 DGXMLScanner::DGXMLScanner(XMLValidator* const valToAdopt
57 , GrammarResolver* const grammarResolver
58 , MemoryManager* const manager) :
59
60 XMLScanner(valToAdopt, grammarResolver, manager)
61 , fAttrNSList(0)
62 , fDTDValidator(0)
63 , fDTDGrammar(0)
64 , fDTDElemNonDeclPool(0)
65 , fElemCount(0)
66 , fAttDefRegistry(0)
67 , fUndeclaredAttrRegistry(0)
68 {
69 CleanupType cleanup(this, &DGXMLScanner::cleanUp);
70
71 try
72 {
73 commonInit();
74 }
75 catch(const OutOfMemoryException&)
76 {
77 // Don't cleanup when out of memory, since executing the
78 // code can cause problems.
79 cleanup.release();
80
81 throw;
82 }
83
84 cleanup.release();
85 }
86
DGXMLScanner(XMLDocumentHandler * const docHandler,DocTypeHandler * const docTypeHandler,XMLEntityHandler * const entityHandler,XMLErrorReporter * const errHandler,XMLValidator * const valToAdopt,GrammarResolver * const grammarResolver,MemoryManager * const manager)87 DGXMLScanner::DGXMLScanner( XMLDocumentHandler* const docHandler
88 , DocTypeHandler* const docTypeHandler
89 , XMLEntityHandler* const entityHandler
90 , XMLErrorReporter* const errHandler
91 , XMLValidator* const valToAdopt
92 , GrammarResolver* const grammarResolver
93 , MemoryManager* const manager) :
94
95 XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager)
96 , fAttrNSList(0)
97 , fDTDValidator(0)
98 , fDTDGrammar(0)
99 , fDTDElemNonDeclPool(0)
100 , fElemCount(0)
101 , fAttDefRegistry(0)
102 , fUndeclaredAttrRegistry(0)
103 {
104 CleanupType cleanup(this, &DGXMLScanner::cleanUp);
105
106 try
107 {
108 commonInit();
109 }
110 catch(const OutOfMemoryException&)
111 {
112 // Don't cleanup when out of memory, since executing the
113 // code can cause problems.
114 cleanup.release();
115
116 throw;
117 }
118
119 cleanup.release();
120 }
121
~DGXMLScanner()122 DGXMLScanner::~DGXMLScanner()
123 {
124 cleanUp();
125 }
126
127 // ---------------------------------------------------------------------------
128 // XMLScanner: Getter methods
129 // ---------------------------------------------------------------------------
getEntityDeclPool()130 NameIdPool<DTDEntityDecl>* DGXMLScanner::getEntityDeclPool()
131 {
132 if(!fGrammar)
133 return 0;
134 return ((DTDGrammar*)fGrammar)->getEntityDeclPool();
135 }
136
getEntityDeclPool() const137 const NameIdPool<DTDEntityDecl>* DGXMLScanner::getEntityDeclPool() const
138 {
139 if(!fGrammar)
140 return 0;
141 return ((DTDGrammar*)fGrammar)->getEntityDeclPool();
142 }
143
144 // ---------------------------------------------------------------------------
145 // DGXMLScanner: Main entry point to scan a document
146 // ---------------------------------------------------------------------------
scanDocument(const InputSource & src)147 void DGXMLScanner::scanDocument(const InputSource& src)
148 {
149 // Bump up the sequence id for this parser instance. This will invalidate
150 // any previous progressive scan tokens.
151 fSequenceId++;
152
153 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
154
155 try
156 {
157 // Reset the scanner and its plugged in stuff for a new run. This
158 // resets all the data structures, creates the initial reader and
159 // pushes it on the stack, and sets up the base document path.
160 scanReset(src);
161
162 // If we have a document handler, then call the start document
163 if (fDocHandler)
164 fDocHandler->startDocument();
165
166 // Scan the prolog part, which is everything before the root element
167 // including the DTD subsets.
168 scanProlog();
169
170 // If we got to the end of input, then its not a valid XML file.
171 // Else, go on to scan the content.
172 if (fReaderMgr.atEOF())
173 {
174 emitError(XMLErrs::EmptyMainEntity);
175 }
176 else
177 {
178 // Scan content, and tell it its not an external entity
179 if (scanContent())
180 {
181 // Do post-parse validation if required
182 if (fValidate)
183 {
184 // We handle ID reference semantics at this level since
185 // its required by XML 1.0.
186 checkIDRefs();
187
188 // Then allow the validator to do any extra stuff it wants
189 // fValidator->postParseValidation();
190 }
191
192 // That went ok, so scan for any miscellaneous stuff
193 if (!fReaderMgr.atEOF())
194 scanMiscellaneous();
195 }
196 }
197
198 // If we have a document handler, then call the end document
199 if (fDocHandler)
200 fDocHandler->endDocument();
201 }
202 // NOTE:
203 //
204 // In all of the error processing below, the emitError() call MUST come
205 // before the flush of the reader mgr, or it will fail because it tries
206 // to find out the position in the XML source of the error.
207 catch(const XMLErrs::Codes)
208 {
209 // This is a 'first failure' exception, so fall through
210 }
211 catch(const XMLValid::Codes)
212 {
213 // This is a 'first fatal error' type exit, so fall through
214 }
215 catch(const XMLException& excToCatch)
216 {
217 // Emit the error and catch any user exception thrown from here. Make
218 // sure in all cases we flush the reader manager.
219 fInException = true;
220 try
221 {
222 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
223 emitError
224 (
225 XMLErrs::XMLException_Warning
226 , excToCatch.getCode()
227 , excToCatch.getMessage()
228 );
229 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
230 emitError
231 (
232 XMLErrs::XMLException_Fatal
233 , excToCatch.getCode()
234 , excToCatch.getMessage()
235 );
236 else
237 emitError
238 (
239 XMLErrs::XMLException_Error
240 , excToCatch.getCode()
241 , excToCatch.getMessage()
242 );
243 }
244 catch(const OutOfMemoryException&)
245 {
246 // This is a special case for out-of-memory
247 // conditions, because resetting the ReaderMgr
248 // can be problematic.
249 resetReaderMgr.release();
250
251 throw;
252 }
253 }
254 catch(const OutOfMemoryException&)
255 {
256 // This is a special case for out-of-memory
257 // conditions, because resetting the ReaderMgr
258 // can be problematic.
259 resetReaderMgr.release();
260
261 throw;
262 }
263 }
264
265
scanNext(XMLPScanToken & token)266 bool DGXMLScanner::scanNext(XMLPScanToken& token)
267 {
268 // Make sure this token is still legal
269 if (!isLegalToken(token))
270 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);
271
272 // Find the next token and remember the reader id
273 XMLSize_t orgReader;
274 XMLTokens curToken;
275
276 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
277
278 bool retVal = true;
279
280 try
281 {
282 while (true)
283 {
284 // We have to handle any end of entity exceptions that happen here.
285 // We could be at the end of X nested entities, each of which will
286 // generate an end of entity exception as we try to move forward.
287 try
288 {
289 curToken = senseNextToken(orgReader);
290 break;
291 }
292 catch(const EndOfEntityException& toCatch)
293 {
294 // Send an end of entity reference event
295 if (fDocHandler)
296 fDocHandler->endEntityReference(toCatch.getEntity());
297 }
298 }
299
300 if (curToken == Token_CharData)
301 {
302 scanCharData(fCDataBuf);
303 }
304 else if (curToken == Token_EOF)
305 {
306 if (!fElemStack.isEmpty())
307 {
308 const ElemStack::StackElem* topElem = fElemStack.popTop();
309 emitError
310 (
311 XMLErrs::EndedWithTagsOnStack
312 , topElem->fThisElement->getFullName()
313 );
314 }
315
316 retVal = false;
317 }
318 else
319 {
320 // Its some sort of markup
321 bool gotData = true;
322 switch(curToken)
323 {
324 case Token_CData :
325 // Make sure we are within content
326 if (fElemStack.isEmpty())
327 emitError(XMLErrs::CDATAOutsideOfContent);
328 scanCDSection();
329 break;
330
331 case Token_Comment :
332 scanComment();
333 break;
334
335 case Token_EndTag :
336 scanEndTag(gotData);
337 break;
338
339 case Token_PI :
340 scanPI();
341 break;
342
343 case Token_StartTag :
344 if (fDoNamespaces)
345 scanStartTagNS(gotData);
346 else
347 scanStartTag(gotData);
348 break;
349
350 default :
351 fReaderMgr.skipToChar(chOpenAngle);
352 break;
353 }
354
355 if (orgReader != fReaderMgr.getCurrentReaderNum())
356 emitError(XMLErrs::PartialMarkupInEntity);
357
358 // If we hit the end, then do the miscellaneous part
359 if (!gotData)
360 {
361 // Do post-parse validation if required
362 if (fValidate)
363 {
364 // We handle ID reference semantics at this level since
365 // its required by XML 1.0.
366 checkIDRefs();
367
368 // Then allow the validator to do any extra stuff it wants
369 // fValidator->postParseValidation();
370 }
371
372 // That went ok, so scan for any miscellaneous stuff
373 scanMiscellaneous();
374
375 if (fDocHandler)
376 fDocHandler->endDocument();
377 }
378 }
379 }
380 // NOTE:
381 //
382 // In all of the error processing below, the emitError() call MUST come
383 // before the flush of the reader mgr, or it will fail because it tries
384 // to find out the position in the XML source of the error.
385 catch(const XMLErrs::Codes)
386 {
387 // This is a 'first failure' exception, so return failure
388 retVal = false;
389 }
390 catch(const XMLValid::Codes)
391 {
392 // This is a 'first fatal error' type exit, so return failure
393 retVal = false;
394 }
395 catch(const XMLException& excToCatch)
396 {
397 // Emit the error and catch any user exception thrown from here. Make
398 // sure in all cases we flush the reader manager.
399 fInException = true;
400 try
401 {
402 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
403 emitError
404 (
405 XMLErrs::XMLException_Warning
406 , excToCatch.getCode()
407 , excToCatch.getMessage()
408 );
409 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
410 emitError
411 (
412 XMLErrs::XMLException_Fatal
413 , excToCatch.getCode()
414 , excToCatch.getMessage()
415 );
416 else
417 emitError
418 (
419 XMLErrs::XMLException_Error
420 , excToCatch.getCode()
421 , excToCatch.getMessage()
422 );
423 }
424 catch(const OutOfMemoryException&)
425 {
426 // This is a special case for out-of-memory
427 // conditions, because resetting the ReaderMgr
428 // can be problematic.
429 resetReaderMgr.release();
430
431 throw;
432 }
433
434 retVal = false;
435 }
436 catch(const OutOfMemoryException&)
437 {
438 // This is a special case for out-of-memory
439 // conditions, because resetting the ReaderMgr
440 // can be problematic.
441 resetReaderMgr.release();
442
443 throw;
444 }
445
446 // If we are not at the end, release the object that will
447 // reset the ReaderMgr.
448 if (retVal)
449 resetReaderMgr.release();
450
451 return retVal;
452 }
453
454
455 // ---------------------------------------------------------------------------
456 // DGXMLScanner: Private scanning methods
457 // ---------------------------------------------------------------------------
458
459 // This method will kick off the scanning of the primary content of the
460 // document, i.e. the elements.
scanContent()461 bool DGXMLScanner::scanContent()
462 {
463 // Go into a loop until we hit the end of the root element, or we fall
464 // out because there is no root element.
465 //
466 // We have to do kind of a deeply nested double loop here in order to
467 // avoid doing the setup/teardown of the exception handler on each
468 // round. Doing it this way we only do it when an exception actually
469 // occurs.
470 bool gotData = true;
471 bool inMarkup = false;
472 while (gotData)
473 {
474 try
475 {
476 while (gotData)
477 {
478 // Sense what the next top level token is. According to what
479 // this tells us, we will call something to handle that kind
480 // of thing.
481 XMLSize_t orgReader;
482 const XMLTokens curToken = senseNextToken(orgReader);
483
484 // Handle character data and end of file specially. Char data
485 // is not markup so we don't want to handle it in the loop
486 // below.
487 if (curToken == Token_CharData)
488 {
489 // Scan the character data and call appropriate events. Let
490 // him use our local character data buffer for efficiency.
491 scanCharData(fCDataBuf);
492 continue;
493 }
494 else if (curToken == Token_EOF)
495 {
496 // The element stack better be empty at this point or we
497 // ended prematurely before all elements were closed.
498 if (!fElemStack.isEmpty())
499 {
500 const ElemStack::StackElem* topElem = fElemStack.popTop();
501 emitError
502 (
503 XMLErrs::EndedWithTagsOnStack
504 , topElem->fThisElement->getFullName()
505 );
506 }
507
508 // Its the end of file, so clear the got data flag
509 gotData = false;
510 continue;
511 }
512
513 // We are in some sort of markup now
514 inMarkup = true;
515
516 // According to the token we got, call the appropriate
517 // scanning method.
518 switch(curToken)
519 {
520 case Token_CData :
521 // Make sure we are within content
522 if (fElemStack.isEmpty())
523 emitError(XMLErrs::CDATAOutsideOfContent);
524 scanCDSection();
525 break;
526
527 case Token_Comment :
528 scanComment();
529 break;
530
531 case Token_EndTag :
532 scanEndTag(gotData);
533 break;
534
535 case Token_PI :
536 scanPI();
537 break;
538
539 case Token_StartTag :
540 if (fDoNamespaces)
541 scanStartTagNS(gotData);
542 else
543 scanStartTag(gotData);
544 break;
545
546 default :
547 fReaderMgr.skipToChar(chOpenAngle);
548 break;
549 }
550
551 if (orgReader != fReaderMgr.getCurrentReaderNum())
552 emitError(XMLErrs::PartialMarkupInEntity);
553
554 // And we are back out of markup again
555 inMarkup = false;
556 }
557 }
558 catch(const EndOfEntityException& toCatch)
559 {
560 // If we were in some markup when this happened, then its a
561 // partial markup error.
562 if (inMarkup)
563 emitError(XMLErrs::PartialMarkupInEntity);
564
565 // Send an end of entity reference event
566 if (fDocHandler)
567 fDocHandler->endEntityReference(toCatch.getEntity());
568
569 inMarkup = false;
570 }
571 }
572
573 // It went ok, so return success
574 return true;
575 }
576
577
scanEndTag(bool & gotData)578 void DGXMLScanner::scanEndTag(bool& gotData)
579 {
580 // Assume we will still have data until proven otherwise. It will only
581 // ever be false if this is the end of the root element.
582 gotData = true;
583
584 // Check if the element stack is empty. If so, then this is an unbalanced
585 // element (i.e. more ends than starts, perhaps because of bad text
586 // causing one to be skipped.)
587 if (fElemStack.isEmpty())
588 {
589 emitError(XMLErrs::MoreEndThanStartTags);
590 fReaderMgr.skipPastChar(chCloseAngle);
591 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager);
592 }
593
594 // Pop the stack of the element we are supposed to be ending. Remember
595 // that we don't own this. The stack just keeps them and reuses them.
596 unsigned int uriId = (fDoNamespaces)
597 ? fElemStack.getCurrentURI() : fEmptyNamespaceId;
598
599 // Pop the stack of the element we are supposed to be ending. Remember
600 // that we don't own this. The stack just keeps them and reuses them.
601 const ElemStack::StackElem* topElem = fElemStack.popTop();
602 XMLElementDecl *tempElement = topElem->fThisElement;
603
604 // See if it was the root element, to avoid multiple calls below
605 const bool isRoot = fElemStack.isEmpty();
606
607 // Make sure that its the end of the element that we expect
608 if (!fReaderMgr.skippedStringLong(tempElement->getFullName()))
609 {
610 emitError
611 (
612 XMLErrs::ExpectedEndOfTagX
613 , tempElement->getFullName()
614 );
615 fReaderMgr.skipPastChar(chCloseAngle);
616 return;
617 }
618
619 // Make sure we are back on the same reader as where we started
620 if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum())
621 emitError(XMLErrs::PartialTagMarkupError);
622
623 // Skip optional whitespace
624 fReaderMgr.skipPastSpaces();
625
626 // Make sure we find the closing bracket
627 if (!fReaderMgr.skippedChar(chCloseAngle))
628 {
629 emitError
630 (
631 XMLErrs::UnterminatedEndTag
632 , topElem->fThisElement->getFullName()
633 );
634 }
635
636 // If validation is enabled, then lets pass him the list of children and
637 // this element and let him validate it.
638 if (fValidate)
639 {
640
641 //
642 // XML1.0-3rd
643 // Validity Constraint:
644 // The declaration matches EMPTY and the element has no content (not even
645 // entity references, comments, PIs or white space).
646 //
647 if ( (topElem->fCommentOrPISeen) &&
648 (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Empty))
649 {
650 fValidator->emitError
651 (
652 XMLValid::EmptyElemHasContent
653 , topElem->fThisElement->getFullName()
654 );
655 }
656
657 //
658 // XML1.0-3rd
659 // Validity Constraint:
660 //
661 // The declaration matches children and the sequence of child elements
662 // belongs to the language generated by the regular expression in the
663 // content model, with optional white space, comments and PIs
664 // (i.e. markup matching production [27] Misc) between the start-tag and
665 // the first child element, between child elements, or between the last
666 // child element and the end-tag.
667 //
668 // Note that
669 // a CDATA section containing only white space or
670 // a reference to an entity whose replacement text is character references
671 // expanding to white space do not match the nonterminal S, and hence
672 // cannot appear in these positions; however,
673 // a reference to an internal entity with a literal value consisting
674 // of character references expanding to white space does match S,
675 // since its replacement text is the white space resulting from expansion
676 // of the character references.
677 //
678 if ( (topElem->fReferenceEscaped) &&
679 (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Children))
680 {
681 fValidator->emitError
682 (
683 XMLValid::ElemChildrenHasInvalidWS
684 , topElem->fThisElement->getFullName()
685 );
686 }
687
688 XMLSize_t failure;
689 bool res = fValidator->checkContent
690 (
691 topElem->fThisElement
692 , topElem->fChildren
693 , topElem->fChildCount
694 , &failure
695 );
696
697 if (!res)
698 {
699 // One of the elements is not valid for the content. NOTE that
700 // if no children were provided but the content model requires
701 // them, it comes back with a zero value. But we cannot use that
702 // to index the child array in this case, and have to put out a
703 // special message.
704 if (!topElem->fChildCount)
705 {
706 fValidator->emitError
707 (
708 XMLValid::EmptyNotValidForContent
709 , topElem->fThisElement->getFormattedContentModel()
710 );
711 }
712 else if (failure >= topElem->fChildCount)
713 {
714 fValidator->emitError
715 (
716 XMLValid::NotEnoughElemsForCM
717 , topElem->fThisElement->getFormattedContentModel()
718 );
719 }
720 else
721 {
722 fValidator->emitError
723 (
724 XMLValid::ElementNotValidForContent
725 , topElem->fChildren[failure]->getRawName()
726 , topElem->fThisElement->getFormattedContentModel()
727 );
728 }
729 }
730 }
731
732 // If we have a doc handler, tell it about the end tag
733 if (fDocHandler)
734 {
735 fDocHandler->endElement
736 (
737 *topElem->fThisElement
738 , uriId
739 , isRoot
740 , (fDoNamespaces)
741 ? topElem->fThisElement->getElementName()->getPrefix()
742 : XMLUni::fgZeroLenString
743 );
744 }
745
746 // If this was the root, then done with content
747 gotData = !isRoot;
748 }
749
750
751 // This method handles the high level logic of scanning the DOCType
752 // declaration. This calls the DTDScanner and kicks off both the scanning of
753 // the internal subset and the scanning of the external subset, if any.
754 //
755 // When we get here the '<!DOCTYPE' part has already been scanned, which is
756 // what told us that we had a doc type decl to parse.
scanDocTypeDecl()757 void DGXMLScanner::scanDocTypeDecl()
758 {
759 if (fDocTypeHandler)
760 fDocTypeHandler->resetDocType();
761
762 // There must be some space after DOCTYPE
763 bool skippedSomething;
764 fReaderMgr.skipPastSpaces(skippedSomething);
765 if (!skippedSomething)
766 {
767 emitError(XMLErrs::ExpectedWhitespace);
768
769 // Just skip the Doctype declaration and return
770 fReaderMgr.skipPastChar(chCloseAngle);
771 return;
772 }
773
774 // Get a buffer for the root element
775 XMLBufBid bbRootName(&fBufMgr);
776
777 // Get a name from the input, which should be the name of the root
778 // element of the upcoming content.
779 int colonPosition;
780 bool validName = fDoNamespaces ? fReaderMgr.getQName(bbRootName.getBuffer(), &colonPosition) :
781 fReaderMgr.getName(bbRootName.getBuffer());
782 if (!validName)
783 {
784 if (bbRootName.isEmpty())
785 emitError(XMLErrs::NoRootElemInDOCTYPE);
786 else
787 emitError(XMLErrs::InvalidRootElemInDOCTYPE, bbRootName.getRawBuffer());
788 fReaderMgr.skipPastChar(chCloseAngle);
789 return;
790 }
791
792 // Store the root element name for later check
793 setRootElemName(bbRootName.getRawBuffer());
794
795 // This element obviously is not going to exist in the element decl
796 // pool yet, but we need to call docTypeDecl. So force it into
797 // the element decl pool, marked as being there because it was in
798 // the DOCTYPE. Later, when its declared, the status will be updated.
799 //
800 // Only do this if we are not reusing the validator! If we are reusing,
801 // then look it up instead. It has to exist!
802 MemoryManager* const rootDeclMgr =
803 fUseCachedGrammar ? fMemoryManager : fGrammarPoolMemoryManager;
804
805 DTDElementDecl* rootDecl = new (rootDeclMgr) DTDElementDecl
806 (
807 bbRootName.getRawBuffer()
808 , fEmptyNamespaceId
809 , DTDElementDecl::Any
810 , rootDeclMgr
811 );
812
813 Janitor<DTDElementDecl> rootDeclJanitor(rootDecl);
814 rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
815 rootDecl->setExternalElemDeclaration(true);
816 if(!fUseCachedGrammar)
817 {
818 fGrammar->putElemDecl(rootDecl);
819 rootDeclJanitor.release();
820 } else
821 {
822 // put this in the undeclared pool so it gets deleted...
823 XMLElementDecl* elemDecl = fDTDElemNonDeclPool->getByKey(bbRootName.getRawBuffer());
824 if (elemDecl)
825 {
826 rootDecl->setId(elemDecl->getId());
827 }
828 else
829 {
830 rootDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)rootDecl));
831 rootDeclJanitor.release();
832 }
833 }
834
835 // Skip any spaces after the name
836 fReaderMgr.skipPastSpaces();
837
838 // And now if we are looking at a >, then we are done. It is not
839 // required to have an internal or external subset, though why you
840 // would not escapes me.
841 if (fReaderMgr.skippedChar(chCloseAngle)) {
842
843 // If we have a doc type handler and advanced callbacks are enabled,
844 // call the doctype event.
845 if (fDocTypeHandler)
846 fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false);
847 return;
848 }
849
850 // either internal/external subset
851 if (fValScheme == Val_Auto && !fValidate)
852 fValidate = true;
853
854 bool hasIntSubset = false;
855 bool hasExtSubset = false;
856 XMLCh* sysId = 0;
857 XMLCh* pubId = 0;
858
859 DTDScanner dtdScanner
860 (
861 (DTDGrammar*) fGrammar
862 , fDocTypeHandler
863 , fGrammarPoolMemoryManager
864 , fMemoryManager
865 );
866 dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr);
867
868 // If the next character is '[' then we have no external subset cause
869 // there is no system id, just the opening character of the internal
870 // subset. Else, has to be an id.
871 //
872 // Just look at the next char, don't eat it.
873 if (fReaderMgr.peekNextChar() == chOpenSquare)
874 {
875 hasIntSubset = true;
876 }
877 else
878 {
879 // Indicate we have an external subset
880 hasExtSubset = true;
881 fHasNoDTD = false;
882
883 // Get buffers for the ids
884 XMLBufBid bbPubId(&fBufMgr);
885 XMLBufBid bbSysId(&fBufMgr);
886
887 // Get the external subset id
888 if (!dtdScanner.scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), DTDScanner::IDType_External))
889 {
890 fReaderMgr.skipPastChar(chCloseAngle);
891 return;
892 }
893
894 // Get copies of the ids we got
895 pubId = XMLString::replicate(bbPubId.getRawBuffer(), fMemoryManager);
896 sysId = XMLString::replicate(bbSysId.getRawBuffer(), fMemoryManager);
897
898 // Skip spaces and check again for the opening of an internal subset
899 fReaderMgr.skipPastSpaces();
900
901 // Just look at the next char, don't eat it.
902 if (fReaderMgr.peekNextChar() == chOpenSquare) {
903 hasIntSubset = true;
904 }
905 }
906
907 // Insure that the ids get cleaned up, if they got allocated
908 ArrayJanitor<XMLCh> janSysId(sysId, fMemoryManager);
909 ArrayJanitor<XMLCh> janPubId(pubId, fMemoryManager);
910
911 // If we have a doc type handler and advanced callbacks are enabled,
912 // call the doctype event.
913 if (fDocTypeHandler)
914 fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset, hasExtSubset);
915
916 // Ok, if we had an internal subset, we are just past the [ character
917 // and need to parse that first.
918 if (hasIntSubset)
919 {
920 // Eat the opening square bracket
921 fReaderMgr.getNextChar();
922
923 checkInternalDTD(hasExtSubset, sysId, pubId);
924
925 // And try to scan the internal subset. If we fail, try to recover
926 // by skipping forward tot he close angle and returning.
927 if (!dtdScanner.scanInternalSubset())
928 {
929 fReaderMgr.skipPastChar(chCloseAngle);
930 return;
931 }
932
933 // Do a sanity check that some expanded PE did not propogate out of
934 // the doctype. This could happen if it was terminated early by bad
935 // syntax.
936 if (fReaderMgr.getReaderDepth() > 1)
937 {
938 emitError(XMLErrs::PEPropogated);
939
940 // Ask the reader manager to pop back down to the main level
941 fReaderMgr.cleanStackBackTo(1);
942 }
943
944 fReaderMgr.skipPastSpaces();
945 }
946
947 // And that should leave us at the closing > of the DOCTYPE line
948 if (!fReaderMgr.skippedChar(chCloseAngle))
949 {
950 // Do a special check for the common scenario of an extra ] char at
951 // the end. This is easy to recover from.
952 if (fReaderMgr.skippedChar(chCloseSquare)
953 && fReaderMgr.skippedChar(chCloseAngle))
954 {
955 emitError(XMLErrs::ExtraCloseSquare);
956 }
957 else
958 {
959 emitError(XMLErrs::UnterminatedDOCTYPE);
960 fReaderMgr.skipPastChar(chCloseAngle);
961 }
962 }
963
964 // If we had an external subset, then we need to deal with that one
965 // next. If we are reusing the validator, then don't scan it.
966 if (hasExtSubset) {
967
968 InputSource* srcUsed=0;
969 Janitor<InputSource> janSrc(srcUsed);
970 // If we had an internal subset and we're using the cached grammar, it
971 // means that the ignoreCachedDTD is set, so we ignore the cached
972 // grammar
973 if (fUseCachedGrammar && !hasIntSubset)
974 {
975 srcUsed = resolveSystemId(sysId, pubId);
976 if (srcUsed) {
977 janSrc.reset(srcUsed);
978 Grammar* grammar = fGrammarResolver->getGrammar(srcUsed->getSystemId());
979
980 if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) {
981
982 fDTDGrammar = (DTDGrammar*) grammar;
983 fGrammar = fDTDGrammar;
984 fValidator->setGrammar(fGrammar);
985 // If we don't report at least the external subset boundaries,
986 // an advanced document handler cannot know when the DTD end,
987 // since we've already sent a doctype decl that indicates there's
988 // there's an external subset.
989 if (fDocTypeHandler)
990 {
991 fDocTypeHandler->startExtSubset();
992 fDocTypeHandler->endExtSubset();
993 }
994
995 return;
996 }
997 }
998 }
999
1000 if (fLoadExternalDTD || fValidate)
1001 {
1002 // And now create a reader to read this entity
1003 XMLReader* reader;
1004 if(srcUsed) {
1005 reader = fReaderMgr.createReader
1006 (
1007 *srcUsed
1008 , false
1009 , XMLReader::RefFrom_NonLiteral
1010 , XMLReader::Type_General
1011 , XMLReader::Source_External
1012 , fCalculateSrcOfs
1013 , fLowWaterMark
1014 );
1015 }
1016 else {
1017 reader = fReaderMgr.createReader
1018 (
1019 sysId
1020 , pubId
1021 , false
1022 , XMLReader::RefFrom_NonLiteral
1023 , XMLReader::Type_General
1024 , XMLReader::Source_External
1025 , srcUsed
1026 , fCalculateSrcOfs
1027 , fLowWaterMark
1028 , fDisableDefaultEntityResolution
1029 );
1030 janSrc.reset(srcUsed);
1031 }
1032 // If it failed then throw an exception
1033 if (!reader)
1034 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed ? srcUsed->getSystemId() : sysId, fMemoryManager);
1035
1036 if (fToCacheGrammar) {
1037
1038 unsigned int stringId = fGrammarResolver->getStringPool()->addOrFind(srcUsed->getSystemId());
1039 const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(stringId);
1040
1041 fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString);
1042 ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr);
1043 fGrammarResolver->putGrammar(fGrammar);
1044 }
1045
1046 // In order to make the processing work consistently, we have to
1047 // make this look like an external entity. So create an entity
1048 // decl and fill it in and push it with the reader, as happens
1049 // with an external entity. Put a janitor on it to insure it gets
1050 // cleaned up. The reader manager does not adopt them.
1051 const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull };
1052 DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager);
1053 declDTD->setSystemId(sysId);
1054 declDTD->setIsExternal(true);
1055 Janitor<DTDEntityDecl> janDecl(declDTD);
1056
1057 // Mark this one as a throw at end
1058 reader->setThrowAtEnd(true);
1059
1060 // And push it onto the stack, with its pseudo name
1061 fReaderMgr.pushReader(reader, declDTD);
1062
1063 // Tell it its not in an include section
1064 dtdScanner.scanExtSubsetDecl(false, true);
1065 }
1066 }
1067 }
1068
scanStartTag(bool & gotData)1069 bool DGXMLScanner::scanStartTag(bool& gotData)
1070 {
1071 // Assume we will still have data until proven otherwise. It will only
1072 // ever be false if this is the root and its empty.
1073 gotData = true;
1074
1075 // Get the QName. In this case, we are not doing namespaces, so we just
1076 // use it as is and don't have to break it into parts.
1077
1078 bool validName = fReaderMgr.getName(fQNameBuf);
1079 if (!validName)
1080 {
1081 if (fQNameBuf.isEmpty())
1082 emitError(XMLErrs::ExpectedElementName);
1083 else
1084 emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer());
1085 fReaderMgr.skipToChar(chOpenAngle);
1086 return false;
1087 }
1088
1089 // Assume it won't be an empty tag
1090 bool isEmpty = false;
1091
1092 // See if its the root element
1093 const bool isRoot = fElemStack.isEmpty();
1094
1095 // Lets try to look up the element in the validator's element decl pool
1096 // We can pass bogus values for the URI id and the base name. We know that
1097 // this can only be called if we are doing a DTD style validator and that
1098 // he will only look at the QName.
1099 //
1100 // We *do not* tell him to fault in a decl if he does not find one - NG.
1101 bool wasAdded = false;
1102 const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
1103
1104 XMLElementDecl* elemDecl = fGrammar->getElemDecl
1105 (
1106 fEmptyNamespaceId
1107 , 0
1108 , qnameRawBuf
1109 , Grammar::TOP_LEVEL_SCOPE
1110 );
1111 // look in the undeclared pool:
1112 if(!elemDecl)
1113 {
1114 elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf);
1115 }
1116 if(!elemDecl)
1117 {
1118 wasAdded = true;
1119 elemDecl = new (fMemoryManager) DTDElementDecl
1120 (
1121 qnameRawBuf
1122 , fEmptyNamespaceId
1123 , DTDElementDecl::Any
1124 , fMemoryManager
1125 );
1126 elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl));
1127 }
1128
1129 if (fValidate) {
1130
1131 if (wasAdded)
1132 {
1133 // This is to tell the reuse Validator that this element was
1134 // faulted-in, was not an element in the validator pool originally
1135 elemDecl->setCreateReason(XMLElementDecl::JustFaultIn);
1136
1137 fValidator->emitError
1138 (
1139 XMLValid::ElementNotDefined
1140 , qnameRawBuf
1141 );
1142 }
1143 // If its not marked declared, then emit an error
1144 else if (!elemDecl->isDeclared())
1145 {
1146 fValidator->emitError
1147 (
1148 XMLValid::ElementNotDefined
1149 , qnameRawBuf
1150 );
1151 }
1152
1153
1154 fValidator->validateElement(elemDecl);
1155 }
1156
1157 // Expand the element stack and add the new element
1158 fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
1159
1160 // If this is the first element and we are validating, check the root
1161 // element.
1162 if (isRoot)
1163 {
1164 fRootGrammar = fGrammar;
1165
1166 if (fValidate)
1167 {
1168 // If a DocType exists, then check if it matches the root name there.
1169 if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName))
1170 fValidator->emitError(XMLValid::RootElemNotLikeDocType);
1171 }
1172 }
1173 else if (fValidate)
1174 {
1175 // If the element stack is not empty, then add this element as a
1176 // child of the previous top element. If its empty, this is the root
1177 // elem and is not the child of anything.
1178 fElemStack.addChild(elemDecl->getElementName(), true);
1179 }
1180
1181 // Skip any whitespace after the name
1182 fReaderMgr.skipPastSpaces();
1183
1184 // We loop until we either see a /> or >, handling attribute/value
1185 // pairs until we get there.
1186 XMLSize_t attCount = 0;
1187 XMLSize_t curAttListSize = fAttrList->size();
1188 wasAdded = false;
1189
1190 fElemCount++;
1191
1192 while (true)
1193 {
1194 // And get the next non-space character
1195 XMLCh nextCh = fReaderMgr.peekNextChar();
1196
1197 // If the next character is not a slash or closed angle bracket,
1198 // then it must be whitespace, since whitespace is required
1199 // between the end of the last attribute and the name of the next
1200 // one.
1201 if (attCount)
1202 {
1203 if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
1204 {
1205 if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
1206 {
1207 // Ok, skip by them and peek another char
1208 fReaderMgr.skipPastSpaces();
1209 nextCh = fReaderMgr.peekNextChar();
1210 }
1211 else
1212 {
1213 // Emit the error but keep on going
1214 emitError(XMLErrs::ExpectedWhitespace);
1215 }
1216 }
1217 }
1218
1219 // Ok, here we first check for any of the special case characters.
1220 // If its not one, then we do the normal case processing, which
1221 // assumes that we've hit an attribute value, Otherwise, we do all
1222 // the special case checks.
1223 if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
1224 {
1225 // Assume its going to be an attribute, so get a name from
1226 // the input.
1227
1228 validName = fReaderMgr.getName(fAttNameBuf);
1229 if (!validName)
1230 {
1231 if (fAttNameBuf.isEmpty())
1232 emitError(XMLErrs::ExpectedAttrName);
1233 else
1234 emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer());
1235 fReaderMgr.skipPastChar(chCloseAngle);
1236 return false;
1237 }
1238
1239 // And next must be an equal sign
1240 if (!scanEq())
1241 {
1242 static const XMLCh tmpList[] =
1243 {
1244 chSingleQuote, chDoubleQuote, chCloseAngle
1245 , chOpenAngle, chForwardSlash, chNull
1246 };
1247
1248 emitError(XMLErrs::ExpectedEqSign);
1249
1250 // Try to sync back up by skipping forward until we either
1251 // hit something meaningful.
1252 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1253
1254 if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
1255 {
1256 // Jump back to top for normal processing of these
1257 continue;
1258 }
1259 else if ((chFound == chSingleQuote)
1260 || (chFound == chDoubleQuote)
1261 || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1262 {
1263 // Just fall through assuming that the value is to follow
1264 }
1265 else if (chFound == chOpenAngle)
1266 {
1267 // Assume a malformed tag and that new one is starting
1268 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1269 return false;
1270 }
1271 else
1272 {
1273 // Something went really wrong
1274 return false;
1275 }
1276 }
1277
1278 // See if this attribute is declared for this element. If we are
1279 // not validating of course it will not be at first, but we will
1280 // fault it into the pool (to avoid lots of redundant errors.)
1281 XMLCh * namePtr = fAttNameBuf.getRawBuffer();
1282 XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr);
1283
1284 // Skip any whitespace before the value and then scan the att
1285 // value. This will come back normalized with entity refs and
1286 // char refs expanded.
1287 fReaderMgr.skipPastSpaces();
1288 if (!scanAttValue(attDef, namePtr, fAttValueBuf))
1289 {
1290 static const XMLCh tmpList[] =
1291 {
1292 chCloseAngle, chOpenAngle, chForwardSlash, chNull
1293 };
1294
1295 emitError(XMLErrs::ExpectedAttrValue);
1296
1297 // It failed, so lets try to get synced back up. We skip
1298 // forward until we find some whitespace or one of the
1299 // chars in our list.
1300 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1301
1302 if ((chFound == chCloseAngle)
1303 || (chFound == chForwardSlash)
1304 || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1305 {
1306 // Just fall through and process this attribute, though
1307 // the value will be "".
1308 }
1309 else if (chFound == chOpenAngle)
1310 {
1311 // Assume a malformed tag and that new one is starting
1312 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1313 return false;
1314 }
1315 else
1316 {
1317 // Something went really wrong
1318 return false;
1319 }
1320 }
1321
1322 // Add this attribute to the attribute list that we use to
1323 // pass them to the handler. We reuse its existing elements
1324 // but expand it as required.
1325 // Note that we want to this first since this will
1326 // make a copy of the namePtr; we can then make use of
1327 // that copy in the hashtable lookup that checks
1328 // for duplicates. This will mean we may have to update
1329 // the type of the XMLAttr later.
1330 XMLAttr* curAtt;
1331 const XMLCh* attrValue = fAttValueBuf.getRawBuffer();
1332
1333 if (attCount >= curAttListSize) {
1334 curAtt = new (fMemoryManager) XMLAttr(fMemoryManager);
1335 fAttrList->addElement(curAtt);
1336 }
1337 else {
1338 curAtt = fAttrList->elementAt(attCount);
1339 }
1340
1341 curAtt->setSpecified(true);
1342
1343 // NO NAMESPACE CODE
1344 {
1345 curAtt->set(
1346 0, namePtr, XMLUni::fgZeroLenString, XMLUni::fgZeroLenString
1347 , (attDef)?attDef->getType():XMLAttDef::CData
1348 );
1349
1350 // now need to prepare for duplicate detection
1351 if (attDef) {
1352 unsigned int *curCountPtr = fAttDefRegistry->get(attDef);
1353 if (!curCountPtr) {
1354 curCountPtr = getNewUIntPtr();
1355 *curCountPtr = fElemCount;
1356 fAttDefRegistry->put(attDef, curCountPtr);
1357 }
1358 else if (*curCountPtr < fElemCount) {
1359 *curCountPtr = fElemCount;
1360 }
1361 else {
1362 emitError(
1363 XMLErrs::AttrAlreadyUsedInSTag
1364 , attDef->getFullName(), elemDecl->getFullName()
1365 );
1366 }
1367 }
1368 else
1369 {
1370 // reset namePtr so it refers to newly-allocated memory
1371 namePtr = (XMLCh *)curAtt->getQName();
1372 if (!fUndeclaredAttrRegistry->putIfNotPresent(namePtr, 0))
1373 {
1374 emitError(
1375 XMLErrs::AttrAlreadyUsedInSTag
1376 , namePtr, elemDecl->getFullName()
1377 );
1378 }
1379 }
1380 }
1381
1382 if (fValidate)
1383 {
1384 if (attDef) {
1385 // Let the validator pass judgement on the attribute value
1386 fValidator->validateAttrValue(
1387 attDef, fAttValueBuf.getRawBuffer(), false, elemDecl
1388 );
1389 }
1390 else
1391 {
1392 fValidator->emitError
1393 (
1394 XMLValid::AttNotDefinedForElement
1395 , fAttNameBuf.getRawBuffer(), qnameRawBuf
1396 );
1397 }
1398 }
1399
1400 // must set the newly-minted value on the XMLAttr:
1401 curAtt->setValue(attrValue);
1402 attCount++;
1403
1404 // And jump back to the top of the loop
1405 continue;
1406 }
1407
1408 // It was some special case character so do all of the checks and
1409 // deal with it.
1410 if (!nextCh)
1411 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1412
1413 if (nextCh == chForwardSlash)
1414 {
1415 fReaderMgr.getNextChar();
1416 isEmpty = true;
1417 if (!fReaderMgr.skippedChar(chCloseAngle))
1418 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1419 break;
1420 }
1421 else if (nextCh == chCloseAngle)
1422 {
1423 fReaderMgr.getNextChar();
1424 break;
1425 }
1426 else if (nextCh == chOpenAngle)
1427 {
1428 // Check for this one specially, since its going to be common
1429 // and it is kind of auto-recovering since we've already hit the
1430 // next open bracket, which is what we would have seeked to (and
1431 // skipped this whole tag.)
1432 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1433 break;
1434 }
1435 else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
1436 {
1437 // Check for this one specially, which is probably a missing
1438 // attribute name, e.g. ="value". Just issue expected name
1439 // error and eat the quoted string, then jump back to the
1440 // top again.
1441 emitError(XMLErrs::ExpectedAttrName);
1442 fReaderMgr.getNextChar();
1443 fReaderMgr.skipQuotedString(nextCh);
1444 fReaderMgr.skipPastSpaces();
1445 continue;
1446 }
1447 }
1448
1449 if(attCount)
1450 {
1451 // clean up after ourselves:
1452 // clear the map used to detect duplicate attributes
1453 fUndeclaredAttrRegistry->removeAll();
1454 }
1455
1456 // Now lets get the fAttrList filled in. This involves faulting in any
1457 // defaulted and fixed attributes and normalizing the values of any that
1458 // we got explicitly.
1459 //
1460 // We update the attCount value with the total number of attributes, but
1461 // it goes in with the number of values we got during the raw scan of
1462 // explictly provided attrs above.
1463 attCount = buildAttList(attCount, elemDecl, *fAttrList);
1464
1465 // If we have a document handler, then tell it about this start tag. We
1466 // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send
1467 // any prefix since its just one big name if we are not doing namespaces.
1468 unsigned int uriId = fEmptyNamespaceId;
1469 if (fDocHandler)
1470 {
1471 fDocHandler->startElement
1472 (
1473 *elemDecl
1474 , uriId
1475 , 0
1476 , *fAttrList
1477 , attCount
1478 , isEmpty
1479 , isRoot
1480 );
1481 }
1482
1483 // If empty, validate content right now if we are validating and then
1484 // pop the element stack top. Else, we have to update the current stack
1485 // top's namespace mapping elements.
1486 if (isEmpty)
1487 {
1488 // If validating, then insure that its legal to have no content
1489 if (fValidate)
1490 {
1491 XMLSize_t failure;
1492 bool res = fValidator->checkContent(elemDecl, 0, 0, &failure);
1493 if (!res)
1494 {
1495 fValidator->emitError
1496 (
1497 XMLValid::ElementNotValidForContent
1498 , qnameRawBuf
1499 , elemDecl->getFormattedContentModel()
1500 );
1501 }
1502 }
1503
1504 // Pop the element stack back off since it'll never be used now
1505 fElemStack.popTop();
1506
1507 // If the elem stack is empty, then it was an empty root
1508 if (isRoot)
1509 gotData = false;
1510 }
1511
1512 return true;
1513 }
1514
1515
scanStartTagNS(bool & gotData)1516 bool DGXMLScanner::scanStartTagNS(bool& gotData)
1517 {
1518 // Assume we will still have data until proven otherwise. It will only
1519 // ever be false if this is the root and its empty.
1520 gotData = true;
1521
1522 // Get the QName. In this case, we are not doing namespaces, so we just
1523 // use it as is and don't have to break it into parts.
1524
1525 int colonPosition;
1526 bool validName = fReaderMgr.getQName(fQNameBuf, &colonPosition);
1527 if (!validName)
1528 {
1529 if (fQNameBuf.isEmpty())
1530 emitError(XMLErrs::ExpectedElementName);
1531 else
1532 emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer());
1533 fReaderMgr.skipToChar(chOpenAngle);
1534 return false;
1535 }
1536
1537 // Assume it won't be an empty tag
1538 bool isEmpty = false;
1539
1540 // See if its the root element
1541 const bool isRoot = fElemStack.isEmpty();
1542
1543 // Lets try to look up the element in the validator's element decl pool
1544 // We can pass bogus values for the URI id and the base name. We know that
1545 // this can only be called if we are doing a DTD style validator and that
1546 // he will only look at the QName.
1547 //
1548 // We *do not* tell him to fault in a decl if he does not find one - NG.
1549 bool wasAdded = false;
1550 const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
1551
1552 XMLElementDecl* elemDecl = fGrammar->getElemDecl
1553 (
1554 fEmptyNamespaceId
1555 , 0
1556 , qnameRawBuf
1557 , Grammar::TOP_LEVEL_SCOPE
1558 );
1559 // look in the undeclared pool:
1560 if(!elemDecl)
1561 {
1562 elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf);
1563 }
1564 if(!elemDecl)
1565 {
1566 wasAdded = true;
1567 elemDecl = new (fMemoryManager) DTDElementDecl
1568 (
1569 qnameRawBuf
1570 , fEmptyNamespaceId
1571 , DTDElementDecl::Any
1572 , fMemoryManager
1573 );
1574 elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl));
1575 }
1576
1577 if (fValidate) {
1578
1579 if (wasAdded)
1580 {
1581 // This is to tell the reuse Validator that this element was
1582 // faulted-in, was not an element in the validator pool originally
1583 elemDecl->setCreateReason(XMLElementDecl::JustFaultIn);
1584
1585 fValidator->emitError
1586 (
1587 XMLValid::ElementNotDefined
1588 , qnameRawBuf
1589 );
1590 }
1591 // If its not marked declared, then emit an error
1592 else if (!elemDecl->isDeclared())
1593 {
1594 fValidator->emitError
1595 (
1596 XMLValid::ElementNotDefined
1597 , qnameRawBuf
1598 );
1599 }
1600
1601
1602 fValidator->validateElement(elemDecl);
1603 }
1604
1605 // Expand the element stack and add the new element
1606 fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
1607
1608 // If this is the first element and we are validating, check the root
1609 // element.
1610 if (isRoot)
1611 {
1612 fRootGrammar = fGrammar;
1613
1614 if (fValidate)
1615 {
1616 // If a DocType exists, then check if it matches the root name there.
1617 if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName))
1618 fValidator->emitError(XMLValid::RootElemNotLikeDocType);
1619 }
1620 }
1621 else if (fValidate)
1622 {
1623 // If the element stack is not empty, then add this element as a
1624 // child of the previous top element. If its empty, this is the root
1625 // elem and is not the child of anything.
1626 fElemStack.addChild(elemDecl->getElementName(), true);
1627 }
1628
1629 // Skip any whitespace after the name
1630 fReaderMgr.skipPastSpaces();
1631
1632 // We loop until we either see a /> or >, handling attribute/value
1633 // pairs until we get there.
1634 XMLSize_t attCount = 0;
1635 XMLSize_t curAttListSize = fAttrList->size();
1636 wasAdded = false;
1637
1638 fElemCount++;
1639
1640 while (true)
1641 {
1642 // And get the next non-space character
1643 XMLCh nextCh = fReaderMgr.peekNextChar();
1644
1645 // If the next character is not a slash or closed angle bracket,
1646 // then it must be whitespace, since whitespace is required
1647 // between the end of the last attribute and the name of the next
1648 // one.
1649 if (attCount)
1650 {
1651 if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
1652 {
1653 if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
1654 {
1655 // Ok, skip by them and peek another char
1656 fReaderMgr.skipPastSpaces();
1657 nextCh = fReaderMgr.peekNextChar();
1658 }
1659 else
1660 {
1661 // Emit the error but keep on going
1662 emitError(XMLErrs::ExpectedWhitespace);
1663 }
1664 }
1665 }
1666
1667 // Ok, here we first check for any of the special case characters.
1668 // If its not one, then we do the normal case processing, which
1669 // assumes that we've hit an attribute value, Otherwise, we do all
1670 // the special case checks.
1671 if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
1672 {
1673 // Assume its going to be an attribute, so get a name from
1674 // the input.
1675
1676 validName = fReaderMgr.getQName(fAttNameBuf, &colonPosition);
1677 if (!validName)
1678 {
1679 if (fAttNameBuf.isEmpty())
1680 emitError(XMLErrs::ExpectedAttrName);
1681 else
1682 emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer());
1683 fReaderMgr.skipPastChar(chCloseAngle);
1684 return false;
1685 }
1686
1687 // And next must be an equal sign
1688 if (!scanEq())
1689 {
1690 static const XMLCh tmpList[] =
1691 {
1692 chSingleQuote, chDoubleQuote, chCloseAngle
1693 , chOpenAngle, chForwardSlash, chNull
1694 };
1695
1696 emitError(XMLErrs::ExpectedEqSign);
1697
1698 // Try to sync back up by skipping forward until we either
1699 // hit something meaningful.
1700 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1701
1702 if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
1703 {
1704 // Jump back to top for normal processing of these
1705 continue;
1706 }
1707 else if ((chFound == chSingleQuote)
1708 || (chFound == chDoubleQuote)
1709 || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1710 {
1711 // Just fall through assuming that the value is to follow
1712 }
1713 else if (chFound == chOpenAngle)
1714 {
1715 // Assume a malformed tag and that new one is starting
1716 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1717 return false;
1718 }
1719 else
1720 {
1721 // Something went really wrong
1722 return false;
1723 }
1724 }
1725
1726 // See if this attribute is declared for this element. If we are
1727 // not validating of course it will not be at first, but we will
1728 // fault it into the pool (to avoid lots of redundant errors.)
1729 XMLCh * namePtr = fAttNameBuf.getRawBuffer();
1730 XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr);
1731
1732 // Skip any whitespace before the value and then scan the att
1733 // value. This will come back normalized with entity refs and
1734 // char refs expanded.
1735 fReaderMgr.skipPastSpaces();
1736 if (!scanAttValue(attDef, namePtr, fAttValueBuf))
1737 {
1738 static const XMLCh tmpList[] =
1739 {
1740 chCloseAngle, chOpenAngle, chForwardSlash, chNull
1741 };
1742
1743 emitError(XMLErrs::ExpectedAttrValue);
1744
1745 // It failed, so lets try to get synced back up. We skip
1746 // forward until we find some whitespace or one of the
1747 // chars in our list.
1748 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
1749
1750 if ((chFound == chCloseAngle)
1751 || (chFound == chForwardSlash)
1752 || fReaderMgr.getCurrentReader()->isWhitespace(chFound))
1753 {
1754 // Just fall through and process this attribute, though
1755 // the value will be "".
1756 }
1757 else if (chFound == chOpenAngle)
1758 {
1759 // Assume a malformed tag and that new one is starting
1760 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1761 return false;
1762 }
1763 else
1764 {
1765 // Something went really wrong
1766 return false;
1767 }
1768 }
1769
1770 // Add this attribute to the attribute list that we use to
1771 // pass them to the handler. We reuse its existing elements
1772 // but expand it as required.
1773 // Note that we want to this first since this will
1774 // make a copy of the namePtr; we can then make use of
1775 // that copy in the hashtable lookup that checks
1776 // for duplicates. This will mean we may have to update
1777 // the type of the XMLAttr later.
1778 XMLAttr* curAtt;
1779 const XMLCh* attrValue = fAttValueBuf.getRawBuffer();
1780
1781 if (attCount >= curAttListSize) {
1782 curAtt = new (fMemoryManager) XMLAttr(fMemoryManager);
1783 fAttrList->addElement(curAtt);
1784 }
1785 else {
1786 curAtt = fAttrList->elementAt(attCount);
1787 }
1788
1789 curAtt->setSpecified(true);
1790 // DO NAMESPACES
1791 {
1792 curAtt->set(
1793 fEmptyNamespaceId, namePtr, XMLUni::fgZeroLenString
1794 , (attDef)? attDef->getType() : XMLAttDef::CData
1795 );
1796
1797 // each attribute has the prefix:suffix="value"
1798 const XMLCh* attPrefix = curAtt->getPrefix();
1799 const XMLCh* attLocalName = curAtt->getName();
1800
1801 if (attPrefix && *attPrefix) {
1802 if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) {
1803 curAtt->setURIId(fXMLNamespaceId);
1804 }
1805 else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) {
1806 curAtt->setURIId(fXMLNSNamespaceId);
1807 updateNSMap(attPrefix, attLocalName, attrValue);
1808 }
1809 else {
1810 fAttrNSList->addElement(curAtt);
1811 }
1812 }
1813 else if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName))
1814 {
1815 updateNSMap(attPrefix, XMLUni::fgZeroLenString, attrValue);
1816 }
1817
1818 // NOTE: duplicate attribute check will be done, when we map
1819 // namespaces to all attributes
1820 if (attDef) {
1821 unsigned int *curCountPtr = fAttDefRegistry->get(attDef);
1822 if (!curCountPtr) {
1823 curCountPtr = getNewUIntPtr();
1824 *curCountPtr = fElemCount;
1825 fAttDefRegistry->put(attDef, curCountPtr);
1826 }
1827 else if (*curCountPtr < fElemCount) {
1828 *curCountPtr = fElemCount;
1829 }
1830 }
1831 }
1832
1833 if (fValidate)
1834 {
1835 if (attDef) {
1836 // Let the validator pass judgement on the attribute value
1837 fValidator->validateAttrValue(
1838 attDef, fAttValueBuf.getRawBuffer(), false, elemDecl
1839 );
1840 }
1841 else
1842 {
1843 fValidator->emitError
1844 (
1845 XMLValid::AttNotDefinedForElement
1846 , fAttNameBuf.getRawBuffer(), qnameRawBuf
1847 );
1848 }
1849 }
1850
1851 // must set the newly-minted value on the XMLAttr:
1852 curAtt->setValue(attrValue);
1853 attCount++;
1854
1855 // And jump back to the top of the loop
1856 continue;
1857 }
1858
1859 // It was some special case character so do all of the checks and
1860 // deal with it.
1861 if (!nextCh)
1862 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1863
1864 if (nextCh == chForwardSlash)
1865 {
1866 fReaderMgr.getNextChar();
1867 isEmpty = true;
1868 if (!fReaderMgr.skippedChar(chCloseAngle))
1869 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1870 break;
1871 }
1872 else if (nextCh == chCloseAngle)
1873 {
1874 fReaderMgr.getNextChar();
1875 break;
1876 }
1877 else if (nextCh == chOpenAngle)
1878 {
1879 // Check for this one specially, since its going to be common
1880 // and it is kind of auto-recovering since we've already hit the
1881 // next open bracket, which is what we would have seeked to (and
1882 // skipped this whole tag.)
1883 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
1884 break;
1885 }
1886 else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
1887 {
1888 // Check for this one specially, which is probably a missing
1889 // attribute name, e.g. ="value". Just issue expected name
1890 // error and eat the quoted string, then jump back to the
1891 // top again.
1892 emitError(XMLErrs::ExpectedAttrName);
1893 fReaderMgr.getNextChar();
1894 fReaderMgr.skipQuotedString(nextCh);
1895 fReaderMgr.skipPastSpaces();
1896 continue;
1897 }
1898 }
1899
1900 // Make an initial pass through the list and find any xmlns attributes.
1901 if (attCount)
1902 scanAttrListforNameSpaces(fAttrList, attCount, elemDecl);
1903
1904 if(attCount)
1905 {
1906 // clean up after ourselves:
1907 // clear the map used to detect duplicate attributes
1908 fUndeclaredAttrRegistry->removeAll();
1909 }
1910
1911 // Now lets get the fAttrList filled in. This involves faulting in any
1912 // defaulted and fixed attributes and normalizing the values of any that
1913 // we got explicitly.
1914 //
1915 // We update the attCount value with the total number of attributes, but
1916 // it goes in with the number of values we got during the raw scan of
1917 // explictly provided attrs above.
1918 attCount = buildAttList(attCount, elemDecl, *fAttrList);
1919
1920 // If we have a document handler, then tell it about this start tag. We
1921 // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send
1922 // any prefix since its just one big name if we are not doing namespaces.
1923 if (fDocHandler)
1924 {
1925 unsigned int uriId = resolvePrefix
1926 (
1927 elemDecl->getElementName()->getPrefix()
1928 , ElemStack::Mode_Element
1929 );
1930
1931 fDocHandler->startElement
1932 (
1933 *elemDecl
1934 , uriId
1935 , elemDecl->getElementName()->getPrefix()
1936 , *fAttrList
1937 , attCount
1938 , isEmpty
1939 , isRoot
1940 );
1941 }
1942
1943 // If empty, validate content right now if we are validating and then
1944 // pop the element stack top. Else, we have to update the current stack
1945 // top's namespace mapping elements.
1946 if (isEmpty)
1947 {
1948 // If validating, then insure that its legal to have no content
1949 if (fValidate)
1950 {
1951 XMLSize_t failure;
1952 bool res = fValidator->checkContent(elemDecl, 0, 0, &failure);
1953 if (!res)
1954 {
1955 fValidator->emitError
1956 (
1957 XMLValid::ElementNotValidForContent
1958 , qnameRawBuf
1959 , elemDecl->getFormattedContentModel()
1960 );
1961 }
1962 }
1963
1964 // Pop the element stack back off since it'll never be used now
1965 fElemStack.popTop();
1966
1967 // If the elem stack is empty, then it was an empty root
1968 if (isRoot)
1969 gotData = false;
1970 }
1971
1972 return true;
1973 }
1974
1975 // ---------------------------------------------------------------------------
1976 // DGXMLScanner: Grammar preparsing
1977 // ---------------------------------------------------------------------------
loadGrammar(const InputSource & src,const short grammarType,const bool toCache)1978 Grammar* DGXMLScanner::loadGrammar(const InputSource& src
1979 , const short grammarType
1980 , const bool toCache)
1981 {
1982 Grammar* loadedGrammar = 0;
1983
1984 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
1985
1986 try
1987 {
1988 fGrammarResolver->cacheGrammarFromParse(false);
1989 fGrammarResolver->useCachedGrammarInParse(false);
1990 fRootGrammar = 0;
1991
1992 if (fValScheme == Val_Auto) {
1993 fValidate = true;
1994 }
1995
1996 // Reset some status flags
1997 fInException = false;
1998 fStandalone = false;
1999 fErrorCount = 0;
2000 fHasNoDTD = true;
2001
2002 if (grammarType == Grammar::DTDGrammarType) {
2003 loadedGrammar = loadDTDGrammar(src, toCache);
2004 }
2005 }
2006 // NOTE:
2007 //
2008 // In all of the error processing below, the emitError() call MUST come
2009 // before the flush of the reader mgr, or it will fail because it tries
2010 // to find out the position in the XML source of the error.
2011 catch(const XMLErrs::Codes)
2012 {
2013 // This is a 'first failure' exception, so fall through
2014 }
2015 catch(const XMLValid::Codes)
2016 {
2017 // This is a 'first fatal error' type exit, so fall through
2018 }
2019 catch(const XMLException& excToCatch)
2020 {
2021 // Emit the error and catch any user exception thrown from here. Make
2022 // sure in all cases we flush the reader manager.
2023 fInException = true;
2024 try
2025 {
2026 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
2027 emitError
2028 (
2029 XMLErrs::XMLException_Warning
2030 , excToCatch.getCode()
2031 , excToCatch.getMessage()
2032 );
2033 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
2034 emitError
2035 (
2036 XMLErrs::XMLException_Fatal
2037 , excToCatch.getCode()
2038 , excToCatch.getMessage()
2039 );
2040 else
2041 emitError
2042 (
2043 XMLErrs::XMLException_Error
2044 , excToCatch.getCode()
2045 , excToCatch.getMessage()
2046 );
2047 }
2048 catch(const OutOfMemoryException&)
2049 {
2050 // This is a special case for out-of-memory
2051 // conditions, because resetting the ReaderMgr
2052 // can be problematic.
2053 resetReaderMgr.release();
2054
2055 throw;
2056 }
2057 }
2058 catch(const OutOfMemoryException&)
2059 {
2060 // This is a special case for out-of-memory
2061 // conditions, because resetting the ReaderMgr
2062 // can be problematic.
2063 resetReaderMgr.release();
2064
2065 throw;
2066 }
2067
2068 return loadedGrammar;
2069 }
2070
loadDTDGrammar(const InputSource & src,const bool toCache)2071 Grammar* DGXMLScanner::loadDTDGrammar(const InputSource& src,
2072 const bool toCache)
2073 {
2074 // Reset the validators
2075 fDTDValidator->reset();
2076 if (fValidatorFromUser)
2077 fValidator->reset();
2078
2079 fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager);
2080 fGrammarResolver->putGrammar(fDTDGrammar);
2081 fGrammar = fDTDGrammar;
2082 fValidator->setGrammar(fGrammar);
2083
2084 // And for all installed handlers, send reset events. This gives them
2085 // a chance to flush any cached data.
2086 if (fDocHandler)
2087 fDocHandler->resetDocument();
2088 if (fEntityHandler)
2089 fEntityHandler->resetEntities();
2090 if (fErrorReporter)
2091 fErrorReporter->resetErrors();
2092
2093 // Clear out the id reference list
2094 resetValidationContext();
2095
2096 if (toCache) {
2097
2098 unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId());
2099 const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId);
2100
2101 fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString);
2102 ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr);
2103 fGrammarResolver->putGrammar(fGrammar);
2104 }
2105
2106 // Handle the creation of the XML reader object for this input source.
2107 // This will provide us with transcoding and basic lexing services.
2108 XMLReader* newReader = fReaderMgr.createReader
2109 (
2110 src
2111 , false
2112 , XMLReader::RefFrom_NonLiteral
2113 , XMLReader::Type_General
2114 , XMLReader::Source_External
2115 , fCalculateSrcOfs
2116 , fLowWaterMark
2117 );
2118 if (!newReader) {
2119 if (src.getIssueFatalErrorIfNotFound())
2120 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager);
2121 else
2122 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager);
2123 }
2124
2125 // In order to make the processing work consistently, we have to
2126 // make this look like an external entity. So create an entity
2127 // decl and fill it in and push it with the reader, as happens
2128 // with an external entity. Put a janitor on it to insure it gets
2129 // cleaned up. The reader manager does not adopt them.
2130 const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull };
2131 DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager);
2132 declDTD->setSystemId(src.getSystemId());
2133 declDTD->setIsExternal(true);
2134 Janitor<DTDEntityDecl> janDecl(declDTD);
2135
2136 // Mark this one as a throw at end
2137 newReader->setThrowAtEnd(true);
2138
2139 // And push it onto the stack, with its pseudo name
2140 fReaderMgr.pushReader(newReader, declDTD);
2141
2142 // If we have a doc type handler and advanced callbacks are enabled,
2143 // call the doctype event.
2144 if (fDocTypeHandler) {
2145
2146 // Create a dummy root
2147 DTDElementDecl* rootDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
2148 (
2149 gDTDStr
2150 , fEmptyNamespaceId
2151 , DTDElementDecl::Any
2152 , fGrammarPoolMemoryManager
2153 );
2154 rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
2155 rootDecl->setExternalElemDeclaration(true);
2156 Janitor<DTDElementDecl> janSrc(rootDecl);
2157
2158 fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false, true);
2159 }
2160
2161 // Create DTDScanner
2162 DTDScanner dtdScanner
2163 (
2164 (DTDGrammar*)fGrammar
2165 , fDocTypeHandler
2166 , fGrammarPoolMemoryManager
2167 , fMemoryManager
2168 );
2169 dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr);
2170
2171 // Tell it its not in an include section
2172 dtdScanner.scanExtSubsetDecl(false, true);
2173
2174 if (fValidate) {
2175 // validate the DTD scan so far
2176 fValidator->preContentValidation(false, true);
2177 }
2178
2179 if (toCache)
2180 fGrammarResolver->cacheGrammars();
2181
2182 return fDTDGrammar;
2183 }
2184
2185
2186 // ---------------------------------------------------------------------------
2187 // DGXMLScanner: Private helper methods
2188 // ---------------------------------------------------------------------------
2189 // This method handles the common initialization, to avoid having to do
2190 // it redundantly in multiple constructors.
commonInit()2191 void DGXMLScanner::commonInit()
2192 {
2193 // And we need one for the raw attribute scan. This just stores key/
2194 // value string pairs (prior to any processing.)
2195 fAttrNSList = new (fMemoryManager) ValueVectorOf<XMLAttr*>(8, fMemoryManager);
2196
2197 // Create the Validator and init them
2198 fDTDValidator = new (fMemoryManager) DTDValidator();
2199 initValidator(fDTDValidator);
2200 fDTDElemNonDeclPool = new (fMemoryManager) NameIdPool<DTDElementDecl>(29, 128, fMemoryManager);
2201 fAttDefRegistry = new (fMemoryManager) RefHashTableOf<unsigned int, PtrHasher>
2202 (
2203 131, false, fMemoryManager
2204 );
2205 fUndeclaredAttrRegistry = new (fMemoryManager) Hash2KeysSetOf<StringHasher>(7, fMemoryManager);
2206
2207 if (fValidator)
2208 {
2209 if (!fValidator->handlesDTD())
2210 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager);
2211 }
2212 else
2213 {
2214 fValidator = fDTDValidator;
2215 }
2216 }
2217
cleanUp()2218 void DGXMLScanner::cleanUp()
2219 {
2220 delete fAttrNSList;
2221 delete fDTDValidator;
2222 delete fDTDElemNonDeclPool;
2223 delete fAttDefRegistry;
2224 delete fUndeclaredAttrRegistry;
2225 }
2226
2227
2228 // This method is called from scanStartTagNS() to build up the list of
2229 // XMLAttr objects that will be passed out in the start tag callout. We
2230 // get the key/value pairs from the raw scan of explicitly provided attrs,
2231 // which have not been normalized. And we get the element declaration from
2232 // which we will get any defaulted or fixed attribute defs and add those
2233 // in as well.
2234 XMLSize_t
buildAttList(const XMLSize_t attCount,XMLElementDecl * elemDecl,RefVectorOf<XMLAttr> & toFill)2235 DGXMLScanner::buildAttList(const XMLSize_t attCount
2236 , XMLElementDecl* elemDecl
2237 , RefVectorOf<XMLAttr>& toFill)
2238 {
2239 // Ask the element to clear the 'provided' flag on all of the att defs
2240 // that it owns, and to return us a boolean indicating whether it has
2241 // any defs.
2242 const bool hasDefs = elemDecl->hasAttDefs();
2243
2244 // If there are no expliclitily provided attributes and there are no
2245 // defined attributes for the element, the we don't have anything to do.
2246 // So just return zero in this case.
2247 if (!hasDefs && !attCount)
2248 return 0;
2249
2250 // Keep up with how many attrs we end up with total
2251 XMLSize_t retCount = attCount;
2252
2253 // And get the current size of the output vector. This lets us use
2254 // existing elements until we fill it, then start adding new ones.
2255 const XMLSize_t curAttListSize = toFill.size();
2256
2257 // Ok, so lets get an enumerator for the attributes of this element
2258 // and run through them for well formedness and validity checks. But
2259 // make sure that we had any attributes before we do it, since the list
2260 // would have have gotten faulted in anyway.
2261 if (hasDefs)
2262 {
2263 XMLAttDefList& attDefList = elemDecl->getAttDefList();
2264 for(XMLSize_t i=0; i<attDefList.getAttDefCount(); i++)
2265 {
2266 // Get the current att def, for convenience and its def type
2267 XMLAttDef& curDef = attDefList.getAttDef(i);
2268
2269 unsigned int *attCountPtr = fAttDefRegistry->get(&curDef);
2270 if (!attCountPtr || *attCountPtr < fElemCount)
2271 { // did not occur
2272 const XMLAttDef::DefAttTypes defType = curDef.getDefaultType();
2273
2274 if (fValidate)
2275 {
2276 // If we are validating and its required, then an error
2277 if (defType == XMLAttDef::Required)
2278 {
2279 fValidator->emitError
2280 (
2281 XMLValid::RequiredAttrNotProvided
2282 , curDef.getFullName()
2283 );
2284 }
2285 else if ((defType == XMLAttDef::Default) ||
2286 (defType == XMLAttDef::Fixed) )
2287 {
2288 if (fStandalone && curDef.isExternal())
2289 {
2290 // XML 1.0 Section 2.9
2291 // Document is standalone, so attributes must not be defaulted.
2292 fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName());
2293 }
2294 }
2295 }
2296
2297 // Fault in the value if needed, and bump the att count
2298 if ((defType == XMLAttDef::Default)
2299 || (defType == XMLAttDef::Fixed))
2300 {
2301 // Let the validator pass judgement on the attribute value
2302 if (fValidate)
2303 {
2304 fValidator->validateAttrValue
2305 (
2306 &curDef
2307 , curDef.getValue()
2308 , false
2309 , elemDecl
2310 );
2311 }
2312
2313 XMLAttr* curAtt;
2314 if (retCount >= curAttListSize)
2315 {
2316 if (fDoNamespaces)
2317 {
2318 curAtt = new (fMemoryManager) XMLAttr
2319 (
2320 fEmptyNamespaceId
2321 , curDef.getFullName()
2322 , curDef.getValue()
2323 , curDef.getType()
2324 , false
2325 , fMemoryManager
2326 );
2327 }
2328 else
2329 {
2330 curAtt = new (fMemoryManager) XMLAttr
2331 (
2332 0
2333 , curDef.getFullName()
2334 , XMLUni::fgZeroLenString
2335 , curDef.getValue()
2336 , curDef.getType()
2337 , false
2338 , fMemoryManager
2339 );
2340 }
2341
2342 fAttrList->addElement(curAtt);
2343 }
2344 else
2345 {
2346 curAtt = fAttrList->elementAt(retCount);
2347 if (fDoNamespaces)
2348 {
2349 curAtt->set
2350 (
2351 fEmptyNamespaceId
2352 , curDef.getFullName()
2353 , curDef.getValue()
2354 , curDef.getType()
2355 );
2356 }
2357 else
2358 {
2359 curAtt->set
2360 (
2361 0
2362 , curDef.getFullName()
2363 , XMLUni::fgZeroLenString
2364 , curDef.getValue()
2365 , curDef.getType()
2366 );
2367 }
2368 curAtt->setSpecified(false);
2369 }
2370
2371 if (fDoNamespaces)
2372 {
2373 // Map the new attribute's prefix to a URI id and store
2374 // that in the attribute object.
2375 const XMLCh* attPrefix = curAtt->getPrefix();
2376 if (attPrefix && *attPrefix) {
2377 curAtt->setURIId
2378 (
2379 resolvePrefix(attPrefix, ElemStack::Mode_Attribute)
2380 );
2381 }
2382 }
2383
2384 retCount++;
2385 }
2386 }
2387 }
2388 }
2389
2390 return retCount;
2391 }
2392
2393
2394 // This method will reset the scanner data structures, and related plugged
2395 // in stuff, for a new scan session. We get the input source for the primary
2396 // XML entity, create the reader for it, and push it on the stack so that
2397 // upon successful return from here we are ready to go.
scanReset(const InputSource & src)2398 void DGXMLScanner::scanReset(const InputSource& src)
2399 {
2400
2401 // This call implicitly tells us that we are going to reuse the scanner
2402 // if it was previously used. So tell the validator to reset itself.
2403 //
2404 // But, if the fUseCacheGrammar flag is set, then don't reset it.
2405 //
2406 // NOTE: The ReaderMgr is flushed on the way out, because that is
2407 // required to insure that files are closed.
2408 fGrammarResolver->cacheGrammarFromParse(fToCacheGrammar);
2409 fGrammarResolver->useCachedGrammarInParse(fUseCachedGrammar);
2410
2411 fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager);
2412 fGrammarResolver->putGrammar(fDTDGrammar);
2413 fGrammar = fDTDGrammar;
2414 fRootGrammar = 0;
2415 fValidator->setGrammar(fGrammar);
2416
2417 // Reset validation
2418 fValidate = (fValScheme == Val_Always) ? true : false;
2419
2420 // And for all installed handlers, send reset events. This gives them
2421 // a chance to flush any cached data.
2422 if (fDocHandler)
2423 fDocHandler->resetDocument();
2424 if (fEntityHandler)
2425 fEntityHandler->resetEntities();
2426 if (fErrorReporter)
2427 fErrorReporter->resetErrors();
2428
2429 // Clear out the id reference list
2430 resetValidationContext();
2431
2432 // Reset the Root Element Name
2433 fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
2434 fRootElemName = 0;
2435
2436 // Reset the element stack, and give it the latest ids for the special
2437 // URIs it has to know about.
2438 fElemStack.reset
2439 (
2440 fEmptyNamespaceId
2441 , fUnknownNamespaceId
2442 , fXMLNamespaceId
2443 , fXMLNSNamespaceId
2444 );
2445
2446 // Reset some status flags
2447 fInException = false;
2448 fStandalone = false;
2449 fErrorCount = 0;
2450 fHasNoDTD = true;
2451
2452 // Reset the validators
2453 fDTDValidator->reset();
2454 fDTDValidator->setErrorReporter(fErrorReporter);
2455 if (fValidatorFromUser)
2456 fValidator->reset();
2457
2458 // Handle the creation of the XML reader object for this input source.
2459 // This will provide us with transcoding and basic lexing services.
2460 XMLReader* newReader = fReaderMgr.createReader
2461 (
2462 src
2463 , true
2464 , XMLReader::RefFrom_NonLiteral
2465 , XMLReader::Type_General
2466 , XMLReader::Source_External
2467 , fCalculateSrcOfs
2468 , fLowWaterMark
2469 );
2470
2471 if (!newReader) {
2472 if (src.getIssueFatalErrorIfNotFound())
2473 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager);
2474 else
2475 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager);
2476 }
2477
2478 // Push this read onto the reader manager
2479 fReaderMgr.pushReader(newReader, 0);
2480
2481 // and reset security-related things if necessary:
2482 if(fSecurityManager != 0)
2483 {
2484 fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit();
2485 fEntityExpansionCount = 0;
2486 }
2487 if(fUIntPoolRowTotal >= 32)
2488 { // 8 KB tied up with validating attributes...
2489 fAttDefRegistry->removeAll();
2490 recreateUIntPool();
2491 }
2492 else
2493 {
2494 // note that this will implicitly reset the values of the hashtables,
2495 // though their buckets will still be tied up
2496 resetUIntPool();
2497 }
2498 fUndeclaredAttrRegistry->removeAll();
2499 fAttrNSList->removeAllElements();
2500 }
2501
2502
2503 // This method is called between markup in content. It scans for character
2504 // data that is sent to the document handler. It watches for any markup
2505 // characters that would indicate that the character data has ended. It also
2506 // handles expansion of general and character entities.
2507 //
2508 // sendData() is a local static helper for this method which handles some
2509 // code that must be done in three different places here.
sendCharData(XMLBuffer & toSend)2510 void DGXMLScanner::sendCharData(XMLBuffer& toSend)
2511 {
2512 // If no data in the buffer, then nothing to do
2513 if (toSend.isEmpty())
2514 return;
2515
2516 // We do different things according to whether we are validating or
2517 // not. If not, its always just characters; else, it depends on the
2518 // current element's content model.
2519 if (fValidate)
2520 {
2521 // Get the raw data we need for the callback
2522 const XMLCh* const rawBuf = toSend.getRawBuffer();
2523 const XMLSize_t len = toSend.getLen();
2524
2525 // And see if the current element is a 'Children' style content model
2526 const ElemStack::StackElem* topElem = fElemStack.topElement();
2527
2528 // Get the character data opts for the current element
2529 XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
2530
2531 if (charOpts == XMLElementDecl::NoCharData)
2532 {
2533 // They definitely cannot handle any type of char data
2534 fValidator->emitError(XMLValid::NoCharDataInCM);
2535 }
2536 else if (fReaderMgr.getCurrentReader()->isAllSpaces(rawBuf, len))
2537 {
2538 // Its all spaces. So, if they can take spaces, then send it
2539 // as ignorable whitespace. If they can handle any char data
2540 // send it as characters.
2541 if (charOpts == XMLElementDecl::SpacesOk) {
2542 if (fDocHandler)
2543 fDocHandler->ignorableWhitespace(rawBuf, len, false);
2544 }
2545 else if (charOpts == XMLElementDecl::AllCharData)
2546 {
2547 if (fDocHandler)
2548 fDocHandler->docCharacters(rawBuf, len, false);
2549 }
2550 }
2551 else
2552 {
2553 // If they can take any char data, then send it. Otherwise, they
2554 // can only handle whitespace and can't handle this stuff so
2555 // issue an error.
2556 if (charOpts == XMLElementDecl::AllCharData)
2557 {
2558 if (fDocHandler)
2559 fDocHandler->docCharacters(rawBuf, len, false);
2560 }
2561 else
2562 {
2563 fValidator->emitError(XMLValid::NoCharDataInCM);
2564 }
2565 }
2566 }
2567 else
2568 {
2569 // Always assume its just char data if not validating
2570 if (fDocHandler)
2571 fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
2572 }
2573
2574 // Reset buffer
2575 toSend.reset();
2576 }
2577
2578
2579
2580 // This method is called with a key/value string pair that represents an
2581 // xmlns="yyy" or xmlns:xxx="yyy" attribute. This method will update the
2582 // current top of the element stack based on this data. We know that when
2583 // we get here, that it is one of these forms, so we don't bother confirming
2584 // it.
2585 //
2586 // But we have to ensure
2587 // 1. xxx is not xmlns
2588 // 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa
2589 // 3. yyy is not XMLUni::fgXMLNSURIName
2590 // 4. if xxx is not null, then yyy cannot be an empty string.
updateNSMap(const XMLCh * const attrPrefix,const XMLCh * const attrLocalName,const XMLCh * const attrValue)2591 void DGXMLScanner::updateNSMap(const XMLCh* const attrPrefix
2592 , const XMLCh* const attrLocalName
2593 , const XMLCh* const attrValue)
2594 {
2595 // We either have the default prefix (""), or we point it into the attr
2596 // name parameter. Note that the xmlns is not the prefix we care about
2597 // here. To us, the 'prefix' is really the local part of the attrName
2598 // parameter.
2599 //
2600 // Check 1. xxx is not xmlns
2601 // 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa
2602 // 3. yyy is not XMLUni::fgXMLNSURIName
2603 // 4. if xxx is not null, then yyy cannot be an empty string.
2604 if (attrPrefix && *attrPrefix) {
2605
2606 if (XMLString::equals(attrLocalName, XMLUni::fgXMLNSString))
2607 emitError(XMLErrs::NoUseOfxmlnsAsPrefix);
2608 else if (XMLString::equals(attrLocalName, XMLUni::fgXMLString)) {
2609 if (!XMLString::equals(attrValue, XMLUni::fgXMLURIName))
2610 emitError(XMLErrs::PrefixXMLNotMatchXMLURI);
2611 }
2612
2613 if (!attrValue)
2614 emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName);
2615 else if(!*attrValue && fXMLVersion == XMLReader::XMLV1_0)
2616 emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName);
2617 }
2618
2619 if (XMLString::equals(attrValue, XMLUni::fgXMLNSURIName))
2620 emitError(XMLErrs::NoUseOfxmlnsURI);
2621 else if (XMLString::equals(attrValue, XMLUni::fgXMLURIName)) {
2622 if (!XMLString::equals(attrLocalName, XMLUni::fgXMLString))
2623 emitError(XMLErrs::XMLURINotMatchXMLPrefix);
2624 }
2625
2626 // Ok, we have to get the unique id for the attribute value, which is the
2627 // URI that this value should be mapped to. The validator has the
2628 // namespace string pool, so we ask him to find or add this new one. Then
2629 // we ask the element stack to add this prefix to URI Id mapping.
2630 fElemStack.addPrefix
2631 (
2632 attrLocalName
2633 , fURIStringPool->addOrFind(attrValue)
2634 );
2635 }
2636
scanAttrListforNameSpaces(RefVectorOf<XMLAttr> * theAttrList,XMLSize_t attCount,XMLElementDecl * elemDecl)2637 void DGXMLScanner::scanAttrListforNameSpaces(RefVectorOf<XMLAttr>* theAttrList, XMLSize_t attCount,
2638 XMLElementDecl* elemDecl)
2639 {
2640 // Map prefixes to uris
2641 for (XMLSize_t i=0; i < fAttrNSList->size(); i++) {
2642 XMLAttr* providedAttr = fAttrNSList->elementAt(i);
2643 providedAttr->setURIId(
2644 resolvePrefix(providedAttr->getPrefix(), ElemStack::Mode_Attribute)
2645 );
2646 }
2647
2648 fAttrNSList->removeAllElements();
2649
2650 // Decide if to use hash table to do duplicate checking
2651 bool toUseHashTable = false;
2652
2653 setAttrDupChkRegistry(attCount, toUseHashTable);
2654 for (XMLSize_t index = 0; index < attCount; index++)
2655 {
2656 // check for duplicate namespace attributes:
2657 // by checking for qualified names with the same local part and with prefixes
2658 // which have been bound to namespace names that are identical.
2659 XMLAttr* curAttr = theAttrList->elementAt(index);
2660 if (!toUseHashTable)
2661 {
2662 XMLAttr* loopAttr;
2663 for (XMLSize_t attrIndex=0; attrIndex < index; attrIndex++) {
2664 loopAttr = theAttrList->elementAt(attrIndex);
2665 if (loopAttr->getURIId() == curAttr->getURIId() &&
2666 XMLString::equals(loopAttr->getName(), curAttr->getName())) {
2667 emitError(
2668 XMLErrs::AttrAlreadyUsedInSTag, curAttr->getName()
2669 , elemDecl->getFullName()
2670 );
2671 }
2672 }
2673 }
2674 else
2675 {
2676 if (fAttrDupChkRegistry->containsKey((void*)curAttr->getName(), curAttr->getURIId()))
2677 {
2678 emitError(
2679 XMLErrs::AttrAlreadyUsedInSTag
2680 , curAttr->getName(), elemDecl->getFullName()
2681 );
2682 }
2683
2684 fAttrDupChkRegistry->put((void*)curAttr->getName(), curAttr->getURIId(), curAttr);
2685 }
2686 }
2687 }
2688
resolveSystemId(const XMLCh * const sysId,const XMLCh * const pubId)2689 InputSource* DGXMLScanner::resolveSystemId(const XMLCh* const sysId
2690 ,const XMLCh* const pubId)
2691 {
2692 //Normalize sysId
2693 XMLBufBid nnSys(&fBufMgr);
2694 XMLBuffer& normalizedSysId = nnSys.getBuffer();
2695 XMLString::removeChar(sysId, 0xFFFF, normalizedSysId);
2696 const XMLCh* normalizedURI = normalizedSysId.getRawBuffer();
2697
2698 // Create a buffer for expanding the normalized system id
2699 XMLBufBid bbSys(&fBufMgr);
2700 XMLBuffer& expSysId = bbSys.getBuffer();
2701
2702 // Allow the entity handler to expand the system id if they choose
2703 // to do so.
2704 InputSource* srcToFill = 0;
2705 if (fEntityHandler)
2706 {
2707 if (!fEntityHandler->expandSystemId(normalizedURI, expSysId))
2708 expSysId.set(normalizedURI);
2709
2710 ReaderMgr::LastExtEntityInfo lastInfo;
2711 fReaderMgr.getLastExtEntityInfo(lastInfo);
2712 XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
2713 expSysId.getRawBuffer(), 0, pubId, lastInfo.systemId,
2714 &fReaderMgr);
2715 srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier);
2716 }
2717 else
2718 {
2719 expSysId.set(normalizedURI);
2720 }
2721
2722 // If they didn't create a source via the entity handler, then we
2723 // have to create one on our own.
2724 if (!srcToFill)
2725 {
2726 if (fDisableDefaultEntityResolution)
2727 return srcToFill;
2728
2729 ReaderMgr::LastExtEntityInfo lastInfo;
2730 fReaderMgr.getLastExtEntityInfo(lastInfo);
2731
2732 XMLURL urlTmp(fMemoryManager);
2733 if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) ||
2734 (urlTmp.isRelative()))
2735 {
2736 if (!fStandardUriConformant)
2737 {
2738 XMLBufBid ddSys(&fBufMgr);
2739 XMLBuffer& resolvedSysId = ddSys.getBuffer();
2740 XMLUri::normalizeURI(expSysId.getRawBuffer(), resolvedSysId);
2741
2742 srcToFill = new (fMemoryManager) LocalFileInputSource
2743 (
2744 lastInfo.systemId
2745 , resolvedSysId.getRawBuffer()
2746 , fMemoryManager
2747 );
2748 }
2749 else
2750 ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
2751 }
2752 else
2753 {
2754 if (fStandardUriConformant && urlTmp.hasInvalidChar())
2755 ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
2756 srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager);
2757 }
2758 }
2759
2760 return srcToFill;
2761 }
2762
2763 // ---------------------------------------------------------------------------
2764 // DGXMLScanner: Private parsing methods
2765 // ---------------------------------------------------------------------------
scanAttValue(const XMLAttDef * const attDef,const XMLCh * const attrName,XMLBuffer & toFill)2766 bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef
2767 , const XMLCh *const attrName
2768 , XMLBuffer& toFill)
2769 {
2770 enum States
2771 {
2772 InWhitespace
2773 , InContent
2774 };
2775
2776 // Get the type and name
2777 const XMLAttDef::AttTypes type = (attDef)
2778 ?attDef->getType()
2779 :XMLAttDef::CData;
2780
2781 // Reset the target buffer
2782 toFill.reset();
2783
2784 // Get the next char which must be a single or double quote
2785 XMLCh quoteCh;
2786 if (!fReaderMgr.skipIfQuote(quoteCh))
2787 return false;
2788
2789 // We have to get the current reader because we have to ignore closing
2790 // quotes until we hit the same reader again.
2791 const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum();
2792
2793 // Get attribute def - to check to see if it's declared externally or not
2794 bool isAttExternal = (attDef)
2795 ?attDef->isExternal()
2796 :false;
2797
2798 // Loop until we get the attribute value. Note that we use a double
2799 // loop here to avoid the setup/teardown overhead of the exception
2800 // handler on every round.
2801 XMLCh nextCh;
2802 XMLCh secondCh = 0;
2803 States curState = InContent;
2804 bool firstNonWS = false;
2805 bool gotLeadingSurrogate = false;
2806 bool escaped;
2807 while (true)
2808 {
2809 try
2810 {
2811 while(true)
2812 {
2813 nextCh = fReaderMgr.getNextChar();
2814
2815 if (!nextCh)
2816 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
2817
2818 // Check for our ending quote in the same entity
2819 if (nextCh == quoteCh)
2820 {
2821 if (curReader == fReaderMgr.getCurrentReaderNum())
2822 return true;
2823
2824 // Watch for spillover into a previous entity
2825 if (curReader > fReaderMgr.getCurrentReaderNum())
2826 {
2827 emitError(XMLErrs::PartialMarkupInEntity);
2828 return false;
2829 }
2830 }
2831
2832 // Check for an entity ref now, before we let it affect our
2833 // whitespace normalization logic below. We ignore the empty flag
2834 // in this one.
2835 escaped = false;
2836 if (nextCh == chAmpersand)
2837 {
2838 if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
2839 {
2840 gotLeadingSurrogate = false;
2841 continue;
2842 }
2843 }
2844 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
2845 {
2846 // Deal with surrogate pairs
2847 // Its a leading surrogate. If we already got one, then
2848 // issue an error, else set leading flag to make sure that
2849 // we look for a trailing next time.
2850 if (gotLeadingSurrogate)
2851 emitError(XMLErrs::Expected2ndSurrogateChar);
2852 else
2853 gotLeadingSurrogate = true;
2854 }
2855 else
2856 {
2857 // If its a trailing surrogate, make sure that we are
2858 // prepared for that. Else, its just a regular char so make
2859 // sure that we were not expected a trailing surrogate.
2860 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
2861 {
2862 // Its trailing, so make sure we were expecting it
2863 if (!gotLeadingSurrogate)
2864 emitError(XMLErrs::Unexpected2ndSurrogateChar);
2865 }
2866 else
2867 {
2868 // Its just a char, so make sure we were not expecting a
2869 // trailing surrogate.
2870 if (gotLeadingSurrogate)
2871 emitError(XMLErrs::Expected2ndSurrogateChar);
2872
2873 // Its got to at least be a valid XML character
2874 if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
2875 {
2876 XMLCh tmpBuf[9];
2877 XMLString::binToText
2878 (
2879 nextCh
2880 , tmpBuf
2881 , 8
2882 , 16
2883 , fMemoryManager
2884 );
2885 emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
2886 }
2887 }
2888 gotLeadingSurrogate = false;
2889 }
2890
2891 // If its not escaped, then make sure its not a < character, which
2892 // is not allowed in attribute values.
2893 if (!escaped && (nextCh == chOpenAngle))
2894 emitError(XMLErrs::BracketInAttrValue, attrName);
2895
2896 // If the attribute is a CDATA type we do simple replacement of
2897 // tabs and new lines with spaces, if the character is not escaped
2898 // by way of a char ref.
2899 //
2900 // Otherwise, we do the standard non-CDATA normalization of
2901 // compressing whitespace to single spaces and getting rid of leading
2902 // and trailing whitespace.
2903 if (type == XMLAttDef::CData)
2904 {
2905 if (!escaped)
2906 {
2907 if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
2908 {
2909 // Check Validity Constraint for Standalone document declaration
2910 // XML 1.0, Section 2.9
2911 if (fStandalone && fValidate && isAttExternal)
2912 {
2913 // Can't have a standalone document declaration of "yes" if attribute
2914 // values are subject to normalisation
2915 fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
2916 }
2917 nextCh = chSpace;
2918 }
2919 }
2920 }
2921 else
2922 {
2923 if (curState == InWhitespace)
2924 {
2925 if ((escaped && nextCh != chSpace) || !fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
2926 {
2927 if (firstNonWS)
2928 toFill.append(chSpace);
2929 curState = InContent;
2930 firstNonWS = true;
2931 }
2932 else
2933 {
2934 continue;
2935 }
2936 }
2937 else if (curState == InContent)
2938 {
2939 if ((nextCh == chSpace) ||
2940 (fReaderMgr.getCurrentReader()->isWhitespace(nextCh) && !escaped))
2941 {
2942 curState = InWhitespace;
2943
2944 // Check Validity Constraint for Standalone document declaration
2945 // XML 1.0, Section 2.9
2946 if (fStandalone && fValidate && isAttExternal)
2947 {
2948 if (!firstNonWS || (nextCh != chSpace) || (fReaderMgr.lookingAtSpace()))
2949 {
2950 // Can't have a standalone document declaration of "yes" if attribute
2951 // values are subject to normalisation
2952 fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
2953 }
2954 }
2955 continue;
2956 }
2957 firstNonWS = true;
2958 }
2959 }
2960
2961 // Else add it to the buffer
2962 toFill.append(nextCh);
2963
2964 if (secondCh)
2965 {
2966 toFill.append(secondCh);
2967 secondCh=0;
2968 }
2969 }
2970 }
2971 catch(const EndOfEntityException&)
2972 {
2973 // Just eat it and continue.
2974 gotLeadingSurrogate = false;
2975 escaped = false;
2976 }
2977 }
2978 return true;
2979 }
2980
2981
2982 // This method scans a CDATA section. It collects the character into one
2983 // of the temp buffers and calls the document handler, if any, with the
2984 // characters. It assumes that the <![CDATA string has been scanned before
2985 // this call.
scanCDSection()2986 void DGXMLScanner::scanCDSection()
2987 {
2988 static const XMLCh CDataClose[] =
2989 {
2990 chCloseSquare, chCloseAngle, chNull
2991 };
2992
2993 // The next character should be the opening square bracket. If not
2994 // issue an error, but then try to recover by skipping any whitespace
2995 // and checking again.
2996 if (!fReaderMgr.skippedChar(chOpenSquare))
2997 {
2998 emitError(XMLErrs::ExpectedOpenSquareBracket);
2999 fReaderMgr.skipPastSpaces();
3000
3001 // If we still don't find it, then give up, else keep going
3002 if (!fReaderMgr.skippedChar(chOpenSquare))
3003 return;
3004 }
3005
3006 // Get a buffer for this
3007 XMLBufBid bbCData(&fBufMgr);
3008
3009 // We just scan forward until we hit the end of CDATA section sequence.
3010 // CDATA is effectively a big escape mechanism so we don't treat markup
3011 // characters specially here.
3012 bool emittedError = false;
3013 bool gotLeadingSurrogate = false;
3014
3015 // Get the character data opts for the current element
3016 const ElemStack::StackElem* topElem = fElemStack.topElement();
3017 XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
3018
3019 while (true)
3020 {
3021 const XMLCh nextCh = fReaderMgr.getNextChar();
3022
3023 // Watch for unexpected end of file
3024 if (!nextCh)
3025 {
3026 emitError(XMLErrs::UnterminatedCDATASection);
3027 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3028 }
3029
3030 if (fValidate && fStandalone && (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)))
3031 {
3032 // This document is standalone; this ignorable CDATA whitespace is forbidden.
3033 // XML 1.0, Section 2.9
3034 // And see if the current element is a 'Children' style content model
3035 if (topElem->fThisElement->isExternal()) {
3036
3037 if (charOpts == XMLElementDecl::SpacesOk) // Element Content
3038 {
3039 // Error - standalone should have a value of "no" as whitespace detected in an
3040 // element type with element content whose element declaration was external
3041 fValidator->emitError(XMLValid::NoWSForStandalone);
3042 }
3043 }
3044 }
3045
3046 // If this is a close square bracket it could be our closing
3047 // sequence.
3048 if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
3049 {
3050 // make sure we were not expecting a trailing surrogate.
3051 if (gotLeadingSurrogate)
3052 emitError(XMLErrs::Expected2ndSurrogateChar);
3053
3054 if (fValidate) {
3055
3056 if (charOpts != XMLElementDecl::AllCharData)
3057 {
3058 // They definitely cannot handle any type of char data
3059 fValidator->emitError(XMLValid::NoCharDataInCM);
3060 }
3061 }
3062
3063 // If we have a doc handler, call it
3064 if (fDocHandler)
3065 {
3066 fDocHandler->docCharacters
3067 (
3068 bbCData.getRawBuffer()
3069 , bbCData.getLen()
3070 , true
3071 );
3072 }
3073
3074 // And we are done
3075 break;
3076 }
3077
3078 // Make sure its a valid character. But if we've emitted an error
3079 // already, don't bother with the overhead since we've already told
3080 // them about it.
3081 if (!emittedError)
3082 {
3083 // Deal with surrogate pairs
3084 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
3085 {
3086 // Its a leading surrogate. If we already got one, then
3087 // issue an error, else set leading flag to make sure that
3088 // we look for a trailing next time.
3089 if (gotLeadingSurrogate)
3090 emitError(XMLErrs::Expected2ndSurrogateChar);
3091 else
3092 gotLeadingSurrogate = true;
3093 }
3094 else
3095 {
3096 // If its a trailing surrogate, make sure that we are
3097 // prepared for that. Else, its just a regular char so make
3098 // sure that we were not expected a trailing surrogate.
3099 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
3100 {
3101 // Its trailing, so make sure we were expecting it
3102 if (!gotLeadingSurrogate)
3103 emitError(XMLErrs::Unexpected2ndSurrogateChar);
3104 }
3105 else
3106 {
3107 // Its just a char, so make sure we were not expecting a
3108 // trailing surrogate.
3109 if (gotLeadingSurrogate)
3110 emitError(XMLErrs::Expected2ndSurrogateChar);
3111
3112 // Its got to at least be a valid XML character
3113 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
3114 {
3115 XMLCh tmpBuf[9];
3116 XMLString::binToText
3117 (
3118 nextCh
3119 , tmpBuf
3120 , 8
3121 , 16
3122 , fMemoryManager
3123 );
3124 emitError(XMLErrs::InvalidCharacter, tmpBuf);
3125 emittedError = true;
3126 }
3127 }
3128 gotLeadingSurrogate = false;
3129 }
3130 }
3131
3132 // Add it to the buffer
3133 bbCData.append(nextCh);
3134 }
3135 }
3136
3137
scanCharData(XMLBuffer & toUse)3138 void DGXMLScanner::scanCharData(XMLBuffer& toUse)
3139 {
3140 // We have to watch for the stupid ]]> sequence, which is illegal in
3141 // character data. So this is a little state machine that handles that.
3142 enum States
3143 {
3144 State_Waiting
3145 , State_GotOne
3146 , State_GotTwo
3147 };
3148
3149 // Reset the buffer before we start
3150 toUse.reset();
3151
3152 // Turn on the 'throw at end' flag of the reader manager
3153 ThrowEOEJanitor jan(&fReaderMgr, true);
3154
3155 // In order to be more efficient we have to use kind of a deeply nested
3156 // set of blocks here. The outer block puts on a try and catches end of
3157 // entity exceptions. The inner loop is the per-character loop. If we
3158 // put the try inside the inner loop, it would work but would require
3159 // the exception handling code setup/teardown code to be invoked for
3160 // each character.
3161 XMLCh nextCh;
3162 XMLCh secondCh = 0;
3163 States curState = State_Waiting;
3164 bool escaped = false;
3165 bool gotLeadingSurrogate = false;
3166 bool notDone = true;
3167 while (notDone)
3168 {
3169 try
3170 {
3171 while (true)
3172 {
3173 // Eat through as many plain content characters as possible without
3174 // needing special handling. Moving most content characters here,
3175 // in this one call, rather than running the overall loop once
3176 // per content character, is a speed optimization.
3177 if (curState == State_Waiting && !gotLeadingSurrogate)
3178 {
3179 fReaderMgr.movePlainContentChars(toUse);
3180 }
3181
3182 // Try to get another char from the source
3183 // The code from here on down covers all contengencies,
3184 if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
3185 {
3186 // If we were waiting for a trailing surrogate, its an error
3187 if (gotLeadingSurrogate)
3188 emitError(XMLErrs::Expected2ndSurrogateChar);
3189
3190 notDone = false;
3191 break;
3192 }
3193
3194 // Watch for a reference. Note that the escapement mechanism
3195 // is ignored in this content.
3196 escaped = false;
3197 if (nextCh == chAmpersand)
3198 {
3199 sendCharData(toUse);
3200
3201 // Turn off the throwing at the end of entity during this
3202 ThrowEOEJanitor jan(&fReaderMgr, false);
3203
3204 if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
3205 {
3206 gotLeadingSurrogate = false;
3207 continue;
3208 }
3209 else
3210 {
3211 if (escaped && !fElemStack.isEmpty())
3212 fElemStack.setReferenceEscaped();
3213 }
3214 }
3215 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
3216 {
3217 // Deal with surrogate pairs
3218 // Its a leading surrogate. If we already got one, then
3219 // issue an error, else set leading flag to make sure that
3220 // we look for a trailing next time.
3221 if (gotLeadingSurrogate)
3222 emitError(XMLErrs::Expected2ndSurrogateChar);
3223 else
3224 gotLeadingSurrogate = true;
3225 }
3226 else
3227 {
3228 // If its a trailing surrogate, make sure that we are
3229 // prepared for that. Else, its just a regular char so make
3230 // sure that we were not expected a trailing surrogate.
3231 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
3232 {
3233 // Its trailing, so make sure we were expecting it
3234 if (!gotLeadingSurrogate)
3235 emitError(XMLErrs::Unexpected2ndSurrogateChar);
3236 }
3237 else
3238 {
3239 // Its just a char, so make sure we were not expecting a
3240 // trailing surrogate.
3241 if (gotLeadingSurrogate)
3242 emitError(XMLErrs::Expected2ndSurrogateChar);
3243
3244 // Make sure the returned char is a valid XML char
3245 if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
3246 {
3247 XMLCh tmpBuf[9];
3248 XMLString::binToText
3249 (
3250 nextCh
3251 , tmpBuf
3252 , 8
3253 , 16
3254 , fMemoryManager
3255 );
3256 emitError(XMLErrs::InvalidCharacter, tmpBuf);
3257 }
3258 }
3259 gotLeadingSurrogate = false;
3260 }
3261
3262 // Keep the state machine up to date
3263 if (!escaped)
3264 {
3265 if (nextCh == chCloseSquare)
3266 {
3267 if (curState == State_Waiting)
3268 curState = State_GotOne;
3269 else if (curState == State_GotOne)
3270 curState = State_GotTwo;
3271 }
3272 else if (nextCh == chCloseAngle)
3273 {
3274 if (curState == State_GotTwo)
3275 emitError(XMLErrs::BadSequenceInCharData);
3276 curState = State_Waiting;
3277 }
3278 else
3279 {
3280 curState = State_Waiting;
3281 }
3282 }
3283 else
3284 {
3285 curState = State_Waiting;
3286 }
3287
3288 // Add this char to the buffer
3289 toUse.append(nextCh);
3290
3291 if (secondCh)
3292 {
3293 toUse.append(secondCh);
3294 secondCh=0;
3295 }
3296 }
3297 }
3298 catch(const EndOfEntityException& toCatch)
3299 {
3300 // Some entity ended, so we have to send any accumulated
3301 // chars and send an end of entity event.
3302 sendCharData(toUse);
3303 gotLeadingSurrogate = false;
3304
3305 if (fDocHandler)
3306 fDocHandler->endEntityReference(toCatch.getEntity());
3307 }
3308 }
3309
3310 // Check the validity constraints as per XML 1.0 Section 2.9
3311 if (fValidate && fStandalone)
3312 {
3313 // See if the text contains whitespace
3314 // Get the raw data we need for the callback
3315 const XMLCh* rawBuf = toUse.getRawBuffer();
3316 const XMLSize_t len = toUse.getLen();
3317 const bool isSpaces = fReaderMgr.getCurrentReader()->containsWhiteSpace(rawBuf, len);
3318
3319 if (isSpaces)
3320 {
3321 // And see if the current element is a 'Children' style content model
3322 const ElemStack::StackElem* topElem = fElemStack.topElement();
3323
3324 if (topElem->fThisElement->isExternal()) {
3325
3326 // Get the character data opts for the current element
3327 XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
3328
3329 if (charOpts == XMLElementDecl::SpacesOk) // => Element Content
3330 {
3331 // Error - standalone should have a value of "no" as whitespace detected in an
3332 // element type with element content whose element declaration was external
3333 //
3334 fValidator->emitError(XMLValid::NoWSForStandalone);
3335 }
3336 }
3337 }
3338 }
3339 // Send any char data that we accumulated into the buffer
3340 sendCharData(toUse);
3341 }
3342
3343
3344 // This method will scan a general/character entity ref. It will either
3345 // expand a char ref and return it directly, or push a reader for a general
3346 // entity.
3347 //
3348 // The return value indicates whether the char parameters hold the value
3349 // or whether the value was pushed as a reader, or that it failed.
3350 //
3351 // The escaped flag tells the caller whether the returned parameter resulted
3352 // from a character reference, which escapes the character in some cases. It
3353 // only makes any difference if the return value indicates the value was
3354 // returned directly.
3355 DGXMLScanner::EntityExpRes
scanEntityRef(const bool inAttVal,XMLCh & firstCh,XMLCh & secondCh,bool & escaped)3356 DGXMLScanner::scanEntityRef( const bool inAttVal
3357 , XMLCh& firstCh
3358 , XMLCh& secondCh
3359 , bool& escaped)
3360 {
3361 // Assume no escape
3362 secondCh = 0;
3363 escaped = false;
3364
3365 // We have to insure that its all in one entity
3366 const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum();
3367
3368 // If the next char is a pound, then its a character reference and we
3369 // need to expand it always.
3370 if (fReaderMgr.skippedChar(chPound))
3371 {
3372 // Its a character reference, so scan it and get back the numeric
3373 // value it represents.
3374 if (!scanCharRef(firstCh, secondCh))
3375 return EntityExp_Failed;
3376
3377 escaped = true;
3378
3379 if (curReader != fReaderMgr.getCurrentReaderNum())
3380 emitError(XMLErrs::PartialMarkupInEntity);
3381
3382 return EntityExp_Returned;
3383 }
3384
3385 // Expand it since its a normal entity ref
3386 XMLBufBid bbName(&fBufMgr);
3387
3388 int colonPosition;
3389 bool validName = fDoNamespaces ? fReaderMgr.getQName(bbName.getBuffer(), &colonPosition) :
3390 fReaderMgr.getName(bbName.getBuffer());
3391 if (!validName)
3392 {
3393 if (bbName.isEmpty())
3394 emitError(XMLErrs::ExpectedEntityRefName);
3395 else
3396 emitError(XMLErrs::InvalidEntityRefName, bbName.getRawBuffer());
3397 return EntityExp_Failed;
3398 }
3399
3400 // Next char must be a semi-colon. But if its not, just emit
3401 // an error and try to continue.
3402 if (!fReaderMgr.skippedChar(chSemiColon))
3403 emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
3404
3405 // Make sure we ended up on the same entity reader as the & char
3406 if (curReader != fReaderMgr.getCurrentReaderNum())
3407 emitError(XMLErrs::PartialMarkupInEntity);
3408
3409 // Look up the name in the general entity pool
3410 XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
3411
3412 // If it does not exist, then obviously an error
3413 if (!decl)
3414 {
3415 // XML 1.0 Section 4.1
3416 // Well-formedness Constraint for entity not found:
3417 // In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references,
3418 // or a document with "standalone='yes'", for an entity reference that does not occur within the external subset
3419 // or a parameter entity
3420 //
3421 // Else it's Validity Constraint
3422 if (fStandalone || fHasNoDTD)
3423 emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
3424 else {
3425 if (fValidate)
3426 fValidator->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
3427 }
3428
3429 return EntityExp_Failed;
3430 }
3431
3432 // XML 1.0 Section 4.1
3433 // If we are a standalone document, then it has to have been declared
3434 // in the internal subset.
3435 if (fStandalone && !decl->getDeclaredInIntSubset())
3436 emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer());
3437
3438 if (decl->isExternal())
3439 {
3440 // If its unparsed, then its not valid here
3441 if (decl->isUnparsed())
3442 {
3443 emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
3444 return EntityExp_Failed;
3445 }
3446
3447 // If we are in an attribute value, then not valid but keep going
3448 if (inAttVal)
3449 emitError(XMLErrs::NoExtRefsInAttValue);
3450
3451 // And now create a reader to read this entity
3452 InputSource* srcUsed;
3453 XMLReader* reader = fReaderMgr.createReader
3454 (
3455 decl->getBaseURI()
3456 , decl->getSystemId()
3457 , decl->getPublicId()
3458 , false
3459 , XMLReader::RefFrom_NonLiteral
3460 , XMLReader::Type_General
3461 , XMLReader::Source_External
3462 , srcUsed
3463 , fCalculateSrcOfs
3464 , fLowWaterMark
3465 , fDisableDefaultEntityResolution
3466 );
3467
3468 // Put a janitor on the source so it gets cleaned up on exit
3469 Janitor<InputSource> janSrc(srcUsed);
3470
3471 // If the creation failed, and its not because the source was empty,
3472 // then emit an error and return.
3473 if (!reader)
3474 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
3475
3476 // Push the reader. If its a recursive expansion, then emit an error
3477 // and return an failure.
3478 if (!fReaderMgr.pushReader(reader, decl))
3479 {
3480 emitError(XMLErrs::RecursiveEntity, decl->getName());
3481 return EntityExp_Failed;
3482 }
3483
3484 // here's where we need to check if there's a SecurityManager,
3485 // how many entity references we've had
3486 if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) {
3487 XMLCh expLimStr[32];
3488 XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager);
3489 emitError
3490 (
3491 XMLErrs::EntityExpansionLimitExceeded
3492 , expLimStr
3493 );
3494 // there seems nothing better to do than reset the entity expansion counter
3495 fEntityExpansionCount = 0;
3496 }
3497
3498 // Do a start entity reference event.
3499 //
3500 // <TBD> For now, we supress them in att values. Later, when
3501 // the stuff is in place to correctly allow DOM to handle them
3502 // we'll turn this back on.
3503 if (fDocHandler && !inAttVal)
3504 fDocHandler->startEntityReference(*decl);
3505
3506 // If it starts with the XML string, then parse a text decl
3507 if (checkXMLDecl(true))
3508 scanXMLDecl(Decl_Text);
3509 }
3510 else
3511 {
3512 // If its one of the special char references, then we can return
3513 // it as a character, and its considered escaped.
3514 if (decl->getIsSpecialChar())
3515 {
3516 firstCh = decl->getValue()[0];
3517 escaped = true;
3518 return EntityExp_Returned;
3519 }
3520
3521 // Create a reader over a memory stream over the entity value
3522 // We force it to assume UTF-16 by passing in an encoding
3523 // string. This way it won't both trying to predecode the
3524 // first line, looking for an XML/TextDecl.
3525 XMLReader* valueReader = fReaderMgr.createIntEntReader
3526 (
3527 decl->getName()
3528 , XMLReader::RefFrom_NonLiteral
3529 , XMLReader::Type_General
3530 , decl->getValue()
3531 , decl->getValueLen()
3532 , false
3533 );
3534
3535 // Try to push the entity reader onto the reader manager stack,
3536 // where it will become the subsequent input. If it fails, that
3537 // means the entity is recursive, so issue an error. The reader
3538 // will have just been discarded, but we just keep going.
3539 if (!fReaderMgr.pushReader(valueReader, decl))
3540 emitError(XMLErrs::RecursiveEntity, decl->getName());
3541
3542 // here's where we need to check if there's a SecurityManager,
3543 // how many entity references we've had
3544 if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) {
3545 XMLCh expLimStr[32];
3546 XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager);
3547 emitError
3548 (
3549 XMLErrs::EntityExpansionLimitExceeded
3550 , expLimStr
3551 );
3552 }
3553
3554 // Do a start entity reference event.
3555 //
3556 // <TBD> For now, we supress them in att values. Later, when
3557 // the stuff is in place to correctly allow DOM to handle them
3558 // we'll turn this back on.
3559 if (fDocHandler && !inAttVal)
3560 fDocHandler->startEntityReference(*decl);
3561
3562 // If it starts with the XML string, then it's an error
3563 if (checkXMLDecl(true)) {
3564 emitError(XMLErrs::TextDeclNotLegalHere);
3565 fReaderMgr.skipPastChar(chCloseAngle);
3566 }
3567 }
3568 return EntityExp_Pushed;
3569 }
3570
3571
3572 XERCES_CPP_NAMESPACE_END
3573