1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * $Id$
20 */
21
22
23 // ---------------------------------------------------------------------------
24 // Includes
25 // ---------------------------------------------------------------------------
26 #include <xercesc/util/BinMemInputStream.hpp>
27 #include <xercesc/util/FlagJanitor.hpp>
28 #include <xercesc/util/Janitor.hpp>
29 #include <xercesc/util/XMLUniDefs.hpp>
30 #include <xercesc/util/ValueStackOf.hpp>
31 #include <xercesc/util/UnexpectedEOFException.hpp>
32 #include <xercesc/util/OutOfMemoryException.hpp>
33 #include <xercesc/sax/InputSource.hpp>
34 #include <xercesc/framework/XMLDocumentHandler.hpp>
35 #include <xercesc/framework/XMLEntityHandler.hpp>
36 #include <xercesc/framework/XMLValidator.hpp>
37 #include <xercesc/internal/EndOfEntityException.hpp>
38 #include <xercesc/internal/XMLScanner.hpp>
39 #include <xercesc/validators/common/ContentSpecNode.hpp>
40 #include <xercesc/validators/common/MixedContentModel.hpp>
41 #include <xercesc/validators/DTD/DTDEntityDecl.hpp>
42 #include <xercesc/validators/DTD/DocTypeHandler.hpp>
43 #include <xercesc/validators/DTD/DTDScanner.hpp>
44
45 XERCES_CPP_NAMESPACE_BEGIN
46
47 #define CONTENTSPEC_DEPTH_LIMIT 1000
48
49 // ---------------------------------------------------------------------------
50 // Local methods
51 // ---------------------------------------------------------------------------
52 //
53 // This method automates the grunt work of looking at a char and see if its
54 // a repetition suffix. If so, it creates a new correct rep node and wraps
55 // the pass node in it. Otherwise, it returns the previous node.
56 //
makeRepNode(const XMLCh testCh,ContentSpecNode * const prevNode,MemoryManager * const manager)57 static ContentSpecNode* makeRepNode(const XMLCh testCh,
58 ContentSpecNode* const prevNode,
59 MemoryManager* const manager)
60 {
61 if (testCh == chQuestion)
62 {
63 return new (manager) ContentSpecNode
64 (
65 ContentSpecNode::ZeroOrOne
66 , prevNode
67 , 0
68 , true
69 , true
70 , manager
71 );
72 }
73 else if (testCh == chPlus)
74 {
75 return new (manager) ContentSpecNode
76 (
77 ContentSpecNode::OneOrMore
78 , prevNode
79 , 0
80 , true
81 , true
82 , manager
83 );
84 }
85 else if (testCh == chAsterisk)
86 {
87 return new (manager) ContentSpecNode
88 (
89 ContentSpecNode::ZeroOrMore
90 , prevNode
91 , 0
92 , true
93 , true
94 , manager
95 );
96 }
97
98 // Just return the incoming node
99 return prevNode;
100 }
101
102 // ---------------------------------------------------------------------------
103 // DTDValidator: Constructors and Destructor
104 // ---------------------------------------------------------------------------
DTDScanner(DTDGrammar * dtdGrammar,DocTypeHandler * const docTypeHandler,MemoryManager * const grammarPoolMemoryManager,MemoryManager * const manager)105 DTDScanner::DTDScanner( DTDGrammar* dtdGrammar
106 , DocTypeHandler* const docTypeHandler
107 , MemoryManager* const grammarPoolMemoryManager
108 , MemoryManager* const manager) :
109 fMemoryManager(manager)
110 , fGrammarPoolMemoryManager(grammarPoolMemoryManager)
111 , fDocTypeHandler(docTypeHandler)
112 , fDumAttDef(0)
113 , fDumElemDecl(0)
114 , fDumEntityDecl(0)
115 , fInternalSubset(false)
116 , fNextAttrId(1)
117 , fDTDGrammar(dtdGrammar)
118 , fBufMgr(0)
119 , fReaderMgr(0)
120 , fScanner(0)
121 , fPEntityDeclPool(0)
122 , fEmptyNamespaceId(0)
123 , fDocTypeReaderId(0)
124 {
125 fPEntityDeclPool = new (fMemoryManager) NameIdPool<DTDEntityDecl>(109, 128, fMemoryManager);
126 }
127
~DTDScanner()128 DTDScanner::~DTDScanner()
129 {
130 delete fDumAttDef;
131 delete fDumElemDecl;
132 delete fDumEntityDecl;
133 delete fPEntityDeclPool;
134 }
135
136 // -----------------------------------------------------------------------
137 // Setter methods
138 // -----------------------------------------------------------------------
setScannerInfo(XMLScanner * const owningScanner,ReaderMgr * const readerMgr,XMLBufferMgr * const bufMgr)139 void DTDScanner::setScannerInfo(XMLScanner* const owningScanner
140 , ReaderMgr* const readerMgr
141 , XMLBufferMgr* const bufMgr)
142 {
143 // We don't own any of these, we just reference them
144 fScanner = owningScanner;
145 fReaderMgr = readerMgr;
146 fBufMgr = bufMgr;
147
148 if (fScanner->getDoNamespaces())
149 fEmptyNamespaceId = fScanner->getEmptyNamespaceId();
150 else
151 fEmptyNamespaceId = 0;
152
153 fDocTypeReaderId = fReaderMgr->getCurrentReaderNum();
154 }
155
156
157 // ---------------------------------------------------------------------------
158 // DTDScanner: Private scanning methods
159 // ---------------------------------------------------------------------------
checkForPERef(const bool inLiteral,const bool inMarkup)160 bool DTDScanner::checkForPERef( const bool inLiteral
161 , const bool inMarkup)
162 {
163 bool gotSpace = false;
164
165 //
166 // See if we have any spaces up front. If so, then skip them and set
167 // the gotSpaces flag.
168 //
169 if (fReaderMgr->skippedSpace())
170 {
171 fReaderMgr->skipPastSpaces();
172 gotSpace = true;
173 }
174
175 // If the next char is a percent, then expand the PERef
176 if (!fReaderMgr->skippedChar(chPercent))
177 return gotSpace;
178
179 while (true)
180 {
181 if (!expandPERef(false, inLiteral, inMarkup, false))
182 fScanner->emitError(XMLErrs::ExpectedEntityRefName);
183 // And skip any more spaces in the expanded value
184 if (fReaderMgr->skippedSpace())
185 {
186 fReaderMgr->skipPastSpaces();
187 gotSpace = true;
188 }
189 if (!fReaderMgr->skippedChar(chPercent))
190 break;
191 }
192 return gotSpace;
193 }
194
195
expandPERef(const bool scanExternal,const bool inLiteral,const bool inMarkup,const bool throwEndOfExt)196 bool DTDScanner::expandPERef( const bool scanExternal
197 , const bool inLiteral
198 , const bool inMarkup
199 , const bool throwEndOfExt)
200 {
201 fScanner->setHasNoDTD(false);
202 XMLBufBid bbName(fBufMgr);
203
204 //
205 // If we are in the internal subset and in markup, then this is
206 // an error but we go ahead and do it anyway.
207 //
208 if (fInternalSubset && inMarkup)
209 fScanner->emitError(XMLErrs::PERefInMarkupInIntSubset);
210
211 if (!fReaderMgr->getName(bbName.getBuffer()))
212 {
213 fScanner->emitError(XMLErrs::ExpectedPEName);
214
215 // Skip the semicolon if that's what we ended up on
216 fReaderMgr->skippedChar(chSemiColon);
217 return false;
218 }
219
220 // If no terminating semicolon, emit an error but try to keep going
221 if (!fReaderMgr->skippedChar(chSemiColon))
222 fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
223
224 //
225 // Look it up in the PE decl pool and see if it exists. If not, just
226 // emit an error and continue.
227 //
228 XMLEntityDecl* decl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
229 if (!decl)
230 {
231 // XML 1.0 Section 4.1
232 if (fScanner->getStandalone()) {
233 // no need to check fScanner->fHasNoDTD which is for sure false
234 // since we are in expandPERef already
235 fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
236 }
237 else {
238 if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
239 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
240 }
241
242 return false;
243 }
244
245 //
246 // XML 1.0 Section 2.9
247 // If we are a standalone document, then it has to have been declared
248 // in the internal subset. Keep going though.
249 //
250 if (fScanner->getValidationScheme() == XMLScanner::Val_Always && fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
251 fScanner->getValidator()->emitError(XMLValid::VC_IllegalRefInStandalone, bbName.getRawBuffer());
252
253 //
254 // Okee dokee, we found it. So create either a memory stream with
255 // the entity value contents, or a file stream if its an external
256 // entity.
257 //
258 if (decl->isExternal())
259 {
260 // And now create a reader to read this entity
261 InputSource* srcUsed;
262 XMLReader* reader = fReaderMgr->createReader
263 (
264 decl->getBaseURI()
265 , decl->getSystemId()
266 , decl->getPublicId()
267 , false
268 , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
269 , XMLReader::Type_PE
270 , XMLReader::Source_External
271 , srcUsed
272 , fScanner->getCalculateSrcOfs()
273 , fScanner->getLowWaterMark()
274 , fScanner->getDisableDefaultEntityResolution()
275 );
276
277 // Put a janitor on the source so its cleaned up on exit
278 Janitor<InputSource> janSrc(srcUsed);
279
280 // If the creation failed then throw an exception
281 if (!reader)
282 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
283
284 // Set the 'throw at end' flag, to the one we were given
285 reader->setThrowAtEnd(throwEndOfExt);
286
287 //
288 // Push the reader. If its a recursive expansion, then emit an error
289 // and return an failure.
290 //
291 if (!fReaderMgr->pushReader(reader, decl))
292 {
293 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
294 return false;
295 }
296
297 //
298 // If the caller wants us to scan the external entity, then lets
299 // do that now.
300 //
301 if (scanExternal)
302 {
303 XMLEntityHandler* entHandler = fScanner->getEntityHandler();
304
305 // If we have an entity handler, tell it we are starting this entity
306 if (entHandler)
307 entHandler->startInputSource(*srcUsed);
308
309 //
310 // Scan the external entity now. The parameter tells it that
311 // it is not in an include section. Get the current reader
312 // level so we can catch partial markup errors and be sure
313 // to get back to here if we get an exception out of the
314 // ext subset scan.
315 //
316 const XMLSize_t readerNum = fReaderMgr->getCurrentReaderNum();
317 try
318 {
319 scanExtSubsetDecl(false, false);
320 }
321 catch(const OutOfMemoryException&)
322 {
323 throw;
324 }
325 catch(...)
326 {
327 // Pop the reader back to the original level
328 fReaderMgr->cleanStackBackTo(readerNum);
329
330 // End the input source, even though its not happy
331 if (entHandler)
332 entHandler->endInputSource(*srcUsed);
333 throw;
334 }
335
336 // If we have an entity handler, tell it we are ending this entity
337 if (entHandler)
338 entHandler->endInputSource(*srcUsed);
339 }
340 else {
341 // If it starts with the XML string, then parse a text decl
342 if (fScanner->checkXMLDecl(true))
343 scanTextDecl();
344 }
345 }
346 else
347 {
348 // Create a reader over a memory stream over the entity value
349 XMLReader* valueReader = fReaderMgr->createIntEntReader
350 (
351 decl->getName()
352 , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
353 , XMLReader::Type_PE
354 , decl->getValue()
355 , decl->getValueLen()
356 , false
357 );
358
359 //
360 // Trt to push the entity reader onto the reader manager stack,
361 // where it will become the subsequent input. If it fails, that
362 // means the entity is recursive, so issue an error. The reader
363 // will have just been discarded, but we just keep going.
364 //
365 if (!fReaderMgr->pushReader(valueReader, decl))
366 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
367 }
368
369 return true;
370 }
371
372
getQuotedString(XMLBuffer & toFill)373 bool DTDScanner::getQuotedString(XMLBuffer& toFill)
374 {
375 // Reset the target buffer
376 toFill.reset();
377
378 // Get the next char which must be a single or double quote
379 XMLCh quoteCh;
380 if (!fReaderMgr->skipIfQuote(quoteCh))
381 return false;
382
383 XMLCh nextCh;
384 // Get another char and see if it matches the starting quote char
385 while ((nextCh=fReaderMgr->getNextChar())!=quoteCh)
386 {
387 //
388 // We should never get either an end of file null char here. If we
389 // do, just fail. It will be handled more gracefully in the higher
390 // level code that called us.
391 //
392 if (!nextCh)
393 return false;
394
395 // Else add it to the buffer
396 toFill.append(nextCh);
397 }
398 return true;
399 }
400
401
402 XMLAttDef*
scanAttDef(DTDElementDecl & parentElem,XMLBuffer & bufToUse)403 DTDScanner::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse)
404 {
405 // Check for PE ref or optional whitespace
406 checkForPERef(false, true);
407
408 // Get the name of the attribute
409 if (!fReaderMgr->getName(bufToUse))
410 {
411 fScanner->emitError(XMLErrs::ExpectedAttrName);
412 return 0;
413 }
414
415 //
416 // Look up this attribute in the parent element's attribute list. If
417 // it already exists, then use the dummy.
418 //
419 DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer());
420 if (decl)
421 {
422 // It already exists, so put out a warning
423 fScanner->emitError
424 (
425 XMLErrs::AttListAlreadyExists
426 , bufToUse.getRawBuffer()
427 , parentElem.getFullName()
428 );
429
430 // Use the dummy decl to parse into and set its name to the name we got
431 if (!fDumAttDef)
432 {
433 fDumAttDef = new (fMemoryManager) DTDAttDef(fMemoryManager);
434 fDumAttDef->setId(fNextAttrId++);
435 }
436 fDumAttDef->setName(bufToUse.getRawBuffer());
437 decl = fDumAttDef;
438 }
439 else
440 {
441 //
442 // It does not already exist so create a new one, give it the next
443 // available unique id, and add it
444 //
445 decl = new (fGrammarPoolMemoryManager) DTDAttDef
446 (
447 bufToUse.getRawBuffer()
448 , XMLAttDef::CData
449 , XMLAttDef::Implied
450 , fGrammarPoolMemoryManager
451 );
452 decl->setId(fNextAttrId++);
453 decl->setExternalAttDeclaration(isReadingExternalEntity());
454 parentElem.addAttDef(decl);
455 }
456
457 // Set a flag to indicate whether we are doing a dummy parse
458 const bool isIgnored = (decl == fDumAttDef);
459
460 // Space is required here, so check for PE ref, and require space
461 if (!checkForPERef(false, true))
462 fScanner->emitError(XMLErrs::ExpectedWhitespace);
463
464 //
465 // Next has to be one of the attribute type strings. This tells us what
466 // is to follow.
467 //
468 if (fReaderMgr->skippedString(XMLUni::fgCDATAString))
469 {
470 decl->setType(XMLAttDef::CData);
471 }
472 else if (fReaderMgr->skippedString(XMLUni::fgIDString))
473 {
474 if (!fReaderMgr->skippedString(XMLUni::fgRefString))
475 decl->setType(XMLAttDef::ID);
476 else if (!fReaderMgr->skippedChar(chLatin_S))
477 decl->setType(XMLAttDef::IDRef);
478 else
479 decl->setType(XMLAttDef::IDRefs);
480 }
481 else if (fReaderMgr->skippedString(XMLUni::fgEntitString))
482 {
483 if (fReaderMgr->skippedChar(chLatin_Y))
484 {
485 decl->setType(XMLAttDef::Entity);
486 }
487 else if (fReaderMgr->skippedString(XMLUni::fgIESString))
488 {
489 decl->setType(XMLAttDef::Entities);
490 }
491 else
492 {
493 fScanner->emitError
494 (
495 XMLErrs::ExpectedAttributeType
496 , decl->getFullName()
497 , parentElem.getFullName()
498 );
499 return 0;
500 }
501 }
502 else if (fReaderMgr->skippedString(XMLUni::fgNmTokenString))
503 {
504 if (fReaderMgr->skippedChar(chLatin_S))
505 decl->setType(XMLAttDef::NmTokens);
506 else
507 decl->setType(XMLAttDef::NmToken);
508 }
509 else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
510 {
511 // Check for PE ref and require space
512 if (!checkForPERef(false, true))
513 fScanner->emitError(XMLErrs::ExpectedWhitespace);
514
515 decl->setType(XMLAttDef::Notation);
516 if (!scanEnumeration(*decl, bufToUse, true))
517 return 0;
518
519 // Set the value as the enumeration for this decl
520 decl->setEnumeration(bufToUse.getRawBuffer());
521 }
522 else if (fReaderMgr->skippedChar(chOpenParen))
523 {
524 decl->setType(XMLAttDef::Enumeration);
525 if (!scanEnumeration(*decl, bufToUse, false))
526 return 0;
527
528 // Set the value as the enumeration for this decl
529 decl->setEnumeration(bufToUse.getRawBuffer());
530 }
531 else
532 {
533 fScanner->emitError
534 (
535 XMLErrs::ExpectedAttributeType
536 , decl->getFullName()
537 , parentElem.getFullName()
538 );
539 return 0;
540 }
541
542 // Space is required here, so check for PE ref, and require space
543 if (!checkForPERef(false, true))
544 fScanner->emitError(XMLErrs::ExpectedWhitespace);
545
546 // And then scan for the optional default value declaration
547 scanDefaultDecl(*decl);
548
549 // If validating, then do a couple of validation constraints
550 if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
551 {
552 if (decl->getType() == XMLAttDef::ID)
553 {
554 if ((decl->getDefaultType() != XMLAttDef::Implied)
555 && (decl->getDefaultType() != XMLAttDef::Required))
556 {
557 fScanner->getValidator()->emitError(XMLValid::BadIDAttrDefType, decl->getFullName());
558 }
559 }
560
561 // if attdef is xml:space, check correct enumeration (default|preserve)
562 const XMLCh fgXMLSpace[] = { chLatin_x, chLatin_m, chLatin_l, chColon, chLatin_s, chLatin_p, chLatin_a, chLatin_c, chLatin_e, chNull };
563
564 if (XMLString::equals(decl->getFullName(),fgXMLSpace)) {
565 const XMLCh fgPreserve[] = { chLatin_p, chLatin_r, chLatin_e, chLatin_s, chLatin_e, chLatin_r, chLatin_v, chLatin_e, chNull };
566 const XMLCh fgDefault[] = { chLatin_d, chLatin_e, chLatin_f, chLatin_a, chLatin_u, chLatin_l, chLatin_t, chNull };
567 bool ok = false;
568 if (decl->getType() == XMLAttDef::Enumeration) {
569 BaseRefVectorOf<XMLCh>* enumVector = XMLString::tokenizeString(decl->getEnumeration(), fMemoryManager);
570 XMLSize_t size = enumVector->size();
571 ok = (size == 1 &&
572 (XMLString::equals(enumVector->elementAt(0), fgDefault) ||
573 XMLString::equals(enumVector->elementAt(0), fgPreserve))) ||
574 (size == 2 &&
575 (XMLString::equals(enumVector->elementAt(0), fgDefault) &&
576 XMLString::equals(enumVector->elementAt(1), fgPreserve))) ||
577 (size == 2 &&
578 (XMLString::equals(enumVector->elementAt(1), fgDefault) &&
579 XMLString::equals(enumVector->elementAt(0), fgPreserve)));
580 delete enumVector;
581 }
582 if (!ok)
583 fScanner->getValidator()->emitError(XMLValid::IllegalXMLSpace);
584 }
585 }
586
587 // If we have a doc type handler, tell it about this attdef.
588 if (fDocTypeHandler)
589 fDocTypeHandler->attDef(parentElem, *decl, isIgnored);
590 return decl;
591 }
592
593
scanAttListDecl()594 void DTDScanner::scanAttListDecl()
595 {
596 // Space is required here, so check for a PE ref
597 if (!checkForPERef(false, true))
598 {
599 fScanner->emitError(XMLErrs::ExpectedWhitespace);
600 fReaderMgr->skipPastChar(chCloseAngle);
601 return;
602 }
603
604 //
605 // Next should be the name of the element it belongs to, so get a buffer
606 // and get the name into it.
607 //
608 XMLBufBid bbName(fBufMgr);
609 if (!fReaderMgr->getName(bbName.getBuffer()))
610 {
611 fScanner->emitError(XMLErrs::ExpectedElementName);
612 fReaderMgr->skipPastChar(chCloseAngle);
613 return;
614 }
615
616 //
617 // Find this element's declaration. If it has not been declared yet,
618 // we will force one into the list, but not mark it as declared.
619 //
620 DTDElementDecl* elemDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
621 if (!elemDecl)
622 {
623 //
624 // Lets fault in a declaration and add it to the pool. We mark
625 // it having been created because of an attlist. Later, if its
626 // declared, this will be updated.
627 //
628 elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
629 (
630 bbName.getRawBuffer()
631 , fEmptyNamespaceId
632 , DTDElementDecl::Any
633 , fGrammarPoolMemoryManager
634 );
635 elemDecl->setCreateReason(XMLElementDecl::AttList);
636 elemDecl->setExternalElemDeclaration(isReadingExternalEntity());
637 fDTDGrammar->putElemDecl((XMLElementDecl*) elemDecl);
638 }
639
640 // If we have a doc type handler, tell it the att list is starting
641 if (fDocTypeHandler)
642 fDocTypeHandler->startAttList(*elemDecl);
643
644 //
645 // Now we loop until we are done with all of the attributes in this
646 // list. We need a buffer to use for local processing.
647 //
648 XMLBufBid bbTmp(fBufMgr);
649 XMLBuffer& tmpBuf = bbTmp.getBuffer();
650 bool seenAnId = false;
651 while (true)
652 {
653 // Get the next char out and see what it tells us to do
654 const XMLCh nextCh = fReaderMgr->peekNextChar();
655
656 // Watch for EOF
657 if (!nextCh)
658 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
659
660 if (nextCh == chCloseAngle)
661 {
662 // We are done with this attribute list
663 fReaderMgr->getNextChar();
664 break;
665 }
666 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
667 {
668 //
669 // If advanced callbacks are enabled and we have a doc
670 // type handler, then gather up the white space and call
671 // back on the doctype handler. Otherwise, just skip
672 // whitespace.
673 //
674 if (fDocTypeHandler)
675 {
676 fReaderMgr->getSpaces(tmpBuf);
677 fDocTypeHandler->doctypeWhitespace
678 (
679 tmpBuf.getRawBuffer()
680 , tmpBuf.getLen()
681 );
682 }
683 else
684 {
685 fReaderMgr->skipPastSpaces();
686 }
687 }
688 else if (nextCh == chPercent)
689 {
690 // Eat the percent and expand the ref
691 fReaderMgr->getNextChar();
692 expandPERef(false, false, true);
693 }
694 else
695 {
696 //
697 // It must be an attribute name, so scan it. We let
698 // it use our local buffer for its name scanning.
699 //
700 XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf);
701
702 if (!attDef)
703 {
704 fReaderMgr->skipPastChar(chCloseAngle);
705 break;
706 }
707
708 //
709 // If we are validating and its an ID type, then we have to
710 // make sure that we have not seen an id attribute yet. Set
711 // the flag to say that we've seen one now also.
712 //
713 if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
714 {
715 if (attDef->getType() == XMLAttDef::ID)
716 {
717 if (seenAnId)
718 fScanner->getValidator()->emitError(XMLValid::MultipleIdAttrs, elemDecl->getFullName());
719 seenAnId = true;
720 }
721 }
722 }
723 }
724
725 // If we have a doc type handler, tell it the att list is ending
726 if (fDocTypeHandler)
727 fDocTypeHandler->endAttList(*elemDecl);
728 }
729
730
731 //
732 // This method is called to scan the value of an attribute in content. This
733 // involves some normalization and replacement of general entity and
734 // character references.
735 //
736 // End of entity's must be dealt with here. During DTD scan, they can come
737 // from external entities. During content, they can come from any entity.
738 // We just eat the end of entity and continue with our scan until we come
739 // to the closing quote. If an unterminated value causes us to go through
740 // subsequent entities, that will cause errors back in the calling code,
741 // but there's little we can do about it here.
742 //
scanAttValue(const XMLCh * const attrName,XMLBuffer & toFill,const XMLAttDef::AttTypes type)743 bool DTDScanner::scanAttValue(const XMLCh* const attrName
744 , XMLBuffer& toFill
745 , const XMLAttDef::AttTypes type)
746 {
747 enum States
748 {
749 InWhitespace
750 , InContent
751 };
752
753 // Reset the target buffer
754 toFill.reset();
755
756 // Get the next char which must be a single or double quote
757 XMLCh quoteCh;
758 if (!fReaderMgr->skipIfQuote(quoteCh))
759 return false;
760
761 //
762 // We have to get the current reader because we have to ignore closing
763 // quotes until we hit the same reader again.
764 //
765 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
766
767 //
768 // Loop until we get the attribute value. Note that we use a double
769 // loop here to avoid the setup/teardown overhead of the exception
770 // handler on every round.
771 //
772 XMLCh nextCh;
773 XMLCh secondCh = 0;
774 States curState = InContent;
775 bool firstNonWS = false;
776 bool gotLeadingSurrogate = false;
777 bool escaped;
778 while (true)
779 {
780 try
781 {
782 while(true)
783 {
784 nextCh = fReaderMgr->getNextChar();
785
786 if (!nextCh)
787 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
788
789 // Check for our ending quote in the same entity
790 if (nextCh == quoteCh)
791 {
792 if (curReader == fReaderMgr->getCurrentReaderNum())
793 return true;
794
795 // Watch for spillover into a previous entity
796 if (curReader > fReaderMgr->getCurrentReaderNum())
797 {
798 fScanner->emitError(XMLErrs::PartialMarkupInEntity);
799 return false;
800 }
801 }
802
803 //
804 // Check for an entity ref now, before we let it affect our
805 // whitespace normalization logic below. We ignore the empty flag
806 // in this one.
807 //
808 escaped = false;
809 if (nextCh == chAmpersand)
810 {
811 if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned)
812 {
813 gotLeadingSurrogate = false;
814 continue;
815 }
816 }
817 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
818 {
819 // Check for correct surrogate pairs
820 if (gotLeadingSurrogate)
821 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
822 else
823 gotLeadingSurrogate = true;
824 }
825 else
826 {
827 if (gotLeadingSurrogate)
828 {
829 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
830 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
831 }
832 // Its got to at least be a valid XML character
833 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
834 {
835 XMLCh tmpBuf[9];
836 XMLString::binToText
837 (
838 nextCh
839 , tmpBuf
840 , 8
841 , 16
842 , fMemoryManager
843 );
844 fScanner->emitError
845 (
846 XMLErrs::InvalidCharacterInAttrValue
847 , attrName
848 , tmpBuf
849 );
850 }
851
852 gotLeadingSurrogate = false;
853 }
854
855 //
856 // If its not escaped, then make sure its not a < character, which
857 // is not allowed in attribute values.
858 //
859 if (!escaped && (nextCh == chOpenAngle))
860 fScanner->emitError(XMLErrs::BracketInAttrValue, attrName);
861
862 //
863 // If the attribute is a CDATA type we do simple replacement of
864 // tabs and new lines with spaces, if the character is not escaped
865 // by way of a char ref.
866 //
867 // Otherwise, we do the standard non-CDATA normalization of
868 // compressing whitespace to single spaces and getting rid of
869 // leading and trailing whitespace.
870 //
871 if (type == XMLAttDef::CData)
872 {
873 if (!escaped)
874 {
875 if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
876 nextCh = chSpace;
877 }
878 }
879 else
880 {
881 if (curState == InWhitespace)
882 {
883 if (!fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
884 {
885 if (firstNonWS)
886 toFill.append(chSpace);
887 curState = InContent;
888 firstNonWS = true;
889 }
890 else
891 {
892 continue;
893 }
894 }
895 else if (curState == InContent)
896 {
897 if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
898 {
899 curState = InWhitespace;
900 continue;
901 }
902 firstNonWS = true;
903 }
904 }
905
906 // Else add it to the buffer
907 toFill.append(nextCh);
908
909 if (secondCh)
910 {
911 toFill.append(secondCh);
912 secondCh=0;
913 }
914 }
915 }
916
917 catch(const EndOfEntityException&)
918 {
919 // Just eat it and continue.
920 gotLeadingSurrogate = false;
921 escaped = false;
922 }
923 }
924 return true;
925 }
926
927
scanCharRef(XMLCh & first,XMLCh & second)928 bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second)
929 {
930 bool gotOne = false;
931 unsigned int value = 0;
932
933 //
934 // Set the radix. Its supposed to be a lower case x if hex. But, in
935 // order to recover well, we check for an upper and put out an error
936 // for that.
937 //
938 unsigned int radix = 10;
939
940 if (fReaderMgr->skippedChar(chLatin_x))
941 {
942 radix = 16;
943 }
944 else if (fReaderMgr->skippedChar(chLatin_X))
945 {
946 fScanner->emitError(XMLErrs::HexRadixMustBeLowerCase);
947 radix = 16;
948 }
949
950 while (true)
951 {
952 const XMLCh nextCh = fReaderMgr->peekNextChar();
953
954 // Watch for EOF
955 if (!nextCh)
956 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
957
958 // Break out on the terminating semicolon
959 if (nextCh == chSemiColon)
960 {
961 fReaderMgr->getNextChar();
962 break;
963 }
964
965 //
966 // Convert this char to a binary value, or bail out if its not
967 // one.
968 //
969 unsigned int nextVal;
970 if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
971 nextVal = (unsigned int)(nextCh - chDigit_0);
972 else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
973 nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
974 else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
975 nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
976 else
977 {
978 //
979 // If we got at least a sigit, then do an unterminated ref
980 // error. Else, do an expected a numerical ref thing.
981 //
982 if (gotOne)
983 fScanner->emitError(XMLErrs::UnterminatedCharRef);
984 else
985 fScanner->emitError(XMLErrs::ExpectedNumericalCharRef);
986
987 return false;
988 }
989
990 //
991 // Make sure its valid for the radix. If not, then just eat the
992 // digit and go on after issueing an error. Else, update the
993 // running value with this new digit.
994 //
995 if (nextVal >= radix)
996 {
997 XMLCh tmpStr[2];
998 tmpStr[0] = nextCh;
999 tmpStr[1] = chNull;
1000 fScanner->emitError(XMLErrs::BadDigitForRadix, tmpStr);
1001 }
1002 else
1003 {
1004 value = (value * radix) + nextVal;
1005 }
1006
1007 // Indicate that we got at least one good digit
1008 gotOne = true;
1009
1010 // Eat the char we just processed
1011 fReaderMgr->getNextChar();
1012 }
1013
1014 // Return the char (or chars)
1015 // And check if the character expanded is valid or not
1016 if (value >= 0x10000 && value <= 0x10FFFF)
1017 {
1018 value -= 0x10000;
1019 first = XMLCh((value >> 10) + 0xD800);
1020 second = XMLCh((value & 0x3FF) + 0xDC00);
1021 }
1022 else if (value <= 0xFFFD)
1023 {
1024 first = XMLCh(value);
1025 second = 0;
1026 if (!fReaderMgr->getCurrentReader()->isXMLChar(first) && !fReaderMgr->getCurrentReader()->isControlChar(first)) {
1027 // Character reference was not in the valid range
1028 fScanner->emitError(XMLErrs::InvalidCharacterRef);
1029 return false;
1030 }
1031 }
1032 else {
1033 // Character reference was not in the valid range
1034 fScanner->emitError(XMLErrs::InvalidCharacterRef);
1035 return false;
1036 }
1037
1038 return true;
1039 }
1040
1041
1042 ContentSpecNode*
scanChildren(const DTDElementDecl & elemDecl,XMLBuffer & bufToUse,unsigned int & depth)1043 DTDScanner::scanChildren(const DTDElementDecl& elemDecl, XMLBuffer& bufToUse, unsigned int& depth)
1044 {
1045 if (depth++ > CONTENTSPEC_DEPTH_LIMIT) {
1046 fScanner->emitError(XMLErrs::UnterminatedDOCTYPE);
1047 return 0;
1048 }
1049
1050 // Check for a PE ref here, but don't require spaces
1051 checkForPERef(false, true);
1052
1053 ValueStackOf<XMLSize_t>* arrNestedDecl=NULL;
1054 //
1055 // We know that the caller just saw an opening parenthesis, so we need
1056 // to parse until we hit the end of it; if we find several parenthesis,
1057 // store them in an array to be processed later.
1058 //
1059 // We have to check for one up front, since it could be something like
1060 // (((a)*)) etc...
1061 //
1062 ContentSpecNode* curNode = 0;
1063 while(fReaderMgr->skippedChar(chOpenParen))
1064 {
1065 // to check entity nesting
1066 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1067 if(arrNestedDecl==NULL)
1068 arrNestedDecl=new (fMemoryManager) ValueStackOf<XMLSize_t>(5, fMemoryManager);
1069 arrNestedDecl->push(curReader);
1070
1071 // Check for a PE ref here, but don't require spaces
1072 checkForPERef(false, true);
1073 }
1074
1075 // We must find a leaf node here, either standalone or nested in the parenthesis
1076 if (!fReaderMgr->getName(bufToUse))
1077 {
1078 fScanner->emitError(XMLErrs::ExpectedElementName);
1079 return 0;
1080 }
1081
1082 //
1083 // Create a leaf node for it. If we can find the element id for
1084 // this element, then use it. Else, we have to fault in an element
1085 // decl, marked as created because of being in a content model.
1086 //
1087 XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
1088 if (!decl)
1089 {
1090 decl = new (fGrammarPoolMemoryManager) DTDElementDecl
1091 (
1092 bufToUse.getRawBuffer()
1093 , fEmptyNamespaceId
1094 , DTDElementDecl::Any
1095 , fGrammarPoolMemoryManager
1096 );
1097 decl->setCreateReason(XMLElementDecl::InContentModel);
1098 decl->setExternalElemDeclaration(isReadingExternalEntity());
1099 fDTDGrammar->putElemDecl(decl);
1100 }
1101 curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
1102 (
1103 decl->getElementName()
1104 , fGrammarPoolMemoryManager
1105 );
1106
1107 // Check for a PE ref here, but don't require spaces
1108 const bool gotSpaces = checkForPERef(false, true);
1109
1110 // Check for a repetition character after the leaf
1111 XMLCh repCh = fReaderMgr->peekNextChar();
1112 ContentSpecNode* tmpNode = makeRepNode(repCh, curNode, fGrammarPoolMemoryManager);
1113 if (tmpNode != curNode)
1114 {
1115 if (gotSpaces)
1116 {
1117 if (fScanner->emitErrorWillThrowException(XMLErrs::UnexpectedWhitespace))
1118 {
1119 delete tmpNode;
1120 }
1121 fScanner->emitError(XMLErrs::UnexpectedWhitespace);
1122 }
1123 fReaderMgr->getNextChar();
1124 curNode = tmpNode;
1125 }
1126
1127 while(arrNestedDecl==NULL || !arrNestedDecl->empty())
1128 {
1129 // Check for a PE ref here, but don't require spaces
1130 checkForPERef(false, true);
1131
1132 //
1133 // Ok, the next character tells us what kind of content this particular
1134 // model this particular parentesized section is. Its either a choice if
1135 // we see ',', a sequence if we see '|', or a single leaf node if we see
1136 // a closing paren.
1137 //
1138 const XMLCh opCh = fReaderMgr->peekNextChar();
1139
1140 if ((opCh != chComma)
1141 && (opCh != chPipe)
1142 && (opCh != chCloseParen))
1143 {
1144 // Not a legal char, so delete our node and return failure
1145 delete curNode;
1146 fScanner->emitError(XMLErrs::ExpectedSeqChoiceLeaf);
1147 return 0;
1148 }
1149
1150 //
1151 // Create the head node of the correct type. We need this to remember
1152 // the top of the local tree. If it was a single subexpr, then just
1153 // set the head node to the current node. For the others, we'll build
1154 // the tree off the second child as we move across.
1155 //
1156 ContentSpecNode* headNode = 0;
1157 ContentSpecNode::NodeTypes curType = ContentSpecNode::UnknownType;
1158 if (opCh == chComma)
1159 {
1160 curType = ContentSpecNode::Sequence;
1161 headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
1162 (
1163 curType
1164 , curNode
1165 , 0
1166 , true
1167 , true
1168 , fGrammarPoolMemoryManager
1169 );
1170 curNode = headNode;
1171 }
1172 else if (opCh == chPipe)
1173 {
1174 curType = ContentSpecNode::Choice;
1175 headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
1176 (
1177 curType
1178 , curNode
1179 , 0
1180 , true
1181 , true
1182 , fGrammarPoolMemoryManager
1183 );
1184 curNode = headNode;
1185 }
1186 else
1187 {
1188 headNode = curNode;
1189 fReaderMgr->getNextChar();
1190 }
1191
1192 //
1193 // If it was a sequence or choice, we just loop until we get to the
1194 // end of our section, adding each new leaf or sub expression to the
1195 // right child of the current node, and making that new node the current
1196 // node.
1197 //
1198 if ((opCh == chComma) || (opCh == chPipe))
1199 {
1200 ContentSpecNode* lastNode = 0;
1201 while (true)
1202 {
1203 //
1204 // The next thing must either be another | or , character followed
1205 // by another leaf or subexpression, or a closing parenthesis, or a
1206 // PE ref.
1207 //
1208 if (fReaderMgr->lookingAtChar(chPercent))
1209 {
1210 checkForPERef(false, true);
1211 }
1212 else if (fReaderMgr->skippedSpace())
1213 {
1214 // Just skip whitespace
1215 fReaderMgr->skipPastSpaces();
1216 }
1217 else if (fReaderMgr->skippedChar(chCloseParen))
1218 {
1219 //
1220 // We've hit the end of this section, so break out. But, we
1221 // need to see if we left a partial sequence of choice node
1222 // without a second node. If so, we have to undo that and
1223 // put its left child into the right node of the previous
1224 // node.
1225 //
1226 if ((curNode->getType() == ContentSpecNode::Choice)
1227 || (curNode->getType() == ContentSpecNode::Sequence))
1228 {
1229 if (!curNode->getSecond() && lastNode)
1230 {
1231 ContentSpecNode* saveFirst = curNode->orphanFirst();
1232 lastNode->setSecond(saveFirst);
1233 curNode = lastNode;
1234 }
1235 }
1236 break;
1237 }
1238 else if (fReaderMgr->skippedChar(opCh))
1239 {
1240 // Check for a PE ref here, but don't require spaces
1241 checkForPERef(false, true);
1242
1243 if (fReaderMgr->skippedChar(chOpenParen))
1244 {
1245 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1246
1247 // Recurse to handle this new guy
1248 ContentSpecNode* subNode;
1249 try {
1250 subNode = scanChildren(elemDecl, bufToUse, depth);
1251 }
1252 catch (const XMLErrs::Codes)
1253 {
1254 delete headNode;
1255 throw;
1256 }
1257
1258 // If it failed, we are done, clean up here and return failure
1259 if (!subNode)
1260 {
1261 delete headNode;
1262 return 0;
1263 }
1264
1265 if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
1266 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
1267
1268 // Else patch it in and make it the new current
1269 ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode
1270 (
1271 curType
1272 , subNode
1273 , 0
1274 , true
1275 , true
1276 , fGrammarPoolMemoryManager
1277 );
1278 curNode->setSecond(newCur);
1279 lastNode = curNode;
1280 curNode = newCur;
1281 }
1282 else
1283 {
1284 //
1285 // Got to be a leaf node, so get a name. If we cannot get
1286 // one, then clean up and get outa here.
1287 //
1288 if (!fReaderMgr->getName(bufToUse))
1289 {
1290 delete headNode;
1291 fScanner->emitError(XMLErrs::ExpectedElementName);
1292 return 0;
1293 }
1294
1295 //
1296 // Create a leaf node for it. If we can find the element
1297 // id for this element, then use it. Else, we have to
1298 // fault in an element decl, marked as created because
1299 // of being in a content model.
1300 //
1301 XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
1302 if (!decl)
1303 {
1304 decl = new (fGrammarPoolMemoryManager) DTDElementDecl
1305 (
1306 bufToUse.getRawBuffer()
1307 , fEmptyNamespaceId
1308 , DTDElementDecl::Any
1309 , fGrammarPoolMemoryManager
1310 );
1311 decl->setCreateReason(XMLElementDecl::InContentModel);
1312 decl->setExternalElemDeclaration(isReadingExternalEntity());
1313 fDTDGrammar->putElemDecl(decl);
1314 }
1315
1316 ContentSpecNode* tmpLeaf = new (fGrammarPoolMemoryManager) ContentSpecNode
1317 (
1318 decl->getElementName()
1319 , fGrammarPoolMemoryManager
1320 );
1321
1322 // Check for a repetition character after the leaf
1323 const XMLCh repCh = fReaderMgr->peekNextChar();
1324 ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf, fGrammarPoolMemoryManager);
1325 if (tmpLeaf != tmpLeaf2)
1326 fReaderMgr->getNextChar();
1327
1328 //
1329 // Create a new sequence or choice node, with the leaf
1330 // (or rep surrounding it) we just got as its first node.
1331 // Make the new node the second node of the current node,
1332 // and then make it the current node.
1333 //
1334 ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode
1335 (
1336 curType
1337 , tmpLeaf2
1338 , 0
1339 , true
1340 , true
1341 , fGrammarPoolMemoryManager
1342 );
1343 curNode->setSecond(newCur);
1344 lastNode = curNode;
1345 curNode = newCur;
1346 }
1347 }
1348 else
1349 {
1350 // Cannot be valid
1351 delete headNode; // emitError may do a throw so need to clean-up first
1352 if (opCh == chComma)
1353 {
1354 fScanner->emitError(XMLErrs::ExpectedChoiceOrCloseParen);
1355 }
1356 else
1357 {
1358 fScanner->emitError
1359 (
1360 XMLErrs::ExpectedSeqOrCloseParen
1361 , elemDecl.getFullName()
1362 );
1363 }
1364 return 0;
1365 }
1366 }
1367 }
1368
1369 //
1370 // We saw the terminating parenthesis so lets check for any repetition
1371 // character, and create a node for that, making the head node the child
1372 // of it.
1373 //
1374 const XMLCh repCh = fReaderMgr->peekNextChar();
1375 curNode = makeRepNode(repCh, headNode, fGrammarPoolMemoryManager);
1376 if (curNode != headNode)
1377 fReaderMgr->getNextChar();
1378
1379 // prepare for recursion
1380 if(arrNestedDecl==NULL)
1381 break;
1382 else
1383 {
1384 // If that failed, no need to go further, return failure
1385 if (!curNode)
1386 return 0;
1387
1388 const XMLSize_t curReader = arrNestedDecl->pop();
1389 if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
1390 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
1391
1392 if(arrNestedDecl->empty())
1393 {
1394 delete arrNestedDecl;
1395 arrNestedDecl=NULL;
1396 }
1397 }
1398 }
1399
1400 return curNode;
1401 }
1402
1403
1404 //
1405 // We get here after the '<!--' part of the comment. We scan past the
1406 // terminating '-->' It will calls the appropriate handler with the comment
1407 // text, if one is provided. A comment can be in either the document or
1408 // the DTD, so the fInDocument flag is used to know which handler to send
1409 // it to.
1410 //
scanComment()1411 void DTDScanner::scanComment()
1412 {
1413 enum States
1414 {
1415 InText
1416 , OneDash
1417 , TwoDashes
1418 };
1419
1420 // Get a buffer for this
1421 XMLBufBid bbComment(fBufMgr);
1422
1423 //
1424 // Get the comment text into a temp buffer. Be sure to use temp buffer
1425 // two here, since its to be used for stuff that is potentially longer
1426 // than just a name.
1427 //
1428 bool gotLeadingSurrogate = false;
1429 States curState = InText;
1430 while (true)
1431 {
1432 // Get the next character
1433 const XMLCh nextCh = fReaderMgr->getNextChar();
1434
1435 // Watch for an end of file
1436 if (!nextCh)
1437 {
1438 fScanner->emitError(XMLErrs::UnterminatedComment);
1439 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1440 }
1441
1442 // Check for correct surrogate pairs
1443 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
1444 {
1445 if (gotLeadingSurrogate)
1446 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
1447 else
1448 gotLeadingSurrogate = true;
1449 }
1450 else
1451 {
1452 if (gotLeadingSurrogate)
1453 {
1454 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
1455 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
1456 }
1457 // Its got to at least be a valid XML character
1458 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
1459
1460 XMLCh tmpBuf[9];
1461 XMLString::binToText
1462 (
1463 nextCh
1464 , tmpBuf
1465 , 8
1466 , 16
1467 , fMemoryManager
1468 );
1469 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
1470 }
1471
1472 gotLeadingSurrogate = false;
1473 }
1474
1475 if (curState == InText)
1476 {
1477 // If its a dash, go to OneDash state. Otherwise take as text
1478 if (nextCh == chDash)
1479 curState = OneDash;
1480 else
1481 bbComment.append(nextCh);
1482 }
1483 else if (curState == OneDash)
1484 {
1485 //
1486 // If its another dash, then we change to the two dashes states.
1487 // Otherwise, we have to put in the deficit dash and the new
1488 // character and go back to InText.
1489 //
1490 if (nextCh == chDash)
1491 {
1492 curState = TwoDashes;
1493 }
1494 else
1495 {
1496 bbComment.append(chDash);
1497 bbComment.append(nextCh);
1498 curState = InText;
1499 }
1500 }
1501 else if (curState == TwoDashes)
1502 {
1503 // The next character must be the closing bracket
1504 if (nextCh != chCloseAngle)
1505 {
1506 fScanner->emitError(XMLErrs::IllegalSequenceInComment);
1507 fReaderMgr->skipPastChar(chCloseAngle);
1508 return;
1509 }
1510 break;
1511 }
1512 }
1513
1514 // If there is a doc type handler, then pass on the comment stuff
1515 if (fDocTypeHandler)
1516 fDocTypeHandler->doctypeComment(bbComment.getRawBuffer());
1517 }
1518
1519
scanContentSpec(DTDElementDecl & toFill)1520 bool DTDScanner::scanContentSpec(DTDElementDecl& toFill)
1521 {
1522 //
1523 // Check for for a couple of the predefined content type strings. If
1524 // its not one of these, its got to be a parenthesized reg ex type
1525 // expression.
1526 //
1527 if (fReaderMgr->skippedString(XMLUni::fgEmptyString))
1528 {
1529 toFill.setModelType(DTDElementDecl::Empty);
1530 return true;
1531 }
1532
1533 if (fReaderMgr->skippedString(XMLUni::fgAnyString))
1534 {
1535 toFill.setModelType(DTDElementDecl::Any);
1536 return true;
1537 }
1538
1539 // Its got to be a parenthesized regular expression
1540 if (!fReaderMgr->skippedChar(chOpenParen))
1541 {
1542 fScanner->emitError
1543 (
1544 XMLErrs::ExpectedContentSpecExpr
1545 , toFill.getFullName()
1546 );
1547 return false;
1548 }
1549
1550 // Get the current reader id, so we can test for partial markup
1551 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1552
1553 // We could have a PE ref here, but don't require space
1554 checkForPERef(false, true);
1555
1556 //
1557 // Now we look for a PCDATA string. If its PCDATA, then it must be a
1558 // MIXED model. Otherwise, it must be a regular list of children in
1559 // a regular expression perhaps.
1560 //
1561 bool status;
1562 if (fReaderMgr->skippedString(XMLUni::fgPCDATAString))
1563 {
1564 // Set the model to mixed
1565 toFill.setModelType(DTDElementDecl::Mixed_Simple);
1566 status = scanMixed(toFill);
1567
1568 //
1569 // If we are validating we have to check that there are no multiple
1570 // uses of any child elements.
1571 //
1572 if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
1573 {
1574 if (((const MixedContentModel*)toFill.getContentModel())->hasDups())
1575 fScanner->getValidator()->emitError(XMLValid::RepElemInMixed);
1576 }
1577 }
1578 else
1579 {
1580 //
1581 // We have to do a recursive scan of the content model. Create a
1582 // buffer for it to use, for efficiency. It returns the top ofthe
1583 // content spec node tree, which we set if successful.
1584 //
1585 toFill.setModelType(DTDElementDecl::Children);
1586 XMLBufBid bbTmp(fBufMgr);
1587 unsigned int depth = 0;
1588 ContentSpecNode* resNode = scanChildren(toFill, bbTmp.getBuffer(), depth);
1589 status = (resNode != 0);
1590 if (status)
1591 toFill.setContentSpec(resNode);
1592 }
1593
1594 // Make sure we are on the same reader as where we started
1595 if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
1596 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
1597
1598 return status;
1599 }
1600
1601
scanDefaultDecl(DTDAttDef & toFill)1602 void DTDScanner::scanDefaultDecl(DTDAttDef& toFill)
1603 {
1604 if (fReaderMgr->skippedString(XMLUni::fgRequiredString))
1605 {
1606 toFill.setDefaultType(XMLAttDef::Required);
1607 return;
1608 }
1609
1610 if (fReaderMgr->skippedString(XMLUni::fgImpliedString))
1611 {
1612 toFill.setDefaultType(XMLAttDef::Implied);
1613 return;
1614 }
1615
1616 if (fReaderMgr->skippedString(XMLUni::fgFixedString))
1617 {
1618 //
1619 // There must be space before the fixed value. If there is not, then
1620 // emit an error but keep going.
1621 //
1622 if (!fReaderMgr->skippedSpace())
1623 fScanner->emitError(XMLErrs::ExpectedWhitespace);
1624 else
1625 fReaderMgr->skipPastSpaces();
1626 toFill.setDefaultType(XMLAttDef::Fixed);
1627 }
1628 else
1629 {
1630 toFill.setDefaultType(XMLAttDef::Default);
1631 }
1632
1633 //
1634 // If we got here, its fixed or default, so we need to get a value.
1635 // If we don't, then emit an error but just set the default value to
1636 // an empty string and try to keep going.
1637 //
1638 // Check for PE ref or optional whitespace
1639 checkForPERef(false, true);
1640
1641 XMLBufBid bbValue(fBufMgr);
1642 if (!scanAttValue(toFill.getFullName(), bbValue.getBuffer(), toFill.getType()))
1643 fScanner->emitError(XMLErrs::ExpectedDefAttrDecl);
1644
1645 toFill.setValue(bbValue.getRawBuffer());
1646 }
1647
1648
1649 //
1650 // This is called after seeing '<!ELEMENT' which indicates that an element
1651 // markup is starting. This guy scans the rest of it and adds it to the
1652 // element decl pool if it has not already been declared.
1653 //
scanElementDecl()1654 void DTDScanner::scanElementDecl()
1655 {
1656 //
1657 // Space is legal (required actually) here so check for a PE ref. If
1658 // we don't get our whitespace, then issue and error, but try to keep
1659 // going.
1660 //
1661 if (!checkForPERef(false, true))
1662 fScanner->emitError(XMLErrs::ExpectedWhitespace);
1663
1664 // Get a buffer for the element name and scan in the name
1665 XMLBufBid bbName(fBufMgr);
1666 if (!fReaderMgr->getName(bbName.getBuffer()))
1667 {
1668 fScanner->emitError(XMLErrs::ExpectedElementName);
1669 fReaderMgr->skipPastChar(chCloseAngle);
1670 return;
1671 }
1672
1673 // Look this guy up in the element decl pool
1674 DTDElementDecl* decl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
1675
1676 //
1677 // If it does not exist, then we need to create it. If it does and
1678 // its marked as declared, then that's an error, but we still need to
1679 // scan over the content model so use the dummy declaration that the
1680 // parsing code can fill in.
1681 //
1682 if (decl)
1683 {
1684 if (decl->isDeclared())
1685 {
1686 if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
1687 fScanner->getValidator()->emitError(XMLValid::ElementAlreadyExists, bbName.getRawBuffer());
1688
1689 if (!fDumElemDecl)
1690 fDumElemDecl = new (fMemoryManager) DTDElementDecl
1691 (
1692 bbName.getRawBuffer()
1693 , fEmptyNamespaceId
1694 , DTDElementDecl::Any
1695 , fMemoryManager
1696 );
1697 else
1698 fDumElemDecl->setElementName(bbName.getRawBuffer(),fEmptyNamespaceId);
1699 }
1700 }
1701 else
1702 {
1703 //
1704 // Create the new empty declaration to fill in and put it into
1705 // the decl pool.
1706 //
1707 decl = new (fGrammarPoolMemoryManager) DTDElementDecl
1708 (
1709 bbName.getRawBuffer()
1710 , fEmptyNamespaceId
1711 , DTDElementDecl::Any
1712 , fGrammarPoolMemoryManager
1713 );
1714 fDTDGrammar->putElemDecl(decl);
1715 }
1716
1717 // Set a flag for whether we will ignore this one
1718 const bool isIgnored = (decl == fDumElemDecl);
1719
1720 // Mark this one if being externally declared
1721 decl->setExternalElemDeclaration(isReadingExternalEntity());
1722
1723 // Mark this one as being declared
1724 decl->setCreateReason(XMLElementDecl::Declared);
1725
1726 // Another check for a PE ref, with at least required whitespace
1727 if (!checkForPERef(false, true))
1728 fScanner->emitError(XMLErrs::ExpectedWhitespace);
1729
1730 // And now scan the content model for this guy.
1731 if (!scanContentSpec(*decl))
1732 {
1733 fReaderMgr->skipPastChar(chCloseAngle);
1734 return;
1735 }
1736
1737 // Another check for a PE ref, but we don't require whitespace here
1738 checkForPERef(false, true);
1739
1740 // And we should have the ending angle bracket
1741 if (!fReaderMgr->skippedChar(chCloseAngle))
1742 {
1743 fScanner->emitError(XMLErrs::UnterminatedElementDecl, bbName.getRawBuffer());
1744 fReaderMgr->skipPastChar(chCloseAngle);
1745 }
1746
1747 //
1748 // If we have a DTD handler tell it about the new element decl. We
1749 // tell it if its one that can be ignored, cause its an override of a
1750 // previously existing decl. If it is being ignored, only call back
1751 // if advanced callbacks are enabled.
1752 //
1753 if (fDocTypeHandler)
1754 fDocTypeHandler->elementDecl(*decl, isIgnored);
1755 }
1756
1757
1758 //
1759 // This method will process a general or parameter entity reference. The
1760 // entity name and entity text will be stored in the entity pool. The value
1761 // of the entity will be scanned for any other parameter entity or char
1762 // references which will be expanded. So the stored value can only have
1763 // general entity references when done.
1764 //
scanEntityDecl()1765 void DTDScanner::scanEntityDecl()
1766 {
1767 //
1768 // Space is required here, but we cannot check for a PE Ref since
1769 // there could be a legal (no-ref) percent sign here. Since any
1770 // entity that ended here would be illegal, we just skip spaces
1771 // and then check for a percent.
1772 //
1773 if (!fReaderMgr->lookingAtSpace())
1774 fScanner->emitError(XMLErrs::ExpectedWhitespace);
1775 else
1776 fReaderMgr->skipPastSpaces();
1777 bool isPEDecl = fReaderMgr->skippedChar(chPercent);
1778
1779 //
1780 // If a PE decl, then check if it is followed by a space; if it is so,
1781 // eat the percent and check for spaces or a PE ref on the other side of it.
1782 // Otherwise, it has to be an entity reference for a general entity.
1783 //
1784 if (isPEDecl)
1785 {
1786 if(!fReaderMgr->getCurrentReader()->isWhitespace(fReaderMgr->peekNextChar()))
1787 {
1788 isPEDecl=false;
1789 while (true)
1790 {
1791 if (!expandPERef(false, false, true, false))
1792 fScanner->emitError(XMLErrs::ExpectedEntityRefName);
1793 // And skip any more spaces in the expanded value
1794 if (fReaderMgr->skippedSpace())
1795 fReaderMgr->skipPastSpaces();
1796 if (!fReaderMgr->skippedChar(chPercent))
1797 break;
1798 }
1799 }
1800 else if (!checkForPERef(false, true))
1801 fScanner->emitError(XMLErrs::ExpectedWhitespace);
1802 }
1803
1804 //
1805 // Now lets get a name, which should be the name of the entity. We
1806 // have to get a buffer for this.
1807 //
1808 XMLBufBid bbName(fBufMgr);
1809 if (!fReaderMgr->getName(bbName.getBuffer()))
1810 {
1811 fScanner->emitError(XMLErrs::ExpectedPEName);
1812 fReaderMgr->skipPastChar(chCloseAngle);
1813 return;
1814 }
1815
1816 // If namespaces are enabled, then no colons allowed
1817 if (fScanner->getDoNamespaces())
1818 {
1819 if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
1820 fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
1821 }
1822
1823 //
1824 // See if this entity already exists. If so, then the existing one
1825 // takes precendence. So we use the local dummy decl to parse into
1826 // and just ignore the results.
1827 //
1828 DTDEntityDecl* entityDecl;
1829 if (isPEDecl)
1830 entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
1831 else
1832 entityDecl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
1833
1834 if (entityDecl)
1835 {
1836 if (!fDumEntityDecl)
1837 fDumEntityDecl = new (fMemoryManager) DTDEntityDecl(fMemoryManager);
1838 fDumEntityDecl->setName(bbName.getRawBuffer());
1839 entityDecl = fDumEntityDecl;
1840 }
1841 else
1842 {
1843 // Its not in existence already, then create an entity decl for it
1844 entityDecl = new (fGrammarPoolMemoryManager) DTDEntityDecl(bbName.getRawBuffer(), false, fGrammarPoolMemoryManager);
1845
1846 //
1847 // Set the declaration location. The parameter indicates whether its
1848 // declared in the content/internal subset, so we know whether or not
1849 // its in the external subset.
1850 //
1851 entityDecl->setDeclaredInIntSubset(fInternalSubset);
1852
1853 // Add it to the appropriate entity decl pool
1854 if (isPEDecl)
1855 fPEntityDeclPool->put(entityDecl);
1856 else
1857 fDTDGrammar->putEntityDecl(entityDecl);
1858 }
1859
1860 // Set a flag that indicates whether we are ignoring this one
1861 const bool isIgnored = (entityDecl == fDumEntityDecl);
1862
1863 // Set the PE flag on it
1864 entityDecl->setIsParameter(isPEDecl);
1865
1866 //
1867 // Space is legal (required actually) here so check for a PE ref. If
1868 // we don't get our whitespace, then issue an error, but try to keep
1869 // going.
1870 //
1871 if (!checkForPERef(false, true))
1872 fScanner->emitError(XMLErrs::ExpectedWhitespace);
1873
1874 // save the hasNoDTD status for Entity Constraint Checking
1875 bool hasNoDTD = fScanner->getHasNoDTD();
1876 if (hasNoDTD && isPEDecl)
1877 fScanner->setHasNoDTD(false);
1878
1879 // According to the type call the value scanning method
1880 if (!scanEntityDef(*entityDecl, isPEDecl))
1881 {
1882 fReaderMgr->skipPastChar(chCloseAngle);
1883 fScanner->setHasNoDTD(true);
1884 fScanner->emitError(XMLErrs::ExpectedEntityValue);
1885 return;
1886 }
1887 if (hasNoDTD)
1888 fScanner->setHasNoDTD(true);
1889
1890 // Space is legal (but not required) here so check for a PE ref
1891 checkForPERef(false, true);
1892
1893 // And then we have to have the closing angle bracket
1894 if (!fReaderMgr->skippedChar(chCloseAngle))
1895 {
1896 fScanner->emitError(XMLErrs::UnterminatedEntityDecl, entityDecl->getName());
1897 fReaderMgr->skipPastChar(chCloseAngle);
1898 }
1899
1900 //
1901 // If we have a doc type handler, then call it. But only call it for
1902 // ignored elements if advanced callbacks are enabled.
1903 //
1904 if (fDocTypeHandler)
1905 fDocTypeHandler->entityDecl(*entityDecl, isPEDecl, isIgnored);
1906 }
1907
1908
1909 //
1910 // This method will scan a general/character entity ref. It will either
1911 // expand a char ref and return the value directly, or it will expand
1912 // a general entity and a reader for it onto the reader stack.
1913 //
1914 // The return value indicates whether the value was returned directly or
1915 // pushed as a reader or it failed.
1916 //
1917 // The escaped flag tells the caller whether the returnd parameter resulted
1918 // from a character reference, which escapes the character in some cases. It
1919 // only makes any difference if the return indicates the value was returned
1920 // directly.
1921 //
1922 // NOTE: This is only called when scanning attribute values, so we always
1923 // expand general entities.
1924 //
1925 DTDScanner::EntityExpRes
scanEntityRef(XMLCh & firstCh,XMLCh & secondCh,bool & escaped)1926 DTDScanner::scanEntityRef(XMLCh& firstCh, XMLCh& secondCh, bool& escaped)
1927 {
1928 // Assume no escape and no second char
1929 escaped = false;
1930 secondCh = 0;
1931
1932 // We have to insure its all done in a single entity
1933 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
1934
1935 //
1936 // If the next char is a pound, then its a character reference and we
1937 // need to expand it always.
1938 //
1939 if (fReaderMgr->skippedChar(chPound))
1940 {
1941 //
1942 // Its a character reference, so scan it and get back the numeric
1943 // value it represents. If it fails, just return immediately.
1944 //
1945 if (!scanCharRef(firstCh, secondCh))
1946 return EntityExp_Failed;
1947
1948 if (curReader != fReaderMgr->getCurrentReaderNum())
1949 fScanner->emitError(XMLErrs::PartialMarkupInEntity);
1950
1951 // Its now escaped since it was a char ref
1952 escaped = true;
1953 return EntityExp_Returned;
1954 }
1955
1956 // Get the name of the general entity
1957 XMLBufBid bbName(fBufMgr);
1958 if (!fReaderMgr->getName(bbName.getBuffer()))
1959 {
1960 fScanner->emitError(XMLErrs::ExpectedEntityRefName);
1961 return EntityExp_Failed;
1962 }
1963
1964 //
1965 // Next char must be a semi-colon. But if its not, just emit
1966 // an error and try to continue.
1967 //
1968 if (!fReaderMgr->skippedChar(chSemiColon))
1969 fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
1970
1971 // Make sure it was all in one entity reader
1972 if (curReader != fReaderMgr->getCurrentReaderNum())
1973 fScanner->emitError(XMLErrs::PartialMarkupInEntity);
1974
1975 // Look it up the name the general entity pool
1976 XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
1977
1978 // If it does not exist, then obviously an error
1979 if (!decl)
1980 {
1981 // XML 1.0 Section 4.1
1982 if (fScanner->getStandalone() || fScanner->getHasNoDTD()) {
1983 fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
1984 }
1985 else {
1986 if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
1987 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
1988 }
1989
1990 return EntityExp_Failed;
1991 }
1992
1993
1994 //
1995 // XML 1.0 Section 4.1
1996 // If we are a standalone document, then it has to have been declared
1997 // in the internal subset.
1998 //
1999 if (fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
2000 fScanner->emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer());
2001
2002 //
2003 // If its a special char reference, then its escaped and we can return
2004 // it directly.
2005 //
2006 if (decl->getIsSpecialChar())
2007 {
2008 firstCh = decl->getValue()[0];
2009 escaped = true;
2010 return EntityExp_Returned;
2011 }
2012
2013 if (decl->isExternal())
2014 {
2015 // If its unparsed, then its not valid here
2016 // XML 1.0 Section 4.4.4 the appearance of a reference to an unparsed entity is forbidden.
2017 if (decl->isUnparsed())
2018 {
2019 fScanner->emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
2020 return EntityExp_Failed;
2021 }
2022
2023 // We are in an attribute value, so not valid.
2024 // XML 1.0 Section 4.4.4 a reference to an external entity in an attribute value is forbidden.
2025 fScanner->emitError(XMLErrs::NoExtRefsInAttValue);
2026
2027 // And now create a reader to read this entity
2028 InputSource* srcUsed;
2029 XMLReader* reader = fReaderMgr->createReader
2030 (
2031 decl->getBaseURI()
2032 , decl->getSystemId()
2033 , decl->getPublicId()
2034 , false
2035 , XMLReader::RefFrom_NonLiteral
2036 , XMLReader::Type_General
2037 , XMLReader::Source_External
2038 , srcUsed
2039 , fScanner->getCalculateSrcOfs()
2040 , fScanner->getLowWaterMark()
2041 , fScanner->getDisableDefaultEntityResolution()
2042 );
2043
2044 // Put a janitor on the source so it gets cleaned up on exit
2045 Janitor<InputSource> janSrc(srcUsed);
2046
2047 //
2048 // If the creation failed then throw an exception
2049 //
2050 if (!reader)
2051 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
2052
2053 //
2054 // Push the reader. If its a recursive expansion, then emit an error
2055 // and return an failure.
2056 //
2057 if (!fReaderMgr->pushReader(reader, decl))
2058 {
2059 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
2060 return EntityExp_Failed;
2061 }
2062
2063 // If it starts with the XML string, then parse a text decl
2064 if (fScanner->checkXMLDecl(true))
2065 scanTextDecl();
2066 }
2067 else
2068 {
2069 //
2070 // Create a reader over a memory stream over the entity value
2071 // We force it to assume UTF-16 by passing in an encoding
2072 // string. This way it won't both trying to predecode the
2073 // first line, looking for an XML/TextDecl.
2074 //
2075 XMLReader* valueReader = fReaderMgr->createIntEntReader
2076 (
2077 decl->getName()
2078 , XMLReader::RefFrom_NonLiteral
2079 , XMLReader::Type_General
2080 , decl->getValue()
2081 , decl->getValueLen()
2082 , false
2083 );
2084
2085 //
2086 // Trt to push the entity reader onto the reader manager stack,
2087 // where it will become the subsequent input. If it fails, that
2088 // means the entity is recursive, so issue an error. The reader
2089 // will have just been discarded, but we just keep going.
2090 //
2091 if (!fReaderMgr->pushReader(valueReader, decl))
2092 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
2093 }
2094
2095 return EntityExp_Pushed;
2096 }
2097
2098
2099 //
2100 // This method will scan a quoted literal of an entity value. It has to
2101 // deal with replacement of PE references; however, since this is a DTD
2102 // scanner, all such entity literals are in entity decls and therefore
2103 // general entities are not expanded.
2104 //
scanEntityLiteral(XMLBuffer & toFill)2105 bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill)
2106 {
2107 toFill.reset();
2108
2109 // Get the next char which must be a single or double quote
2110 XMLCh quoteCh;
2111 if (!fReaderMgr->skipIfQuote(quoteCh))
2112 return false;
2113
2114 // Get a buffer for pulling in entity names when we see GE refs
2115 XMLBufBid bbName(fBufMgr);
2116 XMLBuffer& nameBuf = bbName.getBuffer();
2117
2118 // Remember the current reader
2119 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2120
2121 //
2122 // Loop until we see the ending quote character, handling any references
2123 // in the process.
2124 //
2125 XMLCh nextCh;
2126 XMLCh secondCh = 0;
2127 bool gotLeadingSurrogate = false;
2128 while (true)
2129 {
2130 nextCh = fReaderMgr->getNextChar();
2131
2132 //
2133 // Watch specifically for EOF and issue a more meaningful error
2134 // if that occurs (since an unterminated quoted char can cause
2135 // this easily.)
2136 //
2137 if (!nextCh)
2138 {
2139 fScanner->emitError(XMLErrs::UnterminatedEntityLiteral);
2140 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
2141 }
2142
2143 //
2144 // Break out on our terminating quote char when we are back in the
2145 // same reader. Otherwise, we might trigger on a nested quote char
2146 // in an expanded entity.
2147 //
2148 if ((nextCh == quoteCh)
2149 && (fReaderMgr->getCurrentReaderNum() == orgReader))
2150 {
2151 break;
2152 }
2153
2154 if (nextCh == chPercent)
2155 {
2156 //
2157 // Put the PE's value on the reader stack and then jump back
2158 // to the top to start processing it. The parameter indicates
2159 // that it should not scan the reference's content as an external
2160 // subset.
2161 //
2162 expandPERef(false, true, true);
2163 continue;
2164 }
2165
2166 //
2167 // Ok, now that all the other special stuff is checked, we can
2168 // look for a general entity. In here, we cannot have a naked &
2169 // and will only expand numerical char refs or the intrinsic char
2170 // refs. Others will be left alone.
2171 //
2172 if (nextCh == chAmpersand)
2173 {
2174 //
2175 // Here, we only expand numeric char refs, but not any general
2176 // entities. However, the stupid XML spec requires that we check
2177 // and make sure it does refer to a general entity if its not
2178 // a char ref (i.e. no naked '&' chars.)
2179 //
2180 if (fReaderMgr->skippedChar(chPound))
2181 {
2182 // If it failed, then just jump back to the top and try to pick up
2183 if (!scanCharRef(nextCh, secondCh))
2184 {
2185 gotLeadingSurrogate = false;
2186 continue;
2187 }
2188 }
2189 else
2190 {
2191 if (!fReaderMgr->getName(nameBuf))
2192 {
2193 fScanner->emitError(XMLErrs::ExpectedEntityRefName);
2194 }
2195 else
2196 {
2197 //
2198 // Since we are not expanding any of this, we have to
2199 // put the amp and name into the target buffer as data.
2200 //
2201 toFill.append(chAmpersand);
2202 toFill.append(nameBuf.getRawBuffer());
2203
2204 // Make sure we skipped a trailing semicolon
2205 if (!fReaderMgr->skippedChar(chSemiColon))
2206 {
2207 fScanner->emitError
2208 (
2209 XMLErrs::UnterminatedEntityRef
2210 , nameBuf.getRawBuffer()
2211 );
2212 }
2213
2214 // And make the new character the semicolon
2215 nextCh = chSemiColon;
2216 }
2217
2218 // Either way here we reset the surrogate flag
2219 gotLeadingSurrogate = false;
2220 }
2221 }
2222 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
2223 {
2224 if (gotLeadingSurrogate)
2225 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2226 else
2227 gotLeadingSurrogate = true;
2228 }
2229 else
2230 {
2231 if (gotLeadingSurrogate)
2232 {
2233 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
2234 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2235 }
2236 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
2237 {
2238 XMLCh tmpBuf[9];
2239 XMLString::binToText
2240 (
2241 nextCh
2242 , tmpBuf
2243 , 8
2244 , 16
2245 , fMemoryManager
2246 );
2247 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
2248 fReaderMgr->skipPastChar(quoteCh);
2249 return false;
2250 }
2251 gotLeadingSurrogate = false;
2252 }
2253
2254 // Looks ok, so add it to the literal
2255 toFill.append(nextCh);
2256
2257 if (secondCh)
2258 {
2259 toFill.append(secondCh);
2260 secondCh=0;
2261 }
2262 }
2263
2264 //
2265 // If we got here and did not get back to the original reader level,
2266 // then we propogated some entity out of the literal, so issue an
2267 // error, but don't fail.
2268 //
2269 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
2270 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
2271
2272 return true;
2273 }
2274
2275
2276 //
2277 // This method is called after the entity name has been scanned, and any
2278 // PE referenced following the name is handled. The passed decl will be
2279 // filled in with the info scanned.
2280 //
scanEntityDef(DTDEntityDecl & decl,const bool isPEDecl)2281 bool DTDScanner::scanEntityDef(DTDEntityDecl& decl, const bool isPEDecl)
2282 {
2283 // Its got to be an entity literal
2284 if (fReaderMgr->lookingAtChar(chSingleQuote)
2285 || fReaderMgr->lookingAtChar(chDoubleQuote))
2286 {
2287 // Get a buffer for the literal
2288 XMLBufBid bbValue(fBufMgr);
2289
2290 if (!scanEntityLiteral(bbValue.getBuffer()))
2291 return false;
2292
2293 // Set it on the entity decl
2294 decl.setValue(bbValue.getRawBuffer());
2295 return true;
2296 }
2297
2298 //
2299 // Its got to be an external entity, so there must be an external id.
2300 // Get buffers for them and scan an external id into them.
2301 //
2302 XMLBufBid bbPubId(fBufMgr);
2303 XMLBufBid bbSysId(fBufMgr);
2304 if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
2305 return false;
2306
2307 decl.setIsExternal(true);
2308 ReaderMgr::LastExtEntityInfo lastInfo;
2309 fReaderMgr->getLastExtEntityInfo(lastInfo);
2310
2311 // Fill in the id fields of the decl with the info we got
2312 const XMLCh* publicId = bbPubId.getRawBuffer();
2313 const XMLCh* systemId = bbSysId.getRawBuffer();
2314 decl.setPublicId((publicId && *publicId) ? publicId : 0);
2315 decl.setSystemId((systemId && *systemId) ? systemId : 0);
2316 decl.setBaseURI((lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0);
2317
2318 // If its a PE decl, we are done
2319 bool gotSpaces = checkForPERef(false, true);
2320 if (isPEDecl)
2321 {
2322 //
2323 // Check for a common error here. NDATA is not allowed for PEs
2324 // so check for the NDATA string. If found give a nice meaningful
2325 // error and continue parsing to eat the NDATA text.
2326 //
2327 if (gotSpaces)
2328 {
2329 if (fReaderMgr->skippedString(XMLUni::fgNDATAString))
2330 fScanner->emitError(XMLErrs::NDATANotValidForPE);
2331 }
2332 else
2333 {
2334 return true;
2335 }
2336 }
2337
2338 // If looking at close angle now, we are done
2339 if (fReaderMgr->lookingAtChar(chCloseAngle))
2340 return true;
2341
2342 // Else we had to have seem the whitespace
2343 if (!gotSpaces)
2344 fScanner->emitError(XMLErrs::ExpectedWhitespace);
2345
2346 // We now have to see a notation data string
2347 if (!fReaderMgr->skippedString(XMLUni::fgNDATAString))
2348 fScanner->emitError(XMLErrs::ExpectedNDATA);
2349
2350 // Space is required here, but try to go on if not
2351 if (!checkForPERef(false, true))
2352 fScanner->emitError(XMLErrs::ExpectedWhitespace);
2353
2354 // Get a name
2355 XMLBufBid bbName(fBufMgr);
2356 if (!fReaderMgr->getName(bbName.getBuffer()))
2357 {
2358 fScanner->emitError(XMLErrs::ExpectedNotationName);
2359 return false;
2360 }
2361
2362 // Set the decl's notation name
2363 decl.setNotationName(bbName.getRawBuffer());
2364
2365 return true;
2366 }
2367
2368
2369 //
2370 // This method is called after an attribute decl name or a notation decl has
2371 // been scanned and then an opening parenthesis was see, indicating the list
2372 // of values. It scans the enumeration values and creates a single string
2373 // which has a single space between each value.
2374 //
2375 // The terminating close paren ends this scan.
2376 //
scanEnumeration(const DTDAttDef & attDef,XMLBuffer & toFill,const bool notation)2377 bool DTDScanner::scanEnumeration( const DTDAttDef& attDef
2378 , XMLBuffer& toFill
2379 , const bool notation)
2380 {
2381 // Reset the passed buffer
2382 toFill.reset();
2383
2384 // Check for PE ref but don't require space
2385 checkForPERef(false, true);
2386
2387 // If this is a notation, we need an opening paren
2388 if (notation)
2389 {
2390 if (!fReaderMgr->skippedChar(chOpenParen))
2391 fScanner->emitError(XMLErrs::ExpectedOpenParen);
2392 }
2393
2394 // We need a local buffer to use as well
2395 XMLBufBid bbTmp(fBufMgr);
2396
2397 while (true)
2398 {
2399 // Space is allowed here for either type so check for PE ref
2400 checkForPERef(false, true);
2401
2402 // And then get either a name or a name token
2403 bool success;
2404 if (notation)
2405 success = fReaderMgr->getName(bbTmp.getBuffer());
2406 else
2407 success = fReaderMgr->getNameToken(bbTmp.getBuffer());
2408
2409 if (!success)
2410 {
2411 fScanner->emitError
2412 (
2413 XMLErrs::ExpectedEnumValue
2414 , attDef.getFullName()
2415 );
2416 return false;
2417 }
2418
2419 // Append this value to the target value
2420 toFill.append(bbTmp.getRawBuffer(), bbTmp.getLen());
2421
2422 // Space is allowed here for either type so check for PE ref
2423 checkForPERef(false, true);
2424
2425 // Check for the terminating paren
2426 if (fReaderMgr->skippedChar(chCloseParen))
2427 break;
2428
2429 // And append a space separator
2430 toFill.append(chSpace);
2431
2432 // Check for the pipe character separator
2433 if (!fReaderMgr->skippedChar(chPipe))
2434 {
2435 fScanner->emitError(XMLErrs::ExpectedEnumSepOrParen);
2436 return false;
2437 }
2438 }
2439 return true;
2440 }
2441
2442
scanEq()2443 bool DTDScanner::scanEq()
2444 {
2445 fReaderMgr->skipPastSpaces();
2446 if (fReaderMgr->skippedChar(chEqual))
2447 {
2448 fReaderMgr->skipPastSpaces();
2449 return true;
2450 }
2451 return false;
2452 }
2453
2454
2455 //
2456 // This method is called when an external entity reference is seen in the
2457 // DTD or an external DTD subset is encountered, and their contents pushed
2458 // onto the reader stack. This method will scan that contents.
2459 //
scanExtSubsetDecl(const bool inIncludeSect,const bool isDTD)2460 void DTDScanner::scanExtSubsetDecl(const bool inIncludeSect, const bool isDTD)
2461 {
2462 // Indicate we are in the external subset now
2463 FlagJanitor<bool> janContentFlag(&fInternalSubset, false);
2464
2465
2466 bool bAcceptDecl = !inIncludeSect;
2467
2468 // Get a buffer for whitespace
2469 XMLBufBid bbSpace(fBufMgr);
2470
2471 //
2472 // If we have a doc type handler and we are not being called recursively
2473 // to handle an include section, tell it the ext subset starts
2474 //
2475 if (fDocTypeHandler && isDTD && !inIncludeSect)
2476 fDocTypeHandler->startExtSubset();
2477
2478 //
2479 // We have to play a trick here if the current entity we are parsing
2480 // is a PE. Because the spooling code will put out a whitespace before
2481 // and after an expanded PE if its being scanned outside the context of
2482 // a literal entity, this will confuse this external subset code.
2483 //
2484 // So, we see if that is what is happening and, if so, eat the single
2485 // space, a check for the <?xml string. If we find it, we parse that
2486 // markup right now and put the space back.
2487 //
2488 if (fReaderMgr->isScanningPERefOutOfLiteral())
2489 {
2490 if (fReaderMgr->skippedSpace())
2491 {
2492 if (fScanner->checkXMLDecl(true))
2493 {
2494 scanTextDecl();
2495 bAcceptDecl = false;
2496
2497 // <TBD> Figure out how to do this
2498 // fReaderMgr->unGet(chSpace);
2499 }
2500 }
2501 }
2502
2503 // Get the current reader number
2504 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2505
2506 //
2507 // Loop until we hit the end of the external subset entity. Note that
2508 // we use a double loop here in order to avoid the overhead of doing
2509 // the exception setup/teardown work on every loop.
2510 //
2511 bool inMarkup = false;
2512 bool inCharData = false;
2513 while (true)
2514 {
2515 bool bDoBreak=false; // workaround for Borland bug with 'break' in 'catch'
2516 try
2517 {
2518 while (true)
2519 {
2520 XMLCh nextCh;
2521
2522 try {
2523 nextCh = fReaderMgr->peekNextChar();
2524 }
2525 catch (XMLException& ex) {
2526 fScanner->emitError(XMLErrs::XMLException_Fatal, ex.getCode(), ex.getMessage(), NULL, NULL);
2527 nextCh = chNull;
2528 }
2529
2530 if (!nextCh)
2531 {
2532 return; // nothing left
2533 }
2534 else if (nextCh == chOpenAngle)
2535 {
2536 // Get the reader we started this on
2537 // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
2538 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2539 bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
2540
2541 //
2542 // Now scan the markup. Set the flag so that we will know that
2543 // we were in markup if an end of entity exception occurs.
2544 //
2545 fReaderMgr->getNextChar();
2546 inMarkup = true;
2547 scanMarkupDecl(bAcceptDecl);
2548 inMarkup = false;
2549
2550 //
2551 // And see if we got back to the same level. If not, then its
2552 // a partial markup error.
2553 //
2554 if (fReaderMgr->getCurrentReaderNum() != orgReader){
2555 if (wasInPE)
2556 fScanner->emitError(XMLErrs::PEBetweenDecl);
2557 else if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
2558 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
2559 }
2560
2561 }
2562 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
2563 {
2564 //
2565 // If we have a doc type handler, and advanced callbacks are
2566 // enabled, then gather up whitespace and call back. Otherwise
2567 // just skip whitespaces.
2568 //
2569 if (fDocTypeHandler)
2570 {
2571 inCharData = true;
2572 fReaderMgr->getSpaces(bbSpace.getBuffer());
2573 inCharData = false;
2574
2575 fDocTypeHandler->doctypeWhitespace
2576 (
2577 bbSpace.getRawBuffer()
2578 , bbSpace.getLen()
2579 );
2580 }
2581 else
2582 {
2583 //
2584 // If we hit an end of entity in the middle of white
2585 // space, that's fine. We'll just come back in here
2586 // again on the next round and skip some more.
2587 //
2588 fReaderMgr->skipPastSpaces();
2589 }
2590 }
2591 else if (nextCh == chPercent)
2592 {
2593 //
2594 // Expand (and scan if external) the reference value. Tell
2595 // it to throw an end of entity exception at the end of the
2596 // entity.
2597 //
2598 fReaderMgr->getNextChar();
2599 expandPERef(true, false, false, true);
2600 }
2601 else if (inIncludeSect && (nextCh == chCloseSquare))
2602 {
2603 //
2604 // Its the end of a conditional include section. So scan it and
2605 // decrement the include depth counter.
2606 //
2607 fReaderMgr->getNextChar();
2608 if (!fReaderMgr->skippedChar(chCloseSquare))
2609 {
2610 fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
2611 fReaderMgr->skipPastChar(chCloseAngle);
2612 }
2613 else if (!fReaderMgr->skippedChar(chCloseAngle))
2614 {
2615 fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
2616 fReaderMgr->skipPastChar(chCloseAngle);
2617 }
2618 return;
2619 }
2620 else
2621 {
2622 fReaderMgr->getNextChar();
2623 if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
2624 {
2625 XMLCh tmpBuf[9];
2626 XMLString::binToText
2627 (
2628 nextCh
2629 , tmpBuf
2630 , 8
2631 , 16
2632 , fMemoryManager
2633 );
2634 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
2635 }
2636 else
2637 {
2638 fScanner->emitError(XMLErrs::InvalidDocumentStructure);
2639 }
2640
2641 // Try to get realigned
2642 static const XMLCh toSkip[] =
2643 {
2644 chPercent, chCloseSquare, chOpenAngle, chNull
2645 };
2646 fReaderMgr->skipUntilInOrWS(toSkip);
2647 }
2648 bAcceptDecl = false;
2649 }
2650 }
2651 catch(const EndOfEntityException& toCatch)
2652 {
2653 //
2654 // If the external entity ended while we were in markup, then that's
2655 // a partial markup error.
2656 //
2657 if (inMarkup)
2658 {
2659 fScanner->emitError(XMLErrs::PartialMarkupInEntity);
2660 inMarkup = false;
2661 }
2662
2663 // If we were in char data, then send what we got
2664 if (inCharData)
2665 {
2666 // Send what we got, then rethrow
2667 if (fDocTypeHandler)
2668 {
2669 fDocTypeHandler->doctypeWhitespace
2670 (
2671 bbSpace.getRawBuffer()
2672 , bbSpace.getLen()
2673 );
2674 }
2675 inCharData = false;
2676 }
2677
2678 //
2679 // If the entity that just ended was the entity that we started
2680 // on, then this is the end of the external subset.
2681 //
2682 if (orgReader == toCatch.getReaderNum())
2683 bDoBreak=true;
2684 }
2685 if(bDoBreak)
2686 break;
2687 }
2688
2689 // If we have a doc type handler, tell it the ext subset ends
2690 if (fDocTypeHandler && isDTD && !inIncludeSect)
2691 fDocTypeHandler->endExtSubset();
2692 }
2693
2694
2695 //
2696 // This method will scan for an id, either public or external.
2697 //
2698 //
2699 // [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2700 // | 'PUBLIC' S PubidLiteral S SystemLiteral
2701 // [83] PublicID ::= 'PUBLIC' S PubidLiteral
2702 //
scanId(XMLBuffer & pubIdToFill,XMLBuffer & sysIdToFill,const IDTypes whatKind)2703 bool DTDScanner::scanId( XMLBuffer& pubIdToFill
2704 , XMLBuffer& sysIdToFill
2705 , const IDTypes whatKind)
2706 {
2707 // Clean out both return buffers
2708 pubIdToFill.reset();
2709 sysIdToFill.reset();
2710
2711 //
2712 // Check first for the system id first. If we find it, and system id
2713 // is one of the legal values, then lets try to scan it.
2714 //
2715 // 'SYSTEM' S SystemLiteral
2716 if (fReaderMgr->skippedString(XMLUni::fgSysIDString))
2717 {
2718 // If they were looking for a public id, then we failed
2719 if (whatKind == IDType_Public)
2720 {
2721 fScanner->emitError(XMLErrs::ExpectedPublicId);
2722 return false;
2723 }
2724
2725 // We must skip spaces
2726 bool skippedSomething;
2727 fReaderMgr->skipPastSpaces(skippedSomething);
2728 if (!skippedSomething)
2729 {
2730 fScanner->emitError(XMLErrs::ExpectedWhitespace);
2731 return false;
2732 }
2733
2734 // Get the system literal value
2735 return scanSystemLiteral(sysIdToFill);
2736 }
2737
2738 // Now scan for public id
2739 // 'PUBLIC' S PubidLiteral S SystemLiteral
2740 // or
2741 // 'PUBLIC' S PubidLiteral
2742
2743 // If we don't have any public id string => Error
2744 if (!fReaderMgr->skippedString(XMLUni::fgPubIDString)) {
2745 fScanner->emitError(XMLErrs::ExpectedSystemOrPublicId);
2746 return false;
2747 }
2748
2749 //
2750 // So following this we must have whitespace, a public literal, whitespace,
2751 // and a system literal.
2752 //
2753 bool skippedSomething;
2754 fReaderMgr->skipPastSpaces(skippedSomething);
2755 if (!skippedSomething)
2756 {
2757 fScanner->emitError(XMLErrs::ExpectedWhitespace);
2758
2759 //
2760 // Just in case, if they just forgot the whitespace but the next char
2761 // is a single or double quote, then keep going.
2762 //
2763 const XMLCh chPeek = fReaderMgr->peekNextChar();
2764 if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote))
2765 return false;
2766 }
2767
2768 if (!scanPublicLiteral(pubIdToFill))
2769 return false;
2770
2771 // If they wanted a public id, then this is all
2772 if (whatKind == IDType_Public)
2773 return true;
2774
2775 // check if there is any space follows
2776 bool hasSpace;
2777 fReaderMgr->skipPastSpaces(hasSpace);
2778
2779 //
2780 // In order to recover best here we need to see if
2781 // the next thing is a quote or not
2782 //
2783 const XMLCh chPeek = fReaderMgr->peekNextChar();
2784 const bool bIsQuote = ((chPeek == chDoubleQuote)
2785 || (chPeek == chSingleQuote));
2786
2787 if (!hasSpace)
2788 {
2789 if (whatKind == IDType_External)
2790 {
2791 //
2792 // If its an external Id, then we need to see the system id.
2793 // So, emit the error. But, if the next char is a quote, don't
2794 // give up since its probably going to work. The user just
2795 // missed the separating space. Otherwise, fail.
2796 //
2797 fScanner->emitError(XMLErrs::ExpectedWhitespace);
2798 if (!bIsQuote)
2799 return false;
2800 }
2801 else
2802 {
2803 //
2804 // We can legally return here. But, if the next char is a quote,
2805 // then that's probably not what was desired, since its probably
2806 // just that space was forgotten and there really is a system
2807 // id to follow.
2808 //
2809 // So treat it like missing whitespace if so and keep going.
2810 // Else, just return success.
2811 //
2812 if (bIsQuote)
2813 fScanner->emitError(XMLErrs::ExpectedWhitespace);
2814 else
2815 return true;
2816 }
2817 }
2818
2819 if (bIsQuote) {
2820 // there is a quote coming, scan the system literal
2821 if (!scanSystemLiteral(sysIdToFill))
2822 return false;
2823 }
2824 else {
2825 // no quote, if expecting exteral id, this is an error
2826 if (whatKind == IDType_External)
2827 fScanner->emitError(XMLErrs::ExpectedQuotedString);
2828 }
2829
2830 return true;
2831 }
2832
2833
2834 //
2835 // This method will scan the contents of an ignored section. It assumes that
2836 // we already are in the body, i.e. we've seen <![IGNORE[ at this point. So
2837 // we have to just scan until we see a matching ]]> closing markup.
2838 //
scanIgnoredSection()2839 void DTDScanner::scanIgnoredSection()
2840 {
2841 //
2842 // Depth starts at one because we are already in one section and want
2843 // to parse until we hit its end.
2844 //
2845 unsigned long depth = 1;
2846 bool gotLeadingSurrogate = false;
2847 while (true)
2848 {
2849 const XMLCh nextCh = fReaderMgr->getNextChar();
2850
2851 if (!nextCh)
2852 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
2853
2854 if (nextCh == chOpenAngle)
2855 {
2856 if (fReaderMgr->skippedChar(chBang)
2857 && fReaderMgr->skippedChar(chOpenSquare))
2858 {
2859 depth++;
2860 }
2861 }
2862 else if (nextCh == chCloseSquare)
2863 {
2864 if (fReaderMgr->skippedChar(chCloseSquare))
2865 {
2866 while (fReaderMgr->skippedChar(chCloseSquare))
2867 {
2868 // Do nothing, just skip them
2869 }
2870
2871 if (fReaderMgr->skippedChar(chCloseAngle))
2872 {
2873 depth--;
2874 if (!depth)
2875 break;
2876 }
2877 }
2878 }
2879 // Deal with surrogate pairs
2880 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
2881 {
2882 // Its a leading surrogate. If we already got one, then
2883 // issue an error, else set leading flag to make sure that
2884 // we look for a trailing next time.
2885 if (gotLeadingSurrogate)
2886 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2887 else
2888 gotLeadingSurrogate = true;
2889 }
2890 else
2891 {
2892 // If its a trailing surrogate, make sure that we are
2893 // prepared for that. Else, its just a regular char so make
2894 // sure that we were not expected a trailing surrogate.
2895 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
2896 {
2897 // Its trailing, so make sure we were expecting it
2898 if (!gotLeadingSurrogate)
2899 fScanner->emitError(XMLErrs::Unexpected2ndSurrogateChar);
2900 }
2901 else
2902 {
2903 // Its just a char, so make sure we were not expecting a
2904 // trailing surrogate.
2905 if (gotLeadingSurrogate)
2906 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
2907
2908 // Its got to at least be a valid XML character
2909 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
2910 {
2911 XMLCh tmpBuf[9];
2912 XMLString::binToText
2913 (
2914 nextCh
2915 , tmpBuf
2916 , 8
2917 , 16
2918 , fMemoryManager
2919 );
2920 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
2921 }
2922 }
2923 gotLeadingSurrogate = false;
2924 }
2925 }
2926 }
2927
2928
2929 //
2930 // This method scans the entire internal subset. All we can have here is
2931 // decl markup, and PE references. The expanded PE references must contain
2932 // whole markup, so we don't have to worry about their content at this
2933 // level. We just scan them, expand them, push them, and parse their content
2934 // right there, via the expandERef() method.
2935 //
scanInternalSubset()2936 bool DTDScanner::scanInternalSubset()
2937 {
2938 // Indicate we are in the internal subset now
2939 FlagJanitor<bool> janContentFlag(&fInternalSubset, true);
2940
2941 // If we have a doc type handler, tell it the internal subset starts
2942 if (fDocTypeHandler)
2943 fDocTypeHandler->startIntSubset();
2944
2945 // Get a buffer for whitespace
2946 XMLBufBid bbSpace(fBufMgr);
2947
2948 bool noErrors = true;
2949 while (true)
2950 {
2951 const XMLCh nextCh = fReaderMgr->peekNextChar();
2952
2953 //
2954 // If we get an end of file marker, just unget it and return a
2955 // failure status. The caller will then see the end of file and
2956 // faill out correctly.
2957 //
2958 if (!nextCh)
2959 return false;
2960
2961 // Watch for the end of internal subset marker
2962 if (nextCh == chCloseSquare)
2963 {
2964 fReaderMgr->getNextChar();
2965 break;
2966 }
2967
2968 if (nextCh == chPercent)
2969 {
2970 //
2971 // Expand (and scan if external) the reference value. Tell
2972 // it to set the reader to cause an end of entity exception
2973 // when this reader dies, which is what the scanExtSubset
2974 // method wants (who is called to scan this.)
2975 //
2976 fReaderMgr->getNextChar();
2977 expandPERef(true, false, false, true);
2978 }
2979 else if (nextCh == chOpenAngle)
2980 {
2981 // Remember this reader before we start the scan, for checking
2982 // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
2983 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
2984 bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
2985
2986 // And scan this markup
2987 fReaderMgr->getNextChar();
2988 scanMarkupDecl(false);
2989
2990 // If we did not get back to entry level, then partial markup
2991 if (fReaderMgr->getCurrentReaderNum() != orgReader) {
2992 if (wasInPE)
2993 fScanner->emitError(XMLErrs::PEBetweenDecl);
2994 else if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
2995 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
2996 }
2997 }
2998 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
2999 {
3000 //
3001 // IF we are doing advanced callbacks and have a doc type
3002 // handler, then get the whitespace and call the doc type
3003 // handler with it. Otherwise, just skip whitespace.
3004 //
3005 if (fDocTypeHandler)
3006 {
3007 fReaderMgr->getSpaces(bbSpace.getBuffer());
3008 fDocTypeHandler->doctypeWhitespace
3009 (
3010 bbSpace.getRawBuffer()
3011 , bbSpace.getLen()
3012 );
3013 }
3014 else
3015 {
3016 fReaderMgr->skipPastSpaces();
3017 }
3018 }
3019 else
3020 {
3021 // Not valid, so emit an error
3022 XMLCh tmpBuf[9];
3023 XMLString::binToText
3024 (
3025 fReaderMgr->getNextChar()
3026 , tmpBuf
3027 , 8
3028 , 16
3029 , fMemoryManager
3030 );
3031 fScanner->emitError
3032 (
3033 XMLErrs::InvalidCharacterInIntSubset
3034 , tmpBuf
3035 );
3036
3037 //
3038 // If an '>', then probably an abnormally terminated
3039 // internal subset so just return.
3040 //
3041 if (nextCh == chCloseAngle)
3042 {
3043 noErrors = false;
3044 break;
3045 }
3046
3047 //
3048 // Otherwise, try to sync back up by scanning forward for
3049 // a reasonable start character.
3050 //
3051 static const XMLCh toSkip[] =
3052 {
3053 chPercent, chCloseSquare, chOpenAngle, chNull
3054 };
3055 fReaderMgr->skipUntilInOrWS(toSkip);
3056 }
3057 }
3058
3059 // If we have a doc type handler, tell it the internal subset ends
3060 if (fDocTypeHandler)
3061 fDocTypeHandler->endIntSubset();
3062
3063 return noErrors;
3064 }
3065
3066
3067 //
3068 // This method is called once we see a < in the input of an int/ext subset,
3069 // which indicates the start of some sort of markup.
3070 //
scanMarkupDecl(const bool parseTextDecl)3071 void DTDScanner::scanMarkupDecl(const bool parseTextDecl)
3072 {
3073 //
3074 // We only have two valid first characters here. One is a ! which opens
3075 // some markup decl. The other is a ?, which could begin either a PI
3076 // or a text decl. If parseTextDecl is false, we cannot accept a text
3077 // decl.
3078 //
3079 const XMLCh nextCh = fReaderMgr->getNextChar();
3080
3081 if (nextCh == chBang)
3082 {
3083 if (fReaderMgr->skippedChar(chDash))
3084 {
3085 if (fReaderMgr->skippedChar(chDash))
3086 {
3087 scanComment();
3088 }
3089 else
3090 {
3091 fScanner->emitError(XMLErrs::CommentsMustStartWith);
3092 fReaderMgr->skipPastChar(chCloseAngle);
3093 }
3094 }
3095 else if (fReaderMgr->skippedChar(chOpenSquare))
3096 {
3097 //
3098 // Its a conditional section. This is only valid in the external
3099 // subset, so issue an error if we aren't there.
3100 //
3101 if (fInternalSubset)
3102 {
3103 fScanner->emitError(XMLErrs::ConditionalSectInIntSubset);
3104 fReaderMgr->skipPastChar(chCloseAngle);
3105 return;
3106 }
3107
3108 // A PE ref can happen here, but space is not required
3109 checkForPERef(false, true);
3110
3111 if (fReaderMgr->skippedString(XMLUni::fgIncludeString))
3112 {
3113 checkForPERef(false, true);
3114
3115 // Check for the following open square bracket
3116 if (!fReaderMgr->skippedChar(chOpenSquare))
3117 fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
3118
3119 // Get the reader we started this on
3120 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
3121
3122 checkForPERef(false, true);
3123
3124 //
3125 // Recurse back to the ext subset call again, telling it its
3126 // in an include section.
3127 //
3128 scanExtSubsetDecl(true, false);
3129
3130 //
3131 // And see if we got back to the same level. If not, then its
3132 // a partial markup error.
3133 //
3134 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
3135 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
3136
3137 }
3138 else if (fReaderMgr->skippedString(XMLUni::fgIgnoreString))
3139 {
3140 checkForPERef(false, true);
3141
3142 // Check for the following open square bracket
3143 if (!fReaderMgr->skippedChar(chOpenSquare))
3144 fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
3145
3146 // Get the reader we started this on
3147 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
3148
3149 // And scan over the ignored part
3150 scanIgnoredSection();
3151
3152 //
3153 // And see if we got back to the same level. If not, then its
3154 // a partial markup error.
3155 //
3156 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
3157 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
3158
3159 }
3160 else
3161 {
3162 fScanner->emitError(XMLErrs::ExpectedIncOrIgn);
3163 fReaderMgr->skipPastChar(chCloseAngle);
3164 }
3165 }
3166 else if (fReaderMgr->skippedString(XMLUni::fgAttListString))
3167 {
3168 scanAttListDecl();
3169 }
3170 else if (fReaderMgr->skippedString(XMLUni::fgElemString))
3171 {
3172 scanElementDecl();
3173 }
3174 else if (fReaderMgr->skippedString(XMLUni::fgEntityString))
3175 {
3176 scanEntityDecl();
3177 }
3178 else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
3179 {
3180 scanNotationDecl();
3181 }
3182 else
3183 {
3184 fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
3185 fReaderMgr->skipPastChar(chCloseAngle);
3186 }
3187 }
3188 else if (nextCh == chQuestion)
3189 {
3190 // It could be a PI or the XML declaration. Check for Decl
3191 if (fScanner->checkXMLDecl(false))
3192 {
3193 // If we are not accepting text decls, its an error
3194 if (parseTextDecl)
3195 {
3196 scanTextDecl();
3197 }
3198 else
3199 {
3200 // Emit the error and skip past this markup
3201 fScanner->emitError(XMLErrs::TextDeclNotLegalHere);
3202 fReaderMgr->skipPastChar(chCloseAngle);
3203 }
3204 }
3205 else
3206 {
3207 // It has to be a PI
3208 scanPI();
3209 }
3210 }
3211 else
3212 {
3213 // Can't be valid so emit error and try to skip past end of this decl
3214 fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
3215 fReaderMgr->skipPastChar(chCloseAngle);
3216 }
3217 }
3218
3219
3220 //
3221 // This method is called for a mixed model element's content mode. We've
3222 // already scanned past the '(PCDATA' part by the time we get here. So
3223 // everything else is element names separated by | characters until we
3224 // hit the end. The passed element decl's content model is filled in with
3225 // the information found.
3226 //
scanMixed(DTDElementDecl & toFill)3227 bool DTDScanner::scanMixed(DTDElementDecl& toFill)
3228 {
3229 //
3230 // The terminating star is only required if there is something more
3231 // than (PCDATA).
3232 //
3233 bool starRequired = false;
3234
3235 // Get a buffer to be used below to get element names
3236 XMLBufBid bbName(fBufMgr);
3237 XMLBuffer& nameBuf = bbName.getBuffer();
3238
3239 //
3240 // Create an initial content spec node. Its just a leaf node with a
3241 // PCDATA element id. This current node pointer will be pushed down the
3242 // tree as we go.
3243 //
3244 ContentSpecNode* curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
3245 (
3246 new (fGrammarPoolMemoryManager) QName
3247 (
3248 XMLUni::fgZeroLenString
3249 , XMLUni::fgZeroLenString
3250 , XMLElementDecl::fgPCDataElemId
3251 , fGrammarPoolMemoryManager
3252 )
3253 , false
3254 , fGrammarPoolMemoryManager
3255 );
3256
3257 //
3258 // Set the initial leaf as the temporary head. If we hit the first choice
3259 // node, it will be set up here. When done, this is the node that's set
3260 // as the content spec for the element.
3261 //
3262 ContentSpecNode* headNode = curNode;
3263
3264 // Remember the original node so we can sense the first choice node
3265 ContentSpecNode* orgNode = curNode;
3266
3267 //
3268 // We just loop around, getting the | character at the top and then
3269 // looking for the next element name. We keep up with the last node
3270 // and add each new one to its right node.
3271 //
3272 while (true)
3273 {
3274 //
3275 // First of all we check for some grunt work details of skipping
3276 // whitespace, expand PE refs, and catching invalid reps.
3277 //
3278 if (fReaderMgr->lookingAtChar(chPercent))
3279 {
3280 // Expand it and continue
3281 checkForPERef(false, true);
3282 }
3283 else if (fReaderMgr->skippedChar(chAsterisk))
3284 {
3285 //
3286 // Tell them they can't have reps in mixed model, but eat
3287 // it and keep going if we are allowed to.
3288 //
3289 if (fScanner->emitErrorWillThrowException(XMLErrs::NoRepInMixed))
3290 {
3291 delete headNode;
3292 }
3293 fScanner->emitError(XMLErrs::NoRepInMixed);
3294 }
3295 else if (fReaderMgr->skippedSpace())
3296 {
3297 // Spaces are ok at this point, just eat them and continue
3298 fReaderMgr->skipPastSpaces();
3299 }
3300 else
3301 {
3302 if (!fReaderMgr->skippedChar(chPipe))
3303 {
3304 // Has to be the closing paren now.
3305 if (!fReaderMgr->skippedChar(chCloseParen))
3306 {
3307 delete headNode;
3308 fScanner->emitError(XMLErrs::UnterminatedContentModel, toFill.getElementName()->getLocalPart());
3309 return false;
3310 }
3311
3312 bool starSkipped = true;
3313 if (!fReaderMgr->skippedChar(chAsterisk)) {
3314
3315 starSkipped = false;
3316
3317 if (starRequired)
3318 {
3319 if (fScanner->emitErrorWillThrowException(XMLErrs::ExpectedAsterisk))
3320 {
3321 delete headNode;
3322 }
3323 fScanner->emitError(XMLErrs::ExpectedAsterisk);
3324 }
3325 }
3326
3327 //
3328 // Create a zero or more node and make the original head
3329 // node its first child.
3330 //
3331 if (starRequired || starSkipped) {
3332 headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
3333 (
3334 ContentSpecNode::ZeroOrMore
3335 , headNode
3336 , 0
3337 , true
3338 , true
3339 , fGrammarPoolMemoryManager
3340 );
3341 }
3342
3343 // Store the head node as the content spec of the element.
3344 toFill.setContentSpec(headNode);
3345 break;
3346 }
3347
3348 // Its more than just a PCDATA, so an ending star will be required now
3349 starRequired = true;
3350
3351 // Space is legal here so check for a PE ref, but don't require space
3352 checkForPERef(false, true);
3353
3354 // Get a name token
3355 if (!fReaderMgr->getName(nameBuf))
3356 {
3357 delete headNode;
3358 fScanner->emitError(XMLErrs::ExpectedElementName);
3359 return false;
3360 }
3361
3362 //
3363 // Create a leaf node for it. If we can find the element id for
3364 // this element, then use it. Else, we have to fault in an element
3365 // decl, marked as created because of being in a content model.
3366 //
3367 XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, nameBuf.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
3368 if (!decl)
3369 {
3370 decl = new (fGrammarPoolMemoryManager) DTDElementDecl
3371 (
3372 nameBuf.getRawBuffer()
3373 , fEmptyNamespaceId
3374 , DTDElementDecl::Any
3375 , fGrammarPoolMemoryManager
3376 );
3377 decl->setCreateReason(XMLElementDecl::InContentModel);
3378 decl->setExternalElemDeclaration(isReadingExternalEntity());
3379 fDTDGrammar->putElemDecl(decl);
3380 }
3381
3382 //
3383 // If the current node is the original node, this is the first choice
3384 // node, so create an initial choice node with the current node and
3385 // the new element id. Store this as the head node.
3386 //
3387 // Otherwise, we have to steal the right node of the previous choice
3388 // and weave in another choice node there, which has the old choice
3389 // as its left and the new leaf as its right.
3390 //
3391 if (curNode == orgNode)
3392 {
3393 curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
3394 (
3395 ContentSpecNode::Choice
3396 , curNode
3397 , new (fGrammarPoolMemoryManager) ContentSpecNode
3398 (
3399 decl->getElementName()
3400 , fGrammarPoolMemoryManager
3401 )
3402 , true
3403 , true
3404 , fGrammarPoolMemoryManager
3405 );
3406
3407 // Remember the top node
3408 headNode = curNode;
3409 }
3410 else
3411 {
3412 ContentSpecNode* oldRight = curNode->orphanSecond();
3413 curNode->setSecond
3414 (
3415 new (fGrammarPoolMemoryManager) ContentSpecNode
3416 (
3417 ContentSpecNode::Choice
3418 , oldRight
3419 , new (fGrammarPoolMemoryManager) ContentSpecNode
3420 (
3421 decl->getElementName()
3422 , fGrammarPoolMemoryManager
3423 )
3424 , true
3425 , true
3426 , fGrammarPoolMemoryManager
3427 )
3428 );
3429
3430 // Make the new right node the current node
3431 curNode = curNode->getSecond();
3432 }
3433 }
3434 }
3435
3436 return true;
3437 }
3438
3439
3440 //
3441 // This method is called when we see a '<!NOTATION' string while scanning
3442 // markup decl. It parses out the notation and its id and stores a new
3443 // notation decl object in the notation decl pool.
3444 //
scanNotationDecl()3445 void DTDScanner::scanNotationDecl()
3446 {
3447 // Space is required here so check for a PE ref, and require space
3448 if (!checkForPERef(false, true))
3449 {
3450 fScanner->emitError(XMLErrs::ExpectedWhitespace);
3451 fReaderMgr->skipPastChar(chCloseAngle);
3452 return;
3453 }
3454
3455 //
3456 // And now we get a name, which is the name of the notation. Get a
3457 // buffer for the name.
3458 //
3459 XMLBufBid bbName(fBufMgr);
3460 if (!fReaderMgr->getName(bbName.getBuffer()))
3461 {
3462 fScanner->emitError(XMLErrs::ExpectedNotationName);
3463 fReaderMgr->skipPastChar(chCloseAngle);
3464 return;
3465 }
3466
3467 // If namespaces are enabled, then no colons allowed
3468 if (fScanner->getDoNamespaces())
3469 {
3470 if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
3471 fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
3472 }
3473
3474 // Space is required here so check for a PE ref, and require space
3475 if (!checkForPERef(false, true))
3476 {
3477 fScanner->emitError(XMLErrs::ExpectedWhitespace);
3478 fReaderMgr->skipPastChar(chCloseAngle);
3479 return;
3480 }
3481
3482 //
3483 // And scan an external or public id. We need buffers to use for both
3484 // of these.
3485 //
3486 XMLBufBid bbPubId(fBufMgr);
3487 XMLBufBid bbSysId(fBufMgr);
3488 if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_Either))
3489 {
3490 fReaderMgr->skipPastChar(chCloseAngle);
3491 return;
3492 }
3493
3494 // We can have an optional space or PE ref here
3495 checkForPERef(false, true);
3496
3497 //
3498 // See if it already exists. If so, add it to the notatino decl pool.
3499 // Otherwise, if advanced callbacks are on, create a temp one and
3500 // call out for that one.
3501 //
3502 XMLNotationDecl* decl = fDTDGrammar->getNotationDecl(bbName.getRawBuffer());
3503 bool isIgnoring = (decl != 0);
3504 if (isIgnoring)
3505 {
3506 fScanner->emitError(XMLErrs::NotationAlreadyExists, bbName.getRawBuffer());
3507 }
3508 else
3509 {
3510 // Fill in a new notation declaration and add it to the pool
3511 const XMLCh* publicId = bbPubId.getRawBuffer();
3512 const XMLCh* systemId = bbSysId.getRawBuffer();
3513 ReaderMgr::LastExtEntityInfo lastInfo;
3514 fReaderMgr->getLastExtEntityInfo(lastInfo);
3515
3516 decl = new (fGrammarPoolMemoryManager) XMLNotationDecl
3517 (
3518 bbName.getRawBuffer()
3519 , (publicId && *publicId) ? publicId : 0
3520 , (systemId && *systemId) ? systemId : 0
3521 , (lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0
3522 , fGrammarPoolMemoryManager
3523 );
3524 fDTDGrammar->putNotationDecl(decl);
3525 }
3526
3527 //
3528 // If we have a document type handler, then tell it about this. If we
3529 // are ignoring it, only call out if advanced callbacks are enabled.
3530 //
3531 if (fDocTypeHandler)
3532 {
3533 fDocTypeHandler->notationDecl
3534 (
3535 *decl
3536 , isIgnoring
3537 );
3538 }
3539
3540 // And one more optional space or PE ref
3541 checkForPERef(false, true);
3542
3543 // And skip the terminating bracket
3544 if (!fReaderMgr->skippedChar(chCloseAngle))
3545 fScanner->emitError(XMLErrs::UnterminatedNotationDecl);
3546 }
3547
3548
3549 //
3550 // Scans a PI and calls the appropriate callbacks. A PI can happen in either
3551 // the document or the DTD, so it calls the appropriate handler according
3552 // to the fInDocument flag.
3553 //
3554 // At entry we have just scanned the <? part, and need to now start on the
3555 // PI target name.
3556 //
scanPI()3557 void DTDScanner::scanPI()
3558 {
3559 const XMLCh* namePtr = 0;
3560 const XMLCh* targetPtr = 0;
3561
3562 //
3563 // If there are any spaces here, then warn about it. If we aren't in
3564 // 'first error' mode, then we'll come back and can easily pick up
3565 // again by just skipping them.
3566 //
3567 if (fReaderMgr->lookingAtSpace())
3568 {
3569 fScanner->emitError(XMLErrs::PINameExpected);
3570 fReaderMgr->skipPastSpaces();
3571 }
3572
3573 // Get a buffer for the PI name and scan it in
3574 XMLBufBid bbName(fBufMgr);
3575 if (!fReaderMgr->getName(bbName.getBuffer()))
3576 {
3577 fScanner->emitError(XMLErrs::PINameExpected);
3578 fReaderMgr->skipPastChar(chCloseAngle);
3579 return;
3580 }
3581
3582 // Point the name pointer at the raw data
3583 namePtr = bbName.getRawBuffer();
3584
3585 // See if it issome form of 'xml' and emit a warning
3586 //if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
3587 if (bbName.getLen() == 3 &&
3588 (((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) &&
3589 ((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) &&
3590 ((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L))))
3591 fScanner->emitError(XMLErrs::NoPIStartsWithXML);
3592
3593 // If namespaces are enabled, then no colons allowed
3594 if (fScanner->getDoNamespaces())
3595 {
3596 if (XMLString::indexOf(namePtr, chColon) != -1)
3597 fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
3598 }
3599
3600 //
3601 // If we don't hit a space next, then the PI has no target. If we do
3602 // then get out the target. Get a buffer for it as well
3603 //
3604 XMLBufBid bbTarget(fBufMgr);
3605 if (fReaderMgr->skippedSpace())
3606 {
3607 // Skip any leading spaces
3608 fReaderMgr->skipPastSpaces();
3609
3610 bool gotLeadingSurrogate = false;
3611
3612 // It does have a target, so lets move on to deal with that.
3613 while (1)
3614 {
3615 const XMLCh nextCh = fReaderMgr->getNextChar();
3616
3617 // Watch for an end of file, which is always bad here
3618 if (!nextCh)
3619 {
3620 fScanner->emitError(XMLErrs::UnterminatedPI);
3621 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3622 }
3623
3624 // Watch for potential terminating character
3625 if (nextCh == chQuestion)
3626 {
3627 // It must be followed by '>' to be a termination of the target
3628 if (fReaderMgr->skippedChar(chCloseAngle))
3629 break;
3630 }
3631
3632 // Check for correct surrogate pairs
3633 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
3634 {
3635 if (gotLeadingSurrogate)
3636 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
3637 else
3638 gotLeadingSurrogate = true;
3639 }
3640 else
3641 {
3642 if (gotLeadingSurrogate)
3643 {
3644 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
3645 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
3646 }
3647 // Its got to at least be a valid XML character
3648 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
3649
3650 XMLCh tmpBuf[9];
3651 XMLString::binToText
3652 (
3653 nextCh
3654 , tmpBuf
3655 , 8
3656 , 16
3657 , fMemoryManager
3658 );
3659 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
3660 }
3661
3662 gotLeadingSurrogate = false;
3663 }
3664 bbTarget.append(nextCh);
3665 }
3666 }
3667 else
3668 {
3669 // No target, but make sure its terminated ok
3670 if (!fReaderMgr->skippedChar(chQuestion))
3671 {
3672 fScanner->emitError(XMLErrs::UnterminatedPI);
3673 fReaderMgr->skipPastChar(chCloseAngle);
3674 return;
3675 }
3676
3677 if (!fReaderMgr->skippedChar(chCloseAngle))
3678 {
3679 fScanner->emitError(XMLErrs::UnterminatedPI);
3680 fReaderMgr->skipPastChar(chCloseAngle);
3681 return;
3682 }
3683 }
3684
3685 // Point the target pointer at the raw data
3686 targetPtr = bbTarget.getRawBuffer();
3687
3688 //
3689 // If we have a handler, then call it.
3690 //
3691 if (fDocTypeHandler)
3692 {
3693 fDocTypeHandler->doctypePI
3694 (
3695 namePtr
3696 , targetPtr
3697 );
3698 }
3699 }
3700
3701
3702 //
3703 // This method scans a public literal. It must be quoted and all of its
3704 // characters must be valid public id characters. The quotes are discarded
3705 // and the results are returned.
3706 //
scanPublicLiteral(XMLBuffer & toFill)3707 bool DTDScanner::scanPublicLiteral(XMLBuffer& toFill)
3708 {
3709 toFill.reset();
3710
3711 // Get the next char which must be a single or double quote
3712 XMLCh quoteCh;
3713 if (!fReaderMgr->skipIfQuote(quoteCh)) {
3714 fScanner->emitError(XMLErrs::ExpectedQuotedString);
3715 return false;
3716 }
3717
3718 while (true)
3719 {
3720 const XMLCh nextCh = fReaderMgr->getNextChar();
3721
3722 // Watch for EOF
3723 if (!nextCh)
3724 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3725
3726 if (nextCh == quoteCh)
3727 break;
3728
3729 //
3730 // If its not a valid public id char, then report it but keep going
3731 // since that's the best recovery scheme.
3732 //
3733 if (!fReaderMgr->getCurrentReader()->isPublicIdChar(nextCh))
3734 {
3735 XMLCh tmpBuf[9];
3736 XMLString::binToText
3737 (
3738 nextCh
3739 , tmpBuf
3740 , 8
3741 , 16
3742 , fMemoryManager
3743 );
3744 fScanner->emitError(XMLErrs::InvalidPublicIdChar, tmpBuf);
3745 }
3746
3747 toFill.append(nextCh);
3748 }
3749 return true;
3750 }
3751
3752
3753 //
3754 // This method handles scanning in a quoted system literal. It expects to
3755 // start on the open quote and returns after eating the ending quote. There
3756 // are not really any restrictions on the contents of system literals.
3757 //
scanSystemLiteral(XMLBuffer & toFill)3758 bool DTDScanner::scanSystemLiteral(XMLBuffer& toFill)
3759 {
3760 toFill.reset();
3761
3762 // Get the next char which must be a single or double quote
3763 XMLCh quoteCh;
3764 if (!fReaderMgr->skipIfQuote(quoteCh)) {
3765 fScanner->emitError(XMLErrs::ExpectedQuotedString);
3766 return false;
3767 }
3768
3769 XMLCh nextCh;
3770 // Break out on terminating quote
3771 while ((nextCh=fReaderMgr->getNextChar())!=quoteCh)
3772 {
3773 // Watch for EOF
3774 if (!nextCh)
3775 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
3776 toFill.append(nextCh);
3777 }
3778 return true;
3779 }
3780
3781
3782
3783 //
3784 // This method is called to scan a text decl line, which can be the first
3785 // line in an external entity or external subset.
3786 //
3787 // On entry the <? has been scanned, and next should be 'xml' followed by
3788 // some whitespace, version string, etc...
3789 // [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
3790 //
scanTextDecl()3791 void DTDScanner::scanTextDecl()
3792 {
3793 // Skip any subsequent whitespace before the version string
3794 fReaderMgr->skipPastSpaces();
3795
3796 // Next should be the version string
3797 XMLBufBid bbVersion(fBufMgr);
3798 if (fReaderMgr->skippedString(XMLUni::fgVersionString))
3799 {
3800 if (!scanEq())
3801 {
3802 fScanner->emitError(XMLErrs::ExpectedEqSign);
3803 fReaderMgr->skipPastChar(chCloseAngle);
3804 return;
3805 }
3806
3807 //
3808 // Followed by a single or double quoted version. Get a buffer for
3809 // the string.
3810 //
3811 if (!getQuotedString(bbVersion.getBuffer()))
3812 {
3813 fScanner->emitError(XMLErrs::BadXMLVersion);
3814 fReaderMgr->skipPastChar(chCloseAngle);
3815 return;
3816 }
3817
3818 // If its not our supported version, issue an error but continue
3819 if (XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_1)) {
3820 if (fScanner->getXMLVersion() != XMLReader::XMLV1_1)
3821 fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
3822 }
3823 else if (!XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_0))
3824 fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
3825 }
3826
3827 // Ok, now we must have an encoding string
3828 XMLBufBid bbEncoding(fBufMgr);
3829 fReaderMgr->skipPastSpaces();
3830 bool gotEncoding = false;
3831 if (fReaderMgr->skippedString(XMLUni::fgEncodingString))
3832 {
3833 // There must be a equal sign next
3834 if (!scanEq())
3835 {
3836 fScanner->emitError(XMLErrs::ExpectedEqSign);
3837 fReaderMgr->skipPastChar(chCloseAngle);
3838 return;
3839 }
3840
3841 // Followed by a single or double quoted version string
3842 getQuotedString(bbEncoding.getBuffer());
3843 if (bbEncoding.isEmpty() || !XMLString::isValidEncName(bbEncoding.getRawBuffer()))
3844 {
3845 fScanner->emitError(XMLErrs::BadXMLEncoding, bbEncoding.getRawBuffer());
3846 fReaderMgr->skipPastChar(chCloseAngle);
3847 return;
3848 }
3849
3850 // Indicate that we got an encoding
3851 gotEncoding = true;
3852 }
3853
3854 //
3855 // Encoding declarations are required in the external entity
3856 // if there is a text declaration present
3857 //
3858 if (!gotEncoding)
3859 {
3860 fScanner->emitError(XMLErrs::EncodingRequired);
3861 fReaderMgr->skipPastChar(chCloseAngle);
3862 return;
3863
3864 }
3865
3866 fReaderMgr->skipPastSpaces();
3867 if (!fReaderMgr->skippedChar(chQuestion))
3868 {
3869 fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
3870 fReaderMgr->skipPastChar(chCloseAngle);
3871 }
3872 else if (!fReaderMgr->skippedChar(chCloseAngle))
3873 {
3874 fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
3875 fReaderMgr->skipPastChar(chCloseAngle);
3876 }
3877
3878 //
3879 // If we have a document type handler and advanced callbacks are on,
3880 // then call the TextDecl callback
3881 //
3882 if (fDocTypeHandler)
3883 {
3884 fDocTypeHandler->TextDecl
3885 (
3886 bbVersion.getRawBuffer()
3887 , bbEncoding.getRawBuffer()
3888 );
3889 }
3890
3891 //
3892 // If we got an encoding string, then we have to call back on the reader
3893 // to tell it what the encoding is.
3894 //
3895 if (!bbEncoding.isEmpty())
3896 {
3897 if (!fReaderMgr->getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
3898 fScanner->emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
3899 }
3900 }
3901
3902 XERCES_CPP_NAMESPACE_END
3903